diff options
author | John Baldwin <jhb@FreeBSD.org> | 2009-06-18 17:25:38 +0000 |
---|---|---|
committer | John Baldwin <jhb@FreeBSD.org> | 2009-06-18 17:25:38 +0000 |
commit | d09b8b92d699d636c73c787b33137c6c0141f706 (patch) | |
tree | 02d9cd2862c5d03a77e3954d049ed56be9a62329 | |
parent | 35d0e300ef56f48353cd4386a0830601ce4e82e1 (diff) |
Import the lockstat(1) sources from OpenSolaris as of 20080410.vendor/opensolaris/20080410b
Notes
Notes:
svn path=/vendor/opensolaris/dist/; revision=194450
svn path=/vendor/opensolaris/20080410b/; revision=194451; tag=vendor/opensolaris/20080410b
-rw-r--r-- | cmd/lockstat/lockstat.c | 1858 | ||||
-rw-r--r-- | cmd/lockstat/sym.c | 253 |
2 files changed, 2111 insertions, 0 deletions
diff --git a/cmd/lockstat/lockstat.c b/cmd/lockstat/lockstat.c new file mode 100644 index 000000000000..4c4cf50b8e08 --- /dev/null +++ b/cmd/lockstat/lockstat.c @@ -0,0 +1,1858 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <stdio.h> +#include <stddef.h> +#include <stdlib.h> +#include <stdarg.h> +#include <string.h> +#include <strings.h> +#include <ctype.h> +#include <fcntl.h> +#include <unistd.h> +#include <errno.h> +#include <limits.h> +#include <sys/types.h> +#include <sys/modctl.h> +#include <sys/stat.h> +#include <sys/wait.h> +#include <dtrace.h> +#include <sys/lockstat.h> +#include <alloca.h> +#include <signal.h> +#include <assert.h> + +#define LOCKSTAT_OPTSTR "x:bths:n:d:i:l:f:e:ckwWgCHEATID:RpPo:V" + +#define LS_MAX_STACK_DEPTH 50 +#define LS_MAX_EVENTS 64 + +typedef struct lsrec { + struct lsrec *ls_next; /* next in hash chain */ + uintptr_t ls_lock; /* lock address */ + uintptr_t ls_caller; /* caller address */ + uint32_t ls_count; /* cumulative event count */ + uint32_t ls_event; /* type of event */ + uintptr_t ls_refcnt; /* cumulative reference count */ + uint64_t ls_time; /* cumulative event duration */ + uint32_t ls_hist[64]; /* log2(duration) histogram */ + uintptr_t ls_stack[LS_MAX_STACK_DEPTH]; +} lsrec_t; + +typedef struct lsdata { + struct lsrec *lsd_next; /* next available */ + int lsd_count; /* number of records */ +} lsdata_t; + +/* + * Definitions for the types of experiments which can be run. They are + * listed in increasing order of memory cost and processing time cost. + * The numerical value of each type is the number of bytes needed per record. + */ +#define LS_BASIC offsetof(lsrec_t, ls_time) +#define LS_TIME offsetof(lsrec_t, ls_hist[0]) +#define LS_HIST offsetof(lsrec_t, ls_stack[0]) +#define LS_STACK(depth) offsetof(lsrec_t, ls_stack[depth]) + +static void report_stats(FILE *, lsrec_t **, size_t, uint64_t, uint64_t); +static void report_trace(FILE *, lsrec_t **); + +extern int symtab_init(void); +extern char *addr_to_sym(uintptr_t, uintptr_t *, size_t *); +extern uintptr_t sym_to_addr(char *name); +extern size_t sym_size(char *name); +extern char *strtok_r(char *, const char *, char **); + +#define DEFAULT_NRECS 10000 +#define DEFAULT_HZ 97 +#define MAX_HZ 1000 +#define MIN_AGGSIZE (16 * 1024) +#define MAX_AGGSIZE (32 * 1024 * 1024) + +static int g_stkdepth; +static int g_topn = INT_MAX; +static hrtime_t g_elapsed; +static int g_rates = 0; +static int g_pflag = 0; +static int g_Pflag = 0; +static int g_wflag = 0; +static int g_Wflag = 0; +static int g_cflag = 0; +static int g_kflag = 0; +static int g_gflag = 0; +static int g_Vflag = 0; +static int g_tracing = 0; +static size_t g_recsize; +static size_t g_nrecs; +static int g_nrecs_used; +static uchar_t g_enabled[LS_MAX_EVENTS]; +static hrtime_t g_min_duration[LS_MAX_EVENTS]; +static dtrace_hdl_t *g_dtp; +static char *g_predicate; +static char *g_ipredicate; +static char *g_prog; +static int g_proglen; +static int g_dropped; + +typedef struct ls_event_info { + char ev_type; + char ev_lhdr[20]; + char ev_desc[80]; + char ev_units[10]; + char ev_name[DTRACE_NAMELEN]; + char *ev_predicate; + char *ev_acquire; +} ls_event_info_t; + +static ls_event_info_t g_event_info[LS_MAX_EVENTS] = { + { 'C', "Lock", "Adaptive mutex spin", "nsec", + "lockstat:::adaptive-spin" }, + { 'C', "Lock", "Adaptive mutex block", "nsec", + "lockstat:::adaptive-block" }, + { 'C', "Lock", "Spin lock spin", "nsec", + "lockstat:::spin-spin" }, + { 'C', "Lock", "Thread lock spin", "nsec", + "lockstat:::thread-spin" }, + { 'C', "Lock", "R/W writer blocked by writer", "nsec", + "lockstat:::rw-block", "arg2 == 0 && arg3 == 1" }, + { 'C', "Lock", "R/W writer blocked by readers", "nsec", + "lockstat:::rw-block", "arg2 == 0 && arg3 == 0 && arg4" }, + { 'C', "Lock", "R/W reader blocked by writer", "nsec", + "lockstat:::rw-block", "arg2 != 0 && arg3 == 1" }, + { 'C', "Lock", "R/W reader blocked by write wanted", "nsec", + "lockstat:::rw-block", "arg2 != 0 && arg3 == 0 && arg4" }, + { 'C', "Lock", "Unknown event (type 8)", "units" }, + { 'C', "Lock", "Unknown event (type 9)", "units" }, + { 'C', "Lock", "Unknown event (type 10)", "units" }, + { 'C', "Lock", "Unknown event (type 11)", "units" }, + { 'C', "Lock", "Unknown event (type 12)", "units" }, + { 'C', "Lock", "Unknown event (type 13)", "units" }, + { 'C', "Lock", "Unknown event (type 14)", "units" }, + { 'C', "Lock", "Unknown event (type 15)", "units" }, + { 'C', "Lock", "Unknown event (type 16)", "units" }, + { 'C', "Lock", "Unknown event (type 17)", "units" }, + { 'C', "Lock", "Unknown event (type 18)", "units" }, + { 'C', "Lock", "Unknown event (type 19)", "units" }, + { 'C', "Lock", "Unknown event (type 20)", "units" }, + { 'C', "Lock", "Unknown event (type 21)", "units" }, + { 'C', "Lock", "Unknown event (type 22)", "units" }, + { 'C', "Lock", "Unknown event (type 23)", "units" }, + { 'C', "Lock", "Unknown event (type 24)", "units" }, + { 'C', "Lock", "Unknown event (type 25)", "units" }, + { 'C', "Lock", "Unknown event (type 26)", "units" }, + { 'C', "Lock", "Unknown event (type 27)", "units" }, + { 'C', "Lock", "Unknown event (type 28)", "units" }, + { 'C', "Lock", "Unknown event (type 29)", "units" }, + { 'C', "Lock", "Unknown event (type 30)", "units" }, + { 'C', "Lock", "Unknown event (type 31)", "units" }, + { 'H', "Lock", "Adaptive mutex hold", "nsec", + "lockstat:::adaptive-release", NULL, + "lockstat:::adaptive-acquire" }, + { 'H', "Lock", "Spin lock hold", "nsec", + "lockstat:::spin-release", NULL, + "lockstat:::spin-acquire" }, + { 'H', "Lock", "R/W writer hold", "nsec", + "lockstat:::rw-release", "arg1 == 0", + "lockstat:::rw-acquire" }, + { 'H', "Lock", "R/W reader hold", "nsec", + "lockstat:::rw-release", "arg1 != 0", + "lockstat:::rw-acquire" }, + { 'H', "Lock", "Unknown event (type 36)", "units" }, + { 'H', "Lock", "Unknown event (type 37)", "units" }, + { 'H', "Lock", "Unknown event (type 38)", "units" }, + { 'H', "Lock", "Unknown event (type 39)", "units" }, + { 'H', "Lock", "Unknown event (type 40)", "units" }, + { 'H', "Lock", "Unknown event (type 41)", "units" }, + { 'H', "Lock", "Unknown event (type 42)", "units" }, + { 'H', "Lock", "Unknown event (type 43)", "units" }, + { 'H', "Lock", "Unknown event (type 44)", "units" }, + { 'H', "Lock", "Unknown event (type 45)", "units" }, + { 'H', "Lock", "Unknown event (type 46)", "units" }, + { 'H', "Lock", "Unknown event (type 47)", "units" }, + { 'H', "Lock", "Unknown event (type 48)", "units" }, + { 'H', "Lock", "Unknown event (type 49)", "units" }, + { 'H', "Lock", "Unknown event (type 50)", "units" }, + { 'H', "Lock", "Unknown event (type 51)", "units" }, + { 'H', "Lock", "Unknown event (type 52)", "units" }, + { 'H', "Lock", "Unknown event (type 53)", "units" }, + { 'H', "Lock", "Unknown event (type 54)", "units" }, + { 'H', "Lock", "Unknown event (type 55)", "units" }, + { 'I', "CPU+PIL", "Profiling interrupt", "nsec", + "profile:::profile-97", NULL }, + { 'I', "Lock", "Unknown event (type 57)", "units" }, + { 'I', "Lock", "Unknown event (type 58)", "units" }, + { 'I', "Lock", "Unknown event (type 59)", "units" }, + { 'E', "Lock", "Recursive lock entry detected", "(N/A)", + "lockstat:::rw-release", NULL, "lockstat:::rw-acquire" }, + { 'E', "Lock", "Lockstat enter failure", "(N/A)" }, + { 'E', "Lock", "Lockstat exit failure", "nsec" }, + { 'E', "Lock", "Lockstat record failure", "(N/A)" }, +}; + +static void +fail(int do_perror, const char *message, ...) +{ + va_list args; + int save_errno = errno; + + va_start(args, message); + (void) fprintf(stderr, "lockstat: "); + (void) vfprintf(stderr, message, args); + va_end(args); + if (do_perror) + (void) fprintf(stderr, ": %s", strerror(save_errno)); + (void) fprintf(stderr, "\n"); + exit(2); +} + +static void +dfail(const char *message, ...) +{ + va_list args; + + va_start(args, message); + (void) fprintf(stderr, "lockstat: "); + (void) vfprintf(stderr, message, args); + va_end(args); + (void) fprintf(stderr, ": %s\n", + dtrace_errmsg(g_dtp, dtrace_errno(g_dtp))); + + exit(2); +} + +static void +show_events(char event_type, char *desc) +{ + int i, first = -1, last; + + for (i = 0; i < LS_MAX_EVENTS; i++) { + ls_event_info_t *evp = &g_event_info[i]; + if (evp->ev_type != event_type || + strncmp(evp->ev_desc, "Unknown event", 13) == 0) + continue; + if (first == -1) + first = i; + last = i; + } + + (void) fprintf(stderr, + "\n%s events (lockstat -%c or lockstat -e %d-%d):\n\n", + desc, event_type, first, last); + + for (i = first; i <= last; i++) + (void) fprintf(stderr, + "%4d = %s\n", i, g_event_info[i].ev_desc); +} + +static void +usage(void) +{ + (void) fprintf(stderr, + "Usage: lockstat [options] command [args]\n" + "\nEvent selection options:\n\n" + " -C watch contention events [on by default]\n" + " -E watch error events [off by default]\n" + " -H watch hold events [off by default]\n" + " -I watch interrupt events [off by default]\n" + " -A watch all lock events [equivalent to -CH]\n" + " -e event_list only watch the specified events (shown below);\n" + " <event_list> is a comma-separated list of\n" + " events or ranges of events, e.g. 1,4-7,35\n" + " -i rate interrupt rate for -I [default: %d Hz]\n" + "\nData gathering options:\n\n" + " -b basic statistics (lock, caller, event count)\n" + " -t timing for all events [default]\n" + " -h histograms for event times\n" + " -s depth stack traces <depth> deep\n" + " -x opt[=val] enable or modify DTrace options\n" + "\nData filtering options:\n\n" + " -n nrecords maximum number of data records [default: %d]\n" + " -l lock[,size] only watch <lock>, which can be specified as a\n" + " symbolic name or hex address; <size> defaults\n" + " to the ELF symbol size if available, 1 if not\n" + " -f func[,size] only watch events generated by <func>\n" + " -d duration only watch events longer than <duration>\n" + " -T trace (rather than sample) events\n" + "\nData reporting options:\n\n" + " -c coalesce lock data for arrays like pse_mutex[]\n" + " -k coalesce PCs within functions\n" + " -g show total events generated by function\n" + " -w wherever: don't distinguish events by caller\n" + " -W whichever: don't distinguish events by lock\n" + " -R display rates rather than counts\n" + " -p parsable output format (awk(1)-friendly)\n" + " -P sort lock data by (count * avg_time) product\n" + " -D n only display top <n> events of each type\n" + " -o filename send output to <filename>\n", + DEFAULT_HZ, DEFAULT_NRECS); + + show_events('C', "Contention"); + show_events('H', "Hold-time"); + show_events('I', "Interrupt"); + show_events('E', "Error"); + (void) fprintf(stderr, "\n"); + + exit(1); +} + +static int +lockcmp(lsrec_t *a, lsrec_t *b) +{ + int i; + + if (a->ls_event < b->ls_event) + return (-1); + if (a->ls_event > b->ls_event) + return (1); + + for (i = g_stkdepth - 1; i >= 0; i--) { + if (a->ls_stack[i] < b->ls_stack[i]) + return (-1); + if (a->ls_stack[i] > b->ls_stack[i]) + return (1); + } + + if (a->ls_caller < b->ls_caller) + return (-1); + if (a->ls_caller > b->ls_caller) + return (1); + + if (a->ls_lock < b->ls_lock) + return (-1); + if (a->ls_lock > b->ls_lock) + return (1); + + return (0); +} + +static int +countcmp(lsrec_t *a, lsrec_t *b) +{ + if (a->ls_event < b->ls_event) + return (-1); + if (a->ls_event > b->ls_event) + return (1); + + return (b->ls_count - a->ls_count); +} + +static int +timecmp(lsrec_t *a, lsrec_t *b) +{ + if (a->ls_event < b->ls_event) + return (-1); + if (a->ls_event > b->ls_event) + return (1); + + if (a->ls_time < b->ls_time) + return (1); + if (a->ls_time > b->ls_time) + return (-1); + + return (0); +} + +static int +lockcmp_anywhere(lsrec_t *a, lsrec_t *b) +{ + if (a->ls_event < b->ls_event) + return (-1); + if (a->ls_event > b->ls_event) + return (1); + + if (a->ls_lock < b->ls_lock) + return (-1); + if (a->ls_lock > b->ls_lock) + return (1); + + return (0); +} + +static int +lock_and_count_cmp_anywhere(lsrec_t *a, lsrec_t *b) +{ + if (a->ls_event < b->ls_event) + return (-1); + if (a->ls_event > b->ls_event) + return (1); + + if (a->ls_lock < b->ls_lock) + return (-1); + if (a->ls_lock > b->ls_lock) + return (1); + + return (b->ls_count - a->ls_count); +} + +static int +sitecmp_anylock(lsrec_t *a, lsrec_t *b) +{ + int i; + + if (a->ls_event < b->ls_event) + return (-1); + if (a->ls_event > b->ls_event) + return (1); + + for (i = g_stkdepth - 1; i >= 0; i--) { + if (a->ls_stack[i] < b->ls_stack[i]) + return (-1); + if (a->ls_stack[i] > b->ls_stack[i]) + return (1); + } + + if (a->ls_caller < b->ls_caller) + return (-1); + if (a->ls_caller > b->ls_caller) + return (1); + + return (0); +} + +static int +site_and_count_cmp_anylock(lsrec_t *a, lsrec_t *b) +{ + int i; + + if (a->ls_event < b->ls_event) + return (-1); + if (a->ls_event > b->ls_event) + return (1); + + for (i = g_stkdepth - 1; i >= 0; i--) { + if (a->ls_stack[i] < b->ls_stack[i]) + return (-1); + if (a->ls_stack[i] > b->ls_stack[i]) + return (1); + } + + if (a->ls_caller < b->ls_caller) + return (-1); + if (a->ls_caller > b->ls_caller) + return (1); + + return (b->ls_count - a->ls_count); +} + +static void +mergesort(int (*cmp)(lsrec_t *, lsrec_t *), lsrec_t **a, lsrec_t **b, int n) +{ + int m = n / 2; + int i, j; + + if (m > 1) + mergesort(cmp, a, b, m); + if (n - m > 1) + mergesort(cmp, a + m, b + m, n - m); + for (i = m; i > 0; i--) + b[i - 1] = a[i - 1]; + for (j = m - 1; j < n - 1; j++) + b[n + m - j - 2] = a[j + 1]; + while (i < j) + *a++ = cmp(b[i], b[j]) < 0 ? b[i++] : b[j--]; + *a = b[i]; +} + +static void +coalesce(int (*cmp)(lsrec_t *, lsrec_t *), lsrec_t **lock, int n) +{ + int i, j; + lsrec_t *target, *current; + + target = lock[0]; + + for (i = 1; i < n; i++) { + current = lock[i]; + if (cmp(current, target) != 0) { + target = current; + continue; + } + current->ls_event = LS_MAX_EVENTS; + target->ls_count += current->ls_count; + target->ls_refcnt += current->ls_refcnt; + if (g_recsize < LS_TIME) + continue; + target->ls_time += current->ls_time; + if (g_recsize < LS_HIST) + continue; + for (j = 0; j < 64; j++) + target->ls_hist[j] += current->ls_hist[j]; + } +} + +static void +coalesce_symbol(uintptr_t *addrp) +{ + uintptr_t symoff; + size_t symsize; + + if (addr_to_sym(*addrp, &symoff, &symsize) != NULL && symoff < symsize) + *addrp -= symoff; +} + +static void +predicate_add(char **pred, char *what, char *cmp, uintptr_t value) +{ + char *new; + int len, newlen; + + if (what == NULL) + return; + + if (*pred == NULL) { + *pred = malloc(1); + *pred[0] = '\0'; + } + + len = strlen(*pred); + newlen = len + strlen(what) + 32 + strlen("( && )"); + new = malloc(newlen); + + if (*pred[0] != '\0') { + if (cmp != NULL) { + (void) sprintf(new, "(%s) && (%s %s 0x%p)", + *pred, what, cmp, (void *)value); + } else { + (void) sprintf(new, "(%s) && (%s)", *pred, what); + } + } else { + if (cmp != NULL) { + (void) sprintf(new, "%s %s 0x%p", + what, cmp, (void *)value); + } else { + (void) sprintf(new, "%s", what); + } + } + + free(*pred); + *pred = new; +} + +static void +predicate_destroy(char **pred) +{ + free(*pred); + *pred = NULL; +} + +static void +filter_add(char **filt, char *what, uintptr_t base, uintptr_t size) +{ + char buf[256], *c = buf, *new; + int len, newlen; + + if (*filt == NULL) { + *filt = malloc(1); + *filt[0] = '\0'; + } + + (void) sprintf(c, "%s(%s >= 0x%p && %s < 0x%p)", *filt[0] != '\0' ? + " || " : "", what, (void *)base, what, (void *)(base + size)); + + newlen = (len = strlen(*filt) + 1) + strlen(c); + new = malloc(newlen); + bcopy(*filt, new, len); + (void) strcat(new, c); + free(*filt); + *filt = new; +} + +static void +filter_destroy(char **filt) +{ + free(*filt); + *filt = NULL; +} + +static void +dprog_add(const char *fmt, ...) +{ + va_list args; + int size, offs; + char c; + + va_start(args, fmt); + size = vsnprintf(&c, 1, fmt, args) + 1; + + if (g_proglen == 0) { + offs = 0; + } else { + offs = g_proglen - 1; + } + + g_proglen = offs + size; + + if ((g_prog = realloc(g_prog, g_proglen)) == NULL) + fail(1, "failed to reallocate program text"); + + (void) vsnprintf(&g_prog[offs], size, fmt, args); +} + +/* + * This function may read like an open sewer, but keep in mind that programs + * that generate other programs are rarely pretty. If one has the unenviable + * task of maintaining or -- worse -- extending this code, use the -V option + * to examine the D program as generated by this function. + */ +static void +dprog_addevent(int event) +{ + ls_event_info_t *info = &g_event_info[event]; + char *pred = NULL; + char stack[20]; + const char *arg0, *caller; + char *arg1 = "arg1"; + char buf[80]; + hrtime_t dur; + int depth; + + if (info->ev_name[0] == '\0') + return; + + if (info->ev_type == 'I') { + /* + * For interrupt events, arg0 (normally the lock pointer) is + * the CPU address plus the current pil, and arg1 (normally + * the number of nanoseconds) is the number of nanoseconds + * late -- and it's stored in arg2. + */ + arg0 = "(uintptr_t)curthread->t_cpu + \n" + "\t curthread->t_cpu->cpu_profile_pil"; + caller = "(uintptr_t)arg0"; + arg1 = "arg2"; + } else { + arg0 = "(uintptr_t)arg0"; + caller = "caller"; + } + + if (g_recsize > LS_HIST) { + for (depth = 0; g_recsize > LS_STACK(depth); depth++) + continue; + + if (g_tracing) { + (void) sprintf(stack, "\tstack(%d);\n", depth); + } else { + (void) sprintf(stack, ", stack(%d)", depth); + } + } else { + (void) sprintf(stack, ""); + } + + if (info->ev_acquire != NULL) { + /* + * If this is a hold event, we need to generate an additional + * clause for the acquire; the clause for the release will be + * generated with the aggregating statement, below. + */ + dprog_add("%s\n", info->ev_acquire); + predicate_add(&pred, info->ev_predicate, NULL, 0); + predicate_add(&pred, g_predicate, NULL, 0); + if (pred != NULL) + dprog_add("/%s/\n", pred); + + dprog_add("{\n"); + (void) sprintf(buf, "self->ev%d[(uintptr_t)arg0]", event); + + if (info->ev_type == 'H') { + dprog_add("\t%s = timestamp;\n", buf); + } else { + /* + * If this isn't a hold event, it's the recursive + * error event. For this, we simply bump the + * thread-local, per-lock count. + */ + dprog_add("\t%s++;\n", buf); + } + + dprog_add("}\n\n"); + predicate_destroy(&pred); + pred = NULL; + + if (info->ev_type == 'E') { + /* + * If this is the recursive lock error event, we need + * to generate an additional clause to decrement the + * thread-local, per-lock count. This assures that we + * only execute the aggregating clause if we have + * recursive entry. + */ + dprog_add("%s\n", info->ev_name); + dprog_add("/%s/\n{\n\t%s--;\n}\n\n", buf, buf); + } + + predicate_add(&pred, buf, NULL, 0); + + if (info->ev_type == 'H') { + (void) sprintf(buf, "timestamp -\n\t " + "self->ev%d[(uintptr_t)arg0]", event); + } + + arg1 = buf; + } else { + predicate_add(&pred, info->ev_predicate, NULL, 0); + if (info->ev_type != 'I') + predicate_add(&pred, g_predicate, NULL, 0); + else + predicate_add(&pred, g_ipredicate, NULL, 0); + } + + if ((dur = g_min_duration[event]) != 0) + predicate_add(&pred, arg1, ">=", dur); + + dprog_add("%s\n", info->ev_name); + + if (pred != NULL) + dprog_add("/%s/\n", pred); + predicate_destroy(&pred); + + dprog_add("{\n"); + + if (g_tracing) { + dprog_add("\ttrace(%dULL);\n", event); + dprog_add("\ttrace(%s);\n", arg0); + dprog_add("\ttrace(%s);\n", caller); + dprog_add(stack); + } else { + /* + * The ordering here is important: when we process the + * aggregate, we count on the fact that @avg appears before + * @hist in program order to assure that @avg is assigned the + * first aggregation variable ID and @hist assigned the + * second; see the comment in process_aggregate() for details. + */ + dprog_add("\t@avg[%dULL, %s, %s%s] = avg(%s);\n", + event, arg0, caller, stack, arg1); + + if (g_recsize >= LS_HIST) { + dprog_add("\t@hist[%dULL, %s, %s%s] = quantize" + "(%s);\n", event, arg0, caller, stack, arg1); + } + } + + if (info->ev_acquire != NULL) + dprog_add("\tself->ev%d[arg0] = 0;\n", event); + + dprog_add("}\n\n"); +} + +static void +dprog_compile() +{ + dtrace_prog_t *prog; + dtrace_proginfo_t info; + + if (g_Vflag) { + (void) fprintf(stderr, "lockstat: vvvv D program vvvv\n"); + (void) fputs(g_prog, stderr); + (void) fprintf(stderr, "lockstat: ^^^^ D program ^^^^\n"); + } + + if ((prog = dtrace_program_strcompile(g_dtp, g_prog, + DTRACE_PROBESPEC_NAME, 0, 0, NULL)) == NULL) + dfail("failed to compile program"); + + if (dtrace_program_exec(g_dtp, prog, &info) == -1) + dfail("failed to enable probes"); + + if (dtrace_go(g_dtp) != 0) + dfail("couldn't start tracing"); +} + +static void +status_fire(void) +{} + +static void +status_init(void) +{ + dtrace_optval_t val, status, agg; + struct sigaction act; + struct itimerspec ts; + struct sigevent ev; + timer_t tid; + + if (dtrace_getopt(g_dtp, "statusrate", &status) == -1) + dfail("failed to get 'statusrate'"); + + if (dtrace_getopt(g_dtp, "aggrate", &agg) == -1) + dfail("failed to get 'statusrate'"); + + /* + * We would want to awaken at a rate that is the GCD of the statusrate + * and the aggrate -- but that seems a bit absurd. Instead, we'll + * simply awaken at a rate that is the more frequent of the two, which + * assures that we're never later than the interval implied by the + * more frequent rate. + */ + val = status < agg ? status : agg; + + (void) sigemptyset(&act.sa_mask); + act.sa_flags = 0; + act.sa_handler = status_fire; + (void) sigaction(SIGUSR1, &act, NULL); + + ev.sigev_notify = SIGEV_SIGNAL; + ev.sigev_signo = SIGUSR1; + + if (timer_create(CLOCK_REALTIME, &ev, &tid) == -1) + dfail("cannot create CLOCK_REALTIME timer"); + + ts.it_value.tv_sec = val / NANOSEC; + ts.it_value.tv_nsec = val % NANOSEC; + ts.it_interval = ts.it_value; + + if (timer_settime(tid, TIMER_RELTIME, &ts, NULL) == -1) + dfail("cannot set time on CLOCK_REALTIME timer"); +} + +static void +status_check(void) +{ + if (!g_tracing && dtrace_aggregate_snap(g_dtp) != 0) + dfail("failed to snap aggregate"); + + if (dtrace_status(g_dtp) == -1) + dfail("dtrace_status()"); +} + +static void +lsrec_fill(lsrec_t *lsrec, const dtrace_recdesc_t *rec, int nrecs, caddr_t data) +{ + bzero(lsrec, g_recsize); + lsrec->ls_count = 1; + + if ((g_recsize > LS_HIST && nrecs < 4) || (nrecs < 3)) + fail(0, "truncated DTrace record"); + + if (rec->dtrd_size != sizeof (uint64_t)) + fail(0, "bad event size in first record"); + + /* LINTED - alignment */ + lsrec->ls_event = (uint32_t)*((uint64_t *)(data + rec->dtrd_offset)); + rec++; + + if (rec->dtrd_size != sizeof (uintptr_t)) + fail(0, "bad lock address size in second record"); + + /* LINTED - alignment */ + lsrec->ls_lock = *((uintptr_t *)(data + rec->dtrd_offset)); + rec++; + + if (rec->dtrd_size != sizeof (uintptr_t)) + fail(0, "bad caller size in third record"); + + /* LINTED - alignment */ + lsrec->ls_caller = *((uintptr_t *)(data + rec->dtrd_offset)); + rec++; + + if (g_recsize > LS_HIST) { + int frames, i; + pc_t *stack; + + frames = rec->dtrd_size / sizeof (pc_t); + /* LINTED - alignment */ + stack = (pc_t *)(data + rec->dtrd_offset); + + for (i = 1; i < frames; i++) + lsrec->ls_stack[i - 1] = stack[i]; + } +} + +/*ARGSUSED*/ +static int +count_aggregate(const dtrace_aggdata_t *agg, void *arg) +{ + *((size_t *)arg) += 1; + + return (DTRACE_AGGWALK_NEXT); +} + +static int +process_aggregate(const dtrace_aggdata_t *agg, void *arg) +{ + const dtrace_aggdesc_t *aggdesc = agg->dtada_desc; + caddr_t data = agg->dtada_data; + lsdata_t *lsdata = arg; + lsrec_t *lsrec = lsdata->lsd_next; + const dtrace_recdesc_t *rec; + uint64_t *avg, *quantized; + int i, j; + + assert(lsdata->lsd_count < g_nrecs); + + /* + * Aggregation variable IDs are guaranteed to be generated in program + * order, and they are guaranteed to start from DTRACE_AGGVARIDNONE + * plus one. As "avg" appears before "hist" in program order, we know + * that "avg" will be allocated the first aggregation variable ID, and + * "hist" will be allocated the second aggregation variable ID -- and + * we therefore use the aggregation variable ID to differentiate the + * cases. + */ + if (aggdesc->dtagd_varid > DTRACE_AGGVARIDNONE + 1) { + /* + * If this is the histogram entry. We'll copy the quantized + * data into lc_hist, and jump over the rest. + */ + rec = &aggdesc->dtagd_rec[aggdesc->dtagd_nrecs - 1]; + + if (aggdesc->dtagd_varid != DTRACE_AGGVARIDNONE + 2) + fail(0, "bad variable ID in aggregation record"); + + if (rec->dtrd_size != + DTRACE_QUANTIZE_NBUCKETS * sizeof (uint64_t)) + fail(0, "bad quantize size in aggregation record"); + + /* LINTED - alignment */ + quantized = (uint64_t *)(data + rec->dtrd_offset); + + for (i = DTRACE_QUANTIZE_ZEROBUCKET, j = 0; + i < DTRACE_QUANTIZE_NBUCKETS; i++, j++) + lsrec->ls_hist[j] = quantized[i]; + + goto out; + } + + lsrec_fill(lsrec, &aggdesc->dtagd_rec[1], + aggdesc->dtagd_nrecs - 1, data); + + rec = &aggdesc->dtagd_rec[aggdesc->dtagd_nrecs - 1]; + + if (rec->dtrd_size != 2 * sizeof (uint64_t)) + fail(0, "bad avg size in aggregation record"); + + /* LINTED - alignment */ + avg = (uint64_t *)(data + rec->dtrd_offset); + lsrec->ls_count = (uint32_t)avg[0]; + lsrec->ls_time = (uintptr_t)avg[1]; + + if (g_recsize >= LS_HIST) + return (DTRACE_AGGWALK_NEXT); + +out: + lsdata->lsd_next = (lsrec_t *)((uintptr_t)lsrec + g_recsize); + lsdata->lsd_count++; + + return (DTRACE_AGGWALK_NEXT); +} + +static int +process_trace(const dtrace_probedata_t *pdata, void *arg) +{ + lsdata_t *lsdata = arg; + lsrec_t *lsrec = lsdata->lsd_next; + dtrace_eprobedesc_t *edesc = pdata->dtpda_edesc; + caddr_t data = pdata->dtpda_data; + + if (lsdata->lsd_count >= g_nrecs) + return (DTRACE_CONSUME_NEXT); + + lsrec_fill(lsrec, edesc->dtepd_rec, edesc->dtepd_nrecs, data); + + lsdata->lsd_next = (lsrec_t *)((uintptr_t)lsrec + g_recsize); + lsdata->lsd_count++; + + return (DTRACE_CONSUME_NEXT); +} + +static int +process_data(FILE *out, char *data) +{ + lsdata_t lsdata; + + /* LINTED - alignment */ + lsdata.lsd_next = (lsrec_t *)data; + lsdata.lsd_count = 0; + + if (g_tracing) { + if (dtrace_consume(g_dtp, out, + process_trace, NULL, &lsdata) != 0) + dfail("failed to consume buffer"); + + return (lsdata.lsd_count); + } + + if (dtrace_aggregate_walk_keyvarsorted(g_dtp, + process_aggregate, &lsdata) != 0) + dfail("failed to walk aggregate"); + + return (lsdata.lsd_count); +} + +/*ARGSUSED*/ +static int +drophandler(const dtrace_dropdata_t *data, void *arg) +{ + g_dropped++; + (void) fprintf(stderr, "lockstat: warning: %s", data->dtdda_msg); + return (DTRACE_HANDLE_OK); +} + +int +main(int argc, char **argv) +{ + char *data_buf; + lsrec_t *lsp, **current, **first, **sort_buf, **merge_buf; + FILE *out = stdout; + char c; + pid_t child; + int status; + int i, j; + hrtime_t duration; + char *addrp, *offp, *sizep, *evp, *lastp, *p; + uintptr_t addr; + size_t size, off; + int events_specified = 0; + int exec_errno = 0; + uint32_t event; + char *filt = NULL, *ifilt = NULL; + static uint64_t ev_count[LS_MAX_EVENTS + 1]; + static uint64_t ev_time[LS_MAX_EVENTS + 1]; + dtrace_optval_t aggsize; + char aggstr[10]; + long ncpus; + int dynvar = 0; + int err; + + if ((g_dtp = dtrace_open(DTRACE_VERSION, 0, &err)) == NULL) { + fail(0, "cannot open dtrace library: %s", + dtrace_errmsg(NULL, err)); + } + + if (dtrace_handle_drop(g_dtp, &drophandler, NULL) == -1) + dfail("couldn't establish drop handler"); + + if (symtab_init() == -1) + fail(1, "can't load kernel symbols"); + + g_nrecs = DEFAULT_NRECS; + + while ((c = getopt(argc, argv, LOCKSTAT_OPTSTR)) != EOF) { + switch (c) { + case 'b': + g_recsize = LS_BASIC; + break; + + case 't': + g_recsize = LS_TIME; + break; + + case 'h': + g_recsize = LS_HIST; + break; + + case 's': + if (!isdigit(optarg[0])) + usage(); + g_stkdepth = atoi(optarg); + if (g_stkdepth > LS_MAX_STACK_DEPTH) + fail(0, "max stack depth is %d", + LS_MAX_STACK_DEPTH); + g_recsize = LS_STACK(g_stkdepth); + break; + + case 'n': + if (!isdigit(optarg[0])) + usage(); + g_nrecs = atoi(optarg); + break; + + case 'd': + if (!isdigit(optarg[0])) + usage(); + duration = atoll(optarg); + + /* + * XXX -- durations really should be per event + * since the units are different, but it's hard + * to express this nicely in the interface. + * Not clear yet what the cleanest solution is. + */ + for (i = 0; i < LS_MAX_EVENTS; i++) + if (g_event_info[i].ev_type != 'E') + g_min_duration[i] = duration; + + break; + + case 'i': + if (!isdigit(optarg[0])) + usage(); + i = atoi(optarg); + if (i <= 0) + usage(); + if (i > MAX_HZ) + fail(0, "max interrupt rate is %d Hz", MAX_HZ); + + for (j = 0; j < LS_MAX_EVENTS; j++) + if (strcmp(g_event_info[j].ev_desc, + "Profiling interrupt") == 0) + break; + + (void) sprintf(g_event_info[j].ev_name, + "profile:::profile-%d", i); + break; + + case 'l': + case 'f': + addrp = strtok(optarg, ","); + sizep = strtok(NULL, ","); + addrp = strtok(optarg, ",+"); + offp = strtok(NULL, ","); + + size = sizep ? strtoul(sizep, NULL, 0) : 1; + off = offp ? strtoul(offp, NULL, 0) : 0; + + if (addrp[0] == '0') { + addr = strtoul(addrp, NULL, 16) + off; + } else { + addr = sym_to_addr(addrp) + off; + if (sizep == NULL) + size = sym_size(addrp) - off; + if (addr - off == 0) + fail(0, "symbol '%s' not found", addrp); + if (size == 0) + size = 1; + } + + + if (c == 'l') { + filter_add(&filt, "arg0", addr, size); + } else { + filter_add(&filt, "caller", addr, size); + filter_add(&ifilt, "arg0", addr, size); + } + break; + + case 'e': + evp = strtok_r(optarg, ",", &lastp); + while (evp) { + int ev1, ev2; + char *evp2; + + (void) strtok(evp, "-"); + evp2 = strtok(NULL, "-"); + ev1 = atoi(evp); + ev2 = evp2 ? atoi(evp2) : ev1; + if ((uint_t)ev1 >= LS_MAX_EVENTS || + (uint_t)ev2 >= LS_MAX_EVENTS || ev1 > ev2) + fail(0, "-e events out of range"); + for (i = ev1; i <= ev2; i++) + g_enabled[i] = 1; + evp = strtok_r(NULL, ",", &lastp); + } + events_specified = 1; + break; + + case 'c': + g_cflag = 1; + break; + + case 'k': + g_kflag = 1; + break; + + case 'w': + g_wflag = 1; + break; + + case 'W': + g_Wflag = 1; + break; + + case 'g': + g_gflag = 1; + break; + + case 'C': + case 'E': + case 'H': + case 'I': + for (i = 0; i < LS_MAX_EVENTS; i++) + if (g_event_info[i].ev_type == c) + g_enabled[i] = 1; + events_specified = 1; + break; + + case 'A': + for (i = 0; i < LS_MAX_EVENTS; i++) + if (strchr("CH", g_event_info[i].ev_type)) + g_enabled[i] = 1; + events_specified = 1; + break; + + case 'T': + g_tracing = 1; + break; + + case 'D': + if (!isdigit(optarg[0])) + usage(); + g_topn = atoi(optarg); + break; + + case 'R': + g_rates = 1; + break; + + case 'p': + g_pflag = 1; + break; + + case 'P': + g_Pflag = 1; + break; + + case 'o': + if ((out = fopen(optarg, "w")) == NULL) + fail(1, "error opening file"); + break; + + case 'V': + g_Vflag = 1; + break; + + default: + if (strchr(LOCKSTAT_OPTSTR, c) == NULL) + usage(); + } + } + + if (filt != NULL) { + predicate_add(&g_predicate, filt, NULL, 0); + filter_destroy(&filt); + } + + if (ifilt != NULL) { + predicate_add(&g_ipredicate, ifilt, NULL, 0); + filter_destroy(&ifilt); + } + + if (g_recsize == 0) { + if (g_gflag) { + g_stkdepth = LS_MAX_STACK_DEPTH; + g_recsize = LS_STACK(g_stkdepth); + } else { + g_recsize = LS_TIME; + } + } + + if (g_gflag && g_recsize <= LS_STACK(0)) + fail(0, "'-g' requires at least '-s 1' data gathering"); + + /* + * Make sure the alignment is reasonable + */ + g_recsize = -(-g_recsize & -sizeof (uint64_t)); + + for (i = 0; i < LS_MAX_EVENTS; i++) { + /* + * If no events were specified, enable -C. + */ + if (!events_specified && g_event_info[i].ev_type == 'C') + g_enabled[i] = 1; + } + + for (i = 0; i < LS_MAX_EVENTS; i++) { + if (!g_enabled[i]) + continue; + + if (g_event_info[i].ev_acquire != NULL) { + /* + * If we've enabled a hold event, we must explicitly + * allocate dynamic variable space. + */ + dynvar = 1; + } + + dprog_addevent(i); + } + + /* + * Make sure there are remaining arguments to specify a child command + * to execute. + */ + if (argc <= optind) + usage(); + + if ((ncpus = sysconf(_SC_NPROCESSORS_ONLN)) == -1) + dfail("couldn't determine number of online CPUs"); + + /* + * By default, we set our data buffer size to be the number of records + * multiplied by the size of the record, doubled to account for some + * DTrace slop and divided by the number of CPUs. We silently clamp + * the aggregation size at both a minimum and a maximum to prevent + * absurdly low or high values. + */ + if ((aggsize = (g_nrecs * g_recsize * 2) / ncpus) < MIN_AGGSIZE) + aggsize = MIN_AGGSIZE; + + if (aggsize > MAX_AGGSIZE) + aggsize = MAX_AGGSIZE; + + (void) sprintf(aggstr, "%lld", (long long)aggsize); + + if (!g_tracing) { + if (dtrace_setopt(g_dtp, "bufsize", "4k") == -1) + dfail("failed to set 'bufsize'"); + + if (dtrace_setopt(g_dtp, "aggsize", aggstr) == -1) + dfail("failed to set 'aggsize'"); + + if (dynvar) { + /* + * If we're using dynamic variables, we set our + * dynamic variable size to be one megabyte per CPU, + * with a hard-limit of 32 megabytes. This may still + * be too small in some cases, but it can be tuned + * manually via -x if need be. + */ + (void) sprintf(aggstr, "%ldm", ncpus < 32 ? ncpus : 32); + + if (dtrace_setopt(g_dtp, "dynvarsize", aggstr) == -1) + dfail("failed to set 'dynvarsize'"); + } + } else { + if (dtrace_setopt(g_dtp, "bufsize", aggstr) == -1) + dfail("failed to set 'bufsize'"); + } + + if (dtrace_setopt(g_dtp, "statusrate", "10sec") == -1) + dfail("failed to set 'statusrate'"); + + optind = 1; + while ((c = getopt(argc, argv, LOCKSTAT_OPTSTR)) != EOF) { + switch (c) { + case 'x': + if ((p = strchr(optarg, '=')) != NULL) + *p++ = '\0'; + + if (dtrace_setopt(g_dtp, optarg, p) != 0) + dfail("failed to set -x %s", optarg); + break; + } + } + + argc -= optind; + argv += optind; + + dprog_compile(); + status_init(); + + g_elapsed = -gethrtime(); + + /* + * Spawn the specified command and wait for it to complete. + */ + child = fork(); + if (child == -1) + fail(1, "cannot fork"); + if (child == 0) { + (void) dtrace_close(g_dtp); + (void) execvp(argv[0], &argv[0]); + exec_errno = errno; + exit(127); + } + + while (waitpid(child, &status, WEXITED) != child) + status_check(); + + g_elapsed += gethrtime(); + + if (WIFEXITED(status)) { + if (WEXITSTATUS(status) != 0) { + if (exec_errno != 0) { + errno = exec_errno; + fail(1, "could not execute %s", argv[0]); + } + (void) fprintf(stderr, + "lockstat: warning: %s exited with code %d\n", + argv[0], WEXITSTATUS(status)); + } + } else { + (void) fprintf(stderr, + "lockstat: warning: %s died on signal %d\n", + argv[0], WTERMSIG(status)); + } + + if (dtrace_stop(g_dtp) == -1) + dfail("failed to stop dtrace"); + + /* + * Before we read out the results, we need to allocate our buffer. + * If we're tracing, then we'll just use the precalculated size. If + * we're not, then we'll take a snapshot of the aggregate, and walk + * it to count the number of records. + */ + if (!g_tracing) { + if (dtrace_aggregate_snap(g_dtp) != 0) + dfail("failed to snap aggregate"); + + g_nrecs = 0; + + if (dtrace_aggregate_walk(g_dtp, + count_aggregate, &g_nrecs) != 0) + dfail("failed to walk aggregate"); + } + + if ((data_buf = memalign(sizeof (uint64_t), + (g_nrecs + 1) * g_recsize)) == NULL) + fail(1, "Memory allocation failed"); + + /* + * Read out the DTrace data. + */ + g_nrecs_used = process_data(out, data_buf); + + if (g_nrecs_used > g_nrecs || g_dropped) + (void) fprintf(stderr, "lockstat: warning: " + "ran out of data records (use -n for more)\n"); + + /* LINTED - alignment */ + for (i = 0, lsp = (lsrec_t *)data_buf; i < g_nrecs_used; i++, + /* LINTED - alignment */ + lsp = (lsrec_t *)((char *)lsp + g_recsize)) { + ev_count[lsp->ls_event] += lsp->ls_count; + ev_time[lsp->ls_event] += lsp->ls_time; + } + + /* + * If -g was specified, convert stacks into individual records. + */ + if (g_gflag) { + lsrec_t *newlsp, *oldlsp; + + newlsp = memalign(sizeof (uint64_t), + g_nrecs_used * LS_TIME * (g_stkdepth + 1)); + if (newlsp == NULL) + fail(1, "Cannot allocate space for -g processing"); + lsp = newlsp; + /* LINTED - alignment */ + for (i = 0, oldlsp = (lsrec_t *)data_buf; i < g_nrecs_used; i++, + /* LINTED - alignment */ + oldlsp = (lsrec_t *)((char *)oldlsp + g_recsize)) { + int fr; + int caller_in_stack = 0; + + if (oldlsp->ls_count == 0) + continue; + + for (fr = 0; fr < g_stkdepth; fr++) { + if (oldlsp->ls_stack[fr] == 0) + break; + if (oldlsp->ls_stack[fr] == oldlsp->ls_caller) + caller_in_stack = 1; + bcopy(oldlsp, lsp, LS_TIME); + lsp->ls_caller = oldlsp->ls_stack[fr]; + /* LINTED - alignment */ + lsp = (lsrec_t *)((char *)lsp + LS_TIME); + } + if (!caller_in_stack) { + bcopy(oldlsp, lsp, LS_TIME); + /* LINTED - alignment */ + lsp = (lsrec_t *)((char *)lsp + LS_TIME); + } + } + g_nrecs = g_nrecs_used = + ((uintptr_t)lsp - (uintptr_t)newlsp) / LS_TIME; + g_recsize = LS_TIME; + g_stkdepth = 0; + free(data_buf); + data_buf = (char *)newlsp; + } + + if ((sort_buf = calloc(2 * (g_nrecs + 1), + sizeof (void *))) == NULL) + fail(1, "Sort buffer allocation failed"); + merge_buf = sort_buf + (g_nrecs + 1); + + /* + * Build the sort buffer, discarding zero-count records along the way. + */ + /* LINTED - alignment */ + for (i = 0, lsp = (lsrec_t *)data_buf; i < g_nrecs_used; i++, + /* LINTED - alignment */ + lsp = (lsrec_t *)((char *)lsp + g_recsize)) { + if (lsp->ls_count == 0) + lsp->ls_event = LS_MAX_EVENTS; + sort_buf[i] = lsp; + } + + if (g_nrecs_used == 0) + exit(0); + + /* + * Add a sentinel after the last record + */ + sort_buf[i] = lsp; + lsp->ls_event = LS_MAX_EVENTS; + + if (g_tracing) { + report_trace(out, sort_buf); + return (0); + } + + /* + * Application of -g may have resulted in multiple records + * with the same signature; coalesce them. + */ + if (g_gflag) { + mergesort(lockcmp, sort_buf, merge_buf, g_nrecs_used); + coalesce(lockcmp, sort_buf, g_nrecs_used); + } + + /* + * Coalesce locks within the same symbol if -c option specified. + * Coalesce PCs within the same function if -k option specified. + */ + if (g_cflag || g_kflag) { + for (i = 0; i < g_nrecs_used; i++) { + int fr; + lsp = sort_buf[i]; + if (g_cflag) + coalesce_symbol(&lsp->ls_lock); + if (g_kflag) { + for (fr = 0; fr < g_stkdepth; fr++) + coalesce_symbol(&lsp->ls_stack[fr]); + coalesce_symbol(&lsp->ls_caller); + } + } + mergesort(lockcmp, sort_buf, merge_buf, g_nrecs_used); + coalesce(lockcmp, sort_buf, g_nrecs_used); + } + + /* + * Coalesce callers if -w option specified + */ + if (g_wflag) { + mergesort(lock_and_count_cmp_anywhere, + sort_buf, merge_buf, g_nrecs_used); + coalesce(lockcmp_anywhere, sort_buf, g_nrecs_used); + } + + /* + * Coalesce locks if -W option specified + */ + if (g_Wflag) { + mergesort(site_and_count_cmp_anylock, + sort_buf, merge_buf, g_nrecs_used); + coalesce(sitecmp_anylock, sort_buf, g_nrecs_used); + } + + /* + * Sort data by contention count (ls_count) or total time (ls_time), + * depending on g_Pflag. Override g_Pflag if time wasn't measured. + */ + if (g_recsize < LS_TIME) + g_Pflag = 0; + + if (g_Pflag) + mergesort(timecmp, sort_buf, merge_buf, g_nrecs_used); + else + mergesort(countcmp, sort_buf, merge_buf, g_nrecs_used); + + /* + * Display data by event type + */ + first = &sort_buf[0]; + while ((event = (*first)->ls_event) < LS_MAX_EVENTS) { + current = first; + while ((lsp = *current)->ls_event == event) + current++; + report_stats(out, first, current - first, ev_count[event], + ev_time[event]); + first = current; + } + + return (0); +} + +static char * +format_symbol(char *buf, uintptr_t addr, int show_size) +{ + uintptr_t symoff; + char *symname; + size_t symsize; + + symname = addr_to_sym(addr, &symoff, &symsize); + + if (show_size && symoff == 0) + (void) sprintf(buf, "%s[%ld]", symname, (long)symsize); + else if (symoff == 0) + (void) sprintf(buf, "%s", symname); + else if (symoff < 16 && bcmp(symname, "cpu[", 4) == 0) /* CPU+PIL */ + (void) sprintf(buf, "%s+%ld", symname, (long)symoff); + else if (symoff <= symsize || (symoff < 256 && addr != symoff)) + (void) sprintf(buf, "%s+0x%llx", symname, + (unsigned long long)symoff); + else + (void) sprintf(buf, "0x%llx", (unsigned long long)addr); + return (buf); +} + +static void +report_stats(FILE *out, lsrec_t **sort_buf, size_t nrecs, uint64_t total_count, + uint64_t total_time) +{ + uint32_t event = sort_buf[0]->ls_event; + lsrec_t *lsp; + double ptotal = 0.0; + double percent; + int i, j, fr; + int displayed; + int first_bin, last_bin, max_bin_count, total_bin_count; + int rectype; + char buf[256]; + char lhdr[80], chdr[80]; + + rectype = g_recsize; + + if (g_topn == 0) { + (void) fprintf(out, "%20llu %s\n", + g_rates == 0 ? total_count : + ((unsigned long long)total_count * NANOSEC) / g_elapsed, + g_event_info[event].ev_desc); + return; + } + + (void) sprintf(lhdr, "%s%s", + g_Wflag ? "Hottest " : "", g_event_info[event].ev_lhdr); + (void) sprintf(chdr, "%s%s", + g_wflag ? "Hottest " : "", "Caller"); + + if (!g_pflag) + (void) fprintf(out, + "\n%s: %.0f events in %.3f seconds (%.0f events/sec)\n\n", + g_event_info[event].ev_desc, (double)total_count, + (double)g_elapsed / NANOSEC, + (double)total_count * NANOSEC / g_elapsed); + + if (!g_pflag && rectype < LS_HIST) { + (void) sprintf(buf, "%s", g_event_info[event].ev_units); + (void) fprintf(out, "%5s %4s %4s %4s %8s %-22s %-24s\n", + g_rates ? "ops/s" : "Count", + g_gflag ? "genr" : "indv", + "cuml", "rcnt", rectype >= LS_TIME ? buf : "", lhdr, chdr); + (void) fprintf(out, "---------------------------------" + "----------------------------------------------\n"); + } + + displayed = 0; + for (i = 0; i < nrecs; i++) { + lsp = sort_buf[i]; + + if (displayed++ >= g_topn) + break; + + if (g_pflag) { + int j; + + (void) fprintf(out, "%u %u", + lsp->ls_event, lsp->ls_count); + (void) fprintf(out, " %s", + format_symbol(buf, lsp->ls_lock, g_cflag)); + (void) fprintf(out, " %s", + format_symbol(buf, lsp->ls_caller, 0)); + (void) fprintf(out, " %f", + (double)lsp->ls_refcnt / lsp->ls_count); + if (rectype >= LS_TIME) + (void) fprintf(out, " %llu", + (unsigned long long)lsp->ls_time); + if (rectype >= LS_HIST) { + for (j = 0; j < 64; j++) + (void) fprintf(out, " %u", + lsp->ls_hist[j]); + } + for (j = 0; j < LS_MAX_STACK_DEPTH; j++) { + if (rectype <= LS_STACK(j) || + lsp->ls_stack[j] == 0) + break; + (void) fprintf(out, " %s", + format_symbol(buf, lsp->ls_stack[j], 0)); + } + (void) fprintf(out, "\n"); + continue; + } + + if (rectype >= LS_HIST) { + (void) fprintf(out, "---------------------------------" + "----------------------------------------------\n"); + (void) sprintf(buf, "%s", + g_event_info[event].ev_units); + (void) fprintf(out, "%5s %4s %4s %4s %8s %-22s %-24s\n", + g_rates ? "ops/s" : "Count", + g_gflag ? "genr" : "indv", + "cuml", "rcnt", buf, lhdr, chdr); + } + + if (g_Pflag && total_time != 0) + percent = (lsp->ls_time * 100.00) / total_time; + else + percent = (lsp->ls_count * 100.00) / total_count; + + ptotal += percent; + + if (rectype >= LS_TIME) + (void) sprintf(buf, "%llu", + (unsigned long long)(lsp->ls_time / lsp->ls_count)); + else + buf[0] = '\0'; + + (void) fprintf(out, "%5llu ", + g_rates == 0 ? lsp->ls_count : + ((uint64_t)lsp->ls_count * NANOSEC) / g_elapsed); + + (void) fprintf(out, "%3.0f%% ", percent); + + if (g_gflag) + (void) fprintf(out, "---- "); + else + (void) fprintf(out, "%3.0f%% ", ptotal); + + (void) fprintf(out, "%4.2f %8s ", + (double)lsp->ls_refcnt / lsp->ls_count, buf); + + (void) fprintf(out, "%-22s ", + format_symbol(buf, lsp->ls_lock, g_cflag)); + + (void) fprintf(out, "%-24s\n", + format_symbol(buf, lsp->ls_caller, 0)); + + if (rectype < LS_HIST) + continue; + + (void) fprintf(out, "\n"); + (void) fprintf(out, "%10s %31s %-9s %-24s\n", + g_event_info[event].ev_units, + "------ Time Distribution ------", + g_rates ? "ops/s" : "count", + rectype > LS_STACK(0) ? "Stack" : ""); + + first_bin = 0; + while (lsp->ls_hist[first_bin] == 0) + first_bin++; + + last_bin = 63; + while (lsp->ls_hist[last_bin] == 0) + last_bin--; + + max_bin_count = 0; + total_bin_count = 0; + for (j = first_bin; j <= last_bin; j++) { + total_bin_count += lsp->ls_hist[j]; + if (lsp->ls_hist[j] > max_bin_count) + max_bin_count = lsp->ls_hist[j]; + } + + /* + * If we went a few frames below the caller, ignore them + */ + for (fr = 3; fr > 0; fr--) + if (lsp->ls_stack[fr] == lsp->ls_caller) + break; + + for (j = first_bin; j <= last_bin; j++) { + uint_t depth = (lsp->ls_hist[j] * 30) / total_bin_count; + (void) fprintf(out, "%10llu |%s%s %-9u ", + 1ULL << j, + "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" + 30 - depth, + " " + depth, + g_rates == 0 ? lsp->ls_hist[j] : + (uint_t)(((uint64_t)lsp->ls_hist[j] * NANOSEC) / + g_elapsed)); + if (rectype <= LS_STACK(fr) || lsp->ls_stack[fr] == 0) { + (void) fprintf(out, "\n"); + continue; + } + (void) fprintf(out, "%-24s\n", + format_symbol(buf, lsp->ls_stack[fr], 0)); + fr++; + } + while (rectype > LS_STACK(fr) && lsp->ls_stack[fr] != 0) { + (void) fprintf(out, "%15s %-36s %-24s\n", "", "", + format_symbol(buf, lsp->ls_stack[fr], 0)); + fr++; + } + } + + if (!g_pflag) + (void) fprintf(out, "---------------------------------" + "----------------------------------------------\n"); + + (void) fflush(out); +} + +static void +report_trace(FILE *out, lsrec_t **sort_buf) +{ + lsrec_t *lsp; + int i, fr; + int rectype; + char buf[256], buf2[256]; + + rectype = g_recsize; + + if (!g_pflag) { + (void) fprintf(out, "%5s %7s %11s %-24s %-24s\n", + "Event", "Time", "Owner", "Lock", "Caller"); + (void) fprintf(out, "---------------------------------" + "----------------------------------------------\n"); + } + + for (i = 0; i < g_nrecs_used; i++) { + + lsp = sort_buf[i]; + + if (lsp->ls_event >= LS_MAX_EVENTS || lsp->ls_count == 0) + continue; + + (void) fprintf(out, "%2d %10llu %11p %-24s %-24s\n", + lsp->ls_event, (unsigned long long)lsp->ls_time, + (void *)lsp->ls_next, + format_symbol(buf, lsp->ls_lock, 0), + format_symbol(buf2, lsp->ls_caller, 0)); + + if (rectype <= LS_STACK(0)) + continue; + + /* + * If we went a few frames below the caller, ignore them + */ + for (fr = 3; fr > 0; fr--) + if (lsp->ls_stack[fr] == lsp->ls_caller) + break; + + while (rectype > LS_STACK(fr) && lsp->ls_stack[fr] != 0) { + (void) fprintf(out, "%53s %-24s\n", "", + format_symbol(buf, lsp->ls_stack[fr], 0)); + fr++; + } + (void) fprintf(out, "\n"); + } + + (void) fflush(out); +} diff --git a/cmd/lockstat/sym.c b/cmd/lockstat/sym.c new file mode 100644 index 000000000000..a4a5080428dc --- /dev/null +++ b/cmd/lockstat/sym.c @@ -0,0 +1,253 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 1997-1999 by Sun Microsystems, Inc. + * All rights reserved. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <stdio.h> +#include <fcntl.h> +#include <ctype.h> +#include <string.h> +#include <signal.h> +#include <errno.h> +#include <stdlib.h> +#include <stdarg.h> +#include <unistd.h> +#include <limits.h> +#include <sys/types.h> +#include <sys/stat.h> + +#include <libelf.h> +#include <link.h> +#include <elf.h> +#include <sys/machelf.h> + +#include <kstat.h> +#include <sys/cpuvar.h> + +typedef struct syment { + uintptr_t addr; + char *name; + size_t size; +} syment_t; + +static syment_t *symbol_table; +static int nsyms, maxsyms; +static char maxsymname[64]; + +#ifdef _ELF64 +#define elf_getshdr elf64_getshdr +#else +#define elf_getshdr elf32_getshdr +#endif + +static void +add_symbol(char *name, uintptr_t addr, size_t size) +{ + syment_t *sep; + + if (nsyms >= maxsyms) { + maxsyms += 10000; + symbol_table = realloc(symbol_table, maxsyms * sizeof (*sep)); + if (symbol_table == NULL) { + (void) fprintf(stderr, "can't allocate symbol table\n"); + exit(3); + } + } + sep = &symbol_table[nsyms++]; + + sep->name = name; + sep->addr = addr; + sep->size = size; +} + +static void +remove_symbol(uintptr_t addr) +{ + int i; + syment_t *sep = symbol_table; + + for (i = 0; i < nsyms; i++, sep++) + if (sep->addr == addr) + sep->addr = 0; +} + +static void +fake_up_certain_popular_kernel_symbols(void) +{ + kstat_ctl_t *kc; + kstat_t *ksp; + char *name; + + if ((kc = kstat_open()) == NULL) + return; + + for (ksp = kc->kc_chain; ksp; ksp = ksp->ks_next) { + if (strcmp(ksp->ks_module, "cpu_info") == 0) { + if ((name = malloc(20)) == NULL) + break; + /* + * For consistency, keep cpu[0] and toss cpu0 + * or any other such symbols. + */ + if (ksp->ks_instance == 0) + remove_symbol((uintptr_t)ksp->ks_private); + (void) sprintf(name, "cpu[%d]", ksp->ks_instance); + add_symbol(name, (uintptr_t)ksp->ks_private, + sizeof (struct cpu)); + } + } + (void) kstat_close(kc); +} + +static int +symcmp(const void *p1, const void *p2) +{ + uintptr_t a1 = ((syment_t *)p1)->addr; + uintptr_t a2 = ((syment_t *)p2)->addr; + + if (a1 < a2) + return (-1); + if (a1 > a2) + return (1); + return (0); +} + +int +symtab_init(void) +{ + Elf *elf; + Elf_Scn *scn = NULL; + Sym *symtab, *symp, *lastsym; + char *strtab; + uint_t cnt; + int fd; + int i; + int strindex = -1; + + if ((fd = open("/dev/ksyms", O_RDONLY)) == -1) + return (-1); + + (void) elf_version(EV_CURRENT); + + elf = elf_begin(fd, ELF_C_READ, NULL); + + for (cnt = 1; (scn = elf_nextscn(elf, scn)) != NULL; cnt++) { + Shdr *shdr = elf_getshdr(scn); + if (shdr->sh_type == SHT_SYMTAB) { + symtab = (Sym *)elf_getdata(scn, NULL)->d_buf; + nsyms = shdr->sh_size / shdr->sh_entsize; + strindex = shdr->sh_link; + } + } + + for (cnt = 1; (scn = elf_nextscn(elf, scn)) != NULL; cnt++) { + if (cnt == strindex) + strtab = (char *)elf_getdata(scn, NULL)->d_buf; + } + + lastsym = symtab + nsyms; + nsyms = 0; + for (symp = symtab; symp < lastsym; symp++) + if ((uint_t)ELF32_ST_TYPE(symp->st_info) <= STT_FUNC && + symp->st_size != 0) + add_symbol(symp->st_name + strtab, + (uintptr_t)symp->st_value, (size_t)symp->st_size); + + fake_up_certain_popular_kernel_symbols(); + (void) sprintf(maxsymname, "0x%lx", ULONG_MAX); + add_symbol(maxsymname, ULONG_MAX, 1); + + qsort(symbol_table, nsyms, sizeof (syment_t), symcmp); + + /* + * Destroy all duplicate symbols, then sort it again. + */ + for (i = 0; i < nsyms - 1; i++) + if (symbol_table[i].addr == symbol_table[i + 1].addr) + symbol_table[i].addr = 0; + + qsort(symbol_table, nsyms, sizeof (syment_t), symcmp); + + while (symbol_table[1].addr == 0) { + symbol_table++; + nsyms--; + } + symbol_table[0].name = "(usermode)"; + symbol_table[0].addr = 0; + symbol_table[0].size = 1; + + return (0); +} + +char * +addr_to_sym(uintptr_t addr, uintptr_t *offset, size_t *sizep) +{ + int lo = 0; + int hi = nsyms - 1; + int mid; + syment_t *sep; + + while (hi - lo > 1) { + mid = (lo + hi) / 2; + if (addr >= symbol_table[mid].addr) { + lo = mid; + } else { + hi = mid; + } + } + sep = &symbol_table[lo]; + *offset = addr - sep->addr; + *sizep = sep->size; + return (sep->name); +} + +uintptr_t +sym_to_addr(char *name) +{ + int i; + syment_t *sep = symbol_table; + + for (i = 0; i < nsyms; i++) { + if (strcmp(name, sep->name) == 0) + return (sep->addr); + sep++; + } + return (NULL); +} + +size_t +sym_size(char *name) +{ + int i; + syment_t *sep = symbol_table; + + for (i = 0; i < nsyms; i++) { + if (strcmp(name, sep->name) == 0) + return (sep->size); + sep++; + } + return (0); +} |