1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <stdlib.h>
30 #include <strings.h>
31 #include <errno.h>
32 #include <unistd.h>
33 #include <dt_impl.h>
34 #include <assert.h>
35 
36 #define	DTRACE_AHASHSIZE	32779		/* big 'ol prime */
37 
38 static void
39 dt_aggregate_count(uint64_t *existing, uint64_t *new, size_t size)
40 {
41 	int i;
42 
43 	for (i = 0; i < size / sizeof (uint64_t); i++)
44 		existing[i] = existing[i] + new[i];
45 }
46 
47 static int
48 dt_aggregate_countcmp(uint64_t *lhs, uint64_t *rhs)
49 {
50 	uint64_t lvar = *lhs;
51 	uint64_t rvar = *rhs;
52 
53 	if (lvar > rvar)
54 		return (1);
55 
56 	if (lvar < rvar)
57 		return (-1);
58 
59 	return (0);
60 }
61 
62 /*ARGSUSED*/
63 static void
64 dt_aggregate_min(uint64_t *existing, uint64_t *new, size_t size)
65 {
66 	if (*new < *existing)
67 		*existing = *new;
68 }
69 
70 /*ARGSUSED*/
71 static void
72 dt_aggregate_max(uint64_t *existing, uint64_t *new, size_t size)
73 {
74 	if (*new > *existing)
75 		*existing = *new;
76 }
77 
78 static int
79 dt_aggregate_averagecmp(uint64_t *lhs, uint64_t *rhs)
80 {
81 	uint64_t lavg = lhs[0] ? (lhs[1] / lhs[0]) : 0;
82 	uint64_t ravg = rhs[0] ? (rhs[1] / rhs[0]) : 0;
83 
84 	if (lavg > ravg)
85 		return (1);
86 
87 	if (lavg < ravg)
88 		return (-1);
89 
90 	return (0);
91 }
92 
93 /*ARGSUSED*/
94 static void
95 dt_aggregate_lquantize(uint64_t *existing, uint64_t *new, size_t size)
96 {
97 	uint64_t arg = *existing++;
98 	uint16_t levels = DTRACE_LQUANTIZE_LEVELS(arg);
99 	int i;
100 
101 	for (i = 0; i <= levels + 1; i++)
102 		existing[i] = existing[i] + new[i + 1];
103 }
104 
105 static long double
106 dt_aggregate_lquantizedsum(uint64_t *lquanta)
107 {
108 	uint64_t arg = *lquanta++;
109 	int32_t base = DTRACE_LQUANTIZE_BASE(arg);
110 	uint16_t step = DTRACE_LQUANTIZE_STEP(arg);
111 	uint16_t levels = DTRACE_LQUANTIZE_LEVELS(arg), i;
112 	long double total = (long double)lquanta[0] * (long double)(base - 1);
113 
114 	for (i = 0; i < levels; base += step, i++)
115 		total += (long double)lquanta[i + 1] * (long double)base;
116 
117 	return (total + (long double)lquanta[levels + 1] *
118 	    (long double)(base + 1));
119 }
120 
121 static int
122 dt_aggregate_lquantizedcmp(uint64_t *lhs, uint64_t *rhs)
123 {
124 	long double lsum = dt_aggregate_lquantizedsum(lhs);
125 	long double rsum = dt_aggregate_lquantizedsum(rhs);
126 
127 	if (lsum > rsum)
128 		return (1);
129 
130 	if (lsum < rsum)
131 		return (-1);
132 
133 	return (0);
134 }
135 
136 static int
137 dt_aggregate_quantizedcmp(uint64_t *lhs, uint64_t *rhs)
138 {
139 	int nbuckets = DTRACE_QUANTIZE_NBUCKETS, i;
140 	long double ltotal = 0, rtotal = 0;
141 
142 	for (i = 0; i < nbuckets; i++) {
143 		int64_t bucketval = DTRACE_QUANTIZE_BUCKETVAL(i);
144 
145 		ltotal += (long double)bucketval * (long double)lhs[i];
146 		rtotal += (long double)bucketval * (long double)rhs[i];
147 	}
148 
149 	if (ltotal > rtotal)
150 		return (1);
151 
152 	if (ltotal < rtotal)
153 		return (-1);
154 
155 	return (0);
156 }
157 
158 static void
159 dt_aggregate_usym(dtrace_hdl_t *dtp, uint64_t *data)
160 {
161 	uint64_t pid = data[0];
162 	uint64_t *pc = &data[1];
163 	struct ps_prochandle *P;
164 	GElf_Sym sym;
165 
166 	if (dtp->dt_vector != NULL)
167 		return;
168 
169 	if ((P = dt_proc_grab(dtp, pid, PGRAB_RDONLY | PGRAB_FORCE, 0)) == NULL)
170 		return;
171 
172 	dt_proc_lock(dtp, P);
173 
174 	if (Plookup_by_addr(P, *pc, NULL, 0, &sym) == 0)
175 		*pc = sym.st_value;
176 
177 	dt_proc_unlock(dtp, P);
178 	dt_proc_release(dtp, P);
179 }
180 
181 static void
182 dt_aggregate_umod(dtrace_hdl_t *dtp, uint64_t *data)
183 {
184 	uint64_t pid = data[0];
185 	uint64_t *pc = &data[1];
186 	struct ps_prochandle *P;
187 	const prmap_t *map;
188 
189 	if (dtp->dt_vector != NULL)
190 		return;
191 
192 	if ((P = dt_proc_grab(dtp, pid, PGRAB_RDONLY | PGRAB_FORCE, 0)) == NULL)
193 		return;
194 
195 	dt_proc_lock(dtp, P);
196 
197 	if ((map = Paddr_to_map(P, *pc)) != NULL)
198 		*pc = map->pr_vaddr;
199 
200 	dt_proc_unlock(dtp, P);
201 	dt_proc_release(dtp, P);
202 }
203 
204 static void
205 dt_aggregate_sym(dtrace_hdl_t *dtp, uint64_t *data)
206 {
207 	GElf_Sym sym;
208 	uint64_t *pc = data;
209 
210 	if (dtrace_lookup_by_addr(dtp, *pc, &sym, NULL) == 0)
211 		*pc = sym.st_value;
212 }
213 
214 static void
215 dt_aggregate_mod(dtrace_hdl_t *dtp, uint64_t *data)
216 {
217 	uint64_t *pc = data;
218 	dt_module_t *dmp;
219 
220 	if (dtp->dt_vector != NULL) {
221 		/*
222 		 * We don't have a way of just getting the module for a
223 		 * vectored open, and it doesn't seem to be worth defining
224 		 * one.  This means that use of mod() won't get true
225 		 * aggregation in the postmortem case (some modules may
226 		 * appear more than once in aggregation output).  It seems
227 		 * unlikely that anyone will ever notice or care...
228 		 */
229 		return;
230 	}
231 
232 	for (dmp = dt_list_next(&dtp->dt_modlist); dmp != NULL;
233 	    dmp = dt_list_next(dmp)) {
234 		if (*pc - dmp->dm_text_va < dmp->dm_text_size) {
235 			*pc = dmp->dm_text_va;
236 			return;
237 		}
238 	}
239 }
240 
241 static int
242 dt_aggregate_snap_cpu(dtrace_hdl_t *dtp, processorid_t cpu)
243 {
244 	dtrace_epid_t id;
245 	uint64_t hashval;
246 	size_t offs, roffs, size, ndx;
247 	int i, j, rval;
248 	caddr_t addr, data;
249 	dtrace_recdesc_t *rec;
250 	dt_aggregate_t *agp = &dtp->dt_aggregate;
251 	dtrace_aggdesc_t *agg;
252 	dt_ahash_t *hash = &agp->dtat_hash;
253 	dt_ahashent_t *h;
254 	dtrace_bufdesc_t b = agp->dtat_buf, *buf = &b;
255 	dtrace_aggdata_t *aggdata;
256 	int flags = agp->dtat_flags;
257 
258 	buf->dtbd_cpu = cpu;
259 
260 	if (dt_ioctl(dtp, DTRACEIOC_AGGSNAP, buf) == -1) {
261 		if (errno == ENOENT) {
262 			/*
263 			 * If that failed with ENOENT, it may be because the
264 			 * CPU was unconfigured.  This is okay; we'll just
265 			 * do nothing but return success.
266 			 */
267 			return (0);
268 		}
269 
270 		return (dt_set_errno(dtp, errno));
271 	}
272 
273 	if (buf->dtbd_drops != 0) {
274 		if (dt_handle_cpudrop(dtp, cpu,
275 		    DTRACEDROP_AGGREGATION, buf->dtbd_drops) == -1)
276 			return (-1);
277 	}
278 
279 	if (buf->dtbd_size == 0)
280 		return (0);
281 
282 	if (hash->dtah_hash == NULL) {
283 		size_t size;
284 
285 		hash->dtah_size = DTRACE_AHASHSIZE;
286 		size = hash->dtah_size * sizeof (dt_ahashent_t *);
287 
288 		if ((hash->dtah_hash = malloc(size)) == NULL)
289 			return (dt_set_errno(dtp, EDT_NOMEM));
290 
291 		bzero(hash->dtah_hash, size);
292 	}
293 
294 	for (offs = 0; offs < buf->dtbd_size; ) {
295 		/*
296 		 * We're guaranteed to have an ID.
297 		 */
298 		id = *((dtrace_epid_t *)((uintptr_t)buf->dtbd_data +
299 		    (uintptr_t)offs));
300 
301 		if (id == DTRACE_AGGIDNONE) {
302 			/*
303 			 * This is filler to assure proper alignment of the
304 			 * next record; we simply ignore it.
305 			 */
306 			offs += sizeof (id);
307 			continue;
308 		}
309 
310 		if ((rval = dt_aggid_lookup(dtp, id, &agg)) != 0)
311 			return (rval);
312 
313 		addr = buf->dtbd_data + offs;
314 		size = agg->dtagd_size;
315 		hashval = 0;
316 
317 		for (j = 0; j < agg->dtagd_nrecs - 1; j++) {
318 			rec = &agg->dtagd_rec[j];
319 			roffs = rec->dtrd_offset;
320 
321 			switch (rec->dtrd_action) {
322 			case DTRACEACT_USYM:
323 				dt_aggregate_usym(dtp,
324 				    /* LINTED - alignment */
325 				    (uint64_t *)&addr[roffs]);
326 				break;
327 
328 			case DTRACEACT_UMOD:
329 				dt_aggregate_umod(dtp,
330 				    /* LINTED - alignment */
331 				    (uint64_t *)&addr[roffs]);
332 				break;
333 
334 			case DTRACEACT_SYM:
335 				/* LINTED - alignment */
336 				dt_aggregate_sym(dtp, (uint64_t *)&addr[roffs]);
337 				break;
338 
339 			case DTRACEACT_MOD:
340 				/* LINTED - alignment */
341 				dt_aggregate_mod(dtp, (uint64_t *)&addr[roffs]);
342 				break;
343 
344 			default:
345 				break;
346 			}
347 
348 			for (i = 0; i < rec->dtrd_size; i++)
349 				hashval += addr[roffs + i];
350 		}
351 
352 		ndx = hashval % hash->dtah_size;
353 
354 		for (h = hash->dtah_hash[ndx]; h != NULL; h = h->dtahe_next) {
355 			if (h->dtahe_hashval != hashval)
356 				continue;
357 
358 			if (h->dtahe_size != size)
359 				continue;
360 
361 			aggdata = &h->dtahe_data;
362 			data = aggdata->dtada_data;
363 
364 			for (j = 0; j < agg->dtagd_nrecs - 1; j++) {
365 				rec = &agg->dtagd_rec[j];
366 				roffs = rec->dtrd_offset;
367 
368 				for (i = 0; i < rec->dtrd_size; i++)
369 					if (addr[roffs + i] != data[roffs + i])
370 						goto hashnext;
371 			}
372 
373 			/*
374 			 * We found it.  Now we need to apply the aggregating
375 			 * action on the data here.
376 			 */
377 			rec = &agg->dtagd_rec[agg->dtagd_nrecs - 1];
378 			roffs = rec->dtrd_offset;
379 			/* LINTED - alignment */
380 			h->dtahe_aggregate((uint64_t *)&data[roffs],
381 			    /* LINTED - alignment */
382 			    (uint64_t *)&addr[roffs], rec->dtrd_size);
383 
384 			/*
385 			 * If we're keeping per CPU data, apply the aggregating
386 			 * action there as well.
387 			 */
388 			if (aggdata->dtada_percpu != NULL) {
389 				data = aggdata->dtada_percpu[cpu];
390 
391 				/* LINTED - alignment */
392 				h->dtahe_aggregate((uint64_t *)data,
393 				    /* LINTED - alignment */
394 				    (uint64_t *)&addr[roffs], rec->dtrd_size);
395 			}
396 
397 			goto bufnext;
398 hashnext:
399 			continue;
400 		}
401 
402 		/*
403 		 * If we're here, we couldn't find an entry for this record.
404 		 */
405 		if ((h = malloc(sizeof (dt_ahashent_t))) == NULL)
406 			return (dt_set_errno(dtp, EDT_NOMEM));
407 		bzero(h, sizeof (dt_ahashent_t));
408 		aggdata = &h->dtahe_data;
409 
410 		if ((aggdata->dtada_data = malloc(size)) == NULL) {
411 			free(h);
412 			return (dt_set_errno(dtp, EDT_NOMEM));
413 		}
414 
415 		bcopy(addr, aggdata->dtada_data, size);
416 		aggdata->dtada_size = size;
417 		aggdata->dtada_desc = agg;
418 		aggdata->dtada_handle = dtp;
419 		(void) dt_epid_lookup(dtp, agg->dtagd_epid,
420 		    &aggdata->dtada_edesc, &aggdata->dtada_pdesc);
421 		aggdata->dtada_normal = 1;
422 
423 		h->dtahe_hashval = hashval;
424 		h->dtahe_size = size;
425 
426 		rec = &agg->dtagd_rec[agg->dtagd_nrecs - 1];
427 
428 		if (flags & DTRACE_A_PERCPU) {
429 			int max_cpus = agp->dtat_maxcpu;
430 			caddr_t *percpu = malloc(max_cpus * sizeof (caddr_t));
431 
432 			if (percpu == NULL) {
433 				free(aggdata->dtada_data);
434 				free(h);
435 				return (dt_set_errno(dtp, EDT_NOMEM));
436 			}
437 
438 			for (j = 0; j < max_cpus; j++) {
439 				percpu[j] = malloc(rec->dtrd_size);
440 
441 				if (percpu[j] == NULL) {
442 					while (--j >= 0)
443 						free(percpu[j]);
444 
445 					free(aggdata->dtada_data);
446 					free(h);
447 					return (dt_set_errno(dtp, EDT_NOMEM));
448 				}
449 
450 				if (j == cpu) {
451 					bcopy(&addr[rec->dtrd_offset],
452 					    percpu[j], rec->dtrd_size);
453 				} else {
454 					bzero(percpu[j], rec->dtrd_size);
455 				}
456 			}
457 
458 			aggdata->dtada_percpu = percpu;
459 		}
460 
461 		switch (rec->dtrd_action) {
462 		case DTRACEAGG_MIN:
463 			h->dtahe_aggregate = dt_aggregate_min;
464 			break;
465 
466 		case DTRACEAGG_MAX:
467 			h->dtahe_aggregate = dt_aggregate_max;
468 			break;
469 
470 		case DTRACEAGG_LQUANTIZE:
471 			h->dtahe_aggregate = dt_aggregate_lquantize;
472 			break;
473 
474 		case DTRACEAGG_COUNT:
475 		case DTRACEAGG_SUM:
476 		case DTRACEAGG_AVG:
477 		case DTRACEAGG_QUANTIZE:
478 			h->dtahe_aggregate = dt_aggregate_count;
479 			break;
480 
481 		default:
482 			return (dt_set_errno(dtp, EDT_BADAGG));
483 		}
484 
485 		if (hash->dtah_hash[ndx] != NULL)
486 			hash->dtah_hash[ndx]->dtahe_prev = h;
487 
488 		h->dtahe_next = hash->dtah_hash[ndx];
489 		hash->dtah_hash[ndx] = h;
490 
491 		if (hash->dtah_all != NULL)
492 			hash->dtah_all->dtahe_prevall = h;
493 
494 		h->dtahe_nextall = hash->dtah_all;
495 		hash->dtah_all = h;
496 bufnext:
497 		offs += agg->dtagd_size;
498 	}
499 
500 	return (0);
501 }
502 
503 int
504 dtrace_aggregate_snap(dtrace_hdl_t *dtp)
505 {
506 	int i, rval;
507 	dt_aggregate_t *agp = &dtp->dt_aggregate;
508 	hrtime_t now = gethrtime();
509 	dtrace_optval_t interval = dtp->dt_options[DTRACEOPT_AGGRATE];
510 
511 	if (dtp->dt_lastagg != 0) {
512 		if (now - dtp->dt_lastagg < interval)
513 			return (0);
514 
515 		dtp->dt_lastagg += interval;
516 	} else {
517 		dtp->dt_lastagg = now;
518 	}
519 
520 	if (!dtp->dt_active)
521 		return (dt_set_errno(dtp, EINVAL));
522 
523 	if (agp->dtat_buf.dtbd_size == 0)
524 		return (0);
525 
526 	for (i = 0; i < agp->dtat_ncpus; i++) {
527 		if (rval = dt_aggregate_snap_cpu(dtp, agp->dtat_cpus[i]))
528 			return (rval);
529 	}
530 
531 	return (0);
532 }
533 
534 static int
535 dt_aggregate_hashcmp(const void *lhs, const void *rhs)
536 {
537 	dt_ahashent_t *lh = *((dt_ahashent_t **)lhs);
538 	dt_ahashent_t *rh = *((dt_ahashent_t **)rhs);
539 	dtrace_aggdesc_t *lagg = lh->dtahe_data.dtada_desc;
540 	dtrace_aggdesc_t *ragg = rh->dtahe_data.dtada_desc;
541 
542 	if (lagg->dtagd_nrecs < ragg->dtagd_nrecs)
543 		return (-1);
544 
545 	if (lagg->dtagd_nrecs > ragg->dtagd_nrecs)
546 		return (1);
547 
548 	return (0);
549 }
550 
551 static int
552 dt_aggregate_varcmp(const void *lhs, const void *rhs)
553 {
554 	dt_ahashent_t *lh = *((dt_ahashent_t **)lhs);
555 	dt_ahashent_t *rh = *((dt_ahashent_t **)rhs);
556 	dtrace_aggdesc_t *lagg = lh->dtahe_data.dtada_desc;
557 	dtrace_aggdesc_t *ragg = rh->dtahe_data.dtada_desc;
558 	caddr_t ldata = lh->dtahe_data.dtada_data;
559 	caddr_t rdata = rh->dtahe_data.dtada_data;
560 	dtrace_recdesc_t *lrec, *rrec;
561 	uint64_t lid, rid;
562 
563 	/*
564 	 * We know that we have a compiler-generated ID as the first record.
565 	 */
566 	lrec = lagg->dtagd_rec;
567 	rrec = ragg->dtagd_rec;
568 
569 	lid = *((uint64_t *)(uintptr_t)(ldata + lrec->dtrd_offset));
570 	rid = *((uint64_t *)(uintptr_t)(rdata + rrec->dtrd_offset));
571 
572 	if (lid < rid)
573 		return (-1);
574 
575 	if (lid > rid)
576 		return (1);
577 
578 	return (0);
579 }
580 
581 static int
582 dt_aggregate_keycmp(const void *lhs, const void *rhs)
583 {
584 	dt_ahashent_t *lh = *((dt_ahashent_t **)lhs);
585 	dt_ahashent_t *rh = *((dt_ahashent_t **)rhs);
586 	dtrace_aggdesc_t *lagg = lh->dtahe_data.dtada_desc;
587 	dtrace_aggdesc_t *ragg = rh->dtahe_data.dtada_desc;
588 	dtrace_recdesc_t *lrec, *rrec;
589 	char *ldata, *rdata;
590 	int rval, i, j;
591 
592 	if ((rval = dt_aggregate_hashcmp(lhs, rhs)) != 0)
593 		return (rval);
594 
595 	for (i = 1; i < lagg->dtagd_nrecs - 1; i++) {
596 		uint64_t lval, rval;
597 
598 		lrec = &lagg->dtagd_rec[i];
599 		rrec = &ragg->dtagd_rec[i];
600 
601 		ldata = lh->dtahe_data.dtada_data + lrec->dtrd_offset;
602 		rdata = rh->dtahe_data.dtada_data + rrec->dtrd_offset;
603 
604 		if (lrec->dtrd_size < rrec->dtrd_size)
605 			return (-1);
606 
607 		if (lrec->dtrd_size > rrec->dtrd_size)
608 			return (1);
609 
610 		switch (lrec->dtrd_size) {
611 		case sizeof (uint64_t):
612 			/* LINTED - alignment */
613 			lval = *((uint64_t *)ldata);
614 			/* LINTED - alignment */
615 			rval = *((uint64_t *)rdata);
616 			break;
617 
618 		case sizeof (uint32_t):
619 			/* LINTED - alignment */
620 			lval = *((uint32_t *)ldata);
621 			/* LINTED - alignment */
622 			rval = *((uint32_t *)rdata);
623 			break;
624 
625 		case sizeof (uint16_t):
626 			/* LINTED - alignment */
627 			lval = *((uint16_t *)ldata);
628 			/* LINTED - alignment */
629 			rval = *((uint16_t *)rdata);
630 			break;
631 
632 		case sizeof (uint8_t):
633 			lval = *((uint8_t *)ldata);
634 			rval = *((uint8_t *)rdata);
635 			break;
636 
637 		default:
638 			for (j = 0; j < lrec->dtrd_size; j++) {
639 				lval = ((uint8_t *)ldata)[j];
640 				rval = ((uint8_t *)rdata)[j];
641 
642 				if (lval < rval)
643 					return (-1);
644 
645 				if (lval > rval)
646 					return (1);
647 			}
648 
649 			continue;
650 		}
651 
652 		if (lval < rval)
653 			return (-1);
654 
655 		if (lval > rval)
656 			return (1);
657 	}
658 
659 	return (0);
660 }
661 
662 static int
663 dt_aggregate_valcmp(const void *lhs, const void *rhs)
664 {
665 	dt_ahashent_t *lh = *((dt_ahashent_t **)lhs);
666 	dt_ahashent_t *rh = *((dt_ahashent_t **)rhs);
667 	dtrace_aggdesc_t *lagg = lh->dtahe_data.dtada_desc;
668 	dtrace_aggdesc_t *ragg = rh->dtahe_data.dtada_desc;
669 	caddr_t ldata = lh->dtahe_data.dtada_data;
670 	caddr_t rdata = rh->dtahe_data.dtada_data;
671 	dtrace_recdesc_t *lrec, *rrec;
672 	uint64_t *laddr, *raddr;
673 	int rval, i;
674 
675 	if ((rval = dt_aggregate_hashcmp(lhs, rhs)) != 0)
676 		return (rval);
677 
678 	if (lagg->dtagd_nrecs < ragg->dtagd_nrecs)
679 		return (-1);
680 
681 	if (lagg->dtagd_nrecs > ragg->dtagd_nrecs)
682 		return (1);
683 
684 	for (i = 0; i < lagg->dtagd_nrecs; i++) {
685 		lrec = &lagg->dtagd_rec[i];
686 		rrec = &ragg->dtagd_rec[i];
687 
688 		if (lrec->dtrd_offset < rrec->dtrd_offset)
689 			return (-1);
690 
691 		if (lrec->dtrd_offset > rrec->dtrd_offset)
692 			return (1);
693 
694 		if (lrec->dtrd_action < rrec->dtrd_action)
695 			return (-1);
696 
697 		if (lrec->dtrd_action > rrec->dtrd_action)
698 			return (1);
699 	}
700 
701 	laddr = (uint64_t *)(uintptr_t)(ldata + lrec->dtrd_offset);
702 	raddr = (uint64_t *)(uintptr_t)(rdata + rrec->dtrd_offset);
703 
704 	switch (lrec->dtrd_action) {
705 	case DTRACEAGG_AVG:
706 		rval = dt_aggregate_averagecmp(laddr, raddr);
707 		break;
708 
709 	case DTRACEAGG_QUANTIZE:
710 		rval = dt_aggregate_quantizedcmp(laddr, raddr);
711 		break;
712 
713 	case DTRACEAGG_LQUANTIZE:
714 		rval = dt_aggregate_lquantizedcmp(laddr, raddr);
715 		break;
716 
717 	case DTRACEAGG_COUNT:
718 	case DTRACEAGG_SUM:
719 	case DTRACEAGG_MIN:
720 	case DTRACEAGG_MAX:
721 		rval = dt_aggregate_countcmp(laddr, raddr);
722 		break;
723 
724 	default:
725 		assert(0);
726 	}
727 
728 	if (rval != 0)
729 		return (rval);
730 
731 	/*
732 	 * If we're here, the values for the two aggregation elements are
733 	 * equal.  We already know that the key layout is the same for the two
734 	 * elements; we must now compare the keys themselves as a tie-breaker.
735 	 */
736 	return (dt_aggregate_keycmp(lhs, rhs));
737 }
738 
739 static int
740 dt_aggregate_keyvarcmp(const void *lhs, const void *rhs)
741 {
742 	int rval;
743 
744 	if ((rval = dt_aggregate_keycmp(lhs, rhs)) != 0)
745 		return (rval);
746 
747 	return (dt_aggregate_varcmp(lhs, rhs));
748 }
749 
750 static int
751 dt_aggregate_varkeycmp(const void *lhs, const void *rhs)
752 {
753 	int rval;
754 
755 	if ((rval = dt_aggregate_varcmp(lhs, rhs)) != 0)
756 		return (rval);
757 
758 	return (dt_aggregate_keycmp(lhs, rhs));
759 }
760 
761 static int
762 dt_aggregate_valvarcmp(const void *lhs, const void *rhs)
763 {
764 	int rval;
765 
766 	if ((rval = dt_aggregate_valcmp(lhs, rhs)) != 0)
767 		return (rval);
768 
769 	return (dt_aggregate_varcmp(lhs, rhs));
770 }
771 
772 static int
773 dt_aggregate_varvalcmp(const void *lhs, const void *rhs)
774 {
775 	int rval;
776 
777 	if ((rval = dt_aggregate_varcmp(lhs, rhs)) != 0)
778 		return (rval);
779 
780 	return (dt_aggregate_valcmp(lhs, rhs));
781 }
782 
783 static int
784 dt_aggregate_keyvarrevcmp(const void *lhs, const void *rhs)
785 {
786 	return (dt_aggregate_keyvarcmp(rhs, lhs));
787 }
788 
789 static int
790 dt_aggregate_varkeyrevcmp(const void *lhs, const void *rhs)
791 {
792 	return (dt_aggregate_varkeycmp(rhs, lhs));
793 }
794 
795 static int
796 dt_aggregate_valvarrevcmp(const void *lhs, const void *rhs)
797 {
798 	return (dt_aggregate_valvarcmp(rhs, lhs));
799 }
800 
801 static int
802 dt_aggregate_varvalrevcmp(const void *lhs, const void *rhs)
803 {
804 	return (dt_aggregate_varvalcmp(rhs, lhs));
805 }
806 
807 int
808 dt_aggregate_go(dtrace_hdl_t *dtp)
809 {
810 	dt_aggregate_t *agp = &dtp->dt_aggregate;
811 	dtrace_optval_t size, cpu;
812 	dtrace_bufdesc_t *buf = &agp->dtat_buf;
813 	int rval, i;
814 
815 	assert(agp->dtat_maxcpu == 0);
816 	assert(agp->dtat_ncpu == 0);
817 	assert(agp->dtat_cpus == NULL);
818 
819 	agp->dtat_maxcpu = dt_sysconf(dtp, _SC_CPUID_MAX) + 1;
820 	agp->dtat_ncpu = dt_sysconf(dtp, _SC_NPROCESSORS_MAX);
821 	agp->dtat_cpus = malloc(agp->dtat_ncpu * sizeof (processorid_t));
822 
823 	if (agp->dtat_cpus == NULL)
824 		return (dt_set_errno(dtp, EDT_NOMEM));
825 
826 	/*
827 	 * Use the aggregation buffer size as reloaded from the kernel.
828 	 */
829 	size = dtp->dt_options[DTRACEOPT_AGGSIZE];
830 
831 	rval = dtrace_getopt(dtp, "aggsize", &size);
832 	assert(rval == 0);
833 
834 	if (size == 0 || size == DTRACEOPT_UNSET)
835 		return (0);
836 
837 	buf = &agp->dtat_buf;
838 	buf->dtbd_size = size;
839 
840 	if ((buf->dtbd_data = malloc(buf->dtbd_size)) == NULL)
841 		return (dt_set_errno(dtp, EDT_NOMEM));
842 
843 	/*
844 	 * Now query for the CPUs enabled.
845 	 */
846 	rval = dtrace_getopt(dtp, "cpu", &cpu);
847 	assert(rval == 0 && cpu != DTRACEOPT_UNSET);
848 
849 	if (cpu != DTRACE_CPUALL) {
850 		assert(cpu < agp->dtat_ncpu);
851 		agp->dtat_cpus[agp->dtat_ncpus++] = (processorid_t)cpu;
852 
853 		return (0);
854 	}
855 
856 	agp->dtat_ncpus = 0;
857 	for (i = 0; i < agp->dtat_maxcpu; i++) {
858 		if (dt_status(dtp, i) == -1)
859 			continue;
860 
861 		agp->dtat_cpus[agp->dtat_ncpus++] = i;
862 	}
863 
864 	return (0);
865 }
866 
867 static int
868 dt_aggwalk_rval(dtrace_hdl_t *dtp, dt_ahashent_t *h, int rval)
869 {
870 	dt_aggregate_t *agp = &dtp->dt_aggregate;
871 	dtrace_aggdata_t *data;
872 	dtrace_aggdesc_t *aggdesc;
873 	dtrace_recdesc_t *rec;
874 	int i;
875 
876 	switch (rval) {
877 	case DTRACE_AGGWALK_NEXT:
878 		break;
879 
880 	case DTRACE_AGGWALK_CLEAR: {
881 		uint32_t size, offs = 0;
882 
883 		aggdesc = h->dtahe_data.dtada_desc;
884 		rec = &aggdesc->dtagd_rec[aggdesc->dtagd_nrecs - 1];
885 		size = rec->dtrd_size;
886 		data = &h->dtahe_data;
887 
888 		if (rec->dtrd_action == DTRACEAGG_LQUANTIZE) {
889 			offs = sizeof (uint64_t);
890 			size -= sizeof (uint64_t);
891 		}
892 
893 		bzero(&data->dtada_data[rec->dtrd_offset] + offs, size);
894 
895 		if (data->dtada_percpu == NULL)
896 			break;
897 
898 		for (i = 0; i < dtp->dt_aggregate.dtat_maxcpu; i++)
899 			bzero(data->dtada_percpu[i] + offs, size);
900 		break;
901 	}
902 
903 	case DTRACE_AGGWALK_ERROR:
904 		/*
905 		 * We assume that errno is already set in this case.
906 		 */
907 		return (dt_set_errno(dtp, errno));
908 
909 	case DTRACE_AGGWALK_ABORT:
910 		return (dt_set_errno(dtp, EDT_DIRABORT));
911 
912 	case DTRACE_AGGWALK_DENORMALIZE:
913 		h->dtahe_data.dtada_normal = 1;
914 		return (0);
915 
916 	case DTRACE_AGGWALK_NORMALIZE:
917 		if (h->dtahe_data.dtada_normal == 0) {
918 			h->dtahe_data.dtada_normal = 1;
919 			return (dt_set_errno(dtp, EDT_BADRVAL));
920 		}
921 
922 		return (0);
923 
924 	case DTRACE_AGGWALK_REMOVE: {
925 		dtrace_aggdata_t *aggdata = &h->dtahe_data;
926 		int i, max_cpus = agp->dtat_maxcpu;
927 
928 		/*
929 		 * First, remove this hash entry from its hash chain.
930 		 */
931 		if (h->dtahe_prev != NULL) {
932 			h->dtahe_prev->dtahe_next = h->dtahe_next;
933 		} else {
934 			dt_ahash_t *hash = &agp->dtat_hash;
935 			size_t ndx = h->dtahe_hashval % hash->dtah_size;
936 
937 			assert(hash->dtah_hash[ndx] == h);
938 			hash->dtah_hash[ndx] = h->dtahe_next;
939 		}
940 
941 		if (h->dtahe_next != NULL)
942 			h->dtahe_next->dtahe_prev = h->dtahe_prev;
943 
944 		/*
945 		 * Now remove it from the list of all hash entries.
946 		 */
947 		if (h->dtahe_prevall != NULL) {
948 			h->dtahe_prevall->dtahe_nextall = h->dtahe_nextall;
949 		} else {
950 			dt_ahash_t *hash = &agp->dtat_hash;
951 
952 			assert(hash->dtah_all == h);
953 			hash->dtah_all = h->dtahe_nextall;
954 		}
955 
956 		if (h->dtahe_nextall != NULL)
957 			h->dtahe_nextall->dtahe_prevall = h->dtahe_prevall;
958 
959 		/*
960 		 * We're unlinked.  We can safely destroy the data.
961 		 */
962 		if (aggdata->dtada_percpu != NULL) {
963 			for (i = 0; i < max_cpus; i++)
964 				free(aggdata->dtada_percpu[i]);
965 			free(aggdata->dtada_percpu);
966 		}
967 
968 		free(aggdata->dtada_data);
969 		free(h);
970 
971 		return (0);
972 	}
973 
974 	default:
975 		return (dt_set_errno(dtp, EDT_BADRVAL));
976 	}
977 
978 	return (0);
979 }
980 
981 int
982 dtrace_aggregate_walk(dtrace_hdl_t *dtp, dtrace_aggregate_f *func, void *arg)
983 {
984 	dt_ahashent_t *h, *next;
985 	dt_ahash_t *hash = &dtp->dt_aggregate.dtat_hash;
986 
987 	for (h = hash->dtah_all; h != NULL; h = next) {
988 		/*
989 		 * dt_aggwalk_rval() can potentially remove the current hash
990 		 * entry; we need to load the next hash entry before calling
991 		 * into it.
992 		 */
993 		next = h->dtahe_nextall;
994 
995 		if (dt_aggwalk_rval(dtp, h, func(&h->dtahe_data, arg)) == -1)
996 			return (-1);
997 	}
998 
999 	return (0);
1000 }
1001 
1002 static int
1003 dt_aggregate_walk_sorted(dtrace_hdl_t *dtp,
1004     dtrace_aggregate_f *func, void *arg,
1005     int (*sfunc)(const void *, const void *))
1006 {
1007 	dt_aggregate_t *agp = &dtp->dt_aggregate;
1008 	dt_ahashent_t *h, **sorted;
1009 	dt_ahash_t *hash = &agp->dtat_hash;
1010 	size_t i, nentries = 0;
1011 
1012 	for (h = hash->dtah_all; h != NULL; h = h->dtahe_nextall)
1013 		nentries++;
1014 
1015 	sorted = malloc(nentries * sizeof (dt_ahashent_t *));
1016 
1017 	if (sorted == NULL)
1018 		return (dt_set_errno(dtp, EDT_NOMEM));
1019 
1020 	for (h = hash->dtah_all, i = 0; h != NULL; h = h->dtahe_nextall)
1021 		sorted[i++] = h;
1022 
1023 	qsort(sorted, nentries, sizeof (dt_ahashent_t *), sfunc);
1024 
1025 	for (i = 0; i < nentries; i++) {
1026 		h = sorted[i];
1027 
1028 		if (dt_aggwalk_rval(dtp, h, func(&h->dtahe_data, arg)) == -1)
1029 			return (-1);
1030 	}
1031 
1032 	free(sorted);
1033 	return (0);
1034 }
1035 
1036 int
1037 dtrace_aggregate_walk_keysorted(dtrace_hdl_t *dtp,
1038     dtrace_aggregate_f *func, void *arg)
1039 {
1040 	return (dt_aggregate_walk_sorted(dtp, func,
1041 	    arg, dt_aggregate_varkeycmp));
1042 }
1043 
1044 int
1045 dtrace_aggregate_walk_valsorted(dtrace_hdl_t *dtp,
1046     dtrace_aggregate_f *func, void *arg)
1047 {
1048 	return (dt_aggregate_walk_sorted(dtp, func,
1049 	    arg, dt_aggregate_varvalcmp));
1050 }
1051 
1052 int
1053 dtrace_aggregate_walk_keyvarsorted(dtrace_hdl_t *dtp,
1054     dtrace_aggregate_f *func, void *arg)
1055 {
1056 	return (dt_aggregate_walk_sorted(dtp, func,
1057 	    arg, dt_aggregate_keyvarcmp));
1058 }
1059 
1060 int
1061 dtrace_aggregate_walk_valvarsorted(dtrace_hdl_t *dtp,
1062     dtrace_aggregate_f *func, void *arg)
1063 {
1064 	return (dt_aggregate_walk_sorted(dtp, func,
1065 	    arg, dt_aggregate_valvarcmp));
1066 }
1067 
1068 int
1069 dtrace_aggregate_walk_keyrevsorted(dtrace_hdl_t *dtp,
1070     dtrace_aggregate_f *func, void *arg)
1071 {
1072 	return (dt_aggregate_walk_sorted(dtp, func,
1073 	    arg, dt_aggregate_varkeyrevcmp));
1074 }
1075 
1076 int
1077 dtrace_aggregate_walk_valrevsorted(dtrace_hdl_t *dtp,
1078     dtrace_aggregate_f *func, void *arg)
1079 {
1080 	return (dt_aggregate_walk_sorted(dtp, func,
1081 	    arg, dt_aggregate_varvalrevcmp));
1082 }
1083 
1084 int
1085 dtrace_aggregate_walk_keyvarrevsorted(dtrace_hdl_t *dtp,
1086     dtrace_aggregate_f *func, void *arg)
1087 {
1088 	return (dt_aggregate_walk_sorted(dtp, func,
1089 	    arg, dt_aggregate_keyvarrevcmp));
1090 }
1091 
1092 int
1093 dtrace_aggregate_walk_valvarrevsorted(dtrace_hdl_t *dtp,
1094     dtrace_aggregate_f *func, void *arg)
1095 {
1096 	return (dt_aggregate_walk_sorted(dtp, func,
1097 	    arg, dt_aggregate_valvarrevcmp));
1098 }
1099 
1100 int
1101 dtrace_aggregate_print(dtrace_hdl_t *dtp, FILE *fp,
1102     dtrace_aggregate_walk_f *func)
1103 {
1104 	dt_print_aggdata_t pd;
1105 
1106 	pd.dtpa_dtp = dtp;
1107 	pd.dtpa_fp = fp;
1108 	pd.dtpa_allunprint = 1;
1109 
1110 	if (func == NULL)
1111 		func = dtrace_aggregate_walk_valsorted;
1112 
1113 	if ((*func)(dtp, dt_print_agg, &pd) == -1)
1114 		return (dt_set_errno(dtp, dtp->dt_errno));
1115 
1116 	return (0);
1117 }
1118 
1119 void
1120 dtrace_aggregate_clear(dtrace_hdl_t *dtp)
1121 {
1122 	dt_aggregate_t *agp = &dtp->dt_aggregate;
1123 	dt_ahash_t *hash = &agp->dtat_hash;
1124 	dt_ahashent_t *h;
1125 	dtrace_aggdata_t *data;
1126 	dtrace_aggdesc_t *aggdesc;
1127 	dtrace_recdesc_t *rec;
1128 	int i, max_cpus = agp->dtat_maxcpu;
1129 
1130 	for (h = hash->dtah_all; h != NULL; h = h->dtahe_nextall) {
1131 		aggdesc = h->dtahe_data.dtada_desc;
1132 		rec = &aggdesc->dtagd_rec[aggdesc->dtagd_nrecs - 1];
1133 		data = &h->dtahe_data;
1134 
1135 		bzero(&data->dtada_data[rec->dtrd_offset], rec->dtrd_size);
1136 
1137 		if (data->dtada_percpu == NULL)
1138 			continue;
1139 
1140 		for (i = 0; i < max_cpus; i++)
1141 			bzero(data->dtada_percpu[i], rec->dtrd_size);
1142 	}
1143 }
1144 
1145 void
1146 dt_aggregate_destroy(dtrace_hdl_t *dtp)
1147 {
1148 	dt_aggregate_t *agp = &dtp->dt_aggregate;
1149 	dt_ahash_t *hash = &agp->dtat_hash;
1150 	dt_ahashent_t *h, *next;
1151 	dtrace_aggdata_t *aggdata;
1152 	int i, max_cpus = agp->dtat_maxcpu;
1153 
1154 	if (hash->dtah_hash == NULL) {
1155 		assert(hash->dtah_all == NULL);
1156 	} else {
1157 		free(hash->dtah_hash);
1158 
1159 		for (h = hash->dtah_all; h != NULL; h = next) {
1160 			next = h->dtahe_nextall;
1161 
1162 			aggdata = &h->dtahe_data;
1163 
1164 			if (aggdata->dtada_percpu != NULL) {
1165 				for (i = 0; i < max_cpus; i++)
1166 					free(aggdata->dtada_percpu[i]);
1167 				free(aggdata->dtada_percpu);
1168 			}
1169 
1170 			free(aggdata->dtada_data);
1171 			free(h);
1172 		}
1173 
1174 		hash->dtah_hash = NULL;
1175 		hash->dtah_all = NULL;
1176 		hash->dtah_size = 0;
1177 	}
1178 
1179 	free(agp->dtat_buf.dtbd_data);
1180 	free(agp->dtat_cpus);
1181 }
1182