xref: /illumos-gate/usr/src/uts/common/dtrace/dtrace.c (revision ed1faac1)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Copyright 2019 Joyent, Inc.
25  * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
26  */
27 
28 /*
29  * DTrace - Dynamic Tracing for Solaris
30  *
31  * This is the implementation of the Solaris Dynamic Tracing framework
32  * (DTrace).  The user-visible interface to DTrace is described at length in
33  * the "Solaris Dynamic Tracing Guide".  The interfaces between the libdtrace
34  * library, the in-kernel DTrace framework, and the DTrace providers are
35  * described in the block comments in the <sys/dtrace.h> header file.  The
36  * internal architecture of DTrace is described in the block comments in the
37  * <sys/dtrace_impl.h> header file.  The comments contained within the DTrace
38  * implementation very much assume mastery of all of these sources; if one has
39  * an unanswered question about the implementation, one should consult them
40  * first.
41  *
42  * The functions here are ordered roughly as follows:
43  *
44  *   - Probe context functions
45  *   - Probe hashing functions
46  *   - Non-probe context utility functions
47  *   - Matching functions
48  *   - Provider-to-Framework API functions
49  *   - Probe management functions
50  *   - DIF object functions
51  *   - Format functions
52  *   - Predicate functions
53  *   - ECB functions
54  *   - Buffer functions
55  *   - Enabling functions
56  *   - DOF functions
57  *   - Anonymous enabling functions
58  *   - Consumer state functions
59  *   - Helper functions
60  *   - Hook functions
61  *   - Driver cookbook functions
62  *
63  * Each group of functions begins with a block comment labelled the "DTrace
64  * [Group] Functions", allowing one to find each block by searching forward
65  * on capital-f functions.
66  */
67 #include <sys/errno.h>
68 #include <sys/stat.h>
69 #include <sys/modctl.h>
70 #include <sys/conf.h>
71 #include <sys/systm.h>
72 #include <sys/ddi.h>
73 #include <sys/sunddi.h>
74 #include <sys/cpuvar.h>
75 #include <sys/kmem.h>
76 #include <sys/strsubr.h>
77 #include <sys/sysmacros.h>
78 #include <sys/dtrace_impl.h>
79 #include <sys/atomic.h>
80 #include <sys/cmn_err.h>
81 #include <sys/mutex_impl.h>
82 #include <sys/rwlock_impl.h>
83 #include <sys/ctf_api.h>
84 #include <sys/panic.h>
85 #include <sys/priv_impl.h>
86 #include <sys/policy.h>
87 #include <sys/cred_impl.h>
88 #include <sys/procfs_isa.h>
89 #include <sys/taskq.h>
90 #include <sys/mkdev.h>
91 #include <sys/kdi.h>
92 #include <sys/zone.h>
93 #include <sys/socket.h>
94 #include <netinet/in.h>
95 #include "strtolctype.h"
96 
97 /*
98  * DTrace Tunable Variables
99  *
100  * The following variables may be tuned by adding a line to /etc/system that
101  * includes both the name of the DTrace module ("dtrace") and the name of the
102  * variable.  For example:
103  *
104  *   set dtrace:dtrace_destructive_disallow = 1
105  *
106  * In general, the only variables that one should be tuning this way are those
107  * that affect system-wide DTrace behavior, and for which the default behavior
108  * is undesirable.  Most of these variables are tunable on a per-consumer
109  * basis using DTrace options, and need not be tuned on a system-wide basis.
110  * When tuning these variables, avoid pathological values; while some attempt
111  * is made to verify the integrity of these variables, they are not considered
112  * part of the supported interface to DTrace, and they are therefore not
113  * checked comprehensively.  Further, these variables should not be tuned
114  * dynamically via "mdb -kw" or other means; they should only be tuned via
115  * /etc/system.
116  */
117 int		dtrace_destructive_disallow = 0;
118 dtrace_optval_t	dtrace_nonroot_maxsize = (16 * 1024 * 1024);
119 size_t		dtrace_difo_maxsize = (256 * 1024);
120 dtrace_optval_t	dtrace_dof_maxsize = (8 * 1024 * 1024);
121 size_t		dtrace_statvar_maxsize = (16 * 1024);
122 size_t		dtrace_actions_max = (16 * 1024);
123 size_t		dtrace_retain_max = 1024;
124 dtrace_optval_t	dtrace_helper_actions_max = 1024;
125 dtrace_optval_t	dtrace_helper_providers_max = 32;
126 dtrace_optval_t	dtrace_dstate_defsize = (1 * 1024 * 1024);
127 size_t		dtrace_strsize_default = 256;
128 dtrace_optval_t	dtrace_cleanrate_default = 9900990;		/* 101 hz */
129 dtrace_optval_t	dtrace_cleanrate_min = 200000;			/* 5000 hz */
130 dtrace_optval_t	dtrace_cleanrate_max = (uint64_t)60 * NANOSEC;	/* 1/minute */
131 dtrace_optval_t	dtrace_aggrate_default = NANOSEC;		/* 1 hz */
132 dtrace_optval_t	dtrace_statusrate_default = NANOSEC;		/* 1 hz */
133 dtrace_optval_t dtrace_statusrate_max = (hrtime_t)10 * NANOSEC;	 /* 6/minute */
134 dtrace_optval_t	dtrace_switchrate_default = NANOSEC;		/* 1 hz */
135 dtrace_optval_t	dtrace_nspec_default = 1;
136 dtrace_optval_t	dtrace_specsize_default = 32 * 1024;
137 dtrace_optval_t dtrace_stackframes_default = 20;
138 dtrace_optval_t dtrace_ustackframes_default = 20;
139 dtrace_optval_t dtrace_jstackframes_default = 50;
140 dtrace_optval_t dtrace_jstackstrsize_default = 512;
141 int		dtrace_msgdsize_max = 128;
142 hrtime_t	dtrace_chill_max = MSEC2NSEC(500);		/* 500 ms */
143 hrtime_t	dtrace_chill_interval = NANOSEC;		/* 1000 ms */
144 int		dtrace_devdepth_max = 32;
145 int		dtrace_err_verbose;
146 hrtime_t	dtrace_deadman_interval = NANOSEC;
147 hrtime_t	dtrace_deadman_timeout = (hrtime_t)10 * NANOSEC;
148 hrtime_t	dtrace_deadman_user = (hrtime_t)30 * NANOSEC;
149 hrtime_t	dtrace_unregister_defunct_reap = (hrtime_t)60 * NANOSEC;
150 
151 /*
152  * DTrace External Variables
153  *
154  * As dtrace(7D) is a kernel module, any DTrace variables are obviously
155  * available to DTrace consumers via the backtick (`) syntax.  One of these,
156  * dtrace_zero, is made deliberately so:  it is provided as a source of
157  * well-known, zero-filled memory.  While this variable is not documented,
158  * it is used by some translators as an implementation detail.
159  */
160 const char	dtrace_zero[256] = { 0 };	/* zero-filled memory */
161 
162 /*
163  * DTrace Internal Variables
164  */
165 static dev_info_t	*dtrace_devi;		/* device info */
166 static vmem_t		*dtrace_arena;		/* probe ID arena */
167 static vmem_t		*dtrace_minor;		/* minor number arena */
168 static taskq_t		*dtrace_taskq;		/* task queue */
169 static dtrace_probe_t	**dtrace_probes;	/* array of all probes */
170 static int		dtrace_nprobes;		/* number of probes */
171 static dtrace_provider_t *dtrace_provider;	/* provider list */
172 static dtrace_meta_t	*dtrace_meta_pid;	/* user-land meta provider */
173 static int		dtrace_opens;		/* number of opens */
174 static int		dtrace_helpers;		/* number of helpers */
175 static int		dtrace_getf;		/* number of unpriv getf()s */
176 static void		*dtrace_softstate;	/* softstate pointer */
177 static dtrace_hash_t	*dtrace_bymod;		/* probes hashed by module */
178 static dtrace_hash_t	*dtrace_byfunc;		/* probes hashed by function */
179 static dtrace_hash_t	*dtrace_byname;		/* probes hashed by name */
180 static dtrace_toxrange_t *dtrace_toxrange;	/* toxic range array */
181 static int		dtrace_toxranges;	/* number of toxic ranges */
182 static int		dtrace_toxranges_max;	/* size of toxic range array */
183 static dtrace_anon_t	dtrace_anon;		/* anonymous enabling */
184 static kmem_cache_t	*dtrace_state_cache;	/* cache for dynamic state */
185 static uint64_t		dtrace_vtime_references; /* number of vtimestamp refs */
186 static kthread_t	*dtrace_panicked;	/* panicking thread */
187 static dtrace_ecb_t	*dtrace_ecb_create_cache; /* cached created ECB */
188 static dtrace_genid_t	dtrace_probegen;	/* current probe generation */
189 static dtrace_helpers_t *dtrace_deferred_pid;	/* deferred helper list */
190 static dtrace_enabling_t *dtrace_retained;	/* list of retained enablings */
191 static dtrace_genid_t	dtrace_retained_gen;	/* current retained enab gen */
192 static dtrace_dynvar_t	dtrace_dynhash_sink;	/* end of dynamic hash chains */
193 static int		dtrace_dynvar_failclean; /* dynvars failed to clean */
194 
195 /*
196  * DTrace Locking
197  * DTrace is protected by three (relatively coarse-grained) locks:
198  *
199  * (1) dtrace_lock is required to manipulate essentially any DTrace state,
200  *     including enabling state, probes, ECBs, consumer state, helper state,
201  *     etc.  Importantly, dtrace_lock is _not_ required when in probe context;
202  *     probe context is lock-free -- synchronization is handled via the
203  *     dtrace_sync() cross call mechanism.
204  *
205  * (2) dtrace_provider_lock is required when manipulating provider state, or
206  *     when provider state must be held constant.
207  *
208  * (3) dtrace_meta_lock is required when manipulating meta provider state, or
209  *     when meta provider state must be held constant.
210  *
211  * The lock ordering between these three locks is dtrace_meta_lock before
212  * dtrace_provider_lock before dtrace_lock.  (In particular, there are
213  * several places where dtrace_provider_lock is held by the framework as it
214  * calls into the providers -- which then call back into the framework,
215  * grabbing dtrace_lock.)
216  *
217  * There are two other locks in the mix:  mod_lock and cpu_lock.  With respect
218  * to dtrace_provider_lock and dtrace_lock, cpu_lock continues its historical
219  * role as a coarse-grained lock; it is acquired before both of these locks.
220  * With respect to dtrace_meta_lock, its behavior is stranger:  cpu_lock must
221  * be acquired _between_ dtrace_meta_lock and any other DTrace locks.
222  * mod_lock is similar with respect to dtrace_provider_lock in that it must be
223  * acquired _between_ dtrace_provider_lock and dtrace_lock.
224  */
225 static kmutex_t		dtrace_lock;		/* probe state lock */
226 static kmutex_t		dtrace_provider_lock;	/* provider state lock */
227 static kmutex_t		dtrace_meta_lock;	/* meta-provider state lock */
228 
229 /*
230  * DTrace Provider Variables
231  *
232  * These are the variables relating to DTrace as a provider (that is, the
233  * provider of the BEGIN, END, and ERROR probes).
234  */
235 static dtrace_pattr_t	dtrace_provider_attr = {
236 { DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
237 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
238 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
239 { DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
240 { DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
241 };
242 
243 static void
dtrace_nullop_provide(void * arg __unused,const dtrace_probedesc_t * spec __unused)244 dtrace_nullop_provide(void *arg __unused,
245     const dtrace_probedesc_t *spec __unused)
246 {
247 }
248 
249 static void
dtrace_nullop_module(void * arg __unused,struct modctl * mp __unused)250 dtrace_nullop_module(void *arg __unused, struct modctl *mp __unused)
251 {
252 }
253 
254 static void
dtrace_nullop(void * arg __unused,dtrace_id_t id __unused,void * parg __unused)255 dtrace_nullop(void *arg __unused, dtrace_id_t id __unused, void *parg __unused)
256 {
257 }
258 
259 static int
dtrace_enable_nullop(void * arg __unused,dtrace_id_t id __unused,void * parg __unused)260 dtrace_enable_nullop(void *arg __unused, dtrace_id_t id __unused,
261     void *parg __unused)
262 {
263 	return (0);
264 }
265 
266 static dtrace_pops_t	dtrace_provider_ops = {
267 	.dtps_provide = dtrace_nullop_provide,
268 	.dtps_provide_module = dtrace_nullop_module,
269 	.dtps_enable = dtrace_enable_nullop,
270 	.dtps_disable = dtrace_nullop,
271 	.dtps_suspend = dtrace_nullop,
272 	.dtps_resume = dtrace_nullop,
273 	.dtps_getargdesc = NULL,
274 	.dtps_getargval = NULL,
275 	.dtps_mode = NULL,
276 	.dtps_destroy = dtrace_nullop
277 };
278 
279 static dtrace_id_t	dtrace_probeid_begin;	/* special BEGIN probe */
280 static dtrace_id_t	dtrace_probeid_end;	/* special END probe */
281 dtrace_id_t		dtrace_probeid_error;	/* special ERROR probe */
282 
283 /*
284  * DTrace Helper Tracing Variables
285  *
286  * These variables should be set dynamically to enable helper tracing.  The
287  * only variables that should be set are dtrace_helptrace_enable (which should
288  * be set to a non-zero value to allocate helper tracing buffers on the next
289  * open of /dev/dtrace) and dtrace_helptrace_disable (which should be set to a
290  * non-zero value to deallocate helper tracing buffers on the next close of
291  * /dev/dtrace).  When (and only when) helper tracing is disabled, the
292  * buffer size may also be set via dtrace_helptrace_bufsize.
293  */
294 int			dtrace_helptrace_enable = 0;
295 int			dtrace_helptrace_disable = 0;
296 int			dtrace_helptrace_bufsize = 16 * 1024 * 1024;
297 uint32_t		dtrace_helptrace_nlocals;
298 static dtrace_helptrace_t *dtrace_helptrace_buffer;
299 static uint32_t		dtrace_helptrace_next = 0;
300 static int		dtrace_helptrace_wrapped = 0;
301 
302 /*
303  * DTrace Error Hashing
304  *
305  * On DEBUG kernels, DTrace will track the errors that has seen in a hash
306  * table.  This is very useful for checking coverage of tests that are
307  * expected to induce DIF or DOF processing errors, and may be useful for
308  * debugging problems in the DIF code generator or in DOF generation .  The
309  * error hash may be examined with the ::dtrace_errhash MDB dcmd.
310  */
311 #ifdef DEBUG
312 static dtrace_errhash_t	dtrace_errhash[DTRACE_ERRHASHSZ];
313 static const char *dtrace_errlast;
314 static kthread_t *dtrace_errthread;
315 static kmutex_t dtrace_errlock;
316 #endif
317 
318 /*
319  * DTrace Macros and Constants
320  *
321  * These are various macros that are useful in various spots in the
322  * implementation, along with a few random constants that have no meaning
323  * outside of the implementation.  There is no real structure to this cpp
324  * mishmash -- but is there ever?
325  */
326 #define	DTRACE_HASHSTR(hash, probe)	\
327 	dtrace_hash_str(*((char **)((uintptr_t)(probe) + (hash)->dth_stroffs)))
328 
329 #define	DTRACE_HASHNEXT(hash, probe)	\
330 	(dtrace_probe_t **)((uintptr_t)(probe) + (hash)->dth_nextoffs)
331 
332 #define	DTRACE_HASHPREV(hash, probe)	\
333 	(dtrace_probe_t **)((uintptr_t)(probe) + (hash)->dth_prevoffs)
334 
335 #define	DTRACE_HASHEQ(hash, lhs, rhs)	\
336 	(strcmp(*((char **)((uintptr_t)(lhs) + (hash)->dth_stroffs)), \
337 	    *((char **)((uintptr_t)(rhs) + (hash)->dth_stroffs))) == 0)
338 
339 #define	DTRACE_AGGHASHSIZE_SLEW		17
340 
341 #define	DTRACE_V4MAPPED_OFFSET		(sizeof (uint32_t) * 3)
342 
343 /*
344  * The key for a thread-local variable consists of the lower 61 bits of the
345  * t_did, plus the 3 bits of the highest active interrupt above LOCK_LEVEL.
346  * We add DIF_VARIABLE_MAX to t_did to assure that the thread key is never
347  * equal to a variable identifier.  This is necessary (but not sufficient) to
348  * assure that global associative arrays never collide with thread-local
349  * variables.  To guarantee that they cannot collide, we must also define the
350  * order for keying dynamic variables.  That order is:
351  *
352  *   [ key0 ] ... [ keyn ] [ variable-key ] [ tls-key ]
353  *
354  * Because the variable-key and the tls-key are in orthogonal spaces, there is
355  * no way for a global variable key signature to match a thread-local key
356  * signature.
357  */
358 #define	DTRACE_TLS_THRKEY(where) { \
359 	uint_t intr = 0; \
360 	uint_t actv = CPU->cpu_intr_actv >> (LOCK_LEVEL + 1); \
361 	for (; actv; actv >>= 1) \
362 		intr++; \
363 	ASSERT(intr < (1 << 3)); \
364 	(where) = ((curthread->t_did + DIF_VARIABLE_MAX) & \
365 	    (((uint64_t)1 << 61) - 1)) | ((uint64_t)intr << 61); \
366 }
367 
368 #define	DT_BSWAP_8(x)	((x) & 0xff)
369 #define	DT_BSWAP_16(x)	((DT_BSWAP_8(x) << 8) | DT_BSWAP_8((x) >> 8))
370 #define	DT_BSWAP_32(x)	((DT_BSWAP_16(x) << 16) | DT_BSWAP_16((x) >> 16))
371 #define	DT_BSWAP_64(x)	((DT_BSWAP_32(x) << 32) | DT_BSWAP_32((x) >> 32))
372 
373 #define	DT_MASK_LO 0x00000000FFFFFFFFULL
374 
375 #define	DTRACE_STORE(type, tomax, offset, what) \
376 	*((type *)((uintptr_t)(tomax) + (uintptr_t)offset)) = (type)(what);
377 
378 #ifndef __x86
379 #define	DTRACE_ALIGNCHECK(addr, size, flags)				\
380 	if (addr & (size - 1)) {					\
381 		*flags |= CPU_DTRACE_BADALIGN;				\
382 		cpu_core[CPU->cpu_id].cpuc_dtrace_illval = addr;	\
383 		return (0);						\
384 	}
385 #else
386 #define	DTRACE_ALIGNCHECK(addr, size, flags)
387 #endif
388 
389 /*
390  * Test whether a range of memory starting at testaddr of size testsz falls
391  * within the range of memory described by addr, sz.  We take care to avoid
392  * problems with overflow and underflow of the unsigned quantities, and
393  * disallow all negative sizes.  Ranges of size 0 are allowed.
394  */
395 #define	DTRACE_INRANGE(testaddr, testsz, baseaddr, basesz) \
396 	((testaddr) - (uintptr_t)(baseaddr) < (basesz) && \
397 	(testaddr) + (testsz) - (uintptr_t)(baseaddr) <= (basesz) && \
398 	(testaddr) + (testsz) >= (testaddr))
399 
400 #define	DTRACE_RANGE_REMAIN(remp, addr, baseaddr, basesz)		\
401 do {									\
402 	if ((remp) != NULL) {						\
403 		*(remp) = (uintptr_t)(baseaddr) + (basesz) - (addr);	\
404 	}								\
405 _NOTE(CONSTCOND) } while (0)
406 
407 
408 /*
409  * Test whether alloc_sz bytes will fit in the scratch region.  We isolate
410  * alloc_sz on the righthand side of the comparison in order to avoid overflow
411  * or underflow in the comparison with it.  This is simpler than the INRANGE
412  * check above, because we know that the dtms_scratch_ptr is valid in the
413  * range.  Allocations of size zero are allowed.
414  */
415 #define	DTRACE_INSCRATCH(mstate, alloc_sz) \
416 	((mstate)->dtms_scratch_base + (mstate)->dtms_scratch_size - \
417 	(mstate)->dtms_scratch_ptr >= (alloc_sz))
418 
419 #define	DTRACE_LOADFUNC(bits)						\
420 /*CSTYLED*/								\
421 uint##bits##_t								\
422 dtrace_load##bits(uintptr_t addr)					\
423 {									\
424 	size_t size = bits / NBBY;					\
425 	/*CSTYLED*/							\
426 	uint##bits##_t rval;						\
427 	int i;								\
428 	volatile uint16_t *flags = (volatile uint16_t *)		\
429 	    &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;			\
430 									\
431 	DTRACE_ALIGNCHECK(addr, size, flags);				\
432 									\
433 	for (i = 0; i < dtrace_toxranges; i++) {			\
434 		if (addr >= dtrace_toxrange[i].dtt_limit)		\
435 			continue;					\
436 									\
437 		if (addr + size <= dtrace_toxrange[i].dtt_base)		\
438 			continue;					\
439 									\
440 		/*							\
441 		 * This address falls within a toxic region; return 0.	\
442 		 */							\
443 		*flags |= CPU_DTRACE_BADADDR;				\
444 		cpu_core[CPU->cpu_id].cpuc_dtrace_illval = addr;	\
445 		return (0);						\
446 	}								\
447 									\
448 	*flags |= CPU_DTRACE_NOFAULT;					\
449 	/*CSTYLED*/							\
450 	rval = *((volatile uint##bits##_t *)addr);			\
451 	*flags &= ~CPU_DTRACE_NOFAULT;					\
452 									\
453 	return (!(*flags & CPU_DTRACE_FAULT) ? rval : 0);		\
454 }
455 
456 #ifdef _LP64
457 #define	dtrace_loadptr	dtrace_load64
458 #else
459 #define	dtrace_loadptr	dtrace_load32
460 #endif
461 
462 #define	DTRACE_DYNHASH_FREE	0
463 #define	DTRACE_DYNHASH_SINK	1
464 #define	DTRACE_DYNHASH_VALID	2
465 
466 #define	DTRACE_MATCH_FAIL	-1
467 #define	DTRACE_MATCH_NEXT	0
468 #define	DTRACE_MATCH_DONE	1
469 #define	DTRACE_ANCHORED(probe)	((probe)->dtpr_func[0] != '\0')
470 #define	DTRACE_STATE_ALIGN	64
471 
472 #define	DTRACE_FLAGS2FLT(flags)						\
473 	(((flags) & CPU_DTRACE_BADADDR) ? DTRACEFLT_BADADDR :		\
474 	((flags) & CPU_DTRACE_ILLOP) ? DTRACEFLT_ILLOP :		\
475 	((flags) & CPU_DTRACE_DIVZERO) ? DTRACEFLT_DIVZERO :		\
476 	((flags) & CPU_DTRACE_KPRIV) ? DTRACEFLT_KPRIV :		\
477 	((flags) & CPU_DTRACE_UPRIV) ? DTRACEFLT_UPRIV :		\
478 	((flags) & CPU_DTRACE_TUPOFLOW) ?  DTRACEFLT_TUPOFLOW :		\
479 	((flags) & CPU_DTRACE_BADALIGN) ?  DTRACEFLT_BADALIGN :		\
480 	((flags) & CPU_DTRACE_NOSCRATCH) ?  DTRACEFLT_NOSCRATCH :	\
481 	((flags) & CPU_DTRACE_BADSTACK) ?  DTRACEFLT_BADSTACK :		\
482 	DTRACEFLT_UNKNOWN)
483 
484 #define	DTRACEACT_ISSTRING(act)						\
485 	((act)->dta_kind == DTRACEACT_DIFEXPR &&			\
486 	(act)->dta_difo->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING)
487 
488 static size_t dtrace_strlen(const char *, size_t);
489 static dtrace_probe_t *dtrace_probe_lookup_id(dtrace_id_t id);
490 static void dtrace_enabling_provide(dtrace_provider_t *);
491 static int dtrace_enabling_match(dtrace_enabling_t *, int *);
492 static void dtrace_enabling_matchall(void);
493 static void dtrace_enabling_reap(void);
494 static dtrace_state_t *dtrace_anon_grab(void);
495 static uint64_t dtrace_helper(int, dtrace_mstate_t *,
496     dtrace_state_t *, uint64_t, uint64_t);
497 static dtrace_helpers_t *dtrace_helpers_create(proc_t *);
498 static void dtrace_buffer_drop(dtrace_buffer_t *);
499 static int dtrace_buffer_consumed(dtrace_buffer_t *, hrtime_t when);
500 static intptr_t dtrace_buffer_reserve(dtrace_buffer_t *, size_t, size_t,
501     dtrace_state_t *, dtrace_mstate_t *);
502 static int dtrace_state_option(dtrace_state_t *, dtrace_optid_t,
503     dtrace_optval_t);
504 static int dtrace_ecb_create_enable(dtrace_probe_t *, void *);
505 static void dtrace_helper_provider_destroy(dtrace_helper_provider_t *);
506 static int dtrace_priv_proc(dtrace_state_t *, dtrace_mstate_t *);
507 static void dtrace_getf_barrier(void);
508 static int dtrace_canload_remains(uint64_t, size_t, size_t *,
509     dtrace_mstate_t *, dtrace_vstate_t *);
510 static int dtrace_canstore_remains(uint64_t, size_t, size_t *,
511     dtrace_mstate_t *, dtrace_vstate_t *);
512 
513 /*
514  * DTrace Probe Context Functions
515  *
516  * These functions are called from probe context.  Because probe context is
517  * any context in which C may be called, arbitrarily locks may be held,
518  * interrupts may be disabled, we may be in arbitrary dispatched state, etc.
519  * As a result, functions called from probe context may only call other DTrace
520  * support functions -- they may not interact at all with the system at large.
521  * (Note that the ASSERT macro is made probe-context safe by redefining it in
522  * terms of dtrace_assfail(), a probe-context safe function.) If arbitrary
523  * loads are to be performed from probe context, they _must_ be in terms of
524  * the safe dtrace_load*() variants.
525  *
526  * Some functions in this block are not actually called from probe context;
527  * for these functions, there will be a comment above the function reading
528  * "Note:  not called from probe context."
529  */
530 void
dtrace_panic(const char * format,...)531 dtrace_panic(const char *format, ...)
532 {
533 	va_list alist;
534 
535 	va_start(alist, format);
536 	dtrace_vpanic(format, alist);
537 	va_end(alist);
538 }
539 
540 int
dtrace_assfail(const char * a,const char * f,int l)541 dtrace_assfail(const char *a, const char *f, int l)
542 {
543 	dtrace_panic("assertion failed: %s, file: %s, line: %d", a, f, l);
544 
545 	/*
546 	 * We just need something here that even the most clever compiler
547 	 * cannot optimize away.
548 	 */
549 	return (a[(uintptr_t)f]);
550 }
551 
552 /*
553  * Atomically increment a specified error counter from probe context.
554  */
555 static void
dtrace_error(uint32_t * counter)556 dtrace_error(uint32_t *counter)
557 {
558 	/*
559 	 * Most counters stored to in probe context are per-CPU counters.
560 	 * However, there are some error conditions that are sufficiently
561 	 * arcane that they don't merit per-CPU storage.  If these counters
562 	 * are incremented concurrently on different CPUs, scalability will be
563 	 * adversely affected -- but we don't expect them to be white-hot in a
564 	 * correctly constructed enabling...
565 	 */
566 	uint32_t oval, nval;
567 
568 	do {
569 		oval = *counter;
570 
571 		if ((nval = oval + 1) == 0) {
572 			/*
573 			 * If the counter would wrap, set it to 1 -- assuring
574 			 * that the counter is never zero when we have seen
575 			 * errors.  (The counter must be 32-bits because we
576 			 * aren't guaranteed a 64-bit compare&swap operation.)
577 			 * To save this code both the infamy of being fingered
578 			 * by a priggish news story and the indignity of being
579 			 * the target of a neo-puritan witch trial, we're
580 			 * carefully avoiding any colorful description of the
581 			 * likelihood of this condition -- but suffice it to
582 			 * say that it is only slightly more likely than the
583 			 * overflow of predicate cache IDs, as discussed in
584 			 * dtrace_predicate_create().
585 			 */
586 			nval = 1;
587 		}
588 	} while (dtrace_cas32(counter, oval, nval) != oval);
589 }
590 
591 /*
592  * Use the DTRACE_LOADFUNC macro to define functions for each of loading a
593  * uint8_t, a uint16_t, a uint32_t and a uint64_t.
594  */
595 /* BEGIN CSTYLED */
596 DTRACE_LOADFUNC(8)
597 DTRACE_LOADFUNC(16)
598 DTRACE_LOADFUNC(32)
599 DTRACE_LOADFUNC(64)
600 /* END CSTYLED */
601 
602 static int
dtrace_inscratch(uintptr_t dest,size_t size,dtrace_mstate_t * mstate)603 dtrace_inscratch(uintptr_t dest, size_t size, dtrace_mstate_t *mstate)
604 {
605 	if (dest < mstate->dtms_scratch_base)
606 		return (0);
607 
608 	if (dest + size < dest)
609 		return (0);
610 
611 	if (dest + size > mstate->dtms_scratch_ptr)
612 		return (0);
613 
614 	return (1);
615 }
616 
617 static int
dtrace_canstore_statvar(uint64_t addr,size_t sz,size_t * remain,dtrace_statvar_t ** svars,int nsvars)618 dtrace_canstore_statvar(uint64_t addr, size_t sz, size_t *remain,
619     dtrace_statvar_t **svars, int nsvars)
620 {
621 	int i;
622 	size_t maxglobalsize, maxlocalsize;
623 
624 	if (nsvars == 0)
625 		return (0);
626 
627 	maxglobalsize = dtrace_statvar_maxsize + sizeof (uint64_t);
628 	maxlocalsize = maxglobalsize * NCPU;
629 
630 	for (i = 0; i < nsvars; i++) {
631 		dtrace_statvar_t *svar = svars[i];
632 		uint8_t scope;
633 		size_t size;
634 
635 		if (svar == NULL || (size = svar->dtsv_size) == 0)
636 			continue;
637 
638 		scope = svar->dtsv_var.dtdv_scope;
639 
640 		/*
641 		 * We verify that our size is valid in the spirit of providing
642 		 * defense in depth:  we want to prevent attackers from using
643 		 * DTrace to escalate an orthogonal kernel heap corruption bug
644 		 * into the ability to store to arbitrary locations in memory.
645 		 */
646 		VERIFY((scope == DIFV_SCOPE_GLOBAL && size <= maxglobalsize) ||
647 		    (scope == DIFV_SCOPE_LOCAL && size <= maxlocalsize));
648 
649 		if (DTRACE_INRANGE(addr, sz, svar->dtsv_data,
650 		    svar->dtsv_size)) {
651 			DTRACE_RANGE_REMAIN(remain, addr, svar->dtsv_data,
652 			    svar->dtsv_size);
653 			return (1);
654 		}
655 	}
656 
657 	return (0);
658 }
659 
660 /*
661  * Check to see if the address is within a memory region to which a store may
662  * be issued.  This includes the DTrace scratch areas, and any DTrace variable
663  * region.  The caller of dtrace_canstore() is responsible for performing any
664  * alignment checks that are needed before stores are actually executed.
665  */
666 static int
dtrace_canstore(uint64_t addr,size_t sz,dtrace_mstate_t * mstate,dtrace_vstate_t * vstate)667 dtrace_canstore(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
668     dtrace_vstate_t *vstate)
669 {
670 	return (dtrace_canstore_remains(addr, sz, NULL, mstate, vstate));
671 }
672 
673 /*
674  * Implementation of dtrace_canstore which communicates the upper bound of the
675  * allowed memory region.
676  */
677 static int
dtrace_canstore_remains(uint64_t addr,size_t sz,size_t * remain,dtrace_mstate_t * mstate,dtrace_vstate_t * vstate)678 dtrace_canstore_remains(uint64_t addr, size_t sz, size_t *remain,
679     dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
680 {
681 	/*
682 	 * First, check to see if the address is in scratch space...
683 	 */
684 	if (DTRACE_INRANGE(addr, sz, mstate->dtms_scratch_base,
685 	    mstate->dtms_scratch_size)) {
686 		DTRACE_RANGE_REMAIN(remain, addr, mstate->dtms_scratch_base,
687 		    mstate->dtms_scratch_size);
688 		return (1);
689 	}
690 
691 	/*
692 	 * Now check to see if it's a dynamic variable.  This check will pick
693 	 * up both thread-local variables and any global dynamically-allocated
694 	 * variables.
695 	 */
696 	if (DTRACE_INRANGE(addr, sz, vstate->dtvs_dynvars.dtds_base,
697 	    vstate->dtvs_dynvars.dtds_size)) {
698 		dtrace_dstate_t *dstate = &vstate->dtvs_dynvars;
699 		uintptr_t base = (uintptr_t)dstate->dtds_base +
700 		    (dstate->dtds_hashsize * sizeof (dtrace_dynhash_t));
701 		uintptr_t chunkoffs;
702 		dtrace_dynvar_t *dvar;
703 
704 		/*
705 		 * Before we assume that we can store here, we need to make
706 		 * sure that it isn't in our metadata -- storing to our
707 		 * dynamic variable metadata would corrupt our state.  For
708 		 * the range to not include any dynamic variable metadata,
709 		 * it must:
710 		 *
711 		 *	(1) Start above the hash table that is at the base of
712 		 *	the dynamic variable space
713 		 *
714 		 *	(2) Have a starting chunk offset that is beyond the
715 		 *	dtrace_dynvar_t that is at the base of every chunk
716 		 *
717 		 *	(3) Not span a chunk boundary
718 		 *
719 		 *	(4) Not be in the tuple space of a dynamic variable
720 		 *
721 		 */
722 		if (addr < base)
723 			return (0);
724 
725 		chunkoffs = (addr - base) % dstate->dtds_chunksize;
726 
727 		if (chunkoffs < sizeof (dtrace_dynvar_t))
728 			return (0);
729 
730 		if (chunkoffs + sz > dstate->dtds_chunksize)
731 			return (0);
732 
733 		dvar = (dtrace_dynvar_t *)((uintptr_t)addr - chunkoffs);
734 
735 		if (dvar->dtdv_hashval == DTRACE_DYNHASH_FREE)
736 			return (0);
737 
738 		if (chunkoffs < sizeof (dtrace_dynvar_t) +
739 		    ((dvar->dtdv_tuple.dtt_nkeys - 1) * sizeof (dtrace_key_t)))
740 			return (0);
741 
742 		DTRACE_RANGE_REMAIN(remain, addr, dvar, dstate->dtds_chunksize);
743 		return (1);
744 	}
745 
746 	/*
747 	 * Finally, check the static local and global variables.  These checks
748 	 * take the longest, so we perform them last.
749 	 */
750 	if (dtrace_canstore_statvar(addr, sz, remain,
751 	    vstate->dtvs_locals, vstate->dtvs_nlocals))
752 		return (1);
753 
754 	if (dtrace_canstore_statvar(addr, sz, remain,
755 	    vstate->dtvs_globals, vstate->dtvs_nglobals))
756 		return (1);
757 
758 	return (0);
759 }
760 
761 
762 /*
763  * Convenience routine to check to see if the address is within a memory
764  * region in which a load may be issued given the user's privilege level;
765  * if not, it sets the appropriate error flags and loads 'addr' into the
766  * illegal value slot.
767  *
768  * DTrace subroutines (DIF_SUBR_*) should use this helper to implement
769  * appropriate memory access protection.
770  */
771 static int
dtrace_canload(uint64_t addr,size_t sz,dtrace_mstate_t * mstate,dtrace_vstate_t * vstate)772 dtrace_canload(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
773     dtrace_vstate_t *vstate)
774 {
775 	return (dtrace_canload_remains(addr, sz, NULL, mstate, vstate));
776 }
777 
778 /*
779  * Implementation of dtrace_canload which communicates the upper bound of the
780  * allowed memory region.
781  */
782 static int
dtrace_canload_remains(uint64_t addr,size_t sz,size_t * remain,dtrace_mstate_t * mstate,dtrace_vstate_t * vstate)783 dtrace_canload_remains(uint64_t addr, size_t sz, size_t *remain,
784     dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
785 {
786 	volatile uintptr_t *illval = &cpu_core[CPU->cpu_id].cpuc_dtrace_illval;
787 	file_t *fp;
788 
789 	/*
790 	 * If we hold the privilege to read from kernel memory, then
791 	 * everything is readable.
792 	 */
793 	if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0) {
794 		DTRACE_RANGE_REMAIN(remain, addr, addr, sz);
795 		return (1);
796 	}
797 
798 	/*
799 	 * You can obviously read that which you can store.
800 	 */
801 	if (dtrace_canstore_remains(addr, sz, remain, mstate, vstate))
802 		return (1);
803 
804 	/*
805 	 * We're allowed to read from our own string table.
806 	 */
807 	if (DTRACE_INRANGE(addr, sz, mstate->dtms_difo->dtdo_strtab,
808 	    mstate->dtms_difo->dtdo_strlen)) {
809 		DTRACE_RANGE_REMAIN(remain, addr,
810 		    mstate->dtms_difo->dtdo_strtab,
811 		    mstate->dtms_difo->dtdo_strlen);
812 		return (1);
813 	}
814 
815 	if (vstate->dtvs_state != NULL &&
816 	    dtrace_priv_proc(vstate->dtvs_state, mstate)) {
817 		proc_t *p;
818 
819 		/*
820 		 * When we have privileges to the current process, there are
821 		 * several context-related kernel structures that are safe to
822 		 * read, even absent the privilege to read from kernel memory.
823 		 * These reads are safe because these structures contain only
824 		 * state that (1) we're permitted to read, (2) is harmless or
825 		 * (3) contains pointers to additional kernel state that we're
826 		 * not permitted to read (and as such, do not present an
827 		 * opportunity for privilege escalation).  Finally (and
828 		 * critically), because of the nature of their relation with
829 		 * the current thread context, the memory associated with these
830 		 * structures cannot change over the duration of probe context,
831 		 * and it is therefore impossible for this memory to be
832 		 * deallocated and reallocated as something else while it's
833 		 * being operated upon.
834 		 */
835 		if (DTRACE_INRANGE(addr, sz, curthread, sizeof (kthread_t))) {
836 			DTRACE_RANGE_REMAIN(remain, addr, curthread,
837 			    sizeof (kthread_t));
838 			return (1);
839 		}
840 
841 		if ((p = curthread->t_procp) != NULL && DTRACE_INRANGE(addr,
842 		    sz, curthread->t_procp, sizeof (proc_t))) {
843 			DTRACE_RANGE_REMAIN(remain, addr, curthread->t_procp,
844 			    sizeof (proc_t));
845 			return (1);
846 		}
847 
848 		if (curthread->t_cred != NULL && DTRACE_INRANGE(addr, sz,
849 		    curthread->t_cred, sizeof (cred_t))) {
850 			DTRACE_RANGE_REMAIN(remain, addr, curthread->t_cred,
851 			    sizeof (cred_t));
852 			return (1);
853 		}
854 
855 		if (p != NULL && p->p_pidp != NULL && DTRACE_INRANGE(addr, sz,
856 		    &(p->p_pidp->pid_id), sizeof (pid_t))) {
857 			DTRACE_RANGE_REMAIN(remain, addr, &(p->p_pidp->pid_id),
858 			    sizeof (pid_t));
859 			return (1);
860 		}
861 
862 		if (curthread->t_cpu != NULL && DTRACE_INRANGE(addr, sz,
863 		    curthread->t_cpu, offsetof(cpu_t, cpu_pause_thread))) {
864 			DTRACE_RANGE_REMAIN(remain, addr, curthread->t_cpu,
865 			    offsetof(cpu_t, cpu_pause_thread));
866 			return (1);
867 		}
868 	}
869 
870 	if ((fp = mstate->dtms_getf) != NULL) {
871 		uintptr_t psz = sizeof (void *);
872 		vnode_t *vp;
873 		vnodeops_t *op;
874 
875 		/*
876 		 * When getf() returns a file_t, the enabling is implicitly
877 		 * granted the (transient) right to read the returned file_t
878 		 * as well as the v_path and v_op->vnop_name of the underlying
879 		 * vnode.  These accesses are allowed after a successful
880 		 * getf() because the members that they refer to cannot change
881 		 * once set -- and the barrier logic in the kernel's closef()
882 		 * path assures that the file_t and its referenced vode_t
883 		 * cannot themselves be stale (that is, it impossible for
884 		 * either dtms_getf itself or its f_vnode member to reference
885 		 * freed memory).
886 		 */
887 		if (DTRACE_INRANGE(addr, sz, fp, sizeof (file_t))) {
888 			DTRACE_RANGE_REMAIN(remain, addr, fp, sizeof (file_t));
889 			return (1);
890 		}
891 
892 		if ((vp = fp->f_vnode) != NULL) {
893 			size_t slen;
894 
895 			if (DTRACE_INRANGE(addr, sz, &vp->v_path, psz)) {
896 				DTRACE_RANGE_REMAIN(remain, addr, &vp->v_path,
897 				    psz);
898 				return (1);
899 			}
900 
901 			slen = strlen(vp->v_path) + 1;
902 			if (DTRACE_INRANGE(addr, sz, vp->v_path, slen)) {
903 				DTRACE_RANGE_REMAIN(remain, addr, vp->v_path,
904 				    slen);
905 				return (1);
906 			}
907 
908 			if (DTRACE_INRANGE(addr, sz, &vp->v_op, psz)) {
909 				DTRACE_RANGE_REMAIN(remain, addr, &vp->v_op,
910 				    psz);
911 				return (1);
912 			}
913 
914 			if ((op = vp->v_op) != NULL &&
915 			    DTRACE_INRANGE(addr, sz, &op->vnop_name, psz)) {
916 				DTRACE_RANGE_REMAIN(remain, addr,
917 				    &op->vnop_name, psz);
918 				return (1);
919 			}
920 
921 			if (op != NULL && op->vnop_name != NULL &&
922 			    DTRACE_INRANGE(addr, sz, op->vnop_name,
923 			    (slen = strlen(op->vnop_name) + 1))) {
924 				DTRACE_RANGE_REMAIN(remain, addr,
925 				    op->vnop_name, slen);
926 				return (1);
927 			}
928 		}
929 	}
930 
931 	DTRACE_CPUFLAG_SET(CPU_DTRACE_KPRIV);
932 	*illval = addr;
933 	return (0);
934 }
935 
936 /*
937  * Convenience routine to check to see if a given string is within a memory
938  * region in which a load may be issued given the user's privilege level;
939  * this exists so that we don't need to issue unnecessary dtrace_strlen()
940  * calls in the event that the user has all privileges.
941  */
942 static int
dtrace_strcanload(uint64_t addr,size_t sz,size_t * remain,dtrace_mstate_t * mstate,dtrace_vstate_t * vstate)943 dtrace_strcanload(uint64_t addr, size_t sz, size_t *remain,
944     dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
945 {
946 	size_t rsize;
947 
948 	/*
949 	 * If we hold the privilege to read from kernel memory, then
950 	 * everything is readable.
951 	 */
952 	if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0) {
953 		DTRACE_RANGE_REMAIN(remain, addr, addr, sz);
954 		return (1);
955 	}
956 
957 	/*
958 	 * Even if the caller is uninterested in querying the remaining valid
959 	 * range, it is required to ensure that the access is allowed.
960 	 */
961 	if (remain == NULL) {
962 		remain = &rsize;
963 	}
964 	if (dtrace_canload_remains(addr, 0, remain, mstate, vstate)) {
965 		size_t strsz;
966 		/*
967 		 * Perform the strlen after determining the length of the
968 		 * memory region which is accessible.  This prevents timing
969 		 * information from being used to find NULs in memory which is
970 		 * not accessible to the caller.
971 		 */
972 		strsz = 1 + dtrace_strlen((char *)(uintptr_t)addr,
973 		    MIN(sz, *remain));
974 		if (strsz <= *remain) {
975 			return (1);
976 		}
977 	}
978 
979 	return (0);
980 }
981 
982 /*
983  * Convenience routine to check to see if a given variable is within a memory
984  * region in which a load may be issued given the user's privilege level.
985  */
986 static int
dtrace_vcanload(void * src,dtrace_diftype_t * type,size_t * remain,dtrace_mstate_t * mstate,dtrace_vstate_t * vstate)987 dtrace_vcanload(void *src, dtrace_diftype_t *type, size_t *remain,
988     dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
989 {
990 	size_t sz;
991 	ASSERT(type->dtdt_flags & DIF_TF_BYREF);
992 
993 	/*
994 	 * Calculate the max size before performing any checks since even
995 	 * DTRACE_ACCESS_KERNEL-credentialed callers expect that this function
996 	 * return the max length via 'remain'.
997 	 */
998 	if (type->dtdt_kind == DIF_TYPE_STRING) {
999 		dtrace_state_t *state = vstate->dtvs_state;
1000 
1001 		if (state != NULL) {
1002 			sz = state->dts_options[DTRACEOPT_STRSIZE];
1003 		} else {
1004 			/*
1005 			 * In helper context, we have a NULL state; fall back
1006 			 * to using the system-wide default for the string size
1007 			 * in this case.
1008 			 */
1009 			sz = dtrace_strsize_default;
1010 		}
1011 	} else {
1012 		sz = type->dtdt_size;
1013 	}
1014 
1015 	/*
1016 	 * If we hold the privilege to read from kernel memory, then
1017 	 * everything is readable.
1018 	 */
1019 	if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0) {
1020 		DTRACE_RANGE_REMAIN(remain, (uintptr_t)src, src, sz);
1021 		return (1);
1022 	}
1023 
1024 	if (type->dtdt_kind == DIF_TYPE_STRING) {
1025 		return (dtrace_strcanload((uintptr_t)src, sz, remain, mstate,
1026 		    vstate));
1027 	}
1028 	return (dtrace_canload_remains((uintptr_t)src, sz, remain, mstate,
1029 	    vstate));
1030 }
1031 
1032 /*
1033  * Convert a string to a signed integer using safe loads.
1034  *
1035  * NOTE: This function uses various macros from strtolctype.h to manipulate
1036  * digit values, etc -- these have all been checked to ensure they make
1037  * no additional function calls.
1038  */
1039 static int64_t
dtrace_strtoll(char * input,int base,size_t limit)1040 dtrace_strtoll(char *input, int base, size_t limit)
1041 {
1042 	uintptr_t pos = (uintptr_t)input;
1043 	int64_t val = 0;
1044 	int x;
1045 	boolean_t neg = B_FALSE;
1046 	char c, cc, ccc;
1047 	uintptr_t end = pos + limit;
1048 
1049 	/*
1050 	 * Consume any whitespace preceding digits.
1051 	 */
1052 	while ((c = dtrace_load8(pos)) == ' ' || c == '\t')
1053 		pos++;
1054 
1055 	/*
1056 	 * Handle an explicit sign if one is present.
1057 	 */
1058 	if (c == '-' || c == '+') {
1059 		if (c == '-')
1060 			neg = B_TRUE;
1061 		c = dtrace_load8(++pos);
1062 	}
1063 
1064 	/*
1065 	 * Check for an explicit hexadecimal prefix ("0x" or "0X") and skip it
1066 	 * if present.
1067 	 */
1068 	if (base == 16 && c == '0' && ((cc = dtrace_load8(pos + 1)) == 'x' ||
1069 	    cc == 'X') && isxdigit(ccc = dtrace_load8(pos + 2))) {
1070 		pos += 2;
1071 		c = ccc;
1072 	}
1073 
1074 	/*
1075 	 * Read in contiguous digits until the first non-digit character.
1076 	 */
1077 	for (; pos < end && c != '\0' && lisalnum(c) && (x = DIGIT(c)) < base;
1078 	    c = dtrace_load8(++pos))
1079 		val = val * base + x;
1080 
1081 	return (neg ? -val : val);
1082 }
1083 
1084 /*
1085  * Compare two strings using safe loads.
1086  */
1087 static int
dtrace_strncmp(char * s1,char * s2,size_t limit)1088 dtrace_strncmp(char *s1, char *s2, size_t limit)
1089 {
1090 	uint8_t c1, c2;
1091 	volatile uint16_t *flags;
1092 
1093 	if (s1 == s2 || limit == 0)
1094 		return (0);
1095 
1096 	flags = (volatile uint16_t *)&cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
1097 
1098 	do {
1099 		if (s1 == NULL) {
1100 			c1 = '\0';
1101 		} else {
1102 			c1 = dtrace_load8((uintptr_t)s1++);
1103 		}
1104 
1105 		if (s2 == NULL) {
1106 			c2 = '\0';
1107 		} else {
1108 			c2 = dtrace_load8((uintptr_t)s2++);
1109 		}
1110 
1111 		if (c1 != c2)
1112 			return (c1 - c2);
1113 	} while (--limit && c1 != '\0' && !(*flags & CPU_DTRACE_FAULT));
1114 
1115 	return (0);
1116 }
1117 
1118 /*
1119  * Compute strlen(s) for a string using safe memory accesses.  The additional
1120  * len parameter is used to specify a maximum length to ensure completion.
1121  */
1122 static size_t
dtrace_strlen(const char * s,size_t lim)1123 dtrace_strlen(const char *s, size_t lim)
1124 {
1125 	uint_t len;
1126 
1127 	for (len = 0; len != lim; len++) {
1128 		if (dtrace_load8((uintptr_t)s++) == '\0')
1129 			break;
1130 	}
1131 
1132 	return (len);
1133 }
1134 
1135 /*
1136  * Check if an address falls within a toxic region.
1137  */
1138 static int
dtrace_istoxic(uintptr_t kaddr,size_t size)1139 dtrace_istoxic(uintptr_t kaddr, size_t size)
1140 {
1141 	uintptr_t taddr, tsize;
1142 	int i;
1143 
1144 	for (i = 0; i < dtrace_toxranges; i++) {
1145 		taddr = dtrace_toxrange[i].dtt_base;
1146 		tsize = dtrace_toxrange[i].dtt_limit - taddr;
1147 
1148 		if (kaddr - taddr < tsize) {
1149 			DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
1150 			cpu_core[CPU->cpu_id].cpuc_dtrace_illval = kaddr;
1151 			return (1);
1152 		}
1153 
1154 		if (taddr - kaddr < size) {
1155 			DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
1156 			cpu_core[CPU->cpu_id].cpuc_dtrace_illval = taddr;
1157 			return (1);
1158 		}
1159 	}
1160 
1161 	return (0);
1162 }
1163 
1164 /*
1165  * Copy src to dst using safe memory accesses.  The src is assumed to be unsafe
1166  * memory specified by the DIF program.  The dst is assumed to be safe memory
1167  * that we can store to directly because it is managed by DTrace.  As with
1168  * standard bcopy, overlapping copies are handled properly.
1169  */
1170 static void
dtrace_bcopy(const void * src,void * dst,size_t len)1171 dtrace_bcopy(const void *src, void *dst, size_t len)
1172 {
1173 	if (len != 0) {
1174 		uint8_t *s1 = dst;
1175 		const uint8_t *s2 = src;
1176 
1177 		if (s1 <= s2) {
1178 			do {
1179 				*s1++ = dtrace_load8((uintptr_t)s2++);
1180 			} while (--len != 0);
1181 		} else {
1182 			s2 += len;
1183 			s1 += len;
1184 
1185 			do {
1186 				*--s1 = dtrace_load8((uintptr_t)--s2);
1187 			} while (--len != 0);
1188 		}
1189 	}
1190 }
1191 
1192 /*
1193  * Copy src to dst using safe memory accesses, up to either the specified
1194  * length, or the point that a nul byte is encountered.  The src is assumed to
1195  * be unsafe memory specified by the DIF program.  The dst is assumed to be
1196  * safe memory that we can store to directly because it is managed by DTrace.
1197  * Unlike dtrace_bcopy(), overlapping regions are not handled.
1198  */
1199 static void
dtrace_strcpy(const void * src,void * dst,size_t len)1200 dtrace_strcpy(const void *src, void *dst, size_t len)
1201 {
1202 	if (len != 0) {
1203 		uint8_t *s1 = dst, c;
1204 		const uint8_t *s2 = src;
1205 
1206 		do {
1207 			*s1++ = c = dtrace_load8((uintptr_t)s2++);
1208 		} while (--len != 0 && c != '\0');
1209 	}
1210 }
1211 
1212 /*
1213  * Copy src to dst, deriving the size and type from the specified (BYREF)
1214  * variable type.  The src is assumed to be unsafe memory specified by the DIF
1215  * program.  The dst is assumed to be DTrace variable memory that is of the
1216  * specified type; we assume that we can store to directly.
1217  */
1218 static void
dtrace_vcopy(void * src,void * dst,dtrace_diftype_t * type,size_t limit)1219 dtrace_vcopy(void *src, void *dst, dtrace_diftype_t *type, size_t limit)
1220 {
1221 	ASSERT(type->dtdt_flags & DIF_TF_BYREF);
1222 
1223 	if (type->dtdt_kind == DIF_TYPE_STRING) {
1224 		dtrace_strcpy(src, dst, MIN(type->dtdt_size, limit));
1225 	} else {
1226 		dtrace_bcopy(src, dst, MIN(type->dtdt_size, limit));
1227 	}
1228 }
1229 
1230 /*
1231  * Compare s1 to s2 using safe memory accesses.  The s1 data is assumed to be
1232  * unsafe memory specified by the DIF program.  The s2 data is assumed to be
1233  * safe memory that we can access directly because it is managed by DTrace.
1234  */
1235 static int
dtrace_bcmp(const void * s1,const void * s2,size_t len)1236 dtrace_bcmp(const void *s1, const void *s2, size_t len)
1237 {
1238 	volatile uint16_t *flags;
1239 
1240 	flags = (volatile uint16_t *)&cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
1241 
1242 	if (s1 == s2)
1243 		return (0);
1244 
1245 	if (s1 == NULL || s2 == NULL)
1246 		return (1);
1247 
1248 	if (s1 != s2 && len != 0) {
1249 		const uint8_t *ps1 = s1;
1250 		const uint8_t *ps2 = s2;
1251 
1252 		do {
1253 			if (dtrace_load8((uintptr_t)ps1++) != *ps2++)
1254 				return (1);
1255 		} while (--len != 0 && !(*flags & CPU_DTRACE_FAULT));
1256 	}
1257 	return (0);
1258 }
1259 
1260 /*
1261  * Zero the specified region using a simple byte-by-byte loop.  Note that this
1262  * is for safe DTrace-managed memory only.
1263  */
1264 static void
dtrace_bzero(void * dst,size_t len)1265 dtrace_bzero(void *dst, size_t len)
1266 {
1267 	uchar_t *cp;
1268 
1269 	for (cp = dst; len != 0; len--)
1270 		*cp++ = 0;
1271 }
1272 
1273 static void
dtrace_add_128(uint64_t * addend1,uint64_t * addend2,uint64_t * sum)1274 dtrace_add_128(uint64_t *addend1, uint64_t *addend2, uint64_t *sum)
1275 {
1276 	uint64_t result[2];
1277 
1278 	result[0] = addend1[0] + addend2[0];
1279 	result[1] = addend1[1] + addend2[1] +
1280 	    (result[0] < addend1[0] || result[0] < addend2[0] ? 1 : 0);
1281 
1282 	sum[0] = result[0];
1283 	sum[1] = result[1];
1284 }
1285 
1286 /*
1287  * Shift the 128-bit value in a by b. If b is positive, shift left.
1288  * If b is negative, shift right.
1289  */
1290 static void
dtrace_shift_128(uint64_t * a,int b)1291 dtrace_shift_128(uint64_t *a, int b)
1292 {
1293 	uint64_t mask;
1294 
1295 	if (b == 0)
1296 		return;
1297 
1298 	if (b < 0) {
1299 		b = -b;
1300 		if (b >= 64) {
1301 			a[0] = a[1] >> (b - 64);
1302 			a[1] = 0;
1303 		} else {
1304 			a[0] >>= b;
1305 			mask = 1LL << (64 - b);
1306 			mask -= 1;
1307 			a[0] |= ((a[1] & mask) << (64 - b));
1308 			a[1] >>= b;
1309 		}
1310 	} else {
1311 		if (b >= 64) {
1312 			a[1] = a[0] << (b - 64);
1313 			a[0] = 0;
1314 		} else {
1315 			a[1] <<= b;
1316 			mask = a[0] >> (64 - b);
1317 			a[1] |= mask;
1318 			a[0] <<= b;
1319 		}
1320 	}
1321 }
1322 
1323 /*
1324  * The basic idea is to break the 2 64-bit values into 4 32-bit values,
1325  * use native multiplication on those, and then re-combine into the
1326  * resulting 128-bit value.
1327  *
1328  * (hi1 << 32 + lo1) * (hi2 << 32 + lo2) =
1329  *     hi1 * hi2 << 64 +
1330  *     hi1 * lo2 << 32 +
1331  *     hi2 * lo1 << 32 +
1332  *     lo1 * lo2
1333  */
1334 static void
dtrace_multiply_128(uint64_t factor1,uint64_t factor2,uint64_t * product)1335 dtrace_multiply_128(uint64_t factor1, uint64_t factor2, uint64_t *product)
1336 {
1337 	uint64_t hi1, hi2, lo1, lo2;
1338 	uint64_t tmp[2];
1339 
1340 	hi1 = factor1 >> 32;
1341 	hi2 = factor2 >> 32;
1342 
1343 	lo1 = factor1 & DT_MASK_LO;
1344 	lo2 = factor2 & DT_MASK_LO;
1345 
1346 	product[0] = lo1 * lo2;
1347 	product[1] = hi1 * hi2;
1348 
1349 	tmp[0] = hi1 * lo2;
1350 	tmp[1] = 0;
1351 	dtrace_shift_128(tmp, 32);
1352 	dtrace_add_128(product, tmp, product);
1353 
1354 	tmp[0] = hi2 * lo1;
1355 	tmp[1] = 0;
1356 	dtrace_shift_128(tmp, 32);
1357 	dtrace_add_128(product, tmp, product);
1358 }
1359 
1360 /*
1361  * This privilege check should be used by actions and subroutines to
1362  * verify that the user credentials of the process that enabled the
1363  * invoking ECB match the target credentials
1364  */
1365 static int
dtrace_priv_proc_common_user(dtrace_state_t * state)1366 dtrace_priv_proc_common_user(dtrace_state_t *state)
1367 {
1368 	cred_t *cr, *s_cr = state->dts_cred.dcr_cred;
1369 
1370 	/*
1371 	 * We should always have a non-NULL state cred here, since if cred
1372 	 * is null (anonymous tracing), we fast-path bypass this routine.
1373 	 */
1374 	ASSERT(s_cr != NULL);
1375 
1376 	if ((cr = CRED()) != NULL &&
1377 	    s_cr->cr_uid == cr->cr_uid &&
1378 	    s_cr->cr_uid == cr->cr_ruid &&
1379 	    s_cr->cr_uid == cr->cr_suid &&
1380 	    s_cr->cr_gid == cr->cr_gid &&
1381 	    s_cr->cr_gid == cr->cr_rgid &&
1382 	    s_cr->cr_gid == cr->cr_sgid)
1383 		return (1);
1384 
1385 	return (0);
1386 }
1387 
1388 /*
1389  * This privilege check should be used by actions and subroutines to
1390  * verify that the zone of the process that enabled the invoking ECB
1391  * matches the target credentials
1392  */
1393 static int
dtrace_priv_proc_common_zone(dtrace_state_t * state)1394 dtrace_priv_proc_common_zone(dtrace_state_t *state)
1395 {
1396 	cred_t *cr, *s_cr = state->dts_cred.dcr_cred;
1397 
1398 	/*
1399 	 * We should always have a non-NULL state cred here, since if cred
1400 	 * is null (anonymous tracing), we fast-path bypass this routine.
1401 	 */
1402 	ASSERT(s_cr != NULL);
1403 
1404 	if ((cr = CRED()) != NULL && s_cr->cr_zone == cr->cr_zone)
1405 		return (1);
1406 
1407 	return (0);
1408 }
1409 
1410 /*
1411  * This privilege check should be used by actions and subroutines to
1412  * verify that the process has not setuid or changed credentials.
1413  */
1414 static int
dtrace_priv_proc_common_nocd()1415 dtrace_priv_proc_common_nocd()
1416 {
1417 	proc_t *proc;
1418 
1419 	if ((proc = ttoproc(curthread)) != NULL &&
1420 	    !(proc->p_flag & SNOCD))
1421 		return (1);
1422 
1423 	return (0);
1424 }
1425 
1426 static int
dtrace_priv_proc_destructive(dtrace_state_t * state,dtrace_mstate_t * mstate)1427 dtrace_priv_proc_destructive(dtrace_state_t *state, dtrace_mstate_t *mstate)
1428 {
1429 	int action = state->dts_cred.dcr_action;
1430 
1431 	if (!(mstate->dtms_access & DTRACE_ACCESS_PROC))
1432 		goto bad;
1433 
1434 	if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE) == 0) &&
1435 	    dtrace_priv_proc_common_zone(state) == 0)
1436 		goto bad;
1437 
1438 	if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER) == 0) &&
1439 	    dtrace_priv_proc_common_user(state) == 0)
1440 		goto bad;
1441 
1442 	if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG) == 0) &&
1443 	    dtrace_priv_proc_common_nocd() == 0)
1444 		goto bad;
1445 
1446 	return (1);
1447 
1448 bad:
1449 	cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1450 
1451 	return (0);
1452 }
1453 
1454 static int
dtrace_priv_proc_control(dtrace_state_t * state,dtrace_mstate_t * mstate)1455 dtrace_priv_proc_control(dtrace_state_t *state, dtrace_mstate_t *mstate)
1456 {
1457 	if (mstate->dtms_access & DTRACE_ACCESS_PROC) {
1458 		if (state->dts_cred.dcr_action & DTRACE_CRA_PROC_CONTROL)
1459 			return (1);
1460 
1461 		if (dtrace_priv_proc_common_zone(state) &&
1462 		    dtrace_priv_proc_common_user(state) &&
1463 		    dtrace_priv_proc_common_nocd())
1464 			return (1);
1465 	}
1466 
1467 	cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1468 
1469 	return (0);
1470 }
1471 
1472 static int
dtrace_priv_proc(dtrace_state_t * state,dtrace_mstate_t * mstate)1473 dtrace_priv_proc(dtrace_state_t *state, dtrace_mstate_t *mstate)
1474 {
1475 	if ((mstate->dtms_access & DTRACE_ACCESS_PROC) &&
1476 	    (state->dts_cred.dcr_action & DTRACE_CRA_PROC))
1477 		return (1);
1478 
1479 	cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1480 
1481 	return (0);
1482 }
1483 
1484 static int
dtrace_priv_kernel(dtrace_state_t * state)1485 dtrace_priv_kernel(dtrace_state_t *state)
1486 {
1487 	if (state->dts_cred.dcr_action & DTRACE_CRA_KERNEL)
1488 		return (1);
1489 
1490 	cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_KPRIV;
1491 
1492 	return (0);
1493 }
1494 
1495 static int
dtrace_priv_kernel_destructive(dtrace_state_t * state)1496 dtrace_priv_kernel_destructive(dtrace_state_t *state)
1497 {
1498 	if (state->dts_cred.dcr_action & DTRACE_CRA_KERNEL_DESTRUCTIVE)
1499 		return (1);
1500 
1501 	cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_KPRIV;
1502 
1503 	return (0);
1504 }
1505 
1506 /*
1507  * Determine if the dte_cond of the specified ECB allows for processing of
1508  * the current probe to continue.  Note that this routine may allow continued
1509  * processing, but with access(es) stripped from the mstate's dtms_access
1510  * field.
1511  */
1512 static int
dtrace_priv_probe(dtrace_state_t * state,dtrace_mstate_t * mstate,dtrace_ecb_t * ecb)1513 dtrace_priv_probe(dtrace_state_t *state, dtrace_mstate_t *mstate,
1514     dtrace_ecb_t *ecb)
1515 {
1516 	dtrace_probe_t *probe = ecb->dte_probe;
1517 	dtrace_provider_t *prov = probe->dtpr_provider;
1518 	dtrace_pops_t *pops = &prov->dtpv_pops;
1519 	int mode = DTRACE_MODE_NOPRIV_DROP;
1520 
1521 	ASSERT(ecb->dte_cond);
1522 
1523 	if (pops->dtps_mode != NULL) {
1524 		mode = pops->dtps_mode(prov->dtpv_arg,
1525 		    probe->dtpr_id, probe->dtpr_arg);
1526 
1527 		ASSERT(mode & (DTRACE_MODE_USER | DTRACE_MODE_KERNEL));
1528 		ASSERT(mode & (DTRACE_MODE_NOPRIV_RESTRICT |
1529 		    DTRACE_MODE_NOPRIV_DROP));
1530 	}
1531 
1532 	/*
1533 	 * If the dte_cond bits indicate that this consumer is only allowed to
1534 	 * see user-mode firings of this probe, check that the probe was fired
1535 	 * while in a user context.  If that's not the case, use the policy
1536 	 * specified by the provider to determine if we drop the probe or
1537 	 * merely restrict operation.
1538 	 */
1539 	if (ecb->dte_cond & DTRACE_COND_USERMODE) {
1540 		ASSERT(mode != DTRACE_MODE_NOPRIV_DROP);
1541 
1542 		if (!(mode & DTRACE_MODE_USER)) {
1543 			if (mode & DTRACE_MODE_NOPRIV_DROP)
1544 				return (0);
1545 
1546 			mstate->dtms_access &= ~DTRACE_ACCESS_ARGS;
1547 		}
1548 	}
1549 
1550 	/*
1551 	 * This is more subtle than it looks. We have to be absolutely certain
1552 	 * that CRED() isn't going to change out from under us so it's only
1553 	 * legit to examine that structure if we're in constrained situations.
1554 	 * Currently, the only times we'll this check is if a non-super-user
1555 	 * has enabled the profile or syscall providers -- providers that
1556 	 * allow visibility of all processes. For the profile case, the check
1557 	 * above will ensure that we're examining a user context.
1558 	 */
1559 	if (ecb->dte_cond & DTRACE_COND_OWNER) {
1560 		cred_t *cr;
1561 		cred_t *s_cr = state->dts_cred.dcr_cred;
1562 		proc_t *proc;
1563 
1564 		ASSERT(s_cr != NULL);
1565 
1566 		if ((cr = CRED()) == NULL ||
1567 		    s_cr->cr_uid != cr->cr_uid ||
1568 		    s_cr->cr_uid != cr->cr_ruid ||
1569 		    s_cr->cr_uid != cr->cr_suid ||
1570 		    s_cr->cr_gid != cr->cr_gid ||
1571 		    s_cr->cr_gid != cr->cr_rgid ||
1572 		    s_cr->cr_gid != cr->cr_sgid ||
1573 		    (proc = ttoproc(curthread)) == NULL ||
1574 		    (proc->p_flag & SNOCD)) {
1575 			if (mode & DTRACE_MODE_NOPRIV_DROP)
1576 				return (0);
1577 
1578 			mstate->dtms_access &= ~DTRACE_ACCESS_PROC;
1579 		}
1580 	}
1581 
1582 	/*
1583 	 * If our dte_cond is set to DTRACE_COND_ZONEOWNER and we are not
1584 	 * in our zone, check to see if our mode policy is to restrict rather
1585 	 * than to drop; if to restrict, strip away both DTRACE_ACCESS_PROC
1586 	 * and DTRACE_ACCESS_ARGS
1587 	 */
1588 	if (ecb->dte_cond & DTRACE_COND_ZONEOWNER) {
1589 		cred_t *cr;
1590 		cred_t *s_cr = state->dts_cred.dcr_cred;
1591 
1592 		ASSERT(s_cr != NULL);
1593 
1594 		if ((cr = CRED()) == NULL ||
1595 		    s_cr->cr_zone->zone_id != cr->cr_zone->zone_id) {
1596 			if (mode & DTRACE_MODE_NOPRIV_DROP)
1597 				return (0);
1598 
1599 			mstate->dtms_access &=
1600 			    ~(DTRACE_ACCESS_PROC | DTRACE_ACCESS_ARGS);
1601 		}
1602 	}
1603 
1604 	/*
1605 	 * By merits of being in this code path at all, we have limited
1606 	 * privileges.  If the provider has indicated that limited privileges
1607 	 * are to denote restricted operation, strip off the ability to access
1608 	 * arguments.
1609 	 */
1610 	if (mode & DTRACE_MODE_LIMITEDPRIV_RESTRICT)
1611 		mstate->dtms_access &= ~DTRACE_ACCESS_ARGS;
1612 
1613 	return (1);
1614 }
1615 
1616 /*
1617  * Note:  not called from probe context.  This function is called
1618  * asynchronously (and at a regular interval) from outside of probe context to
1619  * clean the dirty dynamic variable lists on all CPUs.  Dynamic variable
1620  * cleaning is explained in detail in <sys/dtrace_impl.h>.
1621  */
1622 void
dtrace_dynvar_clean(dtrace_dstate_t * dstate)1623 dtrace_dynvar_clean(dtrace_dstate_t *dstate)
1624 {
1625 	dtrace_dynvar_t *dirty;
1626 	dtrace_dstate_percpu_t *dcpu;
1627 	dtrace_dynvar_t **rinsep;
1628 	int i, j, work = 0;
1629 
1630 	for (i = 0; i < NCPU; i++) {
1631 		dcpu = &dstate->dtds_percpu[i];
1632 		rinsep = &dcpu->dtdsc_rinsing;
1633 
1634 		/*
1635 		 * If the dirty list is NULL, there is no dirty work to do.
1636 		 */
1637 		if (dcpu->dtdsc_dirty == NULL)
1638 			continue;
1639 
1640 		if (dcpu->dtdsc_rinsing != NULL) {
1641 			/*
1642 			 * If the rinsing list is non-NULL, then it is because
1643 			 * this CPU was selected to accept another CPU's
1644 			 * dirty list -- and since that time, dirty buffers
1645 			 * have accumulated.  This is a highly unlikely
1646 			 * condition, but we choose to ignore the dirty
1647 			 * buffers -- they'll be picked up a future cleanse.
1648 			 */
1649 			continue;
1650 		}
1651 
1652 		if (dcpu->dtdsc_clean != NULL) {
1653 			/*
1654 			 * If the clean list is non-NULL, then we're in a
1655 			 * situation where a CPU has done deallocations (we
1656 			 * have a non-NULL dirty list) but no allocations (we
1657 			 * also have a non-NULL clean list).  We can't simply
1658 			 * move the dirty list into the clean list on this
1659 			 * CPU, yet we also don't want to allow this condition
1660 			 * to persist, lest a short clean list prevent a
1661 			 * massive dirty list from being cleaned (which in
1662 			 * turn could lead to otherwise avoidable dynamic
1663 			 * drops).  To deal with this, we look for some CPU
1664 			 * with a NULL clean list, NULL dirty list, and NULL
1665 			 * rinsing list -- and then we borrow this CPU to
1666 			 * rinse our dirty list.
1667 			 */
1668 			for (j = 0; j < NCPU; j++) {
1669 				dtrace_dstate_percpu_t *rinser;
1670 
1671 				rinser = &dstate->dtds_percpu[j];
1672 
1673 				if (rinser->dtdsc_rinsing != NULL)
1674 					continue;
1675 
1676 				if (rinser->dtdsc_dirty != NULL)
1677 					continue;
1678 
1679 				if (rinser->dtdsc_clean != NULL)
1680 					continue;
1681 
1682 				rinsep = &rinser->dtdsc_rinsing;
1683 				break;
1684 			}
1685 
1686 			if (j == NCPU) {
1687 				/*
1688 				 * We were unable to find another CPU that
1689 				 * could accept this dirty list -- we are
1690 				 * therefore unable to clean it now.
1691 				 */
1692 				dtrace_dynvar_failclean++;
1693 				continue;
1694 			}
1695 		}
1696 
1697 		work = 1;
1698 
1699 		/*
1700 		 * Atomically move the dirty list aside.
1701 		 */
1702 		do {
1703 			dirty = dcpu->dtdsc_dirty;
1704 
1705 			/*
1706 			 * Before we zap the dirty list, set the rinsing list.
1707 			 * (This allows for a potential assertion in
1708 			 * dtrace_dynvar():  if a free dynamic variable appears
1709 			 * on a hash chain, either the dirty list or the
1710 			 * rinsing list for some CPU must be non-NULL.)
1711 			 */
1712 			*rinsep = dirty;
1713 			dtrace_membar_producer();
1714 		} while (dtrace_casptr(&dcpu->dtdsc_dirty,
1715 		    dirty, NULL) != dirty);
1716 	}
1717 
1718 	if (!work) {
1719 		/*
1720 		 * We have no work to do; we can simply return.
1721 		 */
1722 		return;
1723 	}
1724 
1725 	dtrace_sync();
1726 
1727 	for (i = 0; i < NCPU; i++) {
1728 		dcpu = &dstate->dtds_percpu[i];
1729 
1730 		if (dcpu->dtdsc_rinsing == NULL)
1731 			continue;
1732 
1733 		/*
1734 		 * We are now guaranteed that no hash chain contains a pointer
1735 		 * into this dirty list; we can make it clean.
1736 		 */
1737 		ASSERT(dcpu->dtdsc_clean == NULL);
1738 		dcpu->dtdsc_clean = dcpu->dtdsc_rinsing;
1739 		dcpu->dtdsc_rinsing = NULL;
1740 	}
1741 
1742 	/*
1743 	 * Before we actually set the state to be DTRACE_DSTATE_CLEAN, make
1744 	 * sure that all CPUs have seen all of the dtdsc_clean pointers.
1745 	 * This prevents a race whereby a CPU incorrectly decides that
1746 	 * the state should be something other than DTRACE_DSTATE_CLEAN
1747 	 * after dtrace_dynvar_clean() has completed.
1748 	 */
1749 	dtrace_sync();
1750 
1751 	dstate->dtds_state = DTRACE_DSTATE_CLEAN;
1752 }
1753 
1754 /*
1755  * Depending on the value of the op parameter, this function looks-up,
1756  * allocates or deallocates an arbitrarily-keyed dynamic variable.  If an
1757  * allocation is requested, this function will return a pointer to a
1758  * dtrace_dynvar_t corresponding to the allocated variable -- or NULL if no
1759  * variable can be allocated.  If NULL is returned, the appropriate counter
1760  * will be incremented.
1761  */
1762 dtrace_dynvar_t *
dtrace_dynvar(dtrace_dstate_t * dstate,uint_t nkeys,dtrace_key_t * key,size_t dsize,dtrace_dynvar_op_t op,dtrace_mstate_t * mstate,dtrace_vstate_t * vstate)1763 dtrace_dynvar(dtrace_dstate_t *dstate, uint_t nkeys,
1764     dtrace_key_t *key, size_t dsize, dtrace_dynvar_op_t op,
1765     dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
1766 {
1767 	uint64_t hashval = DTRACE_DYNHASH_VALID;
1768 	dtrace_dynhash_t *hash = dstate->dtds_hash;
1769 	dtrace_dynvar_t *free, *new_free, *next, *dvar, *start, *prev = NULL;
1770 	processorid_t me = CPU->cpu_id, cpu = me;
1771 	dtrace_dstate_percpu_t *dcpu = &dstate->dtds_percpu[me];
1772 	size_t bucket, ksize;
1773 	size_t chunksize = dstate->dtds_chunksize;
1774 	uintptr_t kdata, lock, nstate;
1775 	uint_t i;
1776 
1777 	ASSERT(nkeys != 0);
1778 
1779 	/*
1780 	 * Hash the key.  As with aggregations, we use Jenkins' "One-at-a-time"
1781 	 * algorithm.  For the by-value portions, we perform the algorithm in
1782 	 * 16-bit chunks (as opposed to 8-bit chunks).  This speeds things up a
1783 	 * bit, and seems to have only a minute effect on distribution.  For
1784 	 * the by-reference data, we perform "One-at-a-time" iterating (safely)
1785 	 * over each referenced byte.  It's painful to do this, but it's much
1786 	 * better than pathological hash distribution.  The efficacy of the
1787 	 * hashing algorithm (and a comparison with other algorithms) may be
1788 	 * found by running the ::dtrace_dynstat MDB dcmd.
1789 	 */
1790 	for (i = 0; i < nkeys; i++) {
1791 		if (key[i].dttk_size == 0) {
1792 			uint64_t val = key[i].dttk_value;
1793 
1794 			hashval += (val >> 48) & 0xffff;
1795 			hashval += (hashval << 10);
1796 			hashval ^= (hashval >> 6);
1797 
1798 			hashval += (val >> 32) & 0xffff;
1799 			hashval += (hashval << 10);
1800 			hashval ^= (hashval >> 6);
1801 
1802 			hashval += (val >> 16) & 0xffff;
1803 			hashval += (hashval << 10);
1804 			hashval ^= (hashval >> 6);
1805 
1806 			hashval += val & 0xffff;
1807 			hashval += (hashval << 10);
1808 			hashval ^= (hashval >> 6);
1809 		} else {
1810 			/*
1811 			 * This is incredibly painful, but it beats the hell
1812 			 * out of the alternative.
1813 			 */
1814 			uint64_t j, size = key[i].dttk_size;
1815 			uintptr_t base = (uintptr_t)key[i].dttk_value;
1816 
1817 			if (!dtrace_canload(base, size, mstate, vstate))
1818 				break;
1819 
1820 			for (j = 0; j < size; j++) {
1821 				hashval += dtrace_load8(base + j);
1822 				hashval += (hashval << 10);
1823 				hashval ^= (hashval >> 6);
1824 			}
1825 		}
1826 	}
1827 
1828 	if (DTRACE_CPUFLAG_ISSET(CPU_DTRACE_FAULT))
1829 		return (NULL);
1830 
1831 	hashval += (hashval << 3);
1832 	hashval ^= (hashval >> 11);
1833 	hashval += (hashval << 15);
1834 
1835 	/*
1836 	 * There is a remote chance (ideally, 1 in 2^31) that our hashval
1837 	 * comes out to be one of our two sentinel hash values.  If this
1838 	 * actually happens, we set the hashval to be a value known to be a
1839 	 * non-sentinel value.
1840 	 */
1841 	if (hashval == DTRACE_DYNHASH_FREE || hashval == DTRACE_DYNHASH_SINK)
1842 		hashval = DTRACE_DYNHASH_VALID;
1843 
1844 	/*
1845 	 * Yes, it's painful to do a divide here.  If the cycle count becomes
1846 	 * important here, tricks can be pulled to reduce it.  (However, it's
1847 	 * critical that hash collisions be kept to an absolute minimum;
1848 	 * they're much more painful than a divide.)  It's better to have a
1849 	 * solution that generates few collisions and still keeps things
1850 	 * relatively simple.
1851 	 */
1852 	bucket = hashval % dstate->dtds_hashsize;
1853 
1854 	if (op == DTRACE_DYNVAR_DEALLOC) {
1855 		volatile uintptr_t *lockp = &hash[bucket].dtdh_lock;
1856 
1857 		for (;;) {
1858 			while ((lock = *lockp) & 1)
1859 				continue;
1860 
1861 			if (dtrace_casptr((void *)lockp,
1862 			    (void *)lock, (void *)(lock + 1)) == (void *)lock)
1863 				break;
1864 		}
1865 
1866 		dtrace_membar_producer();
1867 	}
1868 
1869 top:
1870 	prev = NULL;
1871 	lock = hash[bucket].dtdh_lock;
1872 
1873 	dtrace_membar_consumer();
1874 
1875 	start = hash[bucket].dtdh_chain;
1876 	ASSERT(start != NULL && (start->dtdv_hashval == DTRACE_DYNHASH_SINK ||
1877 	    start->dtdv_hashval != DTRACE_DYNHASH_FREE ||
1878 	    op != DTRACE_DYNVAR_DEALLOC));
1879 
1880 	for (dvar = start; dvar != NULL; dvar = dvar->dtdv_next) {
1881 		dtrace_tuple_t *dtuple = &dvar->dtdv_tuple;
1882 		dtrace_key_t *dkey = &dtuple->dtt_key[0];
1883 
1884 		if (dvar->dtdv_hashval != hashval) {
1885 			if (dvar->dtdv_hashval == DTRACE_DYNHASH_SINK) {
1886 				/*
1887 				 * We've reached the sink, and therefore the
1888 				 * end of the hash chain; we can kick out of
1889 				 * the loop knowing that we have seen a valid
1890 				 * snapshot of state.
1891 				 */
1892 				ASSERT(dvar->dtdv_next == NULL);
1893 				ASSERT(dvar == &dtrace_dynhash_sink);
1894 				break;
1895 			}
1896 
1897 			if (dvar->dtdv_hashval == DTRACE_DYNHASH_FREE) {
1898 				/*
1899 				 * We've gone off the rails:  somewhere along
1900 				 * the line, one of the members of this hash
1901 				 * chain was deleted.  Note that we could also
1902 				 * detect this by simply letting this loop run
1903 				 * to completion, as we would eventually hit
1904 				 * the end of the dirty list.  However, we
1905 				 * want to avoid running the length of the
1906 				 * dirty list unnecessarily (it might be quite
1907 				 * long), so we catch this as early as
1908 				 * possible by detecting the hash marker.  In
1909 				 * this case, we simply set dvar to NULL and
1910 				 * break; the conditional after the loop will
1911 				 * send us back to top.
1912 				 */
1913 				dvar = NULL;
1914 				break;
1915 			}
1916 
1917 			goto next;
1918 		}
1919 
1920 		if (dtuple->dtt_nkeys != nkeys)
1921 			goto next;
1922 
1923 		for (i = 0; i < nkeys; i++, dkey++) {
1924 			if (dkey->dttk_size != key[i].dttk_size)
1925 				goto next; /* size or type mismatch */
1926 
1927 			if (dkey->dttk_size != 0) {
1928 				if (dtrace_bcmp(
1929 				    (void *)(uintptr_t)key[i].dttk_value,
1930 				    (void *)(uintptr_t)dkey->dttk_value,
1931 				    dkey->dttk_size))
1932 					goto next;
1933 			} else {
1934 				if (dkey->dttk_value != key[i].dttk_value)
1935 					goto next;
1936 			}
1937 		}
1938 
1939 		if (op != DTRACE_DYNVAR_DEALLOC)
1940 			return (dvar);
1941 
1942 		ASSERT(dvar->dtdv_next == NULL ||
1943 		    dvar->dtdv_next->dtdv_hashval != DTRACE_DYNHASH_FREE);
1944 
1945 		if (prev != NULL) {
1946 			ASSERT(hash[bucket].dtdh_chain != dvar);
1947 			ASSERT(start != dvar);
1948 			ASSERT(prev->dtdv_next == dvar);
1949 			prev->dtdv_next = dvar->dtdv_next;
1950 		} else {
1951 			if (dtrace_casptr(&hash[bucket].dtdh_chain,
1952 			    start, dvar->dtdv_next) != start) {
1953 				/*
1954 				 * We have failed to atomically swing the
1955 				 * hash table head pointer, presumably because
1956 				 * of a conflicting allocation on another CPU.
1957 				 * We need to reread the hash chain and try
1958 				 * again.
1959 				 */
1960 				goto top;
1961 			}
1962 		}
1963 
1964 		dtrace_membar_producer();
1965 
1966 		/*
1967 		 * Now set the hash value to indicate that it's free.
1968 		 */
1969 		ASSERT(hash[bucket].dtdh_chain != dvar);
1970 		dvar->dtdv_hashval = DTRACE_DYNHASH_FREE;
1971 
1972 		dtrace_membar_producer();
1973 
1974 		/*
1975 		 * Set the next pointer to point at the dirty list, and
1976 		 * atomically swing the dirty pointer to the newly freed dvar.
1977 		 */
1978 		do {
1979 			next = dcpu->dtdsc_dirty;
1980 			dvar->dtdv_next = next;
1981 		} while (dtrace_casptr(&dcpu->dtdsc_dirty, next, dvar) != next);
1982 
1983 		/*
1984 		 * Finally, unlock this hash bucket.
1985 		 */
1986 		ASSERT(hash[bucket].dtdh_lock == lock);
1987 		ASSERT(lock & 1);
1988 		hash[bucket].dtdh_lock++;
1989 
1990 		return (NULL);
1991 next:
1992 		prev = dvar;
1993 		continue;
1994 	}
1995 
1996 	if (dvar == NULL) {
1997 		/*
1998 		 * If dvar is NULL, it is because we went off the rails:
1999 		 * one of the elements that we traversed in the hash chain
2000 		 * was deleted while we were traversing it.  In this case,
2001 		 * we assert that we aren't doing a dealloc (deallocs lock
2002 		 * the hash bucket to prevent themselves from racing with
2003 		 * one another), and retry the hash chain traversal.
2004 		 */
2005 		ASSERT(op != DTRACE_DYNVAR_DEALLOC);
2006 		goto top;
2007 	}
2008 
2009 	if (op != DTRACE_DYNVAR_ALLOC) {
2010 		/*
2011 		 * If we are not to allocate a new variable, we want to
2012 		 * return NULL now.  Before we return, check that the value
2013 		 * of the lock word hasn't changed.  If it has, we may have
2014 		 * seen an inconsistent snapshot.
2015 		 */
2016 		if (op == DTRACE_DYNVAR_NOALLOC) {
2017 			if (hash[bucket].dtdh_lock != lock)
2018 				goto top;
2019 		} else {
2020 			ASSERT(op == DTRACE_DYNVAR_DEALLOC);
2021 			ASSERT(hash[bucket].dtdh_lock == lock);
2022 			ASSERT(lock & 1);
2023 			hash[bucket].dtdh_lock++;
2024 		}
2025 
2026 		return (NULL);
2027 	}
2028 
2029 	/*
2030 	 * We need to allocate a new dynamic variable.  The size we need is the
2031 	 * size of dtrace_dynvar plus the size of nkeys dtrace_key_t's plus the
2032 	 * size of any auxiliary key data (rounded up to 8-byte alignment) plus
2033 	 * the size of any referred-to data (dsize).  We then round the final
2034 	 * size up to the chunksize for allocation.
2035 	 */
2036 	for (ksize = 0, i = 0; i < nkeys; i++)
2037 		ksize += P2ROUNDUP(key[i].dttk_size, sizeof (uint64_t));
2038 
2039 	/*
2040 	 * This should be pretty much impossible, but could happen if, say,
2041 	 * strange DIF specified the tuple.  Ideally, this should be an
2042 	 * assertion and not an error condition -- but that requires that the
2043 	 * chunksize calculation in dtrace_difo_chunksize() be absolutely
2044 	 * bullet-proof.  (That is, it must not be able to be fooled by
2045 	 * malicious DIF.)  Given the lack of backwards branches in DIF,
2046 	 * solving this would presumably not amount to solving the Halting
2047 	 * Problem -- but it still seems awfully hard.
2048 	 */
2049 	if (sizeof (dtrace_dynvar_t) + sizeof (dtrace_key_t) * (nkeys - 1) +
2050 	    ksize + dsize > chunksize) {
2051 		dcpu->dtdsc_drops++;
2052 		return (NULL);
2053 	}
2054 
2055 	nstate = DTRACE_DSTATE_EMPTY;
2056 
2057 	do {
2058 retry:
2059 		free = dcpu->dtdsc_free;
2060 
2061 		if (free == NULL) {
2062 			dtrace_dynvar_t *clean = dcpu->dtdsc_clean;
2063 			void *rval;
2064 
2065 			if (clean == NULL) {
2066 				/*
2067 				 * We're out of dynamic variable space on
2068 				 * this CPU.  Unless we have tried all CPUs,
2069 				 * we'll try to allocate from a different
2070 				 * CPU.
2071 				 */
2072 				switch (dstate->dtds_state) {
2073 				case DTRACE_DSTATE_CLEAN: {
2074 					void *sp = &dstate->dtds_state;
2075 
2076 					if (++cpu >= NCPU)
2077 						cpu = 0;
2078 
2079 					if (dcpu->dtdsc_dirty != NULL &&
2080 					    nstate == DTRACE_DSTATE_EMPTY)
2081 						nstate = DTRACE_DSTATE_DIRTY;
2082 
2083 					if (dcpu->dtdsc_rinsing != NULL)
2084 						nstate = DTRACE_DSTATE_RINSING;
2085 
2086 					dcpu = &dstate->dtds_percpu[cpu];
2087 
2088 					if (cpu != me)
2089 						goto retry;
2090 
2091 					(void) dtrace_cas32(sp,
2092 					    DTRACE_DSTATE_CLEAN, nstate);
2093 
2094 					/*
2095 					 * To increment the correct bean
2096 					 * counter, take another lap.
2097 					 */
2098 					goto retry;
2099 				}
2100 
2101 				case DTRACE_DSTATE_DIRTY:
2102 					dcpu->dtdsc_dirty_drops++;
2103 					break;
2104 
2105 				case DTRACE_DSTATE_RINSING:
2106 					dcpu->dtdsc_rinsing_drops++;
2107 					break;
2108 
2109 				case DTRACE_DSTATE_EMPTY:
2110 					dcpu->dtdsc_drops++;
2111 					break;
2112 				}
2113 
2114 				DTRACE_CPUFLAG_SET(CPU_DTRACE_DROP);
2115 				return (NULL);
2116 			}
2117 
2118 			/*
2119 			 * The clean list appears to be non-empty.  We want to
2120 			 * move the clean list to the free list; we start by
2121 			 * moving the clean pointer aside.
2122 			 */
2123 			if (dtrace_casptr(&dcpu->dtdsc_clean,
2124 			    clean, NULL) != clean) {
2125 				/*
2126 				 * We are in one of two situations:
2127 				 *
2128 				 *  (a)	The clean list was switched to the
2129 				 *	free list by another CPU.
2130 				 *
2131 				 *  (b)	The clean list was added to by the
2132 				 *	cleansing cyclic.
2133 				 *
2134 				 * In either of these situations, we can
2135 				 * just reattempt the free list allocation.
2136 				 */
2137 				goto retry;
2138 			}
2139 
2140 			ASSERT(clean->dtdv_hashval == DTRACE_DYNHASH_FREE);
2141 
2142 			/*
2143 			 * Now we'll move the clean list to our free list.
2144 			 * It's impossible for this to fail:  the only way
2145 			 * the free list can be updated is through this
2146 			 * code path, and only one CPU can own the clean list.
2147 			 * Thus, it would only be possible for this to fail if
2148 			 * this code were racing with dtrace_dynvar_clean().
2149 			 * (That is, if dtrace_dynvar_clean() updated the clean
2150 			 * list, and we ended up racing to update the free
2151 			 * list.)  This race is prevented by the dtrace_sync()
2152 			 * in dtrace_dynvar_clean() -- which flushes the
2153 			 * owners of the clean lists out before resetting
2154 			 * the clean lists.
2155 			 */
2156 			dcpu = &dstate->dtds_percpu[me];
2157 			rval = dtrace_casptr(&dcpu->dtdsc_free, NULL, clean);
2158 			ASSERT(rval == NULL);
2159 			goto retry;
2160 		}
2161 
2162 		dvar = free;
2163 		new_free = dvar->dtdv_next;
2164 	} while (dtrace_casptr(&dcpu->dtdsc_free, free, new_free) != free);
2165 
2166 	/*
2167 	 * We have now allocated a new chunk.  We copy the tuple keys into the
2168 	 * tuple array and copy any referenced key data into the data space
2169 	 * following the tuple array.  As we do this, we relocate dttk_value
2170 	 * in the final tuple to point to the key data address in the chunk.
2171 	 */
2172 	kdata = (uintptr_t)&dvar->dtdv_tuple.dtt_key[nkeys];
2173 	dvar->dtdv_data = (void *)(kdata + ksize);
2174 	dvar->dtdv_tuple.dtt_nkeys = nkeys;
2175 
2176 	for (i = 0; i < nkeys; i++) {
2177 		dtrace_key_t *dkey = &dvar->dtdv_tuple.dtt_key[i];
2178 		size_t kesize = key[i].dttk_size;
2179 
2180 		if (kesize != 0) {
2181 			dtrace_bcopy(
2182 			    (const void *)(uintptr_t)key[i].dttk_value,
2183 			    (void *)kdata, kesize);
2184 			dkey->dttk_value = kdata;
2185 			kdata += P2ROUNDUP(kesize, sizeof (uint64_t));
2186 		} else {
2187 			dkey->dttk_value = key[i].dttk_value;
2188 		}
2189 
2190 		dkey->dttk_size = kesize;
2191 	}
2192 
2193 	ASSERT(dvar->dtdv_hashval == DTRACE_DYNHASH_FREE);
2194 	dvar->dtdv_hashval = hashval;
2195 	dvar->dtdv_next = start;
2196 
2197 	if (dtrace_casptr(&hash[bucket].dtdh_chain, start, dvar) == start)
2198 		return (dvar);
2199 
2200 	/*
2201 	 * The cas has failed.  Either another CPU is adding an element to
2202 	 * this hash chain, or another CPU is deleting an element from this
2203 	 * hash chain.  The simplest way to deal with both of these cases
2204 	 * (though not necessarily the most efficient) is to free our
2205 	 * allocated block and re-attempt it all.  Note that the free is
2206 	 * to the dirty list and _not_ to the free list.  This is to prevent
2207 	 * races with allocators, above.
2208 	 */
2209 	dvar->dtdv_hashval = DTRACE_DYNHASH_FREE;
2210 
2211 	dtrace_membar_producer();
2212 
2213 	do {
2214 		free = dcpu->dtdsc_dirty;
2215 		dvar->dtdv_next = free;
2216 	} while (dtrace_casptr(&dcpu->dtdsc_dirty, free, dvar) != free);
2217 
2218 	goto top;
2219 }
2220 
2221 /*ARGSUSED*/
2222 static void
dtrace_aggregate_min(uint64_t * oval,uint64_t nval,uint64_t arg)2223 dtrace_aggregate_min(uint64_t *oval, uint64_t nval, uint64_t arg)
2224 {
2225 	if ((int64_t)nval < (int64_t)*oval)
2226 		*oval = nval;
2227 }
2228 
2229 /*ARGSUSED*/
2230 static void
dtrace_aggregate_max(uint64_t * oval,uint64_t nval,uint64_t arg)2231 dtrace_aggregate_max(uint64_t *oval, uint64_t nval, uint64_t arg)
2232 {
2233 	if ((int64_t)nval > (int64_t)*oval)
2234 		*oval = nval;
2235 }
2236 
2237 static void
dtrace_aggregate_quantize(uint64_t * quanta,uint64_t nval,uint64_t incr)2238 dtrace_aggregate_quantize(uint64_t *quanta, uint64_t nval, uint64_t incr)
2239 {
2240 	int i, zero = DTRACE_QUANTIZE_ZEROBUCKET;
2241 	int64_t val = (int64_t)nval;
2242 
2243 	if (val < 0) {
2244 		for (i = 0; i < zero; i++) {
2245 			if (val <= DTRACE_QUANTIZE_BUCKETVAL(i)) {
2246 				quanta[i] += incr;
2247 				return;
2248 			}
2249 		}
2250 	} else {
2251 		for (i = zero + 1; i < DTRACE_QUANTIZE_NBUCKETS; i++) {
2252 			if (val < DTRACE_QUANTIZE_BUCKETVAL(i)) {
2253 				quanta[i - 1] += incr;
2254 				return;
2255 			}
2256 		}
2257 
2258 		quanta[DTRACE_QUANTIZE_NBUCKETS - 1] += incr;
2259 		return;
2260 	}
2261 
2262 	ASSERT(0);
2263 }
2264 
2265 static void
dtrace_aggregate_lquantize(uint64_t * lquanta,uint64_t nval,uint64_t incr)2266 dtrace_aggregate_lquantize(uint64_t *lquanta, uint64_t nval, uint64_t incr)
2267 {
2268 	uint64_t arg = *lquanta++;
2269 	int32_t base = DTRACE_LQUANTIZE_BASE(arg);
2270 	uint16_t step = DTRACE_LQUANTIZE_STEP(arg);
2271 	uint16_t levels = DTRACE_LQUANTIZE_LEVELS(arg);
2272 	int32_t val = (int32_t)nval, level;
2273 
2274 	ASSERT(step != 0);
2275 	ASSERT(levels != 0);
2276 
2277 	if (val < base) {
2278 		/*
2279 		 * This is an underflow.
2280 		 */
2281 		lquanta[0] += incr;
2282 		return;
2283 	}
2284 
2285 	level = (val - base) / step;
2286 
2287 	if (level < levels) {
2288 		lquanta[level + 1] += incr;
2289 		return;
2290 	}
2291 
2292 	/*
2293 	 * This is an overflow.
2294 	 */
2295 	lquanta[levels + 1] += incr;
2296 }
2297 
2298 static int
dtrace_aggregate_llquantize_bucket(uint16_t factor,uint16_t low,uint16_t high,uint16_t nsteps,int64_t value)2299 dtrace_aggregate_llquantize_bucket(uint16_t factor, uint16_t low,
2300     uint16_t high, uint16_t nsteps, int64_t value)
2301 {
2302 	int64_t this = 1, last, next;
2303 	int base = 1, order;
2304 
2305 	ASSERT(factor <= nsteps);
2306 	ASSERT(nsteps % factor == 0);
2307 
2308 	for (order = 0; order < low; order++)
2309 		this *= factor;
2310 
2311 	/*
2312 	 * If our value is less than our factor taken to the power of the
2313 	 * low order of magnitude, it goes into the zeroth bucket.
2314 	 */
2315 	if (value < (last = this))
2316 		return (0);
2317 
2318 	for (this *= factor; order <= high; order++) {
2319 		int nbuckets = this > nsteps ? nsteps : this;
2320 
2321 		if ((next = this * factor) < this) {
2322 			/*
2323 			 * We should not generally get log/linear quantizations
2324 			 * with a high magnitude that allows 64-bits to
2325 			 * overflow, but we nonetheless protect against this
2326 			 * by explicitly checking for overflow, and clamping
2327 			 * our value accordingly.
2328 			 */
2329 			value = this - 1;
2330 		}
2331 
2332 		if (value < this) {
2333 			/*
2334 			 * If our value lies within this order of magnitude,
2335 			 * determine its position by taking the offset within
2336 			 * the order of magnitude, dividing by the bucket
2337 			 * width, and adding to our (accumulated) base.
2338 			 */
2339 			return (base + (value - last) / (this / nbuckets));
2340 		}
2341 
2342 		base += nbuckets - (nbuckets / factor);
2343 		last = this;
2344 		this = next;
2345 	}
2346 
2347 	/*
2348 	 * Our value is greater than or equal to our factor taken to the
2349 	 * power of one plus the high magnitude -- return the top bucket.
2350 	 */
2351 	return (base);
2352 }
2353 
2354 static void
dtrace_aggregate_llquantize(uint64_t * llquanta,uint64_t nval,uint64_t incr)2355 dtrace_aggregate_llquantize(uint64_t *llquanta, uint64_t nval, uint64_t incr)
2356 {
2357 	uint64_t arg = *llquanta++;
2358 	uint16_t factor = DTRACE_LLQUANTIZE_FACTOR(arg);
2359 	uint16_t low = DTRACE_LLQUANTIZE_LOW(arg);
2360 	uint16_t high = DTRACE_LLQUANTIZE_HIGH(arg);
2361 	uint16_t nsteps = DTRACE_LLQUANTIZE_NSTEP(arg);
2362 
2363 	llquanta[dtrace_aggregate_llquantize_bucket(factor,
2364 	    low, high, nsteps, nval)] += incr;
2365 }
2366 
2367 /*ARGSUSED*/
2368 static void
dtrace_aggregate_avg(uint64_t * data,uint64_t nval,uint64_t arg)2369 dtrace_aggregate_avg(uint64_t *data, uint64_t nval, uint64_t arg)
2370 {
2371 	data[0]++;
2372 	data[1] += nval;
2373 }
2374 
2375 /*ARGSUSED*/
2376 static void
dtrace_aggregate_stddev(uint64_t * data,uint64_t nval,uint64_t arg)2377 dtrace_aggregate_stddev(uint64_t *data, uint64_t nval, uint64_t arg)
2378 {
2379 	int64_t snval = (int64_t)nval;
2380 	uint64_t tmp[2];
2381 
2382 	data[0]++;
2383 	data[1] += nval;
2384 
2385 	/*
2386 	 * What we want to say here is:
2387 	 *
2388 	 * data[2] += nval * nval;
2389 	 *
2390 	 * But given that nval is 64-bit, we could easily overflow, so
2391 	 * we do this as 128-bit arithmetic.
2392 	 */
2393 	if (snval < 0)
2394 		snval = -snval;
2395 
2396 	dtrace_multiply_128((uint64_t)snval, (uint64_t)snval, tmp);
2397 	dtrace_add_128(data + 2, tmp, data + 2);
2398 }
2399 
2400 /*ARGSUSED*/
2401 static void
dtrace_aggregate_count(uint64_t * oval,uint64_t nval,uint64_t arg)2402 dtrace_aggregate_count(uint64_t *oval, uint64_t nval, uint64_t arg)
2403 {
2404 	*oval = *oval + 1;
2405 }
2406 
2407 /*ARGSUSED*/
2408 static void
dtrace_aggregate_sum(uint64_t * oval,uint64_t nval,uint64_t arg)2409 dtrace_aggregate_sum(uint64_t *oval, uint64_t nval, uint64_t arg)
2410 {
2411 	*oval += nval;
2412 }
2413 
2414 /*
2415  * Aggregate given the tuple in the principal data buffer, and the aggregating
2416  * action denoted by the specified dtrace_aggregation_t.  The aggregation
2417  * buffer is specified as the buf parameter.  This routine does not return
2418  * failure; if there is no space in the aggregation buffer, the data will be
2419  * dropped, and a corresponding counter incremented.
2420  */
2421 static void
dtrace_aggregate(dtrace_aggregation_t * agg,dtrace_buffer_t * dbuf,intptr_t offset,dtrace_buffer_t * buf,uint64_t expr,uint64_t arg)2422 dtrace_aggregate(dtrace_aggregation_t *agg, dtrace_buffer_t *dbuf,
2423     intptr_t offset, dtrace_buffer_t *buf, uint64_t expr, uint64_t arg)
2424 {
2425 	dtrace_recdesc_t *rec = &agg->dtag_action.dta_rec;
2426 	uint32_t i, ndx, size, fsize;
2427 	uint32_t align = sizeof (uint64_t) - 1;
2428 	dtrace_aggbuffer_t *agb;
2429 	dtrace_aggkey_t *key;
2430 	uint32_t hashval = 0, limit, isstr;
2431 	caddr_t tomax, data, kdata;
2432 	dtrace_actkind_t action;
2433 	dtrace_action_t *act;
2434 	uintptr_t offs;
2435 
2436 	if (buf == NULL)
2437 		return;
2438 
2439 	if (!agg->dtag_hasarg) {
2440 		/*
2441 		 * Currently, only quantize() and lquantize() take additional
2442 		 * arguments, and they have the same semantics:  an increment
2443 		 * value that defaults to 1 when not present.  If additional
2444 		 * aggregating actions take arguments, the setting of the
2445 		 * default argument value will presumably have to become more
2446 		 * sophisticated...
2447 		 */
2448 		arg = 1;
2449 	}
2450 
2451 	action = agg->dtag_action.dta_kind - DTRACEACT_AGGREGATION;
2452 	size = rec->dtrd_offset - agg->dtag_base;
2453 	fsize = size + rec->dtrd_size;
2454 
2455 	ASSERT(dbuf->dtb_tomax != NULL);
2456 	data = dbuf->dtb_tomax + offset + agg->dtag_base;
2457 
2458 	if ((tomax = buf->dtb_tomax) == NULL) {
2459 		dtrace_buffer_drop(buf);
2460 		return;
2461 	}
2462 
2463 	/*
2464 	 * The metastructure is always at the bottom of the buffer.
2465 	 */
2466 	agb = (dtrace_aggbuffer_t *)(tomax + buf->dtb_size -
2467 	    sizeof (dtrace_aggbuffer_t));
2468 
2469 	if (buf->dtb_offset == 0) {
2470 		/*
2471 		 * We just kludge up approximately 1/8th of the size to be
2472 		 * buckets.  If this guess ends up being routinely
2473 		 * off-the-mark, we may need to dynamically readjust this
2474 		 * based on past performance.
2475 		 */
2476 		uintptr_t hashsize = (buf->dtb_size >> 3) / sizeof (uintptr_t);
2477 
2478 		if ((uintptr_t)agb - hashsize * sizeof (dtrace_aggkey_t *) <
2479 		    (uintptr_t)tomax || hashsize == 0) {
2480 			/*
2481 			 * We've been given a ludicrously small buffer;
2482 			 * increment our drop count and leave.
2483 			 */
2484 			dtrace_buffer_drop(buf);
2485 			return;
2486 		}
2487 
2488 		/*
2489 		 * And now, a pathetic attempt to try to get a an odd (or
2490 		 * perchance, a prime) hash size for better hash distribution.
2491 		 */
2492 		if (hashsize > (DTRACE_AGGHASHSIZE_SLEW << 3))
2493 			hashsize -= DTRACE_AGGHASHSIZE_SLEW;
2494 
2495 		agb->dtagb_hashsize = hashsize;
2496 		agb->dtagb_hash = (dtrace_aggkey_t **)((uintptr_t)agb -
2497 		    agb->dtagb_hashsize * sizeof (dtrace_aggkey_t *));
2498 		agb->dtagb_free = (uintptr_t)agb->dtagb_hash;
2499 
2500 		for (i = 0; i < agb->dtagb_hashsize; i++)
2501 			agb->dtagb_hash[i] = NULL;
2502 	}
2503 
2504 	ASSERT(agg->dtag_first != NULL);
2505 	ASSERT(agg->dtag_first->dta_intuple);
2506 
2507 	/*
2508 	 * Calculate the hash value based on the key.  Note that we _don't_
2509 	 * include the aggid in the hashing (but we will store it as part of
2510 	 * the key).  The hashing algorithm is Bob Jenkins' "One-at-a-time"
2511 	 * algorithm: a simple, quick algorithm that has no known funnels, and
2512 	 * gets good distribution in practice.  The efficacy of the hashing
2513 	 * algorithm (and a comparison with other algorithms) may be found by
2514 	 * running the ::dtrace_aggstat MDB dcmd.
2515 	 */
2516 	for (act = agg->dtag_first; act->dta_intuple; act = act->dta_next) {
2517 		i = act->dta_rec.dtrd_offset - agg->dtag_base;
2518 		limit = i + act->dta_rec.dtrd_size;
2519 		ASSERT(limit <= size);
2520 		isstr = DTRACEACT_ISSTRING(act);
2521 
2522 		for (; i < limit; i++) {
2523 			hashval += data[i];
2524 			hashval += (hashval << 10);
2525 			hashval ^= (hashval >> 6);
2526 
2527 			if (isstr && data[i] == '\0')
2528 				break;
2529 		}
2530 	}
2531 
2532 	hashval += (hashval << 3);
2533 	hashval ^= (hashval >> 11);
2534 	hashval += (hashval << 15);
2535 
2536 	/*
2537 	 * Yes, the divide here is expensive -- but it's generally the least
2538 	 * of the performance issues given the amount of data that we iterate
2539 	 * over to compute hash values, compare data, etc.
2540 	 */
2541 	ndx = hashval % agb->dtagb_hashsize;
2542 
2543 	for (key = agb->dtagb_hash[ndx]; key != NULL; key = key->dtak_next) {
2544 		ASSERT((caddr_t)key >= tomax);
2545 		ASSERT((caddr_t)key < tomax + buf->dtb_size);
2546 
2547 		if (hashval != key->dtak_hashval || key->dtak_size != size)
2548 			continue;
2549 
2550 		kdata = key->dtak_data;
2551 		ASSERT(kdata >= tomax && kdata < tomax + buf->dtb_size);
2552 
2553 		for (act = agg->dtag_first; act->dta_intuple;
2554 		    act = act->dta_next) {
2555 			i = act->dta_rec.dtrd_offset - agg->dtag_base;
2556 			limit = i + act->dta_rec.dtrd_size;
2557 			ASSERT(limit <= size);
2558 			isstr = DTRACEACT_ISSTRING(act);
2559 
2560 			for (; i < limit; i++) {
2561 				if (kdata[i] != data[i])
2562 					goto next;
2563 
2564 				if (isstr && data[i] == '\0')
2565 					break;
2566 			}
2567 		}
2568 
2569 		if (action != key->dtak_action) {
2570 			/*
2571 			 * We are aggregating on the same value in the same
2572 			 * aggregation with two different aggregating actions.
2573 			 * (This should have been picked up in the compiler,
2574 			 * so we may be dealing with errant or devious DIF.)
2575 			 * This is an error condition; we indicate as much,
2576 			 * and return.
2577 			 */
2578 			DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
2579 			return;
2580 		}
2581 
2582 		/*
2583 		 * This is a hit:  we need to apply the aggregator to
2584 		 * the value at this key.
2585 		 */
2586 		agg->dtag_aggregate((uint64_t *)(kdata + size), expr, arg);
2587 		return;
2588 next:
2589 		continue;
2590 	}
2591 
2592 	/*
2593 	 * We didn't find it.  We need to allocate some zero-filled space,
2594 	 * link it into the hash table appropriately, and apply the aggregator
2595 	 * to the (zero-filled) value.
2596 	 */
2597 	offs = buf->dtb_offset;
2598 	while (offs & (align - 1))
2599 		offs += sizeof (uint32_t);
2600 
2601 	/*
2602 	 * If we don't have enough room to both allocate a new key _and_
2603 	 * its associated data, increment the drop count and return.
2604 	 */
2605 	if ((uintptr_t)tomax + offs + fsize >
2606 	    agb->dtagb_free - sizeof (dtrace_aggkey_t)) {
2607 		dtrace_buffer_drop(buf);
2608 		return;
2609 	}
2610 
2611 	/*CONSTCOND*/
2612 	ASSERT(!(sizeof (dtrace_aggkey_t) & (sizeof (uintptr_t) - 1)));
2613 	key = (dtrace_aggkey_t *)(agb->dtagb_free - sizeof (dtrace_aggkey_t));
2614 	agb->dtagb_free -= sizeof (dtrace_aggkey_t);
2615 
2616 	key->dtak_data = kdata = tomax + offs;
2617 	buf->dtb_offset = offs + fsize;
2618 
2619 	/*
2620 	 * Now copy the data across.
2621 	 */
2622 	*((dtrace_aggid_t *)kdata) = agg->dtag_id;
2623 
2624 	for (i = sizeof (dtrace_aggid_t); i < size; i++)
2625 		kdata[i] = data[i];
2626 
2627 	/*
2628 	 * Because strings are not zeroed out by default, we need to iterate
2629 	 * looking for actions that store strings, and we need to explicitly
2630 	 * pad these strings out with zeroes.
2631 	 */
2632 	for (act = agg->dtag_first; act->dta_intuple; act = act->dta_next) {
2633 		int nul;
2634 
2635 		if (!DTRACEACT_ISSTRING(act))
2636 			continue;
2637 
2638 		i = act->dta_rec.dtrd_offset - agg->dtag_base;
2639 		limit = i + act->dta_rec.dtrd_size;
2640 		ASSERT(limit <= size);
2641 
2642 		for (nul = 0; i < limit; i++) {
2643 			if (nul) {
2644 				kdata[i] = '\0';
2645 				continue;
2646 			}
2647 
2648 			if (data[i] != '\0')
2649 				continue;
2650 
2651 			nul = 1;
2652 		}
2653 	}
2654 
2655 	for (i = size; i < fsize; i++)
2656 		kdata[i] = 0;
2657 
2658 	key->dtak_hashval = hashval;
2659 	key->dtak_size = size;
2660 	key->dtak_action = action;
2661 	key->dtak_next = agb->dtagb_hash[ndx];
2662 	agb->dtagb_hash[ndx] = key;
2663 
2664 	/*
2665 	 * Finally, apply the aggregator.
2666 	 */
2667 	*((uint64_t *)(key->dtak_data + size)) = agg->dtag_initial;
2668 	agg->dtag_aggregate((uint64_t *)(key->dtak_data + size), expr, arg);
2669 }
2670 
2671 /*
2672  * Given consumer state, this routine finds a speculation in the INACTIVE
2673  * state and transitions it into the ACTIVE state.  If there is no speculation
2674  * in the INACTIVE state, 0 is returned.  In this case, no error counter is
2675  * incremented -- it is up to the caller to take appropriate action.
2676  */
2677 static int
dtrace_speculation(dtrace_state_t * state)2678 dtrace_speculation(dtrace_state_t *state)
2679 {
2680 	int i = 0;
2681 	dtrace_speculation_state_t current;
2682 	uint32_t *stat = &state->dts_speculations_unavail, count;
2683 
2684 	while (i < state->dts_nspeculations) {
2685 		dtrace_speculation_t *spec = &state->dts_speculations[i];
2686 
2687 		current = spec->dtsp_state;
2688 
2689 		if (current != DTRACESPEC_INACTIVE) {
2690 			if (current == DTRACESPEC_COMMITTINGMANY ||
2691 			    current == DTRACESPEC_COMMITTING ||
2692 			    current == DTRACESPEC_DISCARDING)
2693 				stat = &state->dts_speculations_busy;
2694 			i++;
2695 			continue;
2696 		}
2697 
2698 		if (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2699 		    current, DTRACESPEC_ACTIVE) == current)
2700 			return (i + 1);
2701 	}
2702 
2703 	/*
2704 	 * We couldn't find a speculation.  If we found as much as a single
2705 	 * busy speculation buffer, we'll attribute this failure as "busy"
2706 	 * instead of "unavail".
2707 	 */
2708 	do {
2709 		count = *stat;
2710 	} while (dtrace_cas32(stat, count, count + 1) != count);
2711 
2712 	return (0);
2713 }
2714 
2715 /*
2716  * This routine commits an active speculation.  If the specified speculation
2717  * is not in a valid state to perform a commit(), this routine will silently do
2718  * nothing.  The state of the specified speculation is transitioned according
2719  * to the state transition diagram outlined in <sys/dtrace_impl.h>
2720  */
2721 static void
dtrace_speculation_commit(dtrace_state_t * state,processorid_t cpu,dtrace_specid_t which)2722 dtrace_speculation_commit(dtrace_state_t *state, processorid_t cpu,
2723     dtrace_specid_t which)
2724 {
2725 	dtrace_speculation_t *spec;
2726 	dtrace_buffer_t *src, *dest;
2727 	uintptr_t daddr, saddr, dlimit, slimit;
2728 	dtrace_speculation_state_t current, new;
2729 	intptr_t offs;
2730 	uint64_t timestamp;
2731 
2732 	if (which == 0)
2733 		return;
2734 
2735 	if (which > state->dts_nspeculations) {
2736 		cpu_core[cpu].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
2737 		return;
2738 	}
2739 
2740 	spec = &state->dts_speculations[which - 1];
2741 	src = &spec->dtsp_buffer[cpu];
2742 	dest = &state->dts_buffer[cpu];
2743 
2744 	do {
2745 		current = spec->dtsp_state;
2746 
2747 		if (current == DTRACESPEC_COMMITTINGMANY)
2748 			break;
2749 
2750 		switch (current) {
2751 		case DTRACESPEC_INACTIVE:
2752 		case DTRACESPEC_DISCARDING:
2753 			return;
2754 
2755 		case DTRACESPEC_COMMITTING:
2756 			/*
2757 			 * This is only possible if we are (a) commit()'ing
2758 			 * without having done a prior speculate() on this CPU
2759 			 * and (b) racing with another commit() on a different
2760 			 * CPU.  There's nothing to do -- we just assert that
2761 			 * our offset is 0.
2762 			 */
2763 			ASSERT(src->dtb_offset == 0);
2764 			return;
2765 
2766 		case DTRACESPEC_ACTIVE:
2767 			new = DTRACESPEC_COMMITTING;
2768 			break;
2769 
2770 		case DTRACESPEC_ACTIVEONE:
2771 			/*
2772 			 * This speculation is active on one CPU.  If our
2773 			 * buffer offset is non-zero, we know that the one CPU
2774 			 * must be us.  Otherwise, we are committing on a
2775 			 * different CPU from the speculate(), and we must
2776 			 * rely on being asynchronously cleaned.
2777 			 */
2778 			if (src->dtb_offset != 0) {
2779 				new = DTRACESPEC_COMMITTING;
2780 				break;
2781 			}
2782 			/*FALLTHROUGH*/
2783 
2784 		case DTRACESPEC_ACTIVEMANY:
2785 			new = DTRACESPEC_COMMITTINGMANY;
2786 			break;
2787 
2788 		default:
2789 			ASSERT(0);
2790 		}
2791 	} while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2792 	    current, new) != current);
2793 
2794 	/*
2795 	 * We have set the state to indicate that we are committing this
2796 	 * speculation.  Now reserve the necessary space in the destination
2797 	 * buffer.
2798 	 */
2799 	if ((offs = dtrace_buffer_reserve(dest, src->dtb_offset,
2800 	    sizeof (uint64_t), state, NULL)) < 0) {
2801 		dtrace_buffer_drop(dest);
2802 		goto out;
2803 	}
2804 
2805 	/*
2806 	 * We have sufficient space to copy the speculative buffer into the
2807 	 * primary buffer.  First, modify the speculative buffer, filling
2808 	 * in the timestamp of all entries with the current time.  The data
2809 	 * must have the commit() time rather than the time it was traced,
2810 	 * so that all entries in the primary buffer are in timestamp order.
2811 	 */
2812 	timestamp = dtrace_gethrtime();
2813 	saddr = (uintptr_t)src->dtb_tomax;
2814 	slimit = saddr + src->dtb_offset;
2815 	while (saddr < slimit) {
2816 		size_t size;
2817 		dtrace_rechdr_t *dtrh = (dtrace_rechdr_t *)saddr;
2818 
2819 		if (dtrh->dtrh_epid == DTRACE_EPIDNONE) {
2820 			saddr += sizeof (dtrace_epid_t);
2821 			continue;
2822 		}
2823 		ASSERT3U(dtrh->dtrh_epid, <=, state->dts_necbs);
2824 		size = state->dts_ecbs[dtrh->dtrh_epid - 1]->dte_size;
2825 
2826 		ASSERT3U(saddr + size, <=, slimit);
2827 		ASSERT3U(size, >=, sizeof (dtrace_rechdr_t));
2828 		ASSERT3U(DTRACE_RECORD_LOAD_TIMESTAMP(dtrh), ==, UINT64_MAX);
2829 
2830 		DTRACE_RECORD_STORE_TIMESTAMP(dtrh, timestamp);
2831 
2832 		saddr += size;
2833 	}
2834 
2835 	/*
2836 	 * Copy the buffer across.  (Note that this is a
2837 	 * highly subobtimal bcopy(); in the unlikely event that this becomes
2838 	 * a serious performance issue, a high-performance DTrace-specific
2839 	 * bcopy() should obviously be invented.)
2840 	 */
2841 	daddr = (uintptr_t)dest->dtb_tomax + offs;
2842 	dlimit = daddr + src->dtb_offset;
2843 	saddr = (uintptr_t)src->dtb_tomax;
2844 
2845 	/*
2846 	 * First, the aligned portion.
2847 	 */
2848 	while (dlimit - daddr >= sizeof (uint64_t)) {
2849 		*((uint64_t *)daddr) = *((uint64_t *)saddr);
2850 
2851 		daddr += sizeof (uint64_t);
2852 		saddr += sizeof (uint64_t);
2853 	}
2854 
2855 	/*
2856 	 * Now any left-over bit...
2857 	 */
2858 	while (dlimit - daddr)
2859 		*((uint8_t *)daddr++) = *((uint8_t *)saddr++);
2860 
2861 	/*
2862 	 * Finally, commit the reserved space in the destination buffer.
2863 	 */
2864 	dest->dtb_offset = offs + src->dtb_offset;
2865 
2866 out:
2867 	/*
2868 	 * If we're lucky enough to be the only active CPU on this speculation
2869 	 * buffer, we can just set the state back to DTRACESPEC_INACTIVE.
2870 	 */
2871 	if (current == DTRACESPEC_ACTIVE ||
2872 	    (current == DTRACESPEC_ACTIVEONE && new == DTRACESPEC_COMMITTING)) {
2873 		uint32_t rval = dtrace_cas32((uint32_t *)&spec->dtsp_state,
2874 		    DTRACESPEC_COMMITTING, DTRACESPEC_INACTIVE);
2875 
2876 		ASSERT(rval == DTRACESPEC_COMMITTING);
2877 	}
2878 
2879 	src->dtb_offset = 0;
2880 	src->dtb_xamot_drops += src->dtb_drops;
2881 	src->dtb_drops = 0;
2882 }
2883 
2884 /*
2885  * This routine discards an active speculation.  If the specified speculation
2886  * is not in a valid state to perform a discard(), this routine will silently
2887  * do nothing.  The state of the specified speculation is transitioned
2888  * according to the state transition diagram outlined in <sys/dtrace_impl.h>
2889  */
2890 static void
dtrace_speculation_discard(dtrace_state_t * state,processorid_t cpu,dtrace_specid_t which)2891 dtrace_speculation_discard(dtrace_state_t *state, processorid_t cpu,
2892     dtrace_specid_t which)
2893 {
2894 	dtrace_speculation_t *spec;
2895 	dtrace_speculation_state_t current, new;
2896 	dtrace_buffer_t *buf;
2897 
2898 	if (which == 0)
2899 		return;
2900 
2901 	if (which > state->dts_nspeculations) {
2902 		cpu_core[cpu].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
2903 		return;
2904 	}
2905 
2906 	spec = &state->dts_speculations[which - 1];
2907 	buf = &spec->dtsp_buffer[cpu];
2908 
2909 	do {
2910 		current = spec->dtsp_state;
2911 
2912 		switch (current) {
2913 		case DTRACESPEC_INACTIVE:
2914 		case DTRACESPEC_COMMITTINGMANY:
2915 		case DTRACESPEC_COMMITTING:
2916 		case DTRACESPEC_DISCARDING:
2917 			return;
2918 
2919 		case DTRACESPEC_ACTIVE:
2920 		case DTRACESPEC_ACTIVEMANY:
2921 			new = DTRACESPEC_DISCARDING;
2922 			break;
2923 
2924 		case DTRACESPEC_ACTIVEONE:
2925 			if (buf->dtb_offset != 0) {
2926 				new = DTRACESPEC_INACTIVE;
2927 			} else {
2928 				new = DTRACESPEC_DISCARDING;
2929 			}
2930 			break;
2931 
2932 		default:
2933 			ASSERT(0);
2934 		}
2935 	} while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2936 	    current, new) != current);
2937 
2938 	buf->dtb_offset = 0;
2939 	buf->dtb_drops = 0;
2940 }
2941 
2942 /*
2943  * Note:  not called from probe context.  This function is called
2944  * asynchronously from cross call context to clean any speculations that are
2945  * in the COMMITTINGMANY or DISCARDING states.  These speculations may not be
2946  * transitioned back to the INACTIVE state until all CPUs have cleaned the
2947  * speculation.
2948  */
2949 static void
dtrace_speculation_clean_here(dtrace_state_t * state)2950 dtrace_speculation_clean_here(dtrace_state_t *state)
2951 {
2952 	dtrace_icookie_t cookie;
2953 	processorid_t cpu = CPU->cpu_id;
2954 	dtrace_buffer_t *dest = &state->dts_buffer[cpu];
2955 	dtrace_specid_t i;
2956 
2957 	cookie = dtrace_interrupt_disable();
2958 
2959 	if (dest->dtb_tomax == NULL) {
2960 		dtrace_interrupt_enable(cookie);
2961 		return;
2962 	}
2963 
2964 	for (i = 0; i < state->dts_nspeculations; i++) {
2965 		dtrace_speculation_t *spec = &state->dts_speculations[i];
2966 		dtrace_buffer_t *src = &spec->dtsp_buffer[cpu];
2967 
2968 		if (src->dtb_tomax == NULL)
2969 			continue;
2970 
2971 		if (spec->dtsp_state == DTRACESPEC_DISCARDING) {
2972 			src->dtb_offset = 0;
2973 			continue;
2974 		}
2975 
2976 		if (spec->dtsp_state != DTRACESPEC_COMMITTINGMANY)
2977 			continue;
2978 
2979 		if (src->dtb_offset == 0)
2980 			continue;
2981 
2982 		dtrace_speculation_commit(state, cpu, i + 1);
2983 	}
2984 
2985 	dtrace_interrupt_enable(cookie);
2986 }
2987 
2988 /*
2989  * Note:  not called from probe context.  This function is called
2990  * asynchronously (and at a regular interval) to clean any speculations that
2991  * are in the COMMITTINGMANY or DISCARDING states.  If it discovers that there
2992  * is work to be done, it cross calls all CPUs to perform that work;
2993  * COMMITMANY and DISCARDING speculations may not be transitioned back to the
2994  * INACTIVE state until they have been cleaned by all CPUs.
2995  */
2996 static void
dtrace_speculation_clean(dtrace_state_t * state)2997 dtrace_speculation_clean(dtrace_state_t *state)
2998 {
2999 	int work = 0, rv;
3000 	dtrace_specid_t i;
3001 
3002 	for (i = 0; i < state->dts_nspeculations; i++) {
3003 		dtrace_speculation_t *spec = &state->dts_speculations[i];
3004 
3005 		ASSERT(!spec->dtsp_cleaning);
3006 
3007 		if (spec->dtsp_state != DTRACESPEC_DISCARDING &&
3008 		    spec->dtsp_state != DTRACESPEC_COMMITTINGMANY)
3009 			continue;
3010 
3011 		work++;
3012 		spec->dtsp_cleaning = 1;
3013 	}
3014 
3015 	if (!work)
3016 		return;
3017 
3018 	dtrace_xcall(DTRACE_CPUALL,
3019 	    (dtrace_xcall_t)dtrace_speculation_clean_here, state);
3020 
3021 	/*
3022 	 * We now know that all CPUs have committed or discarded their
3023 	 * speculation buffers, as appropriate.  We can now set the state
3024 	 * to inactive.
3025 	 */
3026 	for (i = 0; i < state->dts_nspeculations; i++) {
3027 		dtrace_speculation_t *spec = &state->dts_speculations[i];
3028 		dtrace_speculation_state_t current, new;
3029 
3030 		if (!spec->dtsp_cleaning)
3031 			continue;
3032 
3033 		current = spec->dtsp_state;
3034 		ASSERT(current == DTRACESPEC_DISCARDING ||
3035 		    current == DTRACESPEC_COMMITTINGMANY);
3036 
3037 		new = DTRACESPEC_INACTIVE;
3038 
3039 		rv = dtrace_cas32((uint32_t *)&spec->dtsp_state, current, new);
3040 		ASSERT(rv == current);
3041 		spec->dtsp_cleaning = 0;
3042 	}
3043 }
3044 
3045 /*
3046  * Called as part of a speculate() to get the speculative buffer associated
3047  * with a given speculation.  Returns NULL if the specified speculation is not
3048  * in an ACTIVE state.  If the speculation is in the ACTIVEONE state -- and
3049  * the active CPU is not the specified CPU -- the speculation will be
3050  * atomically transitioned into the ACTIVEMANY state.
3051  */
3052 static dtrace_buffer_t *
dtrace_speculation_buffer(dtrace_state_t * state,processorid_t cpuid,dtrace_specid_t which)3053 dtrace_speculation_buffer(dtrace_state_t *state, processorid_t cpuid,
3054     dtrace_specid_t which)
3055 {
3056 	dtrace_speculation_t *spec;
3057 	dtrace_speculation_state_t current, new;
3058 	dtrace_buffer_t *buf;
3059 
3060 	if (which == 0)
3061 		return (NULL);
3062 
3063 	if (which > state->dts_nspeculations) {
3064 		cpu_core[cpuid].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
3065 		return (NULL);
3066 	}
3067 
3068 	spec = &state->dts_speculations[which - 1];
3069 	buf = &spec->dtsp_buffer[cpuid];
3070 
3071 	do {
3072 		current = spec->dtsp_state;
3073 
3074 		switch (current) {
3075 		case DTRACESPEC_INACTIVE:
3076 		case DTRACESPEC_COMMITTINGMANY:
3077 		case DTRACESPEC_DISCARDING:
3078 			return (NULL);
3079 
3080 		case DTRACESPEC_COMMITTING:
3081 			ASSERT(buf->dtb_offset == 0);
3082 			return (NULL);
3083 
3084 		case DTRACESPEC_ACTIVEONE:
3085 			/*
3086 			 * This speculation is currently active on one CPU.
3087 			 * Check the offset in the buffer; if it's non-zero,
3088 			 * that CPU must be us (and we leave the state alone).
3089 			 * If it's zero, assume that we're starting on a new
3090 			 * CPU -- and change the state to indicate that the
3091 			 * speculation is active on more than one CPU.
3092 			 */
3093 			if (buf->dtb_offset != 0)
3094 				return (buf);
3095 
3096 			new = DTRACESPEC_ACTIVEMANY;
3097 			break;
3098 
3099 		case DTRACESPEC_ACTIVEMANY:
3100 			return (buf);
3101 
3102 		case DTRACESPEC_ACTIVE:
3103 			new = DTRACESPEC_ACTIVEONE;
3104 			break;
3105 
3106 		default:
3107 			ASSERT(0);
3108 		}
3109 	} while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
3110 	    current, new) != current);
3111 
3112 	ASSERT(new == DTRACESPEC_ACTIVEONE || new == DTRACESPEC_ACTIVEMANY);
3113 	return (buf);
3114 }
3115 
3116 /*
3117  * Return a string.  In the event that the user lacks the privilege to access
3118  * arbitrary kernel memory, we copy the string out to scratch memory so that we
3119  * don't fail access checking.
3120  *
3121  * dtrace_dif_variable() uses this routine as a helper for various
3122  * builtin values such as 'execname' and 'probefunc.'
3123  */
3124 uintptr_t
dtrace_dif_varstr(uintptr_t addr,dtrace_state_t * state,dtrace_mstate_t * mstate)3125 dtrace_dif_varstr(uintptr_t addr, dtrace_state_t *state,
3126     dtrace_mstate_t *mstate)
3127 {
3128 	uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
3129 	uintptr_t ret;
3130 	size_t strsz;
3131 
3132 	/*
3133 	 * The easy case: this probe is allowed to read all of memory, so
3134 	 * we can just return this as a vanilla pointer.
3135 	 */
3136 	if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
3137 		return (addr);
3138 
3139 	/*
3140 	 * This is the tougher case: we copy the string in question from
3141 	 * kernel memory into scratch memory and return it that way: this
3142 	 * ensures that we won't trip up when access checking tests the
3143 	 * BYREF return value.
3144 	 */
3145 	strsz = dtrace_strlen((char *)addr, size) + 1;
3146 
3147 	if (mstate->dtms_scratch_ptr + strsz >
3148 	    mstate->dtms_scratch_base + mstate->dtms_scratch_size) {
3149 		DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3150 		return (0);
3151 	}
3152 
3153 	dtrace_strcpy((const void *)addr, (void *)mstate->dtms_scratch_ptr,
3154 	    strsz);
3155 	ret = mstate->dtms_scratch_ptr;
3156 	mstate->dtms_scratch_ptr += strsz;
3157 	return (ret);
3158 }
3159 
3160 /*
3161  * This function implements the DIF emulator's variable lookups.  The emulator
3162  * passes a reserved variable identifier and optional built-in array index.
3163  */
3164 static uint64_t
dtrace_dif_variable(dtrace_mstate_t * mstate,dtrace_state_t * state,uint64_t v,uint64_t ndx)3165 dtrace_dif_variable(dtrace_mstate_t *mstate, dtrace_state_t *state, uint64_t v,
3166     uint64_t ndx)
3167 {
3168 	/*
3169 	 * If we're accessing one of the uncached arguments, we'll turn this
3170 	 * into a reference in the args array.
3171 	 */
3172 	if (v >= DIF_VAR_ARG0 && v <= DIF_VAR_ARG9) {
3173 		ndx = v - DIF_VAR_ARG0;
3174 		v = DIF_VAR_ARGS;
3175 	}
3176 
3177 	switch (v) {
3178 	case DIF_VAR_ARGS:
3179 		if (!(mstate->dtms_access & DTRACE_ACCESS_ARGS)) {
3180 			cpu_core[CPU->cpu_id].cpuc_dtrace_flags |=
3181 			    CPU_DTRACE_KPRIV;
3182 			return (0);
3183 		}
3184 
3185 		ASSERT(mstate->dtms_present & DTRACE_MSTATE_ARGS);
3186 		if (ndx >= sizeof (mstate->dtms_arg) /
3187 		    sizeof (mstate->dtms_arg[0])) {
3188 			int aframes = mstate->dtms_probe->dtpr_aframes + 2;
3189 			dtrace_provider_t *pv;
3190 			uint64_t val;
3191 
3192 			pv = mstate->dtms_probe->dtpr_provider;
3193 			if (pv->dtpv_pops.dtps_getargval != NULL)
3194 				val = pv->dtpv_pops.dtps_getargval(pv->dtpv_arg,
3195 				    mstate->dtms_probe->dtpr_id,
3196 				    mstate->dtms_probe->dtpr_arg, ndx, aframes);
3197 			else
3198 				val = dtrace_getarg(ndx, aframes);
3199 
3200 			/*
3201 			 * This is regrettably required to keep the compiler
3202 			 * from tail-optimizing the call to dtrace_getarg().
3203 			 * The condition always evaluates to true, but the
3204 			 * compiler has no way of figuring that out a priori.
3205 			 * (None of this would be necessary if the compiler
3206 			 * could be relied upon to _always_ tail-optimize
3207 			 * the call to dtrace_getarg() -- but it can't.)
3208 			 */
3209 			if (mstate->dtms_probe != NULL)
3210 				return (val);
3211 
3212 			ASSERT(0);
3213 		}
3214 
3215 		return (mstate->dtms_arg[ndx]);
3216 
3217 	case DIF_VAR_UREGS: {
3218 		klwp_t *lwp;
3219 
3220 		if (!dtrace_priv_proc(state, mstate))
3221 			return (0);
3222 
3223 		if ((lwp = curthread->t_lwp) == NULL) {
3224 			DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
3225 			cpu_core[CPU->cpu_id].cpuc_dtrace_illval = 0;
3226 			return (0);
3227 		}
3228 
3229 		return (dtrace_getreg(lwp->lwp_regs, ndx));
3230 	}
3231 
3232 	case DIF_VAR_VMREGS: {
3233 		uint64_t rval;
3234 
3235 		if (!dtrace_priv_kernel(state))
3236 			return (0);
3237 
3238 		DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3239 
3240 		rval = dtrace_getvmreg(ndx,
3241 		    &cpu_core[CPU->cpu_id].cpuc_dtrace_flags);
3242 
3243 		DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3244 
3245 		return (rval);
3246 	}
3247 
3248 	case DIF_VAR_CURTHREAD:
3249 		if (!dtrace_priv_proc(state, mstate))
3250 			return (0);
3251 		return ((uint64_t)(uintptr_t)curthread);
3252 
3253 	case DIF_VAR_TIMESTAMP:
3254 		if (!(mstate->dtms_present & DTRACE_MSTATE_TIMESTAMP)) {
3255 			mstate->dtms_timestamp = dtrace_gethrtime();
3256 			mstate->dtms_present |= DTRACE_MSTATE_TIMESTAMP;
3257 		}
3258 		return (mstate->dtms_timestamp);
3259 
3260 	case DIF_VAR_VTIMESTAMP:
3261 		ASSERT(dtrace_vtime_references != 0);
3262 		return (curthread->t_dtrace_vtime);
3263 
3264 	case DIF_VAR_WALLTIMESTAMP:
3265 		if (!(mstate->dtms_present & DTRACE_MSTATE_WALLTIMESTAMP)) {
3266 			mstate->dtms_walltimestamp = dtrace_gethrestime();
3267 			mstate->dtms_present |= DTRACE_MSTATE_WALLTIMESTAMP;
3268 		}
3269 		return (mstate->dtms_walltimestamp);
3270 
3271 	case DIF_VAR_IPL:
3272 		if (!dtrace_priv_kernel(state))
3273 			return (0);
3274 		if (!(mstate->dtms_present & DTRACE_MSTATE_IPL)) {
3275 			mstate->dtms_ipl = dtrace_getipl();
3276 			mstate->dtms_present |= DTRACE_MSTATE_IPL;
3277 		}
3278 		return (mstate->dtms_ipl);
3279 
3280 	case DIF_VAR_EPID:
3281 		ASSERT(mstate->dtms_present & DTRACE_MSTATE_EPID);
3282 		return (mstate->dtms_epid);
3283 
3284 	case DIF_VAR_ID:
3285 		ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3286 		return (mstate->dtms_probe->dtpr_id);
3287 
3288 	case DIF_VAR_STACKDEPTH:
3289 		if (!dtrace_priv_kernel(state))
3290 			return (0);
3291 		if (!(mstate->dtms_present & DTRACE_MSTATE_STACKDEPTH)) {
3292 			int aframes = mstate->dtms_probe->dtpr_aframes + 2;
3293 
3294 			mstate->dtms_stackdepth = dtrace_getstackdepth(aframes);
3295 			mstate->dtms_present |= DTRACE_MSTATE_STACKDEPTH;
3296 		}
3297 		return (mstate->dtms_stackdepth);
3298 
3299 	case DIF_VAR_USTACKDEPTH:
3300 		if (!dtrace_priv_proc(state, mstate))
3301 			return (0);
3302 		if (!(mstate->dtms_present & DTRACE_MSTATE_USTACKDEPTH)) {
3303 			/*
3304 			 * See comment in DIF_VAR_PID.
3305 			 */
3306 			if (DTRACE_ANCHORED(mstate->dtms_probe) &&
3307 			    CPU_ON_INTR(CPU)) {
3308 				mstate->dtms_ustackdepth = 0;
3309 			} else {
3310 				DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3311 				mstate->dtms_ustackdepth =
3312 				    dtrace_getustackdepth();
3313 				DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3314 			}
3315 			mstate->dtms_present |= DTRACE_MSTATE_USTACKDEPTH;
3316 		}
3317 		return (mstate->dtms_ustackdepth);
3318 
3319 	case DIF_VAR_CALLER:
3320 		if (!dtrace_priv_kernel(state))
3321 			return (0);
3322 		if (!(mstate->dtms_present & DTRACE_MSTATE_CALLER)) {
3323 			int aframes = mstate->dtms_probe->dtpr_aframes + 2;
3324 
3325 			if (!DTRACE_ANCHORED(mstate->dtms_probe)) {
3326 				/*
3327 				 * If this is an unanchored probe, we are
3328 				 * required to go through the slow path:
3329 				 * dtrace_caller() only guarantees correct
3330 				 * results for anchored probes.
3331 				 */
3332 				pc_t caller[2];
3333 
3334 				dtrace_getpcstack(caller, 2, aframes,
3335 				    (uint32_t *)(uintptr_t)mstate->dtms_arg[0]);
3336 				mstate->dtms_caller = caller[1];
3337 			} else if ((mstate->dtms_caller =
3338 			    dtrace_caller(aframes)) == -1) {
3339 				/*
3340 				 * We have failed to do this the quick way;
3341 				 * we must resort to the slower approach of
3342 				 * calling dtrace_getpcstack().
3343 				 */
3344 				pc_t caller;
3345 
3346 				dtrace_getpcstack(&caller, 1, aframes, NULL);
3347 				mstate->dtms_caller = caller;
3348 			}
3349 
3350 			mstate->dtms_present |= DTRACE_MSTATE_CALLER;
3351 		}
3352 		return (mstate->dtms_caller);
3353 
3354 	case DIF_VAR_UCALLER:
3355 		if (!dtrace_priv_proc(state, mstate))
3356 			return (0);
3357 
3358 		if (!(mstate->dtms_present & DTRACE_MSTATE_UCALLER)) {
3359 			uint64_t ustack[3];
3360 
3361 			/*
3362 			 * dtrace_getupcstack() fills in the first uint64_t
3363 			 * with the current PID.  The second uint64_t will
3364 			 * be the program counter at user-level.  The third
3365 			 * uint64_t will contain the caller, which is what
3366 			 * we're after.
3367 			 */
3368 			ustack[2] = 0;
3369 			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3370 			dtrace_getupcstack(ustack, 3);
3371 			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3372 			mstate->dtms_ucaller = ustack[2];
3373 			mstate->dtms_present |= DTRACE_MSTATE_UCALLER;
3374 		}
3375 
3376 		return (mstate->dtms_ucaller);
3377 
3378 	case DIF_VAR_PROBEPROV:
3379 		ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3380 		return (dtrace_dif_varstr(
3381 		    (uintptr_t)mstate->dtms_probe->dtpr_provider->dtpv_name,
3382 		    state, mstate));
3383 
3384 	case DIF_VAR_PROBEMOD:
3385 		ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3386 		return (dtrace_dif_varstr(
3387 		    (uintptr_t)mstate->dtms_probe->dtpr_mod,
3388 		    state, mstate));
3389 
3390 	case DIF_VAR_PROBEFUNC:
3391 		ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3392 		return (dtrace_dif_varstr(
3393 		    (uintptr_t)mstate->dtms_probe->dtpr_func,
3394 		    state, mstate));
3395 
3396 	case DIF_VAR_PROBENAME:
3397 		ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3398 		return (dtrace_dif_varstr(
3399 		    (uintptr_t)mstate->dtms_probe->dtpr_name,
3400 		    state, mstate));
3401 
3402 	case DIF_VAR_PID:
3403 		if (!dtrace_priv_proc(state, mstate))
3404 			return (0);
3405 
3406 		/*
3407 		 * Note that we are assuming that an unanchored probe is
3408 		 * always due to a high-level interrupt.  (And we're assuming
3409 		 * that there is only a single high level interrupt.)
3410 		 */
3411 		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3412 			return (pid0.pid_id);
3413 
3414 		/*
3415 		 * It is always safe to dereference one's own t_procp pointer:
3416 		 * it always points to a valid, allocated proc structure.
3417 		 * Further, it is always safe to dereference the p_pidp member
3418 		 * of one's own proc structure.  (These are truisms becuase
3419 		 * threads and processes don't clean up their own state --
3420 		 * they leave that task to whomever reaps them.)
3421 		 */
3422 		return ((uint64_t)curthread->t_procp->p_pidp->pid_id);
3423 
3424 	case DIF_VAR_PPID:
3425 		if (!dtrace_priv_proc(state, mstate))
3426 			return (0);
3427 
3428 		/*
3429 		 * See comment in DIF_VAR_PID.
3430 		 */
3431 		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3432 			return (pid0.pid_id);
3433 
3434 		/*
3435 		 * It is always safe to dereference one's own t_procp pointer:
3436 		 * it always points to a valid, allocated proc structure.
3437 		 * (This is true because threads don't clean up their own
3438 		 * state -- they leave that task to whomever reaps them.)
3439 		 */
3440 		return ((uint64_t)curthread->t_procp->p_ppid);
3441 
3442 	case DIF_VAR_TID:
3443 		/*
3444 		 * See comment in DIF_VAR_PID.
3445 		 */
3446 		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3447 			return (0);
3448 
3449 		return ((uint64_t)curthread->t_tid);
3450 
3451 	case DIF_VAR_EXECNAME:
3452 		if (!dtrace_priv_proc(state, mstate))
3453 			return (0);
3454 
3455 		/*
3456 		 * See comment in DIF_VAR_PID.
3457 		 */
3458 		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3459 			return ((uint64_t)(uintptr_t)p0.p_user.u_comm);
3460 
3461 		/*
3462 		 * It is always safe to dereference one's own t_procp pointer:
3463 		 * it always points to a valid, allocated proc structure.
3464 		 * (This is true because threads don't clean up their own
3465 		 * state -- they leave that task to whomever reaps them.)
3466 		 */
3467 		return (dtrace_dif_varstr(
3468 		    (uintptr_t)curthread->t_procp->p_user.u_comm,
3469 		    state, mstate));
3470 
3471 	case DIF_VAR_ZONENAME:
3472 		if (!dtrace_priv_proc(state, mstate))
3473 			return (0);
3474 
3475 		/*
3476 		 * See comment in DIF_VAR_PID.
3477 		 */
3478 		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3479 			return ((uint64_t)(uintptr_t)p0.p_zone->zone_name);
3480 
3481 		/*
3482 		 * It is always safe to dereference one's own t_procp pointer:
3483 		 * it always points to a valid, allocated proc structure.
3484 		 * (This is true because threads don't clean up their own
3485 		 * state -- they leave that task to whomever reaps them.)
3486 		 */
3487 		return (dtrace_dif_varstr(
3488 		    (uintptr_t)curthread->t_procp->p_zone->zone_name,
3489 		    state, mstate));
3490 
3491 	case DIF_VAR_UID:
3492 		if (!dtrace_priv_proc(state, mstate))
3493 			return (0);
3494 
3495 		/*
3496 		 * See comment in DIF_VAR_PID.
3497 		 */
3498 		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3499 			return ((uint64_t)p0.p_cred->cr_uid);
3500 
3501 		/*
3502 		 * It is always safe to dereference one's own t_procp pointer:
3503 		 * it always points to a valid, allocated proc structure.
3504 		 * (This is true because threads don't clean up their own
3505 		 * state -- they leave that task to whomever reaps them.)
3506 		 *
3507 		 * Additionally, it is safe to dereference one's own process
3508 		 * credential, since this is never NULL after process birth.
3509 		 */
3510 		return ((uint64_t)curthread->t_procp->p_cred->cr_uid);
3511 
3512 	case DIF_VAR_GID:
3513 		if (!dtrace_priv_proc(state, mstate))
3514 			return (0);
3515 
3516 		/*
3517 		 * See comment in DIF_VAR_PID.
3518 		 */
3519 		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3520 			return ((uint64_t)p0.p_cred->cr_gid);
3521 
3522 		/*
3523 		 * It is always safe to dereference one's own t_procp pointer:
3524 		 * it always points to a valid, allocated proc structure.
3525 		 * (This is true because threads don't clean up their own
3526 		 * state -- they leave that task to whomever reaps them.)
3527 		 *
3528 		 * Additionally, it is safe to dereference one's own process
3529 		 * credential, since this is never NULL after process birth.
3530 		 */
3531 		return ((uint64_t)curthread->t_procp->p_cred->cr_gid);
3532 
3533 	case DIF_VAR_ERRNO: {
3534 		klwp_t *lwp;
3535 		if (!dtrace_priv_proc(state, mstate))
3536 			return (0);
3537 
3538 		/*
3539 		 * See comment in DIF_VAR_PID.
3540 		 */
3541 		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3542 			return (0);
3543 
3544 		/*
3545 		 * It is always safe to dereference one's own t_lwp pointer in
3546 		 * the event that this pointer is non-NULL.  (This is true
3547 		 * because threads and lwps don't clean up their own state --
3548 		 * they leave that task to whomever reaps them.)
3549 		 */
3550 		if ((lwp = curthread->t_lwp) == NULL)
3551 			return (0);
3552 
3553 		return ((uint64_t)lwp->lwp_errno);
3554 	}
3555 
3556 	case DIF_VAR_THREADNAME:
3557 		/*
3558 		 * See comment in DIF_VAR_PID.
3559 		 */
3560 		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3561 			return (0);
3562 
3563 		if (curthread->t_name == NULL)
3564 			return (0);
3565 
3566 		/*
3567 		 * Once set, ->t_name itself is never changed: any updates are
3568 		 * made to the same buffer that we are pointing out.  So we are
3569 		 * safe to dereference it here.
3570 		 */
3571 		return (dtrace_dif_varstr((uintptr_t)curthread->t_name,
3572 		    state, mstate));
3573 
3574 	default:
3575 		DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3576 		return (0);
3577 	}
3578 }
3579 
3580 static void
dtrace_dif_variable_write(dtrace_mstate_t * mstate,dtrace_state_t * state,uint64_t v,uint64_t ndx,uint64_t data)3581 dtrace_dif_variable_write(dtrace_mstate_t *mstate, dtrace_state_t *state,
3582     uint64_t v, uint64_t ndx, uint64_t data)
3583 {
3584 	switch (v) {
3585 	case DIF_VAR_UREGS: {
3586 		klwp_t *lwp;
3587 
3588 		if (dtrace_destructive_disallow ||
3589 		    !dtrace_priv_proc_control(state, mstate)) {
3590 			return;
3591 		}
3592 
3593 		if ((lwp = curthread->t_lwp) == NULL) {
3594 			DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
3595 			cpu_core[CPU->cpu_id].cpuc_dtrace_illval = 0;
3596 			return;
3597 		}
3598 
3599 		dtrace_setreg(lwp->lwp_regs, ndx, data);
3600 		return;
3601 	}
3602 
3603 	default:
3604 		DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3605 		return;
3606 	}
3607 }
3608 
3609 typedef enum dtrace_json_state {
3610 	DTRACE_JSON_REST = 1,
3611 	DTRACE_JSON_OBJECT,
3612 	DTRACE_JSON_STRING,
3613 	DTRACE_JSON_STRING_ESCAPE,
3614 	DTRACE_JSON_STRING_ESCAPE_UNICODE,
3615 	DTRACE_JSON_COLON,
3616 	DTRACE_JSON_COMMA,
3617 	DTRACE_JSON_VALUE,
3618 	DTRACE_JSON_IDENTIFIER,
3619 	DTRACE_JSON_NUMBER,
3620 	DTRACE_JSON_NUMBER_FRAC,
3621 	DTRACE_JSON_NUMBER_EXP,
3622 	DTRACE_JSON_COLLECT_OBJECT
3623 } dtrace_json_state_t;
3624 
3625 /*
3626  * This function possesses just enough knowledge about JSON to extract a single
3627  * value from a JSON string and store it in the scratch buffer.  It is able
3628  * to extract nested object values, and members of arrays by index.
3629  *
3630  * elemlist is a list of JSON keys, stored as packed NUL-terminated strings, to
3631  * be looked up as we descend into the object tree.  e.g.
3632  *
3633  *    foo[0].bar.baz[32] --> "foo" NUL "0" NUL "bar" NUL "baz" NUL "32" NUL
3634  *       with nelems = 5.
3635  *
3636  * The run time of this function must be bounded above by strsize to limit the
3637  * amount of work done in probe context.  As such, it is implemented as a
3638  * simple state machine, reading one character at a time using safe loads
3639  * until we find the requested element, hit a parsing error or run off the
3640  * end of the object or string.
3641  *
3642  * As there is no way for a subroutine to return an error without interrupting
3643  * clause execution, we simply return NULL in the event of a missing key or any
3644  * other error condition.  Each NULL return in this function is commented with
3645  * the error condition it represents -- parsing or otherwise.
3646  *
3647  * The set of states for the state machine closely matches the JSON
3648  * specification (http://json.org/).  Briefly:
3649  *
3650  *   DTRACE_JSON_REST:
3651  *     Skip whitespace until we find either a top-level Object, moving
3652  *     to DTRACE_JSON_OBJECT; or an Array, moving to DTRACE_JSON_VALUE.
3653  *
3654  *   DTRACE_JSON_OBJECT:
3655  *     Locate the next key String in an Object.  Sets a flag to denote
3656  *     the next String as a key string and moves to DTRACE_JSON_STRING.
3657  *
3658  *   DTRACE_JSON_COLON:
3659  *     Skip whitespace until we find the colon that separates key Strings
3660  *     from their values.  Once found, move to DTRACE_JSON_VALUE.
3661  *
3662  *   DTRACE_JSON_VALUE:
3663  *     Detects the type of the next value (String, Number, Identifier, Object
3664  *     or Array) and routes to the states that process that type.  Here we also
3665  *     deal with the element selector list if we are requested to traverse down
3666  *     into the object tree.
3667  *
3668  *   DTRACE_JSON_COMMA:
3669