1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright 2019 Joyent, Inc.
25 * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
26 */
27
28/*
29 * DTrace - Dynamic Tracing for Solaris
30 *
31 * This is the implementation of the Solaris Dynamic Tracing framework
32 * (DTrace).  The user-visible interface to DTrace is described at length in
33 * the "Solaris Dynamic Tracing Guide".  The interfaces between the libdtrace
34 * library, the in-kernel DTrace framework, and the DTrace providers are
35 * described in the block comments in the <sys/dtrace.h> header file.  The
36 * internal architecture of DTrace is described in the block comments in the
37 * <sys/dtrace_impl.h> header file.  The comments contained within the DTrace
38 * implementation very much assume mastery of all of these sources; if one has
39 * an unanswered question about the implementation, one should consult them
40 * first.
41 *
42 * The functions here are ordered roughly as follows:
43 *
44 *   - Probe context functions
45 *   - Probe hashing functions
46 *   - Non-probe context utility functions
47 *   - Matching functions
48 *   - Provider-to-Framework API functions
49 *   - Probe management functions
50 *   - DIF object functions
51 *   - Format functions
52 *   - Predicate functions
53 *   - ECB functions
54 *   - Buffer functions
55 *   - Enabling functions
56 *   - DOF functions
57 *   - Anonymous enabling functions
58 *   - Consumer state functions
59 *   - Helper functions
60 *   - Hook functions
61 *   - Driver cookbook functions
62 *
63 * Each group of functions begins with a block comment labelled the "DTrace
64 * [Group] Functions", allowing one to find each block by searching forward
65 * on capital-f functions.
66 */
67#include <sys/errno.h>
68#include <sys/stat.h>
69#include <sys/modctl.h>
70#include <sys/conf.h>
71#include <sys/systm.h>
72#include <sys/ddi.h>
73#include <sys/sunddi.h>
74#include <sys/cpuvar.h>
75#include <sys/kmem.h>
76#include <sys/strsubr.h>
77#include <sys/sysmacros.h>
78#include <sys/dtrace_impl.h>
79#include <sys/atomic.h>
80#include <sys/cmn_err.h>
81#include <sys/mutex_impl.h>
82#include <sys/rwlock_impl.h>
83#include <sys/ctf_api.h>
84#include <sys/panic.h>
85#include <sys/priv_impl.h>
86#include <sys/policy.h>
87#include <sys/cred_impl.h>
88#include <sys/procfs_isa.h>
89#include <sys/taskq.h>
90#include <sys/mkdev.h>
91#include <sys/kdi.h>
92#include <sys/zone.h>
93#include <sys/socket.h>
94#include <netinet/in.h>
95#include "strtolctype.h"
96
97/*
98 * DTrace Tunable Variables
99 *
100 * The following variables may be tuned by adding a line to /etc/system that
101 * includes both the name of the DTrace module ("dtrace") and the name of the
102 * variable.  For example:
103 *
104 *   set dtrace:dtrace_destructive_disallow = 1
105 *
106 * In general, the only variables that one should be tuning this way are those
107 * that affect system-wide DTrace behavior, and for which the default behavior
108 * is undesirable.  Most of these variables are tunable on a per-consumer
109 * basis using DTrace options, and need not be tuned on a system-wide basis.
110 * When tuning these variables, avoid pathological values; while some attempt
111 * is made to verify the integrity of these variables, they are not considered
112 * part of the supported interface to DTrace, and they are therefore not
113 * checked comprehensively.  Further, these variables should not be tuned
114 * dynamically via "mdb -kw" or other means; they should only be tuned via
115 * /etc/system.
116 */
117int		dtrace_destructive_disallow = 0;
118dtrace_optval_t	dtrace_nonroot_maxsize = (16 * 1024 * 1024);
119size_t		dtrace_difo_maxsize = (256 * 1024);
120dtrace_optval_t	dtrace_dof_maxsize = (8 * 1024 * 1024);
121size_t		dtrace_statvar_maxsize = (16 * 1024);
122size_t		dtrace_actions_max = (16 * 1024);
123size_t		dtrace_retain_max = 1024;
124dtrace_optval_t	dtrace_helper_actions_max = 1024;
125dtrace_optval_t	dtrace_helper_providers_max = 32;
126dtrace_optval_t	dtrace_dstate_defsize = (1 * 1024 * 1024);
127size_t		dtrace_strsize_default = 256;
128dtrace_optval_t	dtrace_cleanrate_default = 9900990;		/* 101 hz */
129dtrace_optval_t	dtrace_cleanrate_min = 200000;			/* 5000 hz */
130dtrace_optval_t	dtrace_cleanrate_max = (uint64_t)60 * NANOSEC;	/* 1/minute */
131dtrace_optval_t	dtrace_aggrate_default = NANOSEC;		/* 1 hz */
132dtrace_optval_t	dtrace_statusrate_default = NANOSEC;		/* 1 hz */
133dtrace_optval_t dtrace_statusrate_max = (hrtime_t)10 * NANOSEC;	 /* 6/minute */
134dtrace_optval_t	dtrace_switchrate_default = NANOSEC;		/* 1 hz */
135dtrace_optval_t	dtrace_nspec_default = 1;
136dtrace_optval_t	dtrace_specsize_default = 32 * 1024;
137dtrace_optval_t dtrace_stackframes_default = 20;
138dtrace_optval_t dtrace_ustackframes_default = 20;
139dtrace_optval_t dtrace_jstackframes_default = 50;
140dtrace_optval_t dtrace_jstackstrsize_default = 512;
141int		dtrace_msgdsize_max = 128;
142hrtime_t	dtrace_chill_max = MSEC2NSEC(500);		/* 500 ms */
143hrtime_t	dtrace_chill_interval = NANOSEC;		/* 1000 ms */
144int		dtrace_devdepth_max = 32;
145int		dtrace_err_verbose;
146hrtime_t	dtrace_deadman_interval = NANOSEC;
147hrtime_t	dtrace_deadman_timeout = (hrtime_t)10 * NANOSEC;
148hrtime_t	dtrace_deadman_user = (hrtime_t)30 * NANOSEC;
149hrtime_t	dtrace_unregister_defunct_reap = (hrtime_t)60 * NANOSEC;
150
151/*
152 * DTrace External Variables
153 *
154 * As dtrace(7D) is a kernel module, any DTrace variables are obviously
155 * available to DTrace consumers via the backtick (`) syntax.  One of these,
156 * dtrace_zero, is made deliberately so:  it is provided as a source of
157 * well-known, zero-filled memory.  While this variable is not documented,
158 * it is used by some translators as an implementation detail.
159 */
160const char	dtrace_zero[256] = { 0 };	/* zero-filled memory */
161
162/*
163 * DTrace Internal Variables
164 */
165static dev_info_t	*dtrace_devi;		/* device info */
166static vmem_t		*dtrace_arena;		/* probe ID arena */
167static vmem_t		*dtrace_minor;		/* minor number arena */
168static taskq_t		*dtrace_taskq;		/* task queue */
169static dtrace_probe_t	**dtrace_probes;	/* array of all probes */
170static int		dtrace_nprobes;		/* number of probes */
171static dtrace_provider_t *dtrace_provider;	/* provider list */
172static dtrace_meta_t	*dtrace_meta_pid;	/* user-land meta provider */
173static int		dtrace_opens;		/* number of opens */
174static int		dtrace_helpers;		/* number of helpers */
175static int		dtrace_getf;		/* number of unpriv getf()s */
176static void		*dtrace_softstate;	/* softstate pointer */
177static dtrace_hash_t	*dtrace_bymod;		/* probes hashed by module */
178static dtrace_hash_t	*dtrace_byfunc;		/* probes hashed by function */
179static dtrace_hash_t	*dtrace_byname;		/* probes hashed by name */
180static dtrace_toxrange_t *dtrace_toxrange;	/* toxic range array */
181static int		dtrace_toxranges;	/* number of toxic ranges */
182static int		dtrace_toxranges_max;	/* size of toxic range array */
183static dtrace_anon_t	dtrace_anon;		/* anonymous enabling */
184static kmem_cache_t	*dtrace_state_cache;	/* cache for dynamic state */
185static uint64_t		dtrace_vtime_references; /* number of vtimestamp refs */
186static kthread_t	*dtrace_panicked;	/* panicking thread */
187static dtrace_ecb_t	*dtrace_ecb_create_cache; /* cached created ECB */
188static dtrace_genid_t	dtrace_probegen;	/* current probe generation */
189static dtrace_helpers_t *dtrace_deferred_pid;	/* deferred helper list */
190static dtrace_enabling_t *dtrace_retained;	/* list of retained enablings */
191static dtrace_genid_t	dtrace_retained_gen;	/* current retained enab gen */
192static dtrace_dynvar_t	dtrace_dynhash_sink;	/* end of dynamic hash chains */
193static int		dtrace_dynvar_failclean; /* dynvars failed to clean */
194
195/*
196 * DTrace Locking
197 * DTrace is protected by three (relatively coarse-grained) locks:
198 *
199 * (1) dtrace_lock is required to manipulate essentially any DTrace state,
200 *     including enabling state, probes, ECBs, consumer state, helper state,
201 *     etc.  Importantly, dtrace_lock is _not_ required when in probe context;
202 *     probe context is lock-free -- synchronization is handled via the
203 *     dtrace_sync() cross call mechanism.
204 *
205 * (2) dtrace_provider_lock is required when manipulating provider state, or
206 *     when provider state must be held constant.
207 *
208 * (3) dtrace_meta_lock is required when manipulating meta provider state, or
209 *     when meta provider state must be held constant.
210 *
211 * The lock ordering between these three locks is dtrace_meta_lock before
212 * dtrace_provider_lock before dtrace_lock.  (In particular, there are
213 * several places where dtrace_provider_lock is held by the framework as it
214 * calls into the providers -- which then call back into the framework,
215 * grabbing dtrace_lock.)
216 *
217 * There are two other locks in the mix:  mod_lock and cpu_lock.  With respect
218 * to dtrace_provider_lock and dtrace_lock, cpu_lock continues its historical
219 * role as a coarse-grained lock; it is acquired before both of these locks.
220 * With respect to dtrace_meta_lock, its behavior is stranger:  cpu_lock must
221 * be acquired _between_ dtrace_meta_lock and any other DTrace locks.
222 * mod_lock is similar with respect to dtrace_provider_lock in that it must be
223 * acquired _between_ dtrace_provider_lock and dtrace_lock.
224 */
225static kmutex_t		dtrace_lock;		/* probe state lock */
226static kmutex_t		dtrace_provider_lock;	/* provider state lock */
227static kmutex_t		dtrace_meta_lock;	/* meta-provider state lock */
228
229/*
230 * DTrace Provider Variables
231 *
232 * These are the variables relating to DTrace as a provider (that is, the
233 * provider of the BEGIN, END, and ERROR probes).
234 */
235static dtrace_pattr_t	dtrace_provider_attr = {
236{ DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
237{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
238{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
239{ DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
240{ DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
241};
242
243static void
244dtrace_nullop_provide(void *arg __unused,
245    const dtrace_probedesc_t *spec __unused)
246{
247}
248
249static void
250dtrace_nullop_module(void *arg __unused, struct modctl *mp __unused)
251{
252}
253
254static void
255dtrace_nullop(void *arg __unused, dtrace_id_t id __unused, void *parg __unused)
256{
257}
258
259static int
260dtrace_enable_nullop(void *arg __unused, dtrace_id_t id __unused,
261    void *parg __unused)
262{
263	return (0);
264}
265
266static dtrace_pops_t	dtrace_provider_ops = {
267	.dtps_provide = dtrace_nullop_provide,
268	.dtps_provide_module = dtrace_nullop_module,
269	.dtps_enable = dtrace_enable_nullop,
270	.dtps_disable = dtrace_nullop,
271	.dtps_suspend = dtrace_nullop,
272	.dtps_resume = dtrace_nullop,
273	.dtps_getargdesc = NULL,
274	.dtps_getargval = NULL,
275	.dtps_mode = NULL,
276	.dtps_destroy = dtrace_nullop
277};
278
279static dtrace_id_t	dtrace_probeid_begin;	/* special BEGIN probe */
280static dtrace_id_t	dtrace_probeid_end;	/* special END probe */
281dtrace_id_t		dtrace_probeid_error;	/* special ERROR probe */
282
283/*
284 * DTrace Helper Tracing Variables
285 *
286 * These variables should be set dynamically to enable helper tracing.  The
287 * only variables that should be set are dtrace_helptrace_enable (which should
288 * be set to a non-zero value to allocate helper tracing buffers on the next
289 * open of /dev/dtrace) and dtrace_helptrace_disable (which should be set to a
290 * non-zero value to deallocate helper tracing buffers on the next close of
291 * /dev/dtrace).  When (and only when) helper tracing is disabled, the
292 * buffer size may also be set via dtrace_helptrace_bufsize.
293 */
294int			dtrace_helptrace_enable = 0;
295int			dtrace_helptrace_disable = 0;
296int			dtrace_helptrace_bufsize = 16 * 1024 * 1024;
297uint32_t		dtrace_helptrace_nlocals;
298static dtrace_helptrace_t *dtrace_helptrace_buffer;
299static uint32_t		dtrace_helptrace_next = 0;
300static int		dtrace_helptrace_wrapped = 0;
301
302/*
303 * DTrace Error Hashing
304 *
305 * On DEBUG kernels, DTrace will track the errors that has seen in a hash
306 * table.  This is very useful for checking coverage of tests that are
307 * expected to induce DIF or DOF processing errors, and may be useful for
308 * debugging problems in the DIF code generator or in DOF generation .  The
309 * error hash may be examined with the ::dtrace_errhash MDB dcmd.
310 */
311#ifdef DEBUG
312static dtrace_errhash_t	dtrace_errhash[DTRACE_ERRHASHSZ];
313static const char *dtrace_errlast;
314static kthread_t *dtrace_errthread;
315static kmutex_t dtrace_errlock;
316#endif
317
318/*
319 * DTrace Macros and Constants
320 *
321 * These are various macros that are useful in various spots in the
322 * implementation, along with a few random constants that have no meaning
323 * outside of the implementation.  There is no real structure to this cpp
324 * mishmash -- but is there ever?
325 */
326#define	DTRACE_HASHSTR(hash, probe)	\
327	dtrace_hash_str(*((char **)((uintptr_t)(probe) + (hash)->dth_stroffs)))
328
329#define	DTRACE_HASHNEXT(hash, probe)	\
330	(dtrace_probe_t **)((uintptr_t)(probe) + (hash)->dth_nextoffs)
331
332#define	DTRACE_HASHPREV(hash, probe)	\
333	(dtrace_probe_t **)((uintptr_t)(probe) + (hash)->dth_prevoffs)
334
335#define	DTRACE_HASHEQ(hash, lhs, rhs)	\
336	(strcmp(*((char **)((uintptr_t)(lhs) + (hash)->dth_stroffs)), \
337	    *((char **)((uintptr_t)(rhs) + (hash)->dth_stroffs))) == 0)
338
339#define	DTRACE_AGGHASHSIZE_SLEW		17
340
341#define	DTRACE_V4MAPPED_OFFSET		(sizeof (uint32_t) * 3)
342
343/*
344 * The key for a thread-local variable consists of the lower 61 bits of the
345 * t_did, plus the 3 bits of the highest active interrupt above LOCK_LEVEL.
346 * We add DIF_VARIABLE_MAX to t_did to assure that the thread key is never
347 * equal to a variable identifier.  This is necessary (but not sufficient) to
348 * assure that global associative arrays never collide with thread-local
349 * variables.  To guarantee that they cannot collide, we must also define the
350 * order for keying dynamic variables.  That order is:
351 *
352 *   [ key0 ] ... [ keyn ] [ variable-key ] [ tls-key ]
353 *
354 * Because the variable-key and the tls-key are in orthogonal spaces, there is
355 * no way for a global variable key signature to match a thread-local key
356 * signature.
357 */
358#define	DTRACE_TLS_THRKEY(where) { \
359	uint_t intr = 0; \
360	uint_t actv = CPU->cpu_intr_actv >> (LOCK_LEVEL + 1); \
361	for (; actv; actv >>= 1) \
362		intr++; \
363	ASSERT(intr < (1 << 3)); \
364	(where) = ((curthread->t_did + DIF_VARIABLE_MAX) & \
365	    (((uint64_t)1 << 61) - 1)) | ((uint64_t)intr << 61); \
366}
367
368#define	DT_BSWAP_8(x)	((x) & 0xff)
369#define	DT_BSWAP_16(x)	((DT_BSWAP_8(x) << 8) | DT_BSWAP_8((x) >> 8))
370#define	DT_BSWAP_32(x)	((DT_BSWAP_16(x) << 16) | DT_BSWAP_16((x) >> 16))
371#define	DT_BSWAP_64(x)	((DT_BSWAP_32(x) << 32) | DT_BSWAP_32((x) >> 32))
372
373#define	DT_MASK_LO 0x00000000FFFFFFFFULL
374
375#define	DTRACE_STORE(type, tomax, offset, what) \
376	*((type *)((uintptr_t)(tomax) + (uintptr_t)offset)) = (type)(what);
377
378#ifndef __x86
379#define	DTRACE_ALIGNCHECK(addr, size, flags)				\
380	if (addr & (size - 1)) {					\
381		*flags |= CPU_DTRACE_BADALIGN;				\
382		cpu_core[CPU->cpu_id].cpuc_dtrace_illval = addr;	\
383		return (0);						\
384	}
385#else
386#define	DTRACE_ALIGNCHECK(addr, size, flags)
387#endif
388
389/*
390 * Test whether a range of memory starting at testaddr of size testsz falls
391 * within the range of memory described by addr, sz.  We take care to avoid
392 * problems with overflow and underflow of the unsigned quantities, and
393 * disallow all negative sizes.  Ranges of size 0 are allowed.
394 */
395#define	DTRACE_INRANGE(testaddr, testsz, baseaddr, basesz) \
396	((testaddr) - (uintptr_t)(baseaddr) < (basesz) && \
397	(testaddr) + (testsz) - (uintptr_t)(baseaddr) <= (basesz) && \
398	(testaddr) + (testsz) >= (testaddr))
399
400#define	DTRACE_RANGE_REMAIN(remp, addr, baseaddr, basesz)		\
401do {									\
402	if ((remp) != NULL) {						\
403		*(remp) = (uintptr_t)(baseaddr) + (basesz) - (addr);	\
404	}								\
405_NOTE(CONSTCOND) } while (0)
406
407
408/*
409 * Test whether alloc_sz bytes will fit in the scratch region.  We isolate
410 * alloc_sz on the righthand side of the comparison in order to avoid overflow
411 * or underflow in the comparison with it.  This is simpler than the INRANGE
412 * check above, because we know that the dtms_scratch_ptr is valid in the
413 * range.  Allocations of size zero are allowed.
414 */
415#define	DTRACE_INSCRATCH(mstate, alloc_sz) \
416	((mstate)->dtms_scratch_base + (mstate)->dtms_scratch_size - \
417	(mstate)->dtms_scratch_ptr >= (alloc_sz))
418
419#define	DTRACE_LOADFUNC(bits)						\
420/*CSTYLED*/								\
421uint##bits##_t								\
422dtrace_load##bits(uintptr_t addr)					\
423{									\
424	size_t size = bits / NBBY;					\
425	/*CSTYLED*/							\
426	uint##bits##_t rval;						\
427	int i;								\
428	volatile uint16_t *flags = (volatile uint16_t *)		\
429	    &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;			\
430									\
431	DTRACE_ALIGNCHECK(addr, size, flags);				\
432									\
433	for (i = 0; i < dtrace_toxranges; i++) {			\
434		if (addr >= dtrace_toxrange[i].dtt_limit)		\
435			continue;					\
436									\
437		if (addr + size <= dtrace_toxrange[i].dtt_base)		\
438			continue;					\
439									\
440		/*							\
441		 * This address falls within a toxic region; return 0.	\
442		 */							\
443		*flags |= CPU_DTRACE_BADADDR;				\
444		cpu_core[CPU->cpu_id].cpuc_dtrace_illval = addr;	\
445		return (0);						\
446	}								\
447									\
448	*flags |= CPU_DTRACE_NOFAULT;					\
449	/*CSTYLED*/							\
450	rval = *((volatile uint##bits##_t *)addr);			\
451	*flags &= ~CPU_DTRACE_NOFAULT;					\
452									\
453	return (!(*flags & CPU_DTRACE_FAULT) ? rval : 0);		\
454}
455
456#ifdef _LP64
457#define	dtrace_loadptr	dtrace_load64
458#else
459#define	dtrace_loadptr	dtrace_load32
460#endif
461
462#define	DTRACE_DYNHASH_FREE	0
463#define	DTRACE_DYNHASH_SINK	1
464#define	DTRACE_DYNHASH_VALID	2
465
466#define	DTRACE_MATCH_FAIL	-1
467#define	DTRACE_MATCH_NEXT	0
468#define	DTRACE_MATCH_DONE	1
469#define	DTRACE_ANCHORED(probe)	((probe)->dtpr_func[0] != '\0')
470#define	DTRACE_STATE_ALIGN	64
471
472#define	DTRACE_FLAGS2FLT(flags)						\
473	(((flags) & CPU_DTRACE_BADADDR) ? DTRACEFLT_BADADDR :		\
474	((flags) & CPU_DTRACE_ILLOP) ? DTRACEFLT_ILLOP :		\
475	((flags) & CPU_DTRACE_DIVZERO) ? DTRACEFLT_DIVZERO :		\
476	((flags) & CPU_DTRACE_KPRIV) ? DTRACEFLT_KPRIV :		\
477	((flags) & CPU_DTRACE_UPRIV) ? DTRACEFLT_UPRIV :		\
478	((flags) & CPU_DTRACE_TUPOFLOW) ?  DTRACEFLT_TUPOFLOW :		\
479	((flags) & CPU_DTRACE_BADALIGN) ?  DTRACEFLT_BADALIGN :		\
480	((flags) & CPU_DTRACE_NOSCRATCH) ?  DTRACEFLT_NOSCRATCH :	\
481	((flags) & CPU_DTRACE_BADSTACK) ?  DTRACEFLT_BADSTACK :		\
482	DTRACEFLT_UNKNOWN)
483
484#define	DTRACEACT_ISSTRING(act)						\
485	((act)->dta_kind == DTRACEACT_DIFEXPR &&			\
486	(act)->dta_difo->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING)
487
488static size_t dtrace_strlen(const char *, size_t);
489static dtrace_probe_t *dtrace_probe_lookup_id(dtrace_id_t id);
490static void dtrace_enabling_provide(dtrace_provider_t *);
491static int dtrace_enabling_match(dtrace_enabling_t *, int *);
492static void dtrace_enabling_matchall(void);
493static void dtrace_enabling_reap(void);
494static dtrace_state_t *dtrace_anon_grab(void);
495static uint64_t dtrace_helper(int, dtrace_mstate_t *,
496    dtrace_state_t *, uint64_t, uint64_t);
497static dtrace_helpers_t *dtrace_helpers_create(proc_t *);
498static void dtrace_buffer_drop(dtrace_buffer_t *);
499static int dtrace_buffer_consumed(dtrace_buffer_t *, hrtime_t when);
500static intptr_t dtrace_buffer_reserve(dtrace_buffer_t *, size_t, size_t,
501    dtrace_state_t *, dtrace_mstate_t *);
502static int dtrace_state_option(dtrace_state_t *, dtrace_optid_t,
503    dtrace_optval_t);
504static int dtrace_ecb_create_enable(dtrace_probe_t *, void *);
505static void dtrace_helper_provider_destroy(dtrace_helper_provider_t *);
506static int dtrace_priv_proc(dtrace_state_t *, dtrace_mstate_t *);
507static void dtrace_getf_barrier(void);
508static int dtrace_canload_remains(uint64_t, size_t, size_t *,
509    dtrace_mstate_t *, dtrace_vstate_t *);
510static int dtrace_canstore_remains(uint64_t, size_t, size_t *,
511    dtrace_mstate_t *, dtrace_vstate_t *);
512
513/*
514 * DTrace Probe Context Functions
515 *
516 * These functions are called from probe context.  Because probe context is
517 * any context in which C may be called, arbitrarily locks may be held,
518 * interrupts may be disabled, we may be in arbitrary dispatched state, etc.
519 * As a result, functions called from probe context may only call other DTrace
520 * support functions -- they may not interact at all with the system at large.
521 * (Note that the ASSERT macro is made probe-context safe by redefining it in
522 * terms of dtrace_assfail(), a probe-context safe function.) If arbitrary
523 * loads are to be performed from probe context, they _must_ be in terms of
524 * the safe dtrace_load*() variants.
525 *
526 * Some functions in this block are not actually called from probe context;
527 * for these functions, there will be a comment above the function reading
528 * "Note:  not called from probe context."
529 */
530void
531dtrace_panic(const char *format, ...)
532{
533	va_list alist;
534
535	va_start(alist, format);
536	dtrace_vpanic(format, alist);
537	va_end(alist);
538}
539
540int
541dtrace_assfail(const char *a, const char *f, int l)
542{
543	dtrace_panic("assertion failed: %s, file: %s, line: %d", a, f, l);
544
545	/*
546	 * We just need something here that even the most clever compiler
547	 * cannot optimize away.
548	 */
549	return (a[(uintptr_t)f]);
550}
551
552/*
553 * Atomically increment a specified error counter from probe context.
554 */
555static void
556dtrace_error(uint32_t *counter)
557{
558	/*
559	 * Most counters stored to in probe context are per-CPU counters.
560	 * However, there are some error conditions that are sufficiently
561	 * arcane that they don't merit per-CPU storage.  If these counters
562	 * are incremented concurrently on different CPUs, scalability will be
563	 * adversely affected -- but we don't expect them to be white-hot in a
564	 * correctly constructed enabling...
565	 */
566	uint32_t oval, nval;
567
568	do {
569		oval = *counter;
570
571		if ((nval = oval + 1) == 0) {
572			/*
573			 * If the counter would wrap, set it to 1 -- assuring
574			 * that the counter is never zero when we have seen
575			 * errors.  (The counter must be 32-bits because we
576			 * aren't guaranteed a 64-bit compare&swap operation.)
577			 * To save this code both the infamy of being fingered
578			 * by a priggish news story and the indignity of being
579			 * the target of a neo-puritan witch trial, we're
580			 * carefully avoiding any colorful description of the
581			 * likelihood of this condition -- but suffice it to
582			 * say that it is only slightly more likely than the
583			 * overflow of predicate cache IDs, as discussed in
584			 * dtrace_predicate_create().
585			 */
586			nval = 1;
587		}
588	} while (dtrace_cas32(counter, oval, nval) != oval);
589}
590
591/*
592 * Use the DTRACE_LOADFUNC macro to define functions for each of loading a
593 * uint8_t, a uint16_t, a uint32_t and a uint64_t.
594 */
595/* BEGIN CSTYLED */
596DTRACE_LOADFUNC(8)
597DTRACE_LOADFUNC(16)
598DTRACE_LOADFUNC(32)
599DTRACE_LOADFUNC(64)
600/* END CSTYLED */
601
602static int
603dtrace_inscratch(uintptr_t dest, size_t size, dtrace_mstate_t *mstate)
604{
605	if (dest < mstate->dtms_scratch_base)
606		return (0);
607
608	if (dest + size < dest)
609		return (0);
610
611	if (dest + size > mstate->dtms_scratch_ptr)
612		return (0);
613
614	return (1);
615}
616
617static int
618dtrace_canstore_statvar(uint64_t addr, size_t sz, size_t *remain,
619    dtrace_statvar_t **svars, int nsvars)
620{
621	int i;
622	size_t maxglobalsize, maxlocalsize;
623
624	if (nsvars == 0)
625		return (0);
626
627	maxglobalsize = dtrace_statvar_maxsize + sizeof (uint64_t);
628	maxlocalsize = maxglobalsize * NCPU;
629
630	for (i = 0; i < nsvars; i++) {
631		dtrace_statvar_t *svar = svars[i];
632		uint8_t scope;
633		size_t size;
634
635		if (svar == NULL || (size = svar->dtsv_size) == 0)
636			continue;
637
638		scope = svar->dtsv_var.dtdv_scope;
639
640		/*
641		 * We verify that our size is valid in the spirit of providing
642		 * defense in depth:  we want to prevent attackers from using
643		 * DTrace to escalate an orthogonal kernel heap corruption bug
644		 * into the ability to store to arbitrary locations in memory.
645		 */
646		VERIFY((scope == DIFV_SCOPE_GLOBAL && size <= maxglobalsize) ||
647		    (scope == DIFV_SCOPE_LOCAL && size <= maxlocalsize));
648
649		if (DTRACE_INRANGE(addr, sz, svar->dtsv_data,
650		    svar->dtsv_size)) {
651			DTRACE_RANGE_REMAIN(remain, addr, svar->dtsv_data,
652			    svar->dtsv_size);
653			return (1);
654		}
655	}
656
657	return (0);
658}
659
660/*
661 * Check to see if the address is within a memory region to which a store may
662 * be issued.  This includes the DTrace scratch areas, and any DTrace variable
663 * region.  The caller of dtrace_canstore() is responsible for performing any
664 * alignment checks that are needed before stores are actually executed.
665 */
666static int
667dtrace_canstore(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
668    dtrace_vstate_t *vstate)
669{
670	return (dtrace_canstore_remains(addr, sz, NULL, mstate, vstate));
671}
672
673/*
674 * Implementation of dtrace_canstore which communicates the upper bound of the
675 * allowed memory region.
676 */
677static int
678dtrace_canstore_remains(uint64_t addr, size_t sz, size_t *remain,
679    dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
680{
681	/*
682	 * First, check to see if the address is in scratch space...
683	 */
684	if (DTRACE_INRANGE(addr, sz, mstate->dtms_scratch_base,
685	    mstate->dtms_scratch_size)) {
686		DTRACE_RANGE_REMAIN(remain, addr, mstate->dtms_scratch_base,
687		    mstate->dtms_scratch_size);
688		return (1);
689	}
690
691	/*
692	 * Now check to see if it's a dynamic variable.  This check will pick
693	 * up both thread-local variables and any global dynamically-allocated
694	 * variables.
695	 */
696	if (DTRACE_INRANGE(addr, sz, vstate->dtvs_dynvars.dtds_base,
697	    vstate->dtvs_dynvars.dtds_size)) {
698		dtrace_dstate_t *dstate = &vstate->dtvs_dynvars;
699		uintptr_t base = (uintptr_t)dstate->dtds_base +
700		    (dstate->dtds_hashsize * sizeof (dtrace_dynhash_t));
701		uintptr_t chunkoffs;
702		dtrace_dynvar_t *dvar;
703
704		/*
705		 * Before we assume that we can store here, we need to make
706		 * sure that it isn't in our metadata -- storing to our
707		 * dynamic variable metadata would corrupt our state.  For
708		 * the range to not include any dynamic variable metadata,
709		 * it must:
710		 *
711		 *	(1) Start above the hash table that is at the base of
712		 *	the dynamic variable space
713		 *
714		 *	(2) Have a starting chunk offset that is beyond the
715		 *	dtrace_dynvar_t that is at the base of every chunk
716		 *
717		 *	(3) Not span a chunk boundary
718		 *
719		 *	(4) Not be in the tuple space of a dynamic variable
720		 *
721		 */
722		if (addr < base)
723			return (0);
724
725		chunkoffs = (addr - base) % dstate->dtds_chunksize;
726
727		if (chunkoffs < sizeof (dtrace_dynvar_t))
728			return (0);
729
730		if (chunkoffs + sz > dstate->dtds_chunksize)
731			return (0);
732
733		dvar = (dtrace_dynvar_t *)((uintptr_t)addr - chunkoffs);
734
735		if (dvar->dtdv_hashval == DTRACE_DYNHASH_FREE)
736			return (0);
737
738		if (chunkoffs < sizeof (dtrace_dynvar_t) +
739		    ((dvar->dtdv_tuple.dtt_nkeys - 1) * sizeof (dtrace_key_t)))
740			return (0);
741
742		DTRACE_RANGE_REMAIN(remain, addr, dvar, dstate->dtds_chunksize);
743		return (1);
744	}
745
746	/*
747	 * Finally, check the static local and global variables.  These checks
748	 * take the longest, so we perform them last.
749	 */
750	if (dtrace_canstore_statvar(addr, sz, remain,
751	    vstate->dtvs_locals, vstate->dtvs_nlocals))
752		return (1);
753
754	if (dtrace_canstore_statvar(addr, sz, remain,
755	    vstate->dtvs_globals, vstate->dtvs_nglobals))
756		return (1);
757
758	return (0);
759}
760
761
762/*
763 * Convenience routine to check to see if the address is within a memory
764 * region in which a load may be issued given the user's privilege level;
765 * if not, it sets the appropriate error flags and loads 'addr' into the
766 * illegal value slot.
767 *
768 * DTrace subroutines (DIF_SUBR_*) should use this helper to implement
769 * appropriate memory access protection.
770 */
771static int
772dtrace_canload(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
773    dtrace_vstate_t *vstate)
774{
775	return (dtrace_canload_remains(addr, sz, NULL, mstate, vstate));
776}
777
778/*
779 * Implementation of dtrace_canload which communicates the upper bound of the
780 * allowed memory region.
781 */
782static int
783dtrace_canload_remains(uint64_t addr, size_t sz, size_t *remain,
784    dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
785{
786	volatile uintptr_t *illval = &cpu_core[CPU->cpu_id].cpuc_dtrace_illval;
787	file_t *fp;
788
789	/*
790	 * If we hold the privilege to read from kernel memory, then
791	 * everything is readable.
792	 */
793	if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0) {
794		DTRACE_RANGE_REMAIN(remain, addr, addr, sz);
795		return (1);
796	}
797
798	/*
799	 * You can obviously read that which you can store.
800	 */
801	if (dtrace_canstore_remains(addr, sz, remain, mstate, vstate))
802		return (1);
803
804	/*
805	 * We're allowed to read from our own string table.
806	 */
807	if (DTRACE_INRANGE(addr, sz, mstate->dtms_difo->dtdo_strtab,
808	    mstate->dtms_difo->dtdo_strlen)) {
809		DTRACE_RANGE_REMAIN(remain, addr,
810		    mstate->dtms_difo->dtdo_strtab,
811		    mstate->dtms_difo->dtdo_strlen);
812		return (1);
813	}
814
815	if (vstate->dtvs_state != NULL &&
816	    dtrace_priv_proc(vstate->dtvs_state, mstate)) {
817		proc_t *p;
818
819		/*
820		 * When we have privileges to the current process, there are
821		 * several context-related kernel structures that are safe to
822		 * read, even absent the privilege to read from kernel memory.
823		 * These reads are safe because these structures contain only
824		 * state that (1) we're permitted to read, (2) is harmless or
825		 * (3) contains pointers to additional kernel state that we're
826		 * not permitted to read (and as such, do not present an
827		 * opportunity for privilege escalation).  Finally (and
828		 * critically), because of the nature of their relation with
829		 * the current thread context, the memory associated with these
830		 * structures cannot change over the duration of probe context,
831		 * and it is therefore impossible for this memory to be
832		 * deallocated and reallocated as something else while it's
833		 * being operated upon.
834		 */
835		if (DTRACE_INRANGE(addr, sz, curthread, sizeof (kthread_t))) {
836			DTRACE_RANGE_REMAIN(remain, addr, curthread,
837			    sizeof (kthread_t));
838			return (1);
839		}
840
841		if ((p = curthread->t_procp) != NULL && DTRACE_INRANGE(addr,
842		    sz, curthread->t_procp, sizeof (proc_t))) {
843			DTRACE_RANGE_REMAIN(remain, addr, curthread->t_procp,
844			    sizeof (proc_t));
845			return (1);
846		}
847
848		if (curthread->t_cred != NULL && DTRACE_INRANGE(addr, sz,
849		    curthread->t_cred, sizeof (cred_t))) {
850			DTRACE_RANGE_REMAIN(remain, addr, curthread->t_cred,
851			    sizeof (cred_t));
852			return (1);
853		}
854
855		if (p != NULL && p->p_pidp != NULL && DTRACE_INRANGE(addr, sz,
856		    &(p->p_pidp->pid_id), sizeof (pid_t))) {
857			DTRACE_RANGE_REMAIN(remain, addr, &(p->p_pidp->pid_id),
858			    sizeof (pid_t));
859			return (1);
860		}
861
862		if (curthread->t_cpu != NULL && DTRACE_INRANGE(addr, sz,
863		    curthread->t_cpu, offsetof(cpu_t, cpu_pause_thread))) {
864			DTRACE_RANGE_REMAIN(remain, addr, curthread->t_cpu,
865			    offsetof(cpu_t, cpu_pause_thread));
866			return (1);
867		}
868	}
869
870	if ((fp = mstate->dtms_getf) != NULL) {
871		uintptr_t psz = sizeof (void *);
872		vnode_t *vp;
873		vnodeops_t *op;
874
875		/*
876		 * When getf() returns a file_t, the enabling is implicitly
877		 * granted the (transient) right to read the returned file_t
878		 * as well as the v_path and v_op->vnop_name of the underlying
879		 * vnode.  These accesses are allowed after a successful
880		 * getf() because the members that they refer to cannot change
881		 * once set -- and the barrier logic in the kernel's closef()
882		 * path assures that the file_t and its referenced vode_t
883		 * cannot themselves be stale (that is, it impossible for
884		 * either dtms_getf itself or its f_vnode member to reference
885		 * freed memory).
886		 */
887		if (DTRACE_INRANGE(addr, sz, fp, sizeof (file_t))) {
888			DTRACE_RANGE_REMAIN(remain, addr, fp, sizeof (file_t));
889			return (1);
890		}
891
892		if ((vp = fp->f_vnode) != NULL) {
893			size_t slen;
894
895			if (DTRACE_INRANGE(addr, sz, &vp->v_path, psz)) {
896				DTRACE_RANGE_REMAIN(remain, addr, &vp->v_path,
897				    psz);
898				return (1);
899			}
900
901			slen = strlen(vp->v_path) + 1;
902			if (DTRACE_INRANGE(addr, sz, vp->v_path, slen)) {
903				DTRACE_RANGE_REMAIN(remain, addr, vp->v_path,
904				    slen);
905				return (1);
906			}
907
908			if (DTRACE_INRANGE(addr, sz, &vp->v_op, psz)) {
909				DTRACE_RANGE_REMAIN(remain, addr, &vp->v_op,
910				    psz);
911				return (1);
912			}
913
914			if ((op = vp->v_op) != NULL &&
915			    DTRACE_INRANGE(addr, sz, &op->vnop_name, psz)) {
916				DTRACE_RANGE_REMAIN(remain, addr,
917				    &op->vnop_name, psz);
918				return (1);
919			}
920
921			if (op != NULL && op->vnop_name != NULL &&
922			    DTRACE_INRANGE(addr, sz, op->vnop_name,
923			    (slen = strlen(op->vnop_name) + 1))) {
924				DTRACE_RANGE_REMAIN(remain, addr,
925				    op->vnop_name, slen);
926				return (1);
927			}
928		}
929	}
930
931	DTRACE_CPUFLAG_SET(CPU_DTRACE_KPRIV);
932	*illval = addr;
933	return (0);
934}
935
936/*
937 * Convenience routine to check to see if a given string is within a memory
938 * region in which a load may be issued given the user's privilege level;
939 * this exists so that we don't need to issue unnecessary dtrace_strlen()
940 * calls in the event that the user has all privileges.
941 */
942static int
943dtrace_strcanload(uint64_t addr, size_t sz, size_t *remain,
944    dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
945{
946	size_t rsize;
947
948	/*
949	 * If we hold the privilege to read from kernel memory, then
950	 * everything is readable.
951	 */
952	if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0) {
953		DTRACE_RANGE_REMAIN(remain, addr, addr, sz);
954		return (1);
955	}
956
957	/*
958	 * Even if the caller is uninterested in querying the remaining valid
959	 * range, it is required to ensure that the access is allowed.
960	 */
961	if (remain == NULL) {
962		remain = &rsize;
963	}
964	if (dtrace_canload_remains(addr, 0, remain, mstate, vstate)) {
965		size_t strsz;
966		/*
967		 * Perform the strlen after determining the length of the
968		 * memory region which is accessible.  This prevents timing
969		 * information from being used to find NULs in memory which is
970		 * not accessible to the caller.
971		 */
972		strsz = 1 + dtrace_strlen((char *)(uintptr_t)addr,
973		    MIN(sz, *remain));
974		if (strsz <= *remain) {
975			return (1);
976		}
977	}
978
979	return (0);
980}
981
982/*
983 * Convenience routine to check to see if a given variable is within a memory
984 * region in which a load may be issued given the user's privilege level.
985 */
986static int
987dtrace_vcanload(void *src, dtrace_diftype_t *type, size_t *remain,
988    dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
989{
990	size_t sz;
991	ASSERT(type->dtdt_flags & DIF_TF_BYREF);
992
993	/*
994	 * Calculate the max size before performing any checks since even
995	 * DTRACE_ACCESS_KERNEL-credentialed callers expect that this function
996	 * return the max length via 'remain'.
997	 */
998	if (type->dtdt_kind == DIF_TYPE_STRING) {
999		dtrace_state_t *state = vstate->dtvs_state;
1000
1001		if (state != NULL) {
1002			sz = state->dts_options[DTRACEOPT_STRSIZE];
1003		} else {
1004			/*
1005			 * In helper context, we have a NULL state; fall back
1006			 * to using the system-wide default for the string size
1007			 * in this case.
1008			 */
1009			sz = dtrace_strsize_default;
1010		}
1011	} else {
1012		sz = type->dtdt_size;
1013	}
1014
1015	/*
1016	 * If we hold the privilege to read from kernel memory, then
1017	 * everything is readable.
1018	 */
1019	if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0) {
1020		DTRACE_RANGE_REMAIN(remain, (uintptr_t)src, src, sz);
1021		return (1);
1022	}
1023
1024	if (type->dtdt_kind == DIF_TYPE_STRING) {
1025		return (dtrace_strcanload((uintptr_t)src, sz, remain, mstate,
1026		    vstate));
1027	}
1028	return (dtrace_canload_remains((uintptr_t)src, sz, remain, mstate,
1029	    vstate));
1030}
1031
1032/*
1033 * Convert a string to a signed integer using safe loads.
1034 *
1035 * NOTE: This function uses various macros from strtolctype.h to manipulate
1036 * digit values, etc -- these have all been checked to ensure they make
1037 * no additional function calls.
1038 */
1039static int64_t
1040dtrace_strtoll(char *input, int base, size_t limit)
1041{
1042	uintptr_t pos = (uintptr_t)input;
1043	int64_t val = 0;
1044	int x;
1045	boolean_t neg = B_FALSE;
1046	char c, cc, ccc;
1047	uintptr_t end = pos + limit;
1048
1049	/*
1050	 * Consume any whitespace preceding digits.
1051	 */
1052	while ((c = dtrace_load8(pos)) == ' ' || c == '\t')
1053		pos++;
1054
1055	/*
1056	 * Handle an explicit sign if one is present.
1057	 */
1058	if (c == '-' || c == '+') {
1059		if (c == '-')
1060			neg = B_TRUE;
1061		c = dtrace_load8(++pos);
1062	}
1063
1064	/*
1065	 * Check for an explicit hexadecimal prefix ("0x" or "0X") and skip it
1066	 * if present.
1067	 */
1068	if (base == 16 && c == '0' && ((cc = dtrace_load8(pos + 1)) == 'x' ||
1069	    cc == 'X') && isxdigit(ccc = dtrace_load8(pos + 2))) {
1070		pos += 2;
1071		c = ccc;
1072	}
1073
1074	/*
1075	 * Read in contiguous digits until the first non-digit character.
1076	 */
1077	for (; pos < end && c != '\0' && lisalnum(c) && (x = DIGIT(c)) < base;
1078	    c = dtrace_load8(++pos))
1079		val = val * base + x;
1080
1081	return (neg ? -val : val);
1082}
1083
1084/*
1085 * Compare two strings using safe loads.
1086 */
1087static int
1088dtrace_strncmp(char *s1, char *s2, size_t limit)
1089{
1090	uint8_t c1, c2;
1091	volatile uint16_t *flags;
1092
1093	if (s1 == s2 || limit == 0)
1094		return (0);
1095
1096	flags = (volatile uint16_t *)&cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
1097
1098	do {
1099		if (s1 == NULL) {
1100			c1 = '\0';
1101		} else {
1102			c1 = dtrace_load8((uintptr_t)s1++);
1103		}
1104
1105		if (s2 == NULL) {
1106			c2 = '\0';
1107		} else {
1108			c2 = dtrace_load8((uintptr_t)s2++);
1109		}
1110
1111		if (c1 != c2)
1112			return (c1 - c2);
1113	} while (--limit && c1 != '\0' && !(*flags & CPU_DTRACE_FAULT));
1114
1115	return (0);
1116}
1117
1118/*
1119 * Compute strlen(s) for a string using safe memory accesses.  The additional
1120 * len parameter is used to specify a maximum length to ensure completion.
1121 */
1122static size_t
1123dtrace_strlen(const char *s, size_t lim)
1124{
1125	uint_t len;
1126
1127	for (len = 0; len != lim; len++) {
1128		if (dtrace_load8((uintptr_t)s++) == '\0')
1129			break;
1130	}
1131
1132	return (len);
1133}
1134
1135/*
1136 * Check if an address falls within a toxic region.
1137 */
1138static int
1139dtrace_istoxic(uintptr_t kaddr, size_t size)
1140{
1141	uintptr_t taddr, tsize;
1142	int i;
1143
1144	for (i = 0; i < dtrace_toxranges; i++) {
1145		taddr = dtrace_toxrange[i].dtt_base;
1146		tsize = dtrace_toxrange[i].dtt_limit - taddr;
1147
1148		if (kaddr - taddr < tsize) {
1149			DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
1150			cpu_core[CPU->cpu_id].cpuc_dtrace_illval = kaddr;
1151			return (1);
1152		}
1153
1154		if (taddr - kaddr < size) {
1155			DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
1156			cpu_core[CPU->cpu_id].cpuc_dtrace_illval = taddr;
1157			return (1);
1158		}
1159	}
1160
1161	return (0);
1162}
1163
1164/*
1165 * Copy src to dst using safe memory accesses.  The src is assumed to be unsafe
1166 * memory specified by the DIF program.  The dst is assumed to be safe memory
1167 * that we can store to directly because it is managed by DTrace.  As with
1168 * standard bcopy, overlapping copies are handled properly.
1169 */
1170static void
1171dtrace_bcopy(const void *src, void *dst, size_t len)
1172{
1173	if (len != 0) {
1174		uint8_t *s1 = dst;
1175		const uint8_t *s2 = src;
1176
1177		if (s1 <= s2) {
1178			do {
1179				*s1++ = dtrace_load8((uintptr_t)s2++);
1180			} while (--len != 0);
1181		} else {
1182			s2 += len;
1183			s1 += len;
1184
1185			do {
1186				*--s1 = dtrace_load8((uintptr_t)--s2);
1187			} while (--len != 0);
1188		}
1189	}
1190}
1191
1192/*
1193 * Copy src to dst using safe memory accesses, up to either the specified
1194 * length, or the point that a nul byte is encountered.  The src is assumed to
1195 * be unsafe memory specified by the DIF program.  The dst is assumed to be
1196 * safe memory that we can store to directly because it is managed by DTrace.
1197 * Unlike dtrace_bcopy(), overlapping regions are not handled.
1198 */
1199static void
1200dtrace_strcpy(const void *src, void *dst, size_t len)
1201{
1202	if (len != 0) {
1203		uint8_t *s1 = dst, c;
1204		const uint8_t *s2 = src;
1205
1206		do {
1207			*s1++ = c = dtrace_load8((uintptr_t)s2++);
1208		} while (--len != 0 && c != '\0');
1209	}
1210}
1211
1212/*
1213 * Copy src to dst, deriving the size and type from the specified (BYREF)
1214 * variable type.  The src is assumed to be unsafe memory specified by the DIF
1215 * program.  The dst is assumed to be DTrace variable memory that is of the
1216 * specified type; we assume that we can store to directly.
1217 */
1218static void
1219dtrace_vcopy(void *src, void *dst, dtrace_diftype_t *type, size_t limit)
1220{
1221	ASSERT(type->dtdt_flags & DIF_TF_BYREF);
1222
1223	if (type->dtdt_kind == DIF_TYPE_STRING) {
1224		dtrace_strcpy(src, dst, MIN(type->dtdt_size, limit));
1225	} else {
1226		dtrace_bcopy(src, dst, MIN(type->dtdt_size, limit));
1227	}
1228}
1229
1230/*
1231 * Compare s1 to s2 using safe memory accesses.  The s1 data is assumed to be
1232 * unsafe memory specified by the DIF program.  The s2 data is assumed to be
1233 * safe memory that we can access directly because it is managed by DTrace.
1234 */
1235static int
1236dtrace_bcmp(const void *s1, const void *s2, size_t len)
1237{
1238	volatile uint16_t *flags;
1239
1240	flags = (volatile uint16_t *)&cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
1241
1242	if (s1 == s2)
1243		return (0);
1244
1245	if (s1 == NULL || s2 == NULL)
1246		return (1);
1247
1248	if (s1 != s2 && len != 0) {
1249		const uint8_t *ps1 = s1;
1250		const uint8_t *ps2 = s2;
1251
1252		do {
1253			if (dtrace_load8((uintptr_t)ps1++) != *ps2++)
1254				return (1);
1255		} while (--len != 0 && !(*flags & CPU_DTRACE_FAULT));
1256	}
1257	return (0);
1258}
1259
1260/*
1261 * Zero the specified region using a simple byte-by-byte loop.  Note that this
1262 * is for safe DTrace-managed memory only.
1263 */
1264static void
1265dtrace_bzero(void *dst, size_t len)
1266{
1267	uchar_t *cp;
1268
1269	for (cp = dst; len != 0; len--)
1270		*cp++ = 0;
1271}
1272
1273static void
1274dtrace_add_128(uint64_t *addend1, uint64_t *addend2, uint64_t *sum)
1275{
1276	uint64_t result[2];
1277
1278	result[0] = addend1[0] + addend2[0];
1279	result[1] = addend1[1] + addend2[1] +
1280	    (result[0] < addend1[0] || result[0] < addend2[0] ? 1 : 0);
1281
1282	sum[0] = result[0];
1283	sum[1] = result[1];
1284}
1285
1286/*
1287 * Shift the 128-bit value in a by b. If b is positive, shift left.
1288 * If b is negative, shift right.
1289 */
1290static void
1291dtrace_shift_128(uint64_t *a, int b)
1292{
1293	uint64_t mask;
1294
1295	if (b == 0)
1296		return;
1297
1298	if (b < 0) {
1299		b = -b;
1300		if (b >= 64) {
1301			a[0] = a[1] >> (b - 64);
1302			a[1] = 0;
1303		} else {
1304			a[0] >>= b;
1305			mask = 1LL << (64 - b);
1306			mask -= 1;
1307			a[0] |= ((a[1] & mask) << (64 - b));
1308			a[1] >>= b;
1309		}
1310	} else {
1311		if (b >= 64) {
1312			a[1] = a[0] << (b - 64);
1313			a[0] = 0;
1314		} else {
1315			a[1] <<= b;
1316			mask = a[0] >> (64 - b);
1317			a[1] |= mask;
1318			a[0] <<= b;
1319		}
1320	}
1321}
1322
1323/*
1324 * The basic idea is to break the 2 64-bit values into 4 32-bit values,
1325 * use native multiplication on those, and then re-combine into the
1326 * resulting 128-bit value.
1327 *
1328 * (hi1 << 32 + lo1) * (hi2 << 32 + lo2) =
1329 *     hi1 * hi2 << 64 +
1330 *     hi1 * lo2 << 32 +
1331 *     hi2 * lo1 << 32 +
1332 *     lo1 * lo2
1333 */
1334static void
1335dtrace_multiply_128(uint64_t factor1, uint64_t factor2, uint64_t *product)
1336{
1337	uint64_t hi1, hi2, lo1, lo2;
1338	uint64_t tmp[2];
1339
1340	hi1 = factor1 >> 32;
1341	hi2 = factor2 >> 32;
1342
1343	lo1 = factor1 & DT_MASK_LO;
1344	lo2 = factor2 & DT_MASK_LO;
1345
1346	product[0] = lo1 * lo2;
1347	product[1] = hi1 * hi2;
1348
1349	tmp[0] = hi1 * lo2;
1350	tmp[1] = 0;
1351	dtrace_shift_128(tmp, 32);
1352	dtrace_add_128(product, tmp, product);
1353
1354	tmp[0] = hi2 * lo1;
1355	tmp[1] = 0;
1356	dtrace_shift_128(tmp, 32);
1357	dtrace_add_128(product, tmp, product);
1358}
1359
1360/*
1361 * This privilege check should be used by actions and subroutines to
1362 * verify that the user credentials of the process that enabled the
1363 * invoking ECB match the target credentials
1364 */
1365static int
1366dtrace_priv_proc_common_user(dtrace_state_t *state)
1367{
1368	cred_t *cr, *s_cr = state->dts_cred.dcr_cred;
1369
1370	/*
1371	 * We should always have a non-NULL state cred here, since if cred
1372	 * is null (anonymous tracing), we fast-path bypass this routine.
1373	 */
1374	ASSERT(s_cr != NULL);
1375
1376	if ((cr = CRED()) != NULL &&
1377	    s_cr->cr_uid == cr->cr_uid &&
1378	    s_cr->cr_uid == cr->cr_ruid &&
1379	    s_cr->cr_uid == cr->cr_suid &&
1380	    s_cr->cr_gid == cr->cr_gid &&
1381	    s_cr->cr_gid == cr->cr_rgid &&
1382	    s_cr->cr_gid == cr->cr_sgid)
1383		return (1);
1384
1385	return (0);
1386}
1387
1388/*
1389 * This privilege check should be used by actions and subroutines to
1390 * verify that the zone of the process that enabled the invoking ECB
1391 * matches the target credentials
1392 */
1393static int
1394dtrace_priv_proc_common_zone(dtrace_state_t *state)
1395{
1396	cred_t *cr, *s_cr = state->dts_cred.dcr_cred;
1397
1398	/*
1399	 * We should always have a non-NULL state cred here, since if cred
1400	 * is null (anonymous tracing), we fast-path bypass this routine.
1401	 */
1402	ASSERT(s_cr != NULL);
1403
1404	if ((cr = CRED()) != NULL && s_cr->cr_zone == cr->cr_zone)
1405		return (1);
1406
1407	return (0);
1408}
1409
1410/*
1411 * This privilege check should be used by actions and subroutines to
1412 * verify that the process has not setuid or changed credentials.
1413 */
1414static int
1415dtrace_priv_proc_common_nocd()
1416{
1417	proc_t *proc;
1418
1419	if ((proc = ttoproc(curthread)) != NULL &&
1420	    !(proc->p_flag & SNOCD))
1421		return (1);
1422
1423	return (0);
1424}
1425
1426static int
1427dtrace_priv_proc_destructive(dtrace_state_t *state, dtrace_mstate_t *mstate)
1428{
1429	int action = state->dts_cred.dcr_action;
1430
1431	if (!(mstate->dtms_access & DTRACE_ACCESS_PROC))
1432		goto bad;
1433
1434	if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE) == 0) &&
1435	    dtrace_priv_proc_common_zone(state) == 0)
1436		goto bad;
1437
1438	if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER) == 0) &&
1439	    dtrace_priv_proc_common_user(state) == 0)
1440		goto bad;
1441
1442	if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG) == 0) &&
1443	    dtrace_priv_proc_common_nocd() == 0)
1444		goto bad;
1445
1446	return (1);
1447
1448bad:
1449	cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1450
1451	return (0);
1452}
1453
1454static int
1455dtrace_priv_proc_control(dtrace_state_t *state, dtrace_mstate_t *mstate)
1456{
1457	if (mstate->dtms_access & DTRACE_ACCESS_PROC) {
1458		if (state->dts_cred.dcr_action & DTRACE_CRA_PROC_CONTROL)
1459			return (1);
1460
1461		if (dtrace_priv_proc_common_zone(state) &&
1462		    dtrace_priv_proc_common_user(state) &&
1463		    dtrace_priv_proc_common_nocd())
1464			return (1);
1465	}
1466
1467	cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1468
1469	return (0);
1470}
1471
1472static int
1473dtrace_priv_proc(dtrace_state_t *state, dtrace_mstate_t *mstate)
1474{
1475	if ((mstate->dtms_access & DTRACE_ACCESS_PROC) &&
1476	    (state->dts_cred.dcr_action & DTRACE_CRA_PROC))
1477		return (1);
1478
1479	cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1480
1481	return (0);
1482}
1483
1484static int
1485dtrace_priv_kernel(dtrace_state_t *state)
1486{
1487	if (state->dts_cred.dcr_action & DTRACE_CRA_KERNEL)
1488		return (1);
1489
1490	cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_KPRIV;
1491
1492	return (0);
1493}
1494
1495static int
1496dtrace_priv_kernel_destructive(dtrace_state_t *state)
1497{
1498	if (state->dts_cred.dcr_action & DTRACE_CRA_KERNEL_DESTRUCTIVE)
1499		return (1);
1500
1501	cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_KPRIV;
1502
1503	return (0);
1504}
1505
1506/*
1507 * Determine if the dte_cond of the specified ECB allows for processing of
1508 * the current probe to continue.  Note that this routine may allow continued
1509 * processing, but with access(es) stripped from the mstate's dtms_access
1510 * field.
1511 */
1512static int
1513dtrace_priv_probe(dtrace_state_t *state, dtrace_mstate_t *mstate,
1514    dtrace_ecb_t *ecb)
1515{
1516	dtrace_probe_t *probe = ecb->dte_probe;
1517	dtrace_provider_t *prov = probe->dtpr_provider;
1518	dtrace_pops_t *pops = &prov->dtpv_pops;
1519	int mode = DTRACE_MODE_NOPRIV_DROP;
1520
1521	ASSERT(ecb->dte_cond);
1522
1523	if (pops->dtps_mode != NULL) {
1524		mode = pops->dtps_mode(prov->dtpv_arg,
1525		    probe->dtpr_id, probe->dtpr_arg);
1526
1527		ASSERT(mode & (DTRACE_MODE_USER | DTRACE_MODE_KERNEL));
1528		ASSERT(mode & (DTRACE_MODE_NOPRIV_RESTRICT |
1529		    DTRACE_MODE_NOPRIV_DROP));
1530	}
1531
1532	/*
1533	 * If the dte_cond bits indicate that this consumer is only allowed to
1534	 * see user-mode firings of this probe, check that the probe was fired
1535	 * while in a user context.  If that's not the case, use the policy
1536	 * specified by the provider to determine if we drop the probe or
1537	 * merely restrict operation.
1538	 */
1539	if (ecb->dte_cond & DTRACE_COND_USERMODE) {
1540		ASSERT(mode != DTRACE_MODE_NOPRIV_DROP);
1541
1542		if (!(mode & DTRACE_MODE_USER)) {
1543			if (mode & DTRACE_MODE_NOPRIV_DROP)
1544				return (0);
1545
1546			mstate->dtms_access &= ~DTRACE_ACCESS_ARGS;
1547		}
1548	}
1549
1550	/*
1551	 * This is more subtle than it looks. We have to be absolutely certain
1552	 * that CRED() isn't going to change out from under us so it's only
1553	 * legit to examine that structure if we're in constrained situations.
1554	 * Currently, the only times we'll this check is if a non-super-user
1555	 * has enabled the profile or syscall providers -- providers that
1556	 * allow visibility of all processes. For the profile case, the check
1557	 * above will ensure that we're examining a user context.
1558	 */
1559	if (ecb->dte_cond & DTRACE_COND_OWNER) {
1560		cred_t *cr;
1561		cred_t *s_cr = state->dts_cred.dcr_cred;
1562		proc_t *proc;
1563
1564		ASSERT(s_cr != NULL);
1565
1566		if ((cr = CRED()) == NULL ||
1567		    s_cr->cr_uid != cr->cr_uid ||
1568		    s_cr->cr_uid != cr->cr_ruid ||
1569		    s_cr->cr_uid != cr->cr_suid ||
1570		    s_cr->cr_gid != cr->cr_gid ||
1571		    s_cr->cr_gid != cr->cr_rgid ||
1572		    s_cr->cr_gid != cr->cr_sgid ||
1573		    (proc = ttoproc(curthread)) == NULL ||
1574		    (proc->p_flag & SNOCD)) {
1575			if (mode & DTRACE_MODE_NOPRIV_DROP)
1576				return (0);
1577
1578			mstate->dtms_access &= ~DTRACE_ACCESS_PROC;
1579		}
1580	}
1581
1582	/*
1583	 * If our dte_cond is set to DTRACE_COND_ZONEOWNER and we are not
1584	 * in our zone, check to see if our mode policy is to restrict rather
1585	 * than to drop; if to restrict, strip away both DTRACE_ACCESS_PROC
1586	 * and DTRACE_ACCESS_ARGS
1587	 */
1588	if (ecb->dte_cond & DTRACE_COND_ZONEOWNER) {
1589		cred_t *cr;
1590		cred_t *s_cr = state->dts_cred.dcr_cred;
1591
1592		ASSERT(s_cr != NULL);
1593
1594		if ((cr = CRED()) == NULL ||
1595		    s_cr->cr_zone->zone_id != cr->cr_zone->zone_id) {
1596			if (mode & DTRACE_MODE_NOPRIV_DROP)
1597				return (0);
1598
1599			mstate->dtms_access &=
1600			    ~(DTRACE_ACCESS_PROC | DTRACE_ACCESS_ARGS);
1601		}
1602	}
1603
1604	/*
1605	 * By merits of being in this code path at all, we have limited
1606	 * privileges.  If the provider has indicated that limited privileges
1607	 * are to denote restricted operation, strip off the ability to access
1608	 * arguments.
1609	 */
1610	if (mode & DTRACE_MODE_LIMITEDPRIV_RESTRICT)
1611		mstate->dtms_access &= ~DTRACE_ACCESS_ARGS;
1612
1613	return (1);
1614}
1615
1616/*
1617 * Note:  not called from probe context.  This function is called
1618 * asynchronously (and at a regular interval) from outside of probe context to
1619 * clean the dirty dynamic variable lists on all CPUs.  Dynamic variable
1620 * cleaning is explained in detail in <sys/dtrace_impl.h>.
1621 */
1622void
1623dtrace_dynvar_clean(dtrace_dstate_t *dstate)
1624{
1625	dtrace_dynvar_t *dirty;
1626	dtrace_dstate_percpu_t *dcpu;
1627	dtrace_dynvar_t **rinsep;
1628	int i, j, work = 0;
1629
1630	for (i = 0; i < NCPU; i++) {
1631		dcpu = &dstate->dtds_percpu[i];
1632		rinsep = &dcpu->dtdsc_rinsing;
1633
1634		/*
1635		 * If the dirty list is NULL, there is no dirty work to do.
1636		 */
1637		if (dcpu->dtdsc_dirty == NULL)
1638			continue;
1639
1640		if (dcpu->dtdsc_rinsing != NULL) {
1641			/*
1642			 * If the rinsing list is non-NULL, then it is because
1643			 * this CPU was selected to accept another CPU's
1644			 * dirty list -- and since that time, dirty buffers
1645			 * have accumulated.  This is a highly unlikely
1646			 * condition, but we choose to ignore the dirty
1647			 * buffers -- they'll be picked up a future cleanse.
1648			 */
1649			continue;
1650		}
1651
1652		if (dcpu->dtdsc_clean != NULL) {
1653			/*
1654			 * If the clean list is non-NULL, then we're in a
1655			 * situation where a CPU has done deallocations (we
1656			 * have a non-NULL dirty list) but no allocations (we
1657			 * also have a non-NULL clean list).  We can't simply
1658			 * move the dirty list into the clean list on this
1659			 * CPU, yet we also don't want to allow this condition
1660			 * to persist, lest a short clean list prevent a
1661			 * massive dirty list from being cleaned (which in
1662			 * turn could lead to otherwise avoidable dynamic
1663			 * drops).  To deal with this, we look for some CPU
1664			 * with a NULL clean list, NULL dirty list, and NULL
1665			 * rinsing list -- and then we borrow this CPU to
1666			 * rinse our dirty list.
1667			 */
1668			for (j = 0; j < NCPU; j++) {
1669				dtrace_dstate_percpu_t *rinser;
1670
1671				rinser = &dstate->dtds_percpu[j];
1672
1673				if (rinser->dtdsc_rinsing != NULL)
1674					continue;
1675
1676				if (rinser->dtdsc_dirty != NULL)
1677					continue;
1678
1679				if (rinser->dtdsc_clean != NULL)
1680					continue;
1681
1682				rinsep = &rinser->dtdsc_rinsing;
1683				break;
1684			}
1685
1686			if (j == NCPU) {
1687				/*
1688				 * We were unable to find another CPU that
1689				 * could accept this dirty list -- we are
1690				 * therefore unable to clean it now.
1691				 */
1692				dtrace_dynvar_failclean++;
1693				continue;
1694			}
1695		}
1696
1697		work = 1;
1698
1699		/*
1700		 * Atomically move the dirty list aside.
1701		 */
1702		do {
1703			dirty = dcpu->dtdsc_dirty;
1704
1705			/*
1706			 * Before we zap the dirty list, set the rinsing list.
1707			 * (This allows for a potential assertion in
1708			 * dtrace_dynvar():  if a free dynamic variable appears
1709			 * on a hash chain, either the dirty list or the
1710			 * rinsing list for some CPU must be non-NULL.)
1711			 */
1712			*rinsep = dirty;
1713			dtrace_membar_producer();
1714		} while (dtrace_casptr(&dcpu->dtdsc_dirty,
1715		    dirty, NULL) != dirty);
1716	}
1717
1718	if (!work) {
1719		/*
1720		 * We have no work to do; we can simply return.
1721		 */
1722		return;
1723	}
1724
1725	dtrace_sync();
1726
1727	for (i = 0; i < NCPU; i++) {
1728		dcpu = &dstate->dtds_percpu[i];
1729
1730		if (dcpu->dtdsc_rinsing == NULL)
1731			continue;
1732
1733		/*
1734		 * We are now guaranteed that no hash chain contains a pointer
1735		 * into this dirty list; we can make it clean.
1736		 */
1737		ASSERT(dcpu->dtdsc_clean == NULL);
1738		dcpu->dtdsc_clean = dcpu->dtdsc_rinsing;
1739		dcpu->dtdsc_rinsing = NULL;
1740	}
1741
1742	/*
1743	 * Before we actually set the state to be DTRACE_DSTATE_CLEAN, make
1744	 * sure that all CPUs have seen all of the dtdsc_clean pointers.
1745	 * This prevents a race whereby a CPU incorrectly decides that
1746	 * the state should be something other than DTRACE_DSTATE_CLEAN
1747	 * after dtrace_dynvar_clean() has completed.
1748	 */
1749	dtrace_sync();
1750
1751	dstate->dtds_state = DTRACE_DSTATE_CLEAN;
1752}
1753
1754/*
1755 * Depending on the value of the op parameter, this function looks-up,
1756 * allocates or deallocates an arbitrarily-keyed dynamic variable.  If an
1757 * allocation is requested, this function will return a pointer to a
1758 * dtrace_dynvar_t corresponding to the allocated variable -- or NULL if no
1759 * variable can be allocated.  If NULL is returned, the appropriate counter
1760 * will be incremented.
1761 */
1762dtrace_dynvar_t *
1763dtrace_dynvar(dtrace_dstate_t *dstate, uint_t nkeys,
1764    dtrace_key_t *key, size_t dsize, dtrace_dynvar_op_t op,
1765    dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
1766{
1767	uint64_t hashval = DTRACE_DYNHASH_VALID;
1768	dtrace_dynhash_t *hash = dstate->dtds_hash;
1769	dtrace_dynvar_t *free, *new_free, *next, *dvar, *start, *prev = NULL;
1770	processorid_t me = CPU->cpu_id, cpu = me;
1771	dtrace_dstate_percpu_t *dcpu = &dstate->dtds_percpu[me];
1772	size_t bucket, ksize;
1773	size_t chunksize = dstate->dtds_chunksize;
1774	uintptr_t kdata, lock, nstate;
1775	uint_t i;
1776
1777	ASSERT(nkeys != 0);
1778
1779	/*
1780	 * Hash the key.  As with aggregations, we use Jenkins' "One-at-a-time"
1781	 * algorithm.  For the by-value portions, we perform the algorithm in
1782	 * 16-bit chunks (as opposed to 8-bit chunks).  This speeds things up a
1783	 * bit, and seems to have only a minute effect on distribution.  For
1784	 * the by-reference data, we perform "One-at-a-time" iterating (safely)
1785	 * over each referenced byte.  It's painful to do this, but it's much
1786	 * better than pathological hash distribution.  The efficacy of the
1787	 * hashing algorithm (and a comparison with other algorithms) may be
1788	 * found by running the ::dtrace_dynstat MDB dcmd.
1789	 */
1790	for (i = 0; i < nkeys; i++) {
1791		if (key[i].dttk_size == 0) {
1792			uint64_t val = key[i].dttk_value;
1793
1794			hashval += (val >> 48) & 0xffff;
1795			hashval += (hashval << 10);
1796			hashval ^= (hashval >> 6);
1797
1798			hashval += (val >> 32) & 0xffff;
1799			hashval += (hashval << 10);
1800			hashval ^= (hashval >> 6);
1801
1802			hashval += (val >> 16) & 0xffff;
1803			hashval += (hashval << 10);
1804			hashval ^= (hashval >> 6);
1805
1806			hashval += val & 0xffff;
1807			hashval += (hashval << 10);
1808			hashval ^= (hashval >> 6);
1809		} else {
1810			/*
1811			 * This is incredibly painful, but it beats the hell
1812			 * out of the alternative.
1813			 */
1814			uint64_t j, size = key[i].dttk_size;
1815			uintptr_t base = (uintptr_t)key[i].dttk_value;
1816
1817			if (!dtrace_canload(base, size, mstate, vstate))
1818				break;
1819
1820			for (j = 0; j < size; j++) {
1821				hashval += dtrace_load8(base + j);
1822				hashval += (hashval << 10);
1823				hashval ^= (hashval >> 6);
1824			}
1825		}
1826	}
1827
1828	if (DTRACE_CPUFLAG_ISSET(CPU_DTRACE_FAULT))
1829		return (NULL);
1830
1831	hashval += (hashval << 3);
1832	hashval ^= (hashval >> 11);
1833	hashval += (hashval << 15);
1834
1835	/*
1836	 * There is a remote chance (ideally, 1 in 2^31) that our hashval
1837	 * comes out to be one of our two sentinel hash values.  If this
1838	 * actually happens, we set the hashval to be a value known to be a
1839	 * non-sentinel value.
1840	 */
1841	if (hashval == DTRACE_DYNHASH_FREE || hashval == DTRACE_DYNHASH_SINK)
1842		hashval = DTRACE_DYNHASH_VALID;
1843
1844	/*
1845	 * Yes, it's painful to do a divide here.  If the cycle count becomes
1846	 * important here, tricks can be pulled to reduce it.  (However, it's
1847	 * critical that hash collisions be kept to an absolute minimum;
1848	 * they're much more painful than a divide.)  It's better to have a
1849	 * solution that generates few collisions and still keeps things
1850	 * relatively simple.
1851	 */
1852	bucket = hashval % dstate->dtds_hashsize;
1853
1854	if (op == DTRACE_DYNVAR_DEALLOC) {
1855		volatile uintptr_t *lockp = &hash[bucket].dtdh_lock;
1856
1857		for (;;) {
1858			while ((lock = *lockp) & 1)
1859				continue;
1860
1861			if (dtrace_casptr((void *)lockp,
1862			    (void *)lock, (void *)(lock + 1)) == (void *)lock)
1863				break;
1864		}
1865
1866		dtrace_membar_producer();
1867	}
1868
1869top:
1870	prev = NULL;
1871	lock = hash[bucket].dtdh_lock;
1872
1873	dtrace_membar_consumer();
1874
1875	start = hash[bucket].dtdh_chain;
1876	ASSERT(start != NULL && (start->dtdv_hashval == DTRACE_DYNHASH_SINK ||
1877	    start->dtdv_hashval != DTRACE_DYNHASH_FREE ||
1878	    op != DTRACE_DYNVAR_DEALLOC));
1879
1880	for (dvar = start; dvar != NULL; dvar = dvar->dtdv_next) {
1881		dtrace_tuple_t *dtuple = &dvar->dtdv_tuple;
1882		dtrace_key_t *dkey = &dtuple->dtt_key[0];
1883
1884		if (dvar->dtdv_hashval != hashval) {
1885			if (dvar->dtdv_hashval == DTRACE_DYNHASH_SINK) {
1886				/*
1887				 * We've reached the sink, and therefore the
1888				 * end of the hash chain; we can kick out of
1889				 * the loop knowing that we have seen a valid
1890				 * snapshot of state.
1891				 */
1892				ASSERT(dvar->dtdv_next == NULL);
1893				ASSERT(dvar == &dtrace_dynhash_sink);
1894				break;
1895			}
1896
1897			if (dvar->dtdv_hashval == DTRACE_DYNHASH_FREE) {
1898				/*
1899				 * We've gone off the rails:  somewhere along
1900				 * the line, one of the members of this hash
1901				 * chain was deleted.  Note that we could also
1902				 * detect this by simply letting this loop run
1903				 * to completion, as we would eventually hit
1904				 * the end of the dirty list.  However, we
1905				 * want to avoid running the length of the
1906				 * dirty list unnecessarily (it might be quite
1907				 * long), so we catch this as early as
1908				 * possible by detecting the hash marker.  In
1909				 * this case, we simply set dvar to NULL and
1910				 * break; the conditional after the loop will
1911				 * send us back to top.
1912				 */
1913				dvar = NULL;
1914				break;
1915			}
1916
1917			goto next;
1918		}
1919
1920		if (dtuple->dtt_nkeys != nkeys)
1921			goto next;
1922
1923		for (i = 0; i < nkeys; i++, dkey++) {
1924			if (dkey->dttk_size != key[i].dttk_size)
1925				goto next; /* size or type mismatch */
1926
1927			if (dkey->dttk_size != 0) {
1928				if (dtrace_bcmp(
1929				    (void *)(uintptr_t)key[i].dttk_value,
1930				    (void *)(uintptr_t)dkey->dttk_value,
1931				    dkey->dttk_size))
1932					goto next;
1933			} else {
1934				if (dkey->dttk_value != key[i].dttk_value)
1935					goto next;
1936			}
1937		}
1938
1939		if (op != DTRACE_DYNVAR_DEALLOC)
1940			return (dvar);
1941
1942		ASSERT(dvar->dtdv_next == NULL ||
1943		    dvar->dtdv_next->dtdv_hashval != DTRACE_DYNHASH_FREE);
1944
1945		if (prev != NULL) {
1946			ASSERT(hash[bucket].dtdh_chain != dvar);
1947			ASSERT(start != dvar);
1948			ASSERT(prev->dtdv_next == dvar);
1949			prev->dtdv_next = dvar->dtdv_next;
1950		} else {
1951			if (dtrace_casptr(&hash[bucket].dtdh_chain,
1952			    start, dvar->dtdv_next) != start) {
1953				/*
1954				 * We have failed to atomically swing the
1955				 * hash table head pointer, presumably because
1956				 * of a conflicting allocation on another CPU.
1957				 * We need to reread the hash chain and try
1958				 * again.
1959				 */
1960				goto top;
1961			}
1962		}
1963
1964		dtrace_membar_producer();
1965
1966		/*
1967		 * Now set the hash value to indicate that it's free.
1968		 */
1969		ASSERT(hash[bucket].dtdh_chain != dvar);
1970		dvar->dtdv_hashval = DTRACE_DYNHASH_FREE;
1971
1972		dtrace_membar_producer();
1973
1974		/*
1975		 * Set the next pointer to point at the dirty list, and
1976		 * atomically swing the dirty pointer to the newly freed dvar.
1977		 */
1978		do {
1979			next = dcpu->dtdsc_dirty;
1980			dvar->dtdv_next = next;
1981		} while (dtrace_casptr(&dcpu->dtdsc_dirty, next, dvar) != next);
1982
1983		/*
1984		 * Finally, unlock this hash bucket.
1985		 */
1986		ASSERT(hash[bucket].dtdh_lock == lock);
1987		ASSERT(lock & 1);
1988		hash[bucket].dtdh_lock++;
1989
1990		return (NULL);
1991next:
1992		prev = dvar;
1993		continue;
1994	}
1995
1996	if (dvar == NULL) {
1997		/*
1998		 * If dvar is NULL, it is because we went off the rails:
1999		 * one of the elements that we traversed in the hash chain
2000		 * was deleted while we were traversing it.  In this case,
2001		 * we assert that we aren't doing a dealloc (deallocs lock
2002		 * the hash bucket to prevent themselves from racing with
2003		 * one another), and retry the hash chain traversal.
2004		 */
2005		ASSERT(op != DTRACE_DYNVAR_DEALLOC);
2006		goto top;
2007	}
2008
2009	if (op != DTRACE_DYNVAR_ALLOC) {
2010		/*
2011		 * If we are not to allocate a new variable, we want to
2012		 * return NULL now.  Before we return, check that the value
2013		 * of the lock word hasn't changed.  If it has, we may have
2014		 * seen an inconsistent snapshot.
2015		 */
2016		if (op == DTRACE_DYNVAR_NOALLOC) {
2017			if (hash[bucket].dtdh_lock != lock)
2018				goto top;
2019		} else {
2020			ASSERT(op == DTRACE_DYNVAR_DEALLOC);
2021			ASSERT(hash[bucket].dtdh_lock == lock);
2022			ASSERT(lock & 1);
2023			hash[bucket].dtdh_lock++;
2024		}
2025
2026		return (NULL);
2027	}
2028
2029	/*
2030	 * We need to allocate a new dynamic variable.  The size we need is the
2031	 * size of dtrace_dynvar plus the size of nkeys dtrace_key_t's plus the
2032	 * size of any auxiliary key data (rounded up to 8-byte alignment) plus
2033	 * the size of any referred-to data (dsize).  We then round the final
2034	 * size up to the chunksize for allocation.
2035	 */
2036	for (ksize = 0, i = 0; i < nkeys; i++)
2037		ksize += P2ROUNDUP(key[i].dttk_size, sizeof (uint64_t));
2038
2039	/*
2040	 * This should be pretty much impossible, but could happen if, say,
2041	 * strange DIF specified the tuple.  Ideally, this should be an
2042	 * assertion and not an error condition -- but that requires that the
2043	 * chunksize calculation in dtrace_difo_chunksize() be absolutely
2044	 * bullet-proof.  (That is, it must not be able to be fooled by
2045	 * malicious DIF.)  Given the lack of backwards branches in DIF,
2046	 * solving this would presumably not amount to solving the Halting
2047	 * Problem -- but it still seems awfully hard.
2048	 */
2049	if (sizeof (dtrace_dynvar_t) + sizeof (dtrace_key_t) * (nkeys - 1) +
2050	    ksize + dsize > chunksize) {
2051		dcpu->dtdsc_drops++;
2052		return (NULL);
2053	}
2054
2055	nstate = DTRACE_DSTATE_EMPTY;
2056
2057	do {
2058retry:
2059		free = dcpu->dtdsc_free;
2060
2061		if (free == NULL) {
2062			dtrace_dynvar_t *clean = dcpu->dtdsc_clean;
2063			void *rval;
2064
2065			if (clean == NULL) {
2066				/*
2067				 * We're out of dynamic variable space on
2068				 * this CPU.  Unless we have tried all CPUs,
2069				 * we'll try to allocate from a different
2070				 * CPU.
2071				 */
2072				switch (dstate->dtds_state) {
2073				case DTRACE_DSTATE_CLEAN: {
2074					void *sp = &dstate->dtds_state;
2075
2076					if (++cpu >= NCPU)
2077						cpu = 0;
2078
2079					if (dcpu->dtdsc_dirty != NULL &&
2080					    nstate == DTRACE_DSTATE_EMPTY)
2081						nstate = DTRACE_DSTATE_DIRTY;
2082
2083					if (dcpu->dtdsc_rinsing != NULL)
2084						nstate = DTRACE_DSTATE_RINSING;
2085
2086					dcpu = &dstate->dtds_percpu[cpu];
2087
2088					if (cpu != me)
2089						goto retry;
2090
2091					(void) dtrace_cas32(sp,
2092					    DTRACE_DSTATE_CLEAN, nstate);
2093
2094					/*
2095					 * To increment the correct bean
2096					 * counter, take another lap.
2097					 */
2098					goto retry;
2099				}
2100
2101				case DTRACE_DSTATE_DIRTY:
2102					dcpu->dtdsc_dirty_drops++;
2103					break;
2104
2105				case DTRACE_DSTATE_RINSING:
2106					dcpu->dtdsc_rinsing_drops++;
2107					break;
2108
2109				case DTRACE_DSTATE_EMPTY:
2110					dcpu->dtdsc_drops++;
2111					break;
2112				}
2113
2114				DTRACE_CPUFLAG_SET(CPU_DTRACE_DROP);
2115				return (NULL);
2116			}
2117
2118			/*
2119			 * The clean list appears to be non-empty.  We want to
2120			 * move the clean list to the free list; we start by
2121			 * moving the clean pointer aside.
2122			 */
2123			if (dtrace_casptr(&dcpu->dtdsc_clean,
2124			    clean, NULL) != clean) {
2125				/*
2126				 * We are in one of two situations:
2127				 *
2128				 *  (a)	The clean list was switched to the
2129				 *	free list by another CPU.
2130				 *
2131				 *  (b)	The clean list was added to by the
2132				 *	cleansing cyclic.
2133				 *
2134				 * In either of these situations, we can
2135				 * just reattempt the free list allocation.
2136				 */
2137				goto retry;
2138			}
2139
2140			ASSERT(clean->dtdv_hashval == DTRACE_DYNHASH_FREE);
2141
2142			/*
2143			 * Now we'll move the clean list to our free list.
2144			 * It's impossible for this to fail:  the only way
2145			 * the free list can be updated is through this
2146			 * code path, and only one CPU can own the clean list.
2147			 * Thus, it would only be possible for this to fail if
2148			 * this code were racing with dtrace_dynvar_clean().
2149			 * (That is, if dtrace_dynvar_clean() updated the clean
2150			 * list, and we ended up racing to update the free
2151			 * list.)  This race is prevented by the dtrace_sync()
2152			 * in dtrace_dynvar_clean() -- which flushes the
2153			 * owners of the clean lists out before resetting
2154			 * the clean lists.
2155			 */
2156			dcpu = &dstate->dtds_percpu[me];
2157			rval = dtrace_casptr(&dcpu->dtdsc_free, NULL, clean);
2158			ASSERT(rval == NULL);
2159			goto retry;
2160		}
2161
2162		dvar = free;
2163		new_free = dvar->dtdv_next;
2164	} while (dtrace_casptr(&dcpu->dtdsc_free, free, new_free) != free);
2165
2166	/*
2167	 * We have now allocated a new chunk.  We copy the tuple keys into the
2168	 * tuple array and copy any referenced key data into the data space
2169	 * following the tuple array.  As we do this, we relocate dttk_value
2170	 * in the final tuple to point to the key data address in the chunk.
2171	 */
2172	kdata = (uintptr_t)&dvar->dtdv_tuple.dtt_key[nkeys];
2173	dvar->dtdv_data = (void *)(kdata + ksize);
2174	dvar->dtdv_tuple.dtt_nkeys = nkeys;
2175
2176	for (i = 0; i < nkeys; i++) {
2177		dtrace_key_t *dkey = &dvar->dtdv_tuple.dtt_key[i];
2178		size_t kesize = key[i].dttk_size;
2179
2180		if (kesize != 0) {
2181			dtrace_bcopy(
2182			    (const void *)(uintptr_t)key[i].dttk_value,
2183			    (void *)kdata, kesize);
2184			dkey->dttk_value = kdata;
2185			kdata += P2ROUNDUP(kesize, sizeof (uint64_t));
2186		} else {
2187			dkey->dttk_value = key[i].dttk_value;
2188		}
2189
2190		dkey->dttk_size = kesize;
2191	}
2192
2193	ASSERT(dvar->dtdv_hashval == DTRACE_DYNHASH_FREE);
2194	dvar->dtdv_hashval = hashval;
2195	dvar->dtdv_next = start;
2196
2197	if (dtrace_casptr(&hash[bucket].dtdh_chain, start, dvar) == start)
2198		return (dvar);
2199
2200	/*
2201	 * The cas has failed.  Either another CPU is adding an element to
2202	 * this hash chain, or another CPU is deleting an element from this
2203	 * hash chain.  The simplest way to deal with both of these cases
2204	 * (though not necessarily the most efficient) is to free our
2205	 * allocated block and re-attempt it all.  Note that the free is
2206	 * to the dirty list and _not_ to the free list.  This is to prevent
2207	 * races with allocators, above.
2208	 */
2209	dvar->dtdv_hashval = DTRACE_DYNHASH_FREE;
2210
2211	dtrace_membar_producer();
2212
2213	do {
2214		free = dcpu->dtdsc_dirty;
2215		dvar->dtdv_next = free;
2216	} while (dtrace_casptr(&dcpu->dtdsc_dirty, free, dvar) != free);
2217
2218	goto top;
2219}
2220
2221/*ARGSUSED*/
2222static void
2223dtrace_aggregate_min(uint64_t *oval, uint64_t nval, uint64_t arg)
2224{
2225	if ((int64_t)nval < (int64_t)*oval)
2226		*oval = nval;
2227}
2228
2229/*ARGSUSED*/
2230static void
2231dtrace_aggregate_max(uint64_t *oval, uint64_t nval, uint64_t arg)
2232{
2233	if ((int64_t)nval > (int64_t)*oval)
2234		*oval = nval;
2235}
2236
2237static void
2238dtrace_aggregate_quantize(uint64_t *quanta, uint64_t nval, uint64_t incr)
2239{
2240	int i, zero = DTRACE_QUANTIZE_ZEROBUCKET;
2241	int64_t val = (int64_t)nval;
2242
2243	if (val < 0) {
2244		for (i = 0; i < zero; i++) {
2245			if (val <= DTRACE_QUANTIZE_BUCKETVAL(i)) {
2246				quanta[i] += incr;
2247				return;
2248			}
2249		}
2250	} else {
2251		for (i = zero + 1; i < DTRACE_QUANTIZE_NBUCKETS; i++) {
2252			if (val < DTRACE_QUANTIZE_BUCKETVAL(i)) {
2253				quanta[i - 1] += incr;
2254				return;
2255			}
2256		}
2257
2258		quanta[DTRACE_QUANTIZE_NBUCKETS - 1] += incr;
2259		return;
2260	}
2261
2262	ASSERT(0);
2263}
2264
2265static void
2266dtrace_aggregate_lquantize(uint64_t *lquanta, uint64_t nval, uint64_t incr)
2267{
2268	uint64_t arg = *lquanta++;
2269	int32_t base = DTRACE_LQUANTIZE_BASE(arg);
2270	uint16_t step = DTRACE_LQUANTIZE_STEP(arg);
2271	uint16_t levels = DTRACE_LQUANTIZE_LEVELS(arg);
2272	int32_t val = (int32_t)nval, level;
2273
2274	ASSERT(step != 0);
2275	ASSERT(levels != 0);
2276
2277	if (val < base) {
2278		/*
2279		 * This is an underflow.
2280		 */
2281		lquanta[0] += incr;
2282		return;
2283	}
2284
2285	level = (val - base) / step;
2286
2287	if (level < levels) {
2288		lquanta[level + 1] += incr;
2289		return;
2290	}
2291
2292	/*
2293	 * This is an overflow.
2294	 */
2295	lquanta[levels + 1] += incr;
2296}
2297
2298static int
2299dtrace_aggregate_llquantize_bucket(uint16_t factor, uint16_t low,
2300    uint16_t high, uint16_t nsteps, int64_t value)
2301{
2302	int64_t this = 1, last, next;
2303	int base = 1, order;
2304
2305	ASSERT(factor <= nsteps);
2306	ASSERT(nsteps % factor == 0);
2307
2308	for (order = 0; order < low; order++)
2309		this *= factor;
2310
2311	/*
2312	 * If our value is less than our factor taken to the power of the
2313	 * low order of magnitude, it goes into the zeroth bucket.
2314	 */
2315	if (value < (last = this))
2316		return (0);
2317
2318	for (this *= factor; order <= high; order++) {
2319		int nbuckets = this > nsteps ? nsteps : this;
2320
2321		if ((next = this * factor) < this) {
2322			/*
2323			 * We should not generally get log/linear quantizations
2324			 * with a high magnitude that allows 64-bits to
2325			 * overflow, but we nonetheless protect against this
2326			 * by explicitly checking for overflow, and clamping
2327			 * our value accordingly.
2328			 */
2329			value = this - 1;
2330		}
2331
2332		if (value < this) {
2333			/*
2334			 * If our value lies within this order of magnitude,
2335			 * determine its position by taking the offset within
2336			 * the order of magnitude, dividing by the bucket
2337			 * width, and adding to our (accumulated) base.
2338			 */
2339			return (base + (value - last) / (this / nbuckets));
2340		}
2341
2342		base += nbuckets - (nbuckets / factor);
2343		last = this;
2344		this = next;
2345	}
2346
2347	/*
2348	 * Our value is greater than or equal to our factor taken to the
2349	 * power of one plus the high magnitude -- return the top bucket.
2350	 */
2351	return (base);
2352}
2353
2354static void
2355dtrace_aggregate_llquantize(uint64_t *llquanta, uint64_t nval, uint64_t incr)
2356{
2357	uint64_t arg = *llquanta++;
2358	uint16_t factor = DTRACE_LLQUANTIZE_FACTOR(arg);
2359	uint16_t low = DTRACE_LLQUANTIZE_LOW(arg);
2360	uint16_t high = DTRACE_LLQUANTIZE_HIGH(arg);
2361	uint16_t nsteps = DTRACE_LLQUANTIZE_NSTEP(arg);
2362
2363	llquanta[dtrace_aggregate_llquantize_bucket(factor,
2364	    low, high, nsteps, nval)] += incr;
2365}
2366
2367/*ARGSUSED*/
2368static void
2369dtrace_aggregate_avg(uint64_t *data, uint64_t nval, uint64_t arg)
2370{
2371	data[0]++;
2372	data[1] += nval;
2373}
2374
2375/*ARGSUSED*/
2376static void
2377dtrace_aggregate_stddev(uint64_t *data, uint64_t nval, uint64_t arg)
2378{
2379	int64_t snval = (int64_t)nval;
2380	uint64_t tmp[2];
2381
2382	data[0]++;
2383	data[1] += nval;
2384
2385	/*
2386	 * What we want to say here is:
2387	 *
2388	 * data[2] += nval * nval;
2389	 *
2390	 * But given that nval is 64-bit, we could easily overflow, so
2391	 * we do this as 128-bit arithmetic.
2392	 */
2393	if (snval < 0)
2394		snval = -snval;
2395
2396	dtrace_multiply_128((uint64_t)snval, (uint64_t)snval, tmp);
2397	dtrace_add_128(data + 2, tmp, data + 2);
2398}
2399
2400/*ARGSUSED*/
2401static void
2402dtrace_aggregate_count(uint64_t *oval, uint64_t nval, uint64_t arg)
2403{
2404	*oval = *oval + 1;
2405}
2406
2407/*ARGSUSED*/
2408static void
2409dtrace_aggregate_sum(uint64_t *oval, uint64_t nval, uint64_t arg)
2410{
2411	*oval += nval;
2412}
2413
2414/*
2415 * Aggregate given the tuple in the principal data buffer, and the aggregating
2416 * action denoted by the specified dtrace_aggregation_t.  The aggregation
2417 * buffer is specified as the buf parameter.  This routine does not return
2418 * failure; if there is no space in the aggregation buffer, the data will be
2419 * dropped, and a corresponding counter incremented.
2420 */
2421static void
2422dtrace_aggregate(dtrace_aggregation_t *agg, dtrace_buffer_t *dbuf,
2423    intptr_t offset, dtrace_buffer_t *buf, uint64_t expr, uint64_t arg)
2424{
2425	dtrace_recdesc_t *rec = &agg->dtag_action.dta_rec;
2426	uint32_t i, ndx, size, fsize;
2427	uint32_t align = sizeof (uint64_t) - 1;
2428	dtrace_aggbuffer_t *agb;
2429	dtrace_aggkey_t *key;
2430	uint32_t hashval = 0, limit, isstr;
2431	caddr_t tomax, data, kdata;
2432	dtrace_actkind_t action;
2433	dtrace_action_t *act;
2434	uintptr_t offs;
2435
2436	if (buf == NULL)
2437		return;
2438
2439	if (!agg->dtag_hasarg) {
2440		/*
2441		 * Currently, only quantize() and lquantize() take additional
2442		 * arguments, and they have the same semantics:  an increment
2443		 * value that defaults to 1 when not present.  If additional
2444		 * aggregating actions take arguments, the setting of the
2445		 * default argument value will presumably have to become more
2446		 * sophisticated...
2447		 */
2448		arg = 1;
2449	}
2450
2451	action = agg->dtag_action.dta_kind - DTRACEACT_AGGREGATION;
2452	size = rec->dtrd_offset - agg->dtag_base;
2453	fsize = size + rec->dtrd_size;
2454
2455	ASSERT(dbuf->dtb_tomax != NULL);
2456	data = dbuf->dtb_tomax + offset + agg->dtag_base;
2457
2458	if ((tomax = buf->dtb_tomax) == NULL) {
2459		dtrace_buffer_drop(buf);
2460		return;
2461	}
2462
2463	/*
2464	 * The metastructure is always at the bottom of the buffer.
2465	 */
2466	agb = (dtrace_aggbuffer_t *)(tomax + buf->dtb_size -
2467	    sizeof (dtrace_aggbuffer_t));
2468
2469	if (buf->dtb_offset == 0) {
2470		/*
2471		 * We just kludge up approximately 1/8th of the size to be
2472		 * buckets.  If this guess ends up being routinely
2473		 * off-the-mark, we may need to dynamically readjust this
2474		 * based on past performance.
2475		 */
2476		uintptr_t hashsize = (buf->dtb_size >> 3) / sizeof (uintptr_t);
2477
2478		if ((uintptr_t)agb - hashsize * sizeof (dtrace_aggkey_t *) <
2479		    (uintptr_t)tomax || hashsize == 0) {
2480			/*
2481			 * We've been given a ludicrously small buffer;
2482			 * increment our drop count and leave.
2483			 */
2484			dtrace_buffer_drop(buf);
2485			return;
2486		}
2487
2488		/*
2489		 * And now, a pathetic attempt to try to get a an odd (or
2490		 * perchance, a prime) hash size for better hash distribution.
2491		 */
2492		if (hashsize > (DTRACE_AGGHASHSIZE_SLEW << 3))
2493			hashsize -= DTRACE_AGGHASHSIZE_SLEW;
2494
2495		agb->dtagb_hashsize = hashsize;
2496		agb->dtagb_hash = (dtrace_aggkey_t **)((uintptr_t)agb -
2497		    agb->dtagb_hashsize * sizeof (dtrace_aggkey_t *));
2498		agb->dtagb_free = (uintptr_t)agb->dtagb_hash;
2499
2500		for (i = 0; i < agb->dtagb_hashsize; i++)
2501			agb->dtagb_hash[i] = NULL;
2502	}
2503
2504	ASSERT(agg->dtag_first != NULL);
2505	ASSERT(agg->dtag_first->dta_intuple);
2506
2507	/*
2508	 * Calculate the hash value based on the key.  Note that we _don't_
2509	 * include the aggid in the hashing (but we will store it as part of
2510	 * the key).  The hashing algorithm is Bob Jenkins' "One-at-a-time"
2511	 * algorithm: a simple, quick algorithm that has no known funnels, and
2512	 * gets good distribution in practice.  The efficacy of the hashing
2513	 * algorithm (and a comparison with other algorithms) may be found by
2514	 * running the ::dtrace_aggstat MDB dcmd.
2515	 */
2516	for (act = agg->dtag_first; act->dta_intuple; act = act->dta_next) {
2517		i = act->dta_rec.dtrd_offset - agg->dtag_base;
2518		limit = i + act->dta_rec.dtrd_size;
2519		ASSERT(limit <= size);
2520		isstr = DTRACEACT_ISSTRING(act);
2521
2522		for (; i < limit; i++) {
2523			hashval += data[i];
2524			hashval += (hashval << 10);
2525			hashval ^= (hashval >> 6);
2526
2527			if (isstr && data[i] == '\0')
2528				break;
2529		}
2530	}
2531
2532	hashval += (hashval << 3);
2533	hashval ^= (hashval >> 11);
2534	hashval += (hashval << 15);
2535
2536	/*
2537	 * Yes, the divide here is expensive -- but it's generally the least
2538	 * of the performance issues given the amount of data that we iterate
2539	 * over to compute hash values, compare data, etc.
2540	 */
2541	ndx = hashval % agb->dtagb_hashsize;
2542
2543	for (key = agb->dtagb_hash[ndx]; key != NULL; key = key->dtak_next) {
2544		ASSERT((caddr_t)key >= tomax);
2545		ASSERT((caddr_t)key < tomax + buf->dtb_size);
2546
2547		if (hashval != key->dtak_hashval || key->dtak_size != size)
2548			continue;
2549
2550		kdata = key->dtak_data;
2551		ASSERT(kdata >= tomax && kdata < tomax + buf->dtb_size);
2552
2553		for (act = agg->dtag_first; act->dta_intuple;
2554		    act = act->dta_next) {
2555			i = act->dta_rec.dtrd_offset - agg->dtag_base;
2556			limit = i + act->dta_rec.dtrd_size;
2557			ASSERT(limit <= size);
2558			isstr = DTRACEACT_ISSTRING(act);
2559
2560			for (; i < limit; i++) {
2561				if (kdata[i] != data[i])
2562					goto next;
2563
2564				if (isstr && data[i] == '\0')
2565					break;
2566			}
2567		}
2568
2569		if (action != key->dtak_action) {
2570			/*
2571			 * We are aggregating on the same value in the same
2572			 * aggregation with two different aggregating actions.
2573			 * (This should have been picked up in the compiler,
2574			 * so we may be dealing with errant or devious DIF.)
2575			 * This is an error condition; we indicate as much,
2576			 * and return.
2577			 */
2578			DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
2579			return;
2580		}
2581
2582		/*
2583		 * This is a hit:  we need to apply the aggregator to
2584		 * the value at this key.
2585		 */
2586		agg->dtag_aggregate((uint64_t *)(kdata + size), expr, arg);
2587		return;
2588next:
2589		continue;
2590	}
2591
2592	/*
2593	 * We didn't find it.  We need to allocate some zero-filled space,
2594	 * link it into the hash table appropriately, and apply the aggregator
2595	 * to the (zero-filled) value.
2596	 */
2597	offs = buf->dtb_offset;
2598	while (offs & (align - 1))
2599		offs += sizeof (uint32_t);
2600
2601	/*
2602	 * If we don't have enough room to both allocate a new key _and_
2603	 * its associated data, increment the drop count and return.
2604	 */
2605	if ((uintptr_t)tomax + offs + fsize >
2606	    agb->dtagb_free - sizeof (dtrace_aggkey_t)) {
2607		dtrace_buffer_drop(buf);
2608		return;
2609	}
2610
2611	/*CONSTCOND*/
2612	ASSERT(!(sizeof (dtrace_aggkey_t) & (sizeof (uintptr_t) - 1)));
2613	key = (dtrace_aggkey_t *)(agb->dtagb_free - sizeof (dtrace_aggkey_t));
2614	agb->dtagb_free -= sizeof (dtrace_aggkey_t);
2615
2616	key->dtak_data = kdata = tomax + offs;
2617	buf->dtb_offset = offs + fsize;
2618
2619	/*
2620	 * Now copy the data across.
2621	 */
2622	*((dtrace_aggid_t *)kdata) = agg->dtag_id;
2623
2624	for (i = sizeof (dtrace_aggid_t); i < size; i++)
2625		kdata[i] = data[i];
2626
2627	/*
2628	 * Because strings are not zeroed out by default, we need to iterate
2629	 * looking for actions that store strings, and we need to explicitly
2630	 * pad these strings out with zeroes.
2631	 */
2632	for (act = agg->dtag_first; act->dta_intuple; act = act->dta_next) {
2633		int nul;
2634
2635		if (!DTRACEACT_ISSTRING(act))
2636			continue;
2637
2638		i = act->dta_rec.dtrd_offset - agg->dtag_base;
2639		limit = i + act->dta_rec.dtrd_size;
2640		ASSERT(limit <= size);
2641
2642		for (nul = 0; i < limit; i++) {
2643			if (nul) {
2644				kdata[i] = '\0';
2645				continue;
2646			}
2647
2648			if (data[i] != '\0')
2649				continue;
2650
2651			nul = 1;
2652		}
2653	}
2654
2655	for (i = size; i < fsize; i++)
2656		kdata[i] = 0;
2657
2658	key->dtak_hashval = hashval;
2659	key->dtak_size = size;
2660	key->dtak_action = action;
2661	key->dtak_next = agb->dtagb_hash[ndx];
2662	agb->dtagb_hash[ndx] = key;
2663
2664	/*
2665	 * Finally, apply the aggregator.
2666	 */
2667	*((uint64_t *)(key->dtak_data + size)) = agg->dtag_initial;
2668	agg->dtag_aggregate((uint64_t *)(key->dtak_data + size), expr, arg);
2669}
2670
2671/*
2672 * Given consumer state, this routine finds a speculation in the INACTIVE
2673 * state and transitions it into the ACTIVE state.  If there is no speculation
2674 * in the INACTIVE state, 0 is returned.  In this case, no error counter is
2675 * incremented -- it is up to the caller to take appropriate action.
2676 */
2677static int
2678dtrace_speculation(dtrace_state_t *state)
2679{
2680	int i = 0;
2681	dtrace_speculation_state_t current;
2682	uint32_t *stat = &state->dts_speculations_unavail, count;
2683
2684	while (i < state->dts_nspeculations) {
2685		dtrace_speculation_t *spec = &state->dts_speculations[i];
2686
2687		current = spec->dtsp_state;
2688
2689		if (current != DTRACESPEC_INACTIVE) {
2690			if (current == DTRACESPEC_COMMITTINGMANY ||
2691			    current == DTRACESPEC_COMMITTING ||
2692			    current == DTRACESPEC_DISCARDING)
2693				stat = &state->dts_speculations_busy;
2694			i++;
2695			continue;
2696		}
2697
2698		if (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2699		    current, DTRACESPEC_ACTIVE) == current)
2700			return (i + 1);
2701	}
2702
2703	/*
2704	 * We couldn't find a speculation.  If we found as much as a single
2705	 * busy speculation buffer, we'll attribute this failure as "busy"
2706	 * instead of "unavail".
2707	 */
2708	do {
2709		count = *stat;
2710	} while (dtrace_cas32(stat, count, count + 1) != count);
2711
2712	return (0);
2713}
2714
2715/*
2716 * This routine commits an active speculation.  If the specified speculation
2717 * is not in a valid state to perform a commit(), this routine will silently do
2718 * nothing.  The state of the specified speculation is transitioned according
2719 * to the state transition diagram outlined in <sys/dtrace_impl.h>
2720 */
2721static void
2722dtrace_speculation_commit(dtrace_state_t *state, processorid_t cpu,
2723    dtrace_specid_t which)
2724{
2725	dtrace_speculation_t *spec;
2726	dtrace_buffer_t *src, *dest;
2727	uintptr_t daddr, saddr, dlimit, slimit;
2728	dtrace_speculation_state_t current, new;
2729	intptr_t offs;
2730	uint64_t timestamp;
2731
2732	if (which == 0)
2733		return;
2734
2735	if (which > state->dts_nspeculations) {
2736		cpu_core[cpu].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
2737		return;
2738	}
2739
2740	spec = &state->dts_speculations[which - 1];
2741	src = &spec->dtsp_buffer[cpu];
2742	dest = &state->dts_buffer[cpu];
2743
2744	do {
2745		current = spec->dtsp_state;
2746
2747		if (current == DTRACESPEC_COMMITTINGMANY)
2748			break;
2749
2750		switch (current) {
2751		case DTRACESPEC_INACTIVE:
2752		case DTRACESPEC_DISCARDING:
2753			return;
2754
2755		case DTRACESPEC_COMMITTING:
2756			/*
2757			 * This is only possible if we are (a) commit()'ing
2758			 * without having done a prior speculate() on this CPU
2759			 * and (b) racing with another commit() on a different
2760			 * CPU.  There's nothing to do -- we just assert that
2761			 * our offset is 0.
2762			 */
2763			ASSERT(src->dtb_offset == 0);
2764			return;
2765
2766		case DTRACESPEC_ACTIVE:
2767			new = DTRACESPEC_COMMITTING;
2768			break;
2769
2770		case DTRACESPEC_ACTIVEONE:
2771			/*
2772			 * This speculation is active on one CPU.  If our
2773			 * buffer offset is non-zero, we know that the one CPU
2774			 * must be us.  Otherwise, we are committing on a
2775			 * different CPU from the speculate(), and we must
2776			 * rely on being asynchronously cleaned.
2777			 */
2778			if (src->dtb_offset != 0) {
2779				new = DTRACESPEC_COMMITTING;
2780				break;
2781			}
2782			/*FALLTHROUGH*/
2783
2784		case DTRACESPEC_ACTIVEMANY:
2785			new = DTRACESPEC_COMMITTINGMANY;
2786			break;
2787
2788		default:
2789			ASSERT(0);
2790		}
2791	} while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2792	    current, new) != current);
2793
2794	/*
2795	 * We have set the state to indicate that we are committing this
2796	 * speculation.  Now reserve the necessary space in the destination
2797	 * buffer.
2798	 */
2799	if ((offs = dtrace_buffer_reserve(dest, src->dtb_offset,
2800	    sizeof (uint64_t), state, NULL)) < 0) {
2801		dtrace_buffer_drop(dest);
2802		goto out;
2803	}
2804
2805	/*
2806	 * We have sufficient space to copy the speculative buffer into the
2807	 * primary buffer.  First, modify the speculative buffer, filling
2808	 * in the timestamp of all entries with the current time.  The data
2809	 * must have the commit() time rather than the time it was traced,
2810	 * so that all entries in the primary buffer are in timestamp order.
2811	 */
2812	timestamp = dtrace_gethrtime();
2813	saddr = (uintptr_t)src->dtb_tomax;
2814	slimit = saddr + src->dtb_offset;
2815	while (saddr < slimit) {
2816		size_t size;
2817		dtrace_rechdr_t *dtrh = (dtrace_rechdr_t *)saddr;
2818
2819		if (dtrh->dtrh_epid == DTRACE_EPIDNONE) {
2820			saddr += sizeof (dtrace_epid_t);
2821			continue;
2822		}
2823		ASSERT3U(dtrh->dtrh_epid, <=, state->dts_necbs);
2824		size = state->dts_ecbs[dtrh->dtrh_epid - 1]->dte_size;
2825
2826		ASSERT3U(saddr + size, <=, slimit);
2827		ASSERT3U(size, >=, sizeof (dtrace_rechdr_t));
2828		ASSERT3U(DTRACE_RECORD_LOAD_TIMESTAMP(dtrh), ==, UINT64_MAX);
2829
2830		DTRACE_RECORD_STORE_TIMESTAMP(dtrh, timestamp);
2831
2832		saddr += size;
2833	}
2834
2835	/*
2836	 * Copy the buffer across.  (Note that this is a
2837	 * highly subobtimal bcopy(); in the unlikely event that this becomes
2838	 * a serious performance issue, a high-performance DTrace-specific
2839	 * bcopy() should obviously be invented.)
2840	 */
2841	daddr = (uintptr_t)dest->dtb_tomax + offs;
2842	dlimit = daddr + src->dtb_offset;
2843	saddr = (uintptr_t)src->dtb_tomax;
2844
2845	/*
2846	 * First, the aligned portion.
2847	 */
2848	while (dlimit - daddr >= sizeof (uint64_t)) {
2849		*((uint64_t *)daddr) = *((uint64_t *)saddr);
2850
2851		daddr += sizeof (uint64_t);
2852		saddr += sizeof (uint64_t);
2853	}
2854
2855	/*
2856	 * Now any left-over bit...
2857	 */
2858	while (dlimit - daddr)
2859		*((uint8_t *)daddr++) = *((uint8_t *)saddr++);
2860
2861	/*
2862	 * Finally, commit the reserved space in the destination buffer.
2863	 */
2864	dest->dtb_offset = offs + src->dtb_offset;
2865
2866out:
2867	/*
2868	 * If we're lucky enough to be the only active CPU on this speculation
2869	 * buffer, we can just set the state back to DTRACESPEC_INACTIVE.
2870	 */
2871	if (current == DTRACESPEC_ACTIVE ||
2872	    (current == DTRACESPEC_ACTIVEONE && new == DTRACESPEC_COMMITTING)) {
2873		uint32_t rval = dtrace_cas32((uint32_t *)&spec->dtsp_state,
2874		    DTRACESPEC_COMMITTING, DTRACESPEC_INACTIVE);
2875
2876		ASSERT(rval == DTRACESPEC_COMMITTING);
2877	}
2878
2879	src->dtb_offset = 0;
2880	src->dtb_xamot_drops += src->dtb_drops;
2881	src->dtb_drops = 0;
2882}
2883
2884/*
2885 * This routine discards an active speculation.  If the specified speculation
2886 * is not in a valid state to perform a discard(), this routine will silently
2887 * do nothing.  The state of the specified speculation is transitioned
2888 * according to the state transition diagram outlined in <sys/dtrace_impl.h>
2889 */
2890static void
2891dtrace_speculation_discard(dtrace_state_t *state, processorid_t cpu,
2892    dtrace_specid_t which)
2893{
2894	dtrace_speculation_t *spec;
2895	dtrace_speculation_state_t current, new;
2896	dtrace_buffer_t *buf;
2897
2898	if (which == 0)
2899		return;
2900
2901	if (which > state->dts_nspeculations) {
2902		cpu_core[cpu].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
2903		return;
2904	}
2905
2906	spec = &state->dts_speculations[which - 1];
2907	buf = &spec->dtsp_buffer[cpu];
2908
2909	do {
2910		current = spec->dtsp_state;
2911
2912		switch (current) {
2913		case DTRACESPEC_INACTIVE:
2914		case DTRACESPEC_COMMITTINGMANY:
2915		case DTRACESPEC_COMMITTING:
2916		case DTRACESPEC_DISCARDING:
2917			return;
2918
2919		case DTRACESPEC_ACTIVE:
2920		case DTRACESPEC_ACTIVEMANY:
2921			new = DTRACESPEC_DISCARDING;
2922			break;
2923
2924		case DTRACESPEC_ACTIVEONE:
2925			if (buf->dtb_offset != 0) {
2926				new = DTRACESPEC_INACTIVE;
2927			} else {
2928				new = DTRACESPEC_DISCARDING;
2929			}
2930			break;
2931
2932		default:
2933			ASSERT(0);
2934		}
2935	} while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2936	    current, new) != current);
2937
2938	buf->dtb_offset = 0;
2939	buf->dtb_drops = 0;
2940}
2941
2942/*
2943 * Note:  not called from probe context.  This function is called
2944 * asynchronously from cross call context to clean any speculations that are
2945 * in the COMMITTINGMANY or DISCARDING states.  These speculations may not be
2946 * transitioned back to the INACTIVE state until all CPUs have cleaned the
2947 * speculation.
2948 */
2949static void
2950dtrace_speculation_clean_here(dtrace_state_t *state)
2951{
2952	dtrace_icookie_t cookie;
2953	processorid_t cpu = CPU->cpu_id;
2954	dtrace_buffer_t *dest = &state->dts_buffer[cpu];
2955	dtrace_specid_t i;
2956
2957	cookie = dtrace_interrupt_disable();
2958
2959	if (dest->dtb_tomax == NULL) {
2960		dtrace_interrupt_enable(cookie);
2961		return;
2962	}
2963
2964	for (i = 0; i < state->dts_nspeculations; i++) {
2965		dtrace_speculation_t *spec = &state->dts_speculations[i];
2966		dtrace_buffer_t *src = &spec->dtsp_buffer[cpu];
2967
2968		if (src->dtb_tomax == NULL)
2969			continue;
2970
2971		if (spec->dtsp_state == DTRACESPEC_DISCARDING) {
2972			src->dtb_offset = 0;
2973			continue;
2974		}
2975
2976		if (spec->dtsp_state != DTRACESPEC_COMMITTINGMANY)
2977			continue;
2978
2979		if (src->dtb_offset == 0)
2980			continue;
2981
2982		dtrace_speculation_commit(state, cpu, i + 1);
2983	}
2984
2985	dtrace_interrupt_enable(cookie);
2986}
2987
2988/*
2989 * Note:  not called from probe context.  This function is called
2990 * asynchronously (and at a regular interval) to clean any speculations that
2991 * are in the COMMITTINGMANY or DISCARDING states.  If it discovers that there
2992 * is work to be done, it cross calls all CPUs to perform that work;
2993 * COMMITMANY and DISCARDING speculations may not be transitioned back to the
2994 * INACTIVE state until they have been cleaned by all CPUs.
2995 */
2996static void
2997dtrace_speculation_clean(dtrace_state_t *state)
2998{
2999	int work = 0, rv;
3000	dtrace_specid_t i;
3001
3002	for (i = 0; i < state->dts_nspeculations; i++) {
3003		dtrace_speculation_t *spec = &state->dts_speculations[i];
3004
3005		ASSERT(!spec->dtsp_cleaning);
3006
3007		if (spec->dtsp_state != DTRACESPEC_DISCARDING &&
3008		    spec->dtsp_state != DTRACESPEC_COMMITTINGMANY)
3009			continue;
3010
3011		work++;
3012		spec->dtsp_cleaning = 1;
3013	}
3014
3015	if (!work)
3016		return;
3017
3018	dtrace_xcall(DTRACE_CPUALL,
3019	    (dtrace_xcall_t)dtrace_speculation_clean_here, state);
3020
3021	/*
3022	 * We now know that all CPUs have committed or discarded their
3023	 * speculation buffers, as appropriate.  We can now set the state
3024	 * to inactive.
3025	 */
3026	for (i = 0; i < state->dts_nspeculations; i++) {
3027		dtrace_speculation_t *spec = &state->dts_speculations[i];
3028		dtrace_speculation_state_t current, new;
3029
3030		if (!spec->dtsp_cleaning)
3031			continue;
3032
3033		current = spec->dtsp_state;
3034		ASSERT(current == DTRACESPEC_DISCARDING ||
3035		    current == DTRACESPEC_COMMITTINGMANY);
3036
3037		new = DTRACESPEC_INACTIVE;
3038
3039		rv = dtrace_cas32((uint32_t *)&spec->dtsp_state, current, new);
3040		ASSERT(rv == current);
3041		spec->dtsp_cleaning = 0;
3042	}
3043}
3044
3045/*
3046 * Called as part of a speculate() to get the speculative buffer associated
3047 * with a given speculation.  Returns NULL if the specified speculation is not
3048 * in an ACTIVE state.  If the speculation is in the ACTIVEONE state -- and
3049 * the active CPU is not the specified CPU -- the speculation will be
3050 * atomically transitioned into the ACTIVEMANY state.
3051 */
3052static dtrace_buffer_t *
3053dtrace_speculation_buffer(dtrace_state_t *state, processorid_t cpuid,
3054    dtrace_specid_t which)
3055{
3056	dtrace_speculation_t *spec;
3057	dtrace_speculation_state_t current, new;
3058	dtrace_buffer_t *buf;
3059
3060	if (which == 0)
3061		return (NULL);
3062
3063	if (which > state->dts_nspeculations) {
3064		cpu_core[cpuid].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
3065		return (NULL);
3066	}
3067
3068	spec = &state->dts_speculations[which - 1];
3069	buf = &spec->dtsp_buffer[cpuid];
3070
3071	do {
3072		current = spec->dtsp_state;
3073
3074		switch (current) {
3075		case DTRACESPEC_INACTIVE:
3076		case DTRACESPEC_COMMITTINGMANY:
3077		case DTRACESPEC_DISCARDING:
3078			return (NULL);
3079
3080		case DTRACESPEC_COMMITTING:
3081			ASSERT(buf->dtb_offset == 0);
3082			return (NULL);
3083
3084		case DTRACESPEC_ACTIVEONE:
3085			/*
3086			 * This speculation is currently active on one CPU.
3087			 * Check the offset in the buffer; if it's non-zero,
3088			 * that CPU must be us (and we leave the state alone).
3089			 * If it's zero, assume that we're starting on a new
3090			 * CPU -- and change the state to indicate that the
3091			 * speculation is active on more than one CPU.
3092			 */
3093			if (buf->dtb_offset != 0)
3094				return (buf);
3095
3096			new = DTRACESPEC_ACTIVEMANY;
3097			break;
3098
3099		case DTRACESPEC_ACTIVEMANY:
3100			return (buf);
3101
3102		case DTRACESPEC_ACTIVE:
3103			new = DTRACESPEC_ACTIVEONE;
3104			break;
3105
3106		default:
3107			ASSERT(0);
3108		}
3109	} while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
3110	    current, new) != current);
3111
3112	ASSERT(new == DTRACESPEC_ACTIVEONE || new == DTRACESPEC_ACTIVEMANY);
3113	return (buf);
3114}
3115
3116/*
3117 * Return a string.  In the event that the user lacks the privilege to access
3118 * arbitrary kernel memory, we copy the string out to scratch memory so that we
3119 * don't fail access checking.
3120 *
3121 * dtrace_dif_variable() uses this routine as a helper for various
3122 * builtin values such as 'execname' and 'probefunc.'
3123 */
3124uintptr_t
3125dtrace_dif_varstr(uintptr_t addr, dtrace_state_t *state,
3126    dtrace_mstate_t *mstate)
3127{
3128	uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
3129	uintptr_t ret;
3130	size_t strsz;
3131
3132	/*
3133	 * The easy case: this probe is allowed to read all of memory, so
3134	 * we can just return this as a vanilla pointer.
3135	 */
3136	if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
3137		return (addr);
3138
3139	/*
3140	 * This is the tougher case: we copy the string in question from
3141	 * kernel memory into scratch memory and return it that way: this
3142	 * ensures that we won't trip up when access checking tests the
3143	 * BYREF return value.
3144	 */
3145	strsz = dtrace_strlen((char *)addr, size) + 1;
3146
3147	if (mstate->dtms_scratch_ptr + strsz >
3148	    mstate->dtms_scratch_base + mstate->dtms_scratch_size) {
3149		DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3150		return (0);
3151	}
3152
3153	dtrace_strcpy((const void *)addr, (void *)mstate->dtms_scratch_ptr,
3154	    strsz);
3155	ret = mstate->dtms_scratch_ptr;
3156	mstate->dtms_scratch_ptr += strsz;
3157	return (ret);
3158}
3159
3160/*
3161 * This function implements the DIF emulator's variable lookups.  The emulator
3162 * passes a reserved variable identifier and optional built-in array index.
3163 */
3164static uint64_t
3165dtrace_dif_variable(dtrace_mstate_t *mstate, dtrace_state_t *state, uint64_t v,
3166    uint64_t ndx)
3167{
3168	/*
3169	 * If we're accessing one of the uncached arguments, we'll turn this
3170	 * into a reference in the args array.
3171	 */
3172	if (v >= DIF_VAR_ARG0 && v <= DIF_VAR_ARG9) {
3173		ndx = v - DIF_VAR_ARG0;
3174		v = DIF_VAR_ARGS;
3175	}
3176
3177	switch (v) {
3178	case DIF_VAR_ARGS:
3179		if (!(mstate->dtms_access & DTRACE_ACCESS_ARGS)) {
3180			cpu_core[CPU->cpu_id].cpuc_dtrace_flags |=
3181			    CPU_DTRACE_KPRIV;
3182			return (0);
3183		}
3184
3185		ASSERT(mstate->dtms_present & DTRACE_MSTATE_ARGS);
3186		if (ndx >= sizeof (mstate->dtms_arg) /
3187		    sizeof (mstate->dtms_arg[0])) {
3188			int aframes = mstate->dtms_probe->dtpr_aframes + 2;
3189			dtrace_provider_t *pv;
3190			uint64_t val;
3191
3192			pv = mstate->dtms_probe->dtpr_provider;
3193			if (pv->dtpv_pops.dtps_getargval != NULL)
3194				val = pv->dtpv_pops.dtps_getargval(pv->dtpv_arg,
3195				    mstate->dtms_probe->dtpr_id,
3196				    mstate->dtms_probe->dtpr_arg, ndx, aframes);
3197			else
3198				val = dtrace_getarg(ndx, aframes);
3199
3200			/*
3201			 * This is regrettably required to keep the compiler
3202			 * from tail-optimizing the call to dtrace_getarg().
3203			 * The condition always evaluates to true, but the
3204			 * compiler has no way of figuring that out a priori.
3205			 * (None of this would be necessary if the compiler
3206			 * could be relied upon to _always_ tail-optimize
3207			 * the call to dtrace_getarg() -- but it can't.)
3208			 */
3209			if (mstate->dtms_probe != NULL)
3210				return (val);
3211
3212			ASSERT(0);
3213		}
3214
3215		return (mstate->dtms_arg[ndx]);
3216
3217	case DIF_VAR_UREGS: {
3218		klwp_t *lwp;
3219
3220		if (!dtrace_priv_proc(state, mstate))
3221			return (0);
3222
3223		if ((lwp = curthread->t_lwp) == NULL) {
3224			DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
3225			cpu_core[CPU->cpu_id].cpuc_dtrace_illval = 0;
3226			return (0);
3227		}
3228
3229		return (dtrace_getreg(lwp->lwp_regs, ndx));
3230	}
3231
3232	case DIF_VAR_VMREGS: {
3233		uint64_t rval;
3234
3235		if (!dtrace_priv_kernel(state))
3236			return (0);
3237
3238		DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3239
3240		rval = dtrace_getvmreg(ndx,
3241		    &cpu_core[CPU->cpu_id].cpuc_dtrace_flags);
3242
3243		DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3244
3245		return (rval);
3246	}
3247
3248	case DIF_VAR_CURTHREAD:
3249		if (!dtrace_priv_proc(state, mstate))
3250			return (0);
3251		return ((uint64_t)(uintptr_t)curthread);
3252
3253	case DIF_VAR_TIMESTAMP:
3254		if (!(mstate->dtms_present & DTRACE_MSTATE_TIMESTAMP)) {
3255			mstate->dtms_timestamp = dtrace_gethrtime();
3256			mstate->dtms_present |= DTRACE_MSTATE_TIMESTAMP;
3257		}
3258		return (mstate->dtms_timestamp);
3259
3260	case DIF_VAR_VTIMESTAMP:
3261		ASSERT(dtrace_vtime_references != 0);
3262		return (curthread->t_dtrace_vtime);
3263
3264	case DIF_VAR_WALLTIMESTAMP:
3265		if (!(mstate->dtms_present & DTRACE_MSTATE_WALLTIMESTAMP)) {
3266			mstate->dtms_walltimestamp = dtrace_gethrestime();
3267			mstate->dtms_present |= DTRACE_MSTATE_WALLTIMESTAMP;
3268		}
3269		return (mstate->dtms_walltimestamp);
3270
3271	case DIF_VAR_IPL:
3272		if (!dtrace_priv_kernel(state))
3273			return (0);
3274		if (!(mstate->dtms_present & DTRACE_MSTATE_IPL)) {
3275			mstate->dtms_ipl = dtrace_getipl();
3276			mstate->dtms_present |= DTRACE_MSTATE_IPL;
3277		}
3278		return (mstate->dtms_ipl);
3279
3280	case DIF_VAR_EPID:
3281		ASSERT(mstate->dtms_present & DTRACE_MSTATE_EPID);
3282		return (mstate->dtms_epid);
3283
3284	case DIF_VAR_ID:
3285		ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3286		return (mstate->dtms_probe->dtpr_id);
3287
3288	case DIF_VAR_STACKDEPTH:
3289		if (!dtrace_priv_kernel(state))
3290			return (0);
3291		if (!(mstate->dtms_present & DTRACE_MSTATE_STACKDEPTH)) {
3292			int aframes = mstate->dtms_probe->dtpr_aframes + 2;
3293
3294			mstate->dtms_stackdepth = dtrace_getstackdepth(aframes);
3295			mstate->dtms_present |= DTRACE_MSTATE_STACKDEPTH;
3296		}
3297		return (mstate->dtms_stackdepth);
3298
3299	case DIF_VAR_USTACKDEPTH:
3300		if (!dtrace_priv_proc(state, mstate))
3301			return (0);
3302		if (!(mstate->dtms_present & DTRACE_MSTATE_USTACKDEPTH)) {
3303			/*
3304			 * See comment in DIF_VAR_PID.
3305			 */
3306			if (DTRACE_ANCHORED(mstate->dtms_probe) &&
3307			    CPU_ON_INTR(CPU)) {
3308				mstate->dtms_ustackdepth = 0;
3309			} else {
3310				DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3311				mstate->dtms_ustackdepth =
3312				    dtrace_getustackdepth();
3313				DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3314			}
3315			mstate->dtms_present |= DTRACE_MSTATE_USTACKDEPTH;
3316		}
3317		return (mstate->dtms_ustackdepth);
3318
3319	case DIF_VAR_CALLER:
3320		if (!dtrace_priv_kernel(state))
3321			return (0);
3322		if (!(mstate->dtms_present & DTRACE_MSTATE_CALLER)) {
3323			int aframes = mstate->dtms_probe->dtpr_aframes + 2;
3324
3325			if (!DTRACE_ANCHORED(mstate->dtms_probe)) {
3326				/*
3327				 * If this is an unanchored probe, we are
3328				 * required to go through the slow path:
3329				 * dtrace_caller() only guarantees correct
3330				 * results for anchored probes.
3331				 */
3332				pc_t caller[2];
3333
3334				dtrace_getpcstack(caller, 2, aframes,
3335				    (uint32_t *)(uintptr_t)mstate->dtms_arg[0]);
3336				mstate->dtms_caller = caller[1];
3337			} else if ((mstate->dtms_caller =
3338			    dtrace_caller(aframes)) == -1) {
3339				/*
3340				 * We have failed to do this the quick way;
3341				 * we must resort to the slower approach of
3342				 * calling dtrace_getpcstack().
3343				 */
3344				pc_t caller;
3345
3346				dtrace_getpcstack(&caller, 1, aframes, NULL);
3347				mstate->dtms_caller = caller;
3348			}
3349
3350			mstate->dtms_present |= DTRACE_MSTATE_CALLER;
3351		}
3352		return (mstate->dtms_caller);
3353
3354	case DIF_VAR_UCALLER:
3355		if (!dtrace_priv_proc(state, mstate))
3356			return (0);
3357
3358		if (!(mstate->dtms_present & DTRACE_MSTATE_UCALLER)) {
3359			uint64_t ustack[3];
3360
3361			/*
3362			 * dtrace_getupcstack() fills in the first uint64_t
3363			 * with the current PID.  The second uint64_t will
3364			 * be the program counter at user-level.  The third
3365			 * uint64_t will contain the caller, which is what
3366			 * we're after.
3367			 */
3368			ustack[2] = 0;
3369			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3370			dtrace_getupcstack(ustack, 3);
3371			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3372			mstate->dtms_ucaller = ustack[2];
3373			mstate->dtms_present |= DTRACE_MSTATE_UCALLER;
3374		}
3375
3376		return (mstate->dtms_ucaller);
3377
3378	case DIF_VAR_PROBEPROV:
3379		ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3380		return (dtrace_dif_varstr(
3381		    (uintptr_t)mstate->dtms_probe->dtpr_provider->dtpv_name,
3382		    state, mstate));
3383
3384	case DIF_VAR_PROBEMOD:
3385		ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3386		return (dtrace_dif_varstr(
3387		    (uintptr_t)mstate->dtms_probe->dtpr_mod,
3388		    state, mstate));
3389
3390	case DIF_VAR_PROBEFUNC:
3391		ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3392		return (dtrace_dif_varstr(
3393		    (uintptr_t)mstate->dtms_probe->dtpr_func,
3394		    state, mstate));
3395
3396	case DIF_VAR_PROBENAME:
3397		ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3398		return (dtrace_dif_varstr(
3399		    (uintptr_t)mstate->dtms_probe->dtpr_name,
3400		    state, mstate));
3401
3402	case DIF_VAR_PID:
3403		if (!dtrace_priv_proc(state, mstate))
3404			return (0);
3405
3406		/*
3407		 * Note that we are assuming that an unanchored probe is
3408		 * always due to a high-level interrupt.  (And we're assuming
3409		 * that there is only a single high level interrupt.)
3410		 */
3411		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3412			return (pid0.pid_id);
3413
3414		/*
3415		 * It is always safe to dereference one's own t_procp pointer:
3416		 * it always points to a valid, allocated proc structure.
3417		 * Further, it is always safe to dereference the p_pidp member
3418		 * of one's own proc structure.  (These are truisms becuase
3419		 * threads and processes don't clean up their own state --
3420		 * they leave that task to whomever reaps them.)
3421		 */
3422		return ((uint64_t)curthread->t_procp->p_pidp->pid_id);
3423
3424	case DIF_VAR_PPID:
3425		if (!dtrace_priv_proc(state, mstate))
3426			return (0);
3427
3428		/*
3429		 * See comment in DIF_VAR_PID.
3430		 */
3431		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3432			return (pid0.pid_id);
3433
3434		/*
3435		 * It is always safe to dereference one's own t_procp pointer:
3436		 * it always points to a valid, allocated proc structure.
3437		 * (This is true because threads don't clean up their own
3438		 * state -- they leave that task to whomever reaps them.)
3439		 */
3440		return ((uint64_t)curthread->t_procp->p_ppid);
3441
3442	case DIF_VAR_TID:
3443		/*
3444		 * See comment in DIF_VAR_PID.
3445		 */
3446		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3447			return (0);
3448
3449		return ((uint64_t)curthread->t_tid);
3450
3451	case DIF_VAR_EXECNAME:
3452		if (!dtrace_priv_proc(state, mstate))
3453			return (0);
3454
3455		/*
3456		 * See comment in DIF_VAR_PID.
3457		 */
3458		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3459			return ((uint64_t)(uintptr_t)p0.p_user.u_comm);
3460
3461		/*
3462		 * It is always safe to dereference one's own t_procp pointer:
3463		 * it always points to a valid, allocated proc structure.
3464		 * (This is true because threads don't clean up their own
3465		 * state -- they leave that task to whomever reaps them.)
3466		 */
3467		return (dtrace_dif_varstr(
3468		    (uintptr_t)curthread->t_procp->p_user.u_comm,
3469		    state, mstate));
3470
3471	case DIF_VAR_ZONENAME:
3472		if (!dtrace_priv_proc(state, mstate))
3473			return (0);
3474
3475		/*
3476		 * See comment in DIF_VAR_PID.
3477		 */
3478		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3479			return ((uint64_t)(uintptr_t)p0.p_zone->zone_name);
3480
3481		/*
3482		 * It is always safe to dereference one's own t_procp pointer:
3483		 * it always points to a valid, allocated proc structure.
3484		 * (This is true because threads don't clean up their own
3485		 * state -- they leave that task to whomever reaps them.)
3486		 */
3487		return (dtrace_dif_varstr(
3488		    (uintptr_t)curthread->t_procp->p_zone->zone_name,
3489		    state, mstate));
3490
3491	case DIF_VAR_UID:
3492		if (!dtrace_priv_proc(state, mstate))
3493			return (0);
3494
3495		/*
3496		 * See comment in DIF_VAR_PID.
3497		 */
3498		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3499			return ((uint64_t)p0.p_cred->cr_uid);
3500
3501		/*
3502		 * It is always safe to dereference one's own t_procp pointer:
3503		 * it always points to a valid, allocated proc structure.
3504		 * (This is true because threads don't clean up their own
3505		 * state -- they leave that task to whomever reaps them.)
3506		 *
3507		 * Additionally, it is safe to dereference one's own process
3508		 * credential, since this is never NULL after process birth.
3509		 */
3510		return ((uint64_t)curthread->t_procp->p_cred->cr_uid);
3511
3512	case DIF_VAR_GID:
3513		if (!dtrace_priv_proc(state, mstate))
3514			return (0);
3515
3516		/*
3517		 * See comment in DIF_VAR_PID.
3518		 */
3519		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3520			return ((uint64_t)p0.p_cred->cr_gid);
3521
3522		/*
3523		 * It is always safe to dereference one's own t_procp pointer:
3524		 * it always points to a valid, allocated proc structure.
3525		 * (This is true because threads don't clean up their own
3526		 * state -- they leave that task to whomever reaps them.)
3527		 *
3528		 * Additionally, it is safe to dereference one's own process
3529		 * credential, since this is never NULL after process birth.
3530		 */
3531		return ((uint64_t)curthread->t_procp->p_cred->cr_gid);
3532
3533	case DIF_VAR_ERRNO: {
3534		klwp_t *lwp;
3535		if (!dtrace_priv_proc(state, mstate))
3536			return (0);
3537
3538		/*
3539		 * See comment in DIF_VAR_PID.
3540		 */
3541		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3542			return (0);
3543
3544		/*
3545		 * It is always safe to dereference one's own t_lwp pointer in
3546		 * the event that this pointer is non-NULL.  (This is true
3547		 * because threads and lwps don't clean up their own state --
3548		 * they leave that task to whomever reaps them.)
3549		 */
3550		if ((lwp = curthread->t_lwp) == NULL)
3551			return (0);
3552
3553		return ((uint64_t)lwp->lwp_errno);
3554	}
3555
3556	case DIF_VAR_THREADNAME:
3557		/*
3558		 * See comment in DIF_VAR_PID.
3559		 */
3560		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3561			return (0);
3562
3563		if (curthread->t_name == NULL)
3564			return (0);
3565
3566		/*
3567		 * Once set, ->t_name itself is never changed: any updates are
3568		 * made to the same buffer that we are pointing out.  So we are
3569		 * safe to dereference it here.
3570		 */
3571		return (dtrace_dif_varstr((uintptr_t)curthread->t_name,
3572		    state, mstate));
3573
3574	default:
3575		DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3576		return (0);
3577	}
3578}
3579
3580static void
3581dtrace_dif_variable_write(dtrace_mstate_t *mstate, dtrace_state_t *state,
3582    uint64_t v, uint64_t ndx, uint64_t data)
3583{
3584	switch (v) {
3585	case DIF_VAR_UREGS: {
3586		klwp_t *lwp;
3587
3588		if (dtrace_destructive_disallow ||
3589		    !dtrace_priv_proc_control(state, mstate)) {
3590			return;
3591		}
3592
3593		if ((lwp = curthread->t_lwp) == NULL) {
3594			DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
3595			cpu_core[CPU->cpu_id].cpuc_dtrace_illval = 0;
3596			return;
3597		}
3598
3599		dtrace_setreg(lwp->lwp_regs, ndx, data);
3600		return;
3601	}
3602
3603	default:
3604		DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3605		return;
3606	}
3607}
3608
3609typedef enum dtrace_json_state {
3610	DTRACE_JSON_REST = 1,
3611	DTRACE_JSON_OBJECT,
3612	DTRACE_JSON_STRING,
3613	DTRACE_JSON_STRING_ESCAPE,
3614	DTRACE_JSON_STRING_ESCAPE_UNICODE,
3615	DTRACE_JSON_COLON,
3616	DTRACE_JSON_COMMA,
3617	DTRACE_JSON_VALUE,
3618	DTRACE_JSON_IDENTIFIER,
3619	DTRACE_JSON_NUMBER,
3620	DTRACE_JSON_NUMBER_FRAC,
3621	DTRACE_JSON_NUMBER_EXP,
3622	DTRACE_JSON_COLLECT_OBJECT
3623} dtrace_json_state_t;
3624
3625/*
3626 * This function possesses just enough knowledge about JSON to extract a single
3627 * value from a JSON string and store it in the scratch buffer.  It is able
3628 * to extract nested object values, and members of arrays by index.
3629 *
3630 * elemlist is a list of JSON keys, stored as packed NUL-terminated strings, to
3631 * be looked up as we descend into the object tree.  e.g.
3632 *
3633 *    foo[0].bar.baz[32] --> "foo" NUL "0" NUL "bar" NUL "baz" NUL "32" NUL
3634 *       with nelems = 5.
3635 *
3636 * The run time of this function must be bounded above by strsize to limit the
3637 * amount of work done in probe context.  As such, it is implemented as a
3638 * simple state machine, reading one character at a time using safe loads
3639 * until we find the requested element, hit a parsing error or run off the
3640 * end of the object or string.
3641 *
3642 * As there is no way for a subroutine to return an error without interrupting
3643 * clause execution, we simply return NULL in the event of a missing key or any
3644 * other error condition.  Each NULL return in this function is commented with
3645 * the error condition it represents -- parsing or otherwise.
3646 *
3647 * The set of states for the state machine closely matches the JSON
3648 * specification (http://json.org/).  Briefly:
3649 *
3650 *   DTRACE_JSON_REST:
3651 *     Skip whitespace until we find either a top-level Object, moving
3652 *     to DTRACE_JSON_OBJECT; or an Array, moving to DTRACE_JSON_VALUE.
3653 *
3654 *   DTRACE_JSON_OBJECT:
3655 *     Locate the next key String in an Object.  Sets a flag to denote
3656 *     the next String as a key string and moves to DTRACE_JSON_STRING.
3657 *
3658 *   DTRACE_JSON_COLON:
3659 *     Skip whitespace until we find the colon that separates key Strings
3660 *     from their values.  Once found, move to DTRACE_JSON_VALUE.
3661 *
3662 *   DTRACE_JSON_VALUE:
3663 *     Detects the type of the next value (String, Number, Identifier, Object
3664 *     or Array) and routes to the states that process that type.  Here we also
3665 *     deal with the element selector list if we are requested to traverse down
3666 *     into the object tree.
3667 *
3668 *   DTRACE_JSON_COMMA:
3669 *     Skip whitespace until we find the comma that separates key-value pairs
3670 *     in Objects (returning to DTRACE_JSON_OBJECT) or values in Arrays
3671 *     (similarly DTRACE_JSON_VALUE).  All following literal value processing
3672 *     states return to this state at the end of their value, unless otherwise
3673 *     noted.
3674 *
3675 *   DTRACE_JSON_NUMBER, DTRACE_JSON_NUMBER_FRAC, DTRACE_JSON_NUMBER_EXP:
3676 *     Processes a Number literal from the JSON, including any exponent
3677 *     component that may be present.  Numbers are returned as strings, which
3678 *     may be passed to strtoll() if an integer is required.
3679 *
3680 *   DTRACE_JSON_IDENTIFIER:
3681 *     Processes a "true", "false" or "null" literal in the JSON.
3682 *
3683 *   DTRACE_JSON_STRING, DTRACE_JSON_STRING_ESCAPE,
3684 *   DTRACE_JSON_STRING_ESCAPE_UNICODE:
3685 *     Processes a String literal from the JSON, whether the String denotes
3686 *     a key, a value or part of a larger Object.  Handles all escape sequences
3687 *     present in the specification, including four-digit unicode characters,
3688 *     but merely includes the escape sequence without converting it to the
3689 *     actual escaped character.  If the String is flagged as a key, we
3690 *     move to DTRACE_JSON_COLON rather than DTRACE_JSON_COMMA.
3691 *
3692 *   DTRACE_JSON_COLLECT_OBJECT:
3693 *     This state collects an entire Object (or Array), correctly handling
3694 *     embedded strings.  If the full element selector list matches this nested
3695 *     object, we return the Object in full as a string.  If not, we use this
3696 *     state to skip to the next value at this level and continue processing.
3697 *
3698 * NOTE: This function uses various macros from strtolctype.h to manipulate
3699 * digit values, etc -- these have all been checked to ensure they make
3700 * no additional function calls.
3701 */
3702static char *
3703dtrace_json(uint64_t size, uintptr_t json, char *elemlist, int nelems,
3704    char *dest)
3705{
3706	dtrace_json_state_t state = DTRACE_JSON_REST;
3707	int64_t array_elem = INT64_MIN;
3708	int64_t array_pos = 0;
3709	uint8_t escape_unicount = 0;
3710	boolean_t string_is_key = B_FALSE;
3711	boolean_t collect_object = B_FALSE;
3712	boolean_t found_key = B_FALSE;
3713	boolean_t in_array = B_FALSE;
3714	uint32_t braces = 0, brackets = 0;
3715	char *elem = elemlist;
3716	char *dd = dest;
3717	uintptr_t cur;
3718
3719	for (cur = json; cur < json + size; cur++) {
3720		char cc = dtrace_load8(cur);
3721		if (cc == '\0')
3722			return (NULL);
3723
3724		switch (state) {
3725		case DTRACE_JSON_REST:
3726			if (isspace(cc))
3727				break;
3728
3729			if (cc == '{') {
3730				state = DTRACE_JSON_OBJECT;
3731				break;
3732			}
3733
3734			if (cc == '[') {
3735				in_array = B_TRUE;
3736				array_pos = 0;
3737				array_elem = dtrace_strtoll(elem, 10, size);
3738				found_key = array_elem == 0 ? B_TRUE : B_FALSE;
3739				state = DTRACE_JSON_VALUE;
3740				break;
3741			}
3742
3743			/*
3744			 * ERROR: expected to find a top-level object or array.
3745			 */
3746			return (NULL);
3747		case DTRACE_JSON_OBJECT:
3748			if (isspace(cc))
3749				break;
3750
3751			if (cc == '"') {
3752				state = DTRACE_JSON_STRING;
3753				string_is_key = B_TRUE;
3754				break;
3755			}
3756
3757			/*
3758			 * ERROR: either the object did not start with a key
3759			 * string, or we've run off the end of the object
3760			 * without finding the requested key.
3761			 */
3762			return (NULL);
3763		case DTRACE_JSON_STRING:
3764			if (cc == '\\') {
3765				*dd++ = '\\';
3766				state = DTRACE_JSON_STRING_ESCAPE;
3767				break;
3768			}
3769
3770			if (cc == '"') {
3771				if (collect_object) {
3772					/*
3773					 * We don't reset the dest here, as
3774					 * the string is part of a larger
3775					 * object being collected.
3776					 */
3777					*dd++ = cc;
3778					collect_object = B_FALSE;
3779					state = DTRACE_JSON_COLLECT_OBJECT;
3780					break;
3781				}
3782				*dd = '\0';
3783				dd = dest; /* reset string buffer */
3784				if (string_is_key) {
3785					if (dtrace_strncmp(dest, elem,
3786					    size) == 0)
3787						found_key = B_TRUE;
3788				} else if (found_key) {
3789					if (nelems > 1) {
3790						/*
3791						 * We expected an object, not
3792						 * this string.
3793						 */
3794						return (NULL);
3795					}
3796					return (dest);
3797				}
3798				state = string_is_key ? DTRACE_JSON_COLON :
3799				    DTRACE_JSON_COMMA;
3800				string_is_key = B_FALSE;
3801				break;
3802			}
3803
3804			*dd++ = cc;
3805			break;
3806		case DTRACE_JSON_STRING_ESCAPE:
3807			*dd++ = cc;
3808			if (cc == 'u') {
3809				escape_unicount = 0;
3810				state = DTRACE_JSON_STRING_ESCAPE_UNICODE;
3811			} else {
3812				state = DTRACE_JSON_STRING;
3813			}
3814			break;
3815		case DTRACE_JSON_STRING_ESCAPE_UNICODE:
3816			if (!isxdigit(cc)) {
3817				/*
3818				 * ERROR: invalid unicode escape, expected
3819				 * four valid hexidecimal digits.
3820				 */
3821				return (NULL);
3822			}
3823
3824			*dd++ = cc;
3825			if (++escape_unicount == 4)
3826				state = DTRACE_JSON_STRING;
3827			break;
3828		case DTRACE_JSON_COLON:
3829			if (isspace(cc))
3830				break;
3831
3832			if (cc == ':') {
3833				state = DTRACE_JSON_VALUE;
3834				break;
3835			}
3836
3837			/*
3838			 * ERROR: expected a colon.
3839			 */
3840			return (NULL);
3841		case DTRACE_JSON_COMMA:
3842			if (isspace(cc))
3843				break;
3844
3845			if (cc == ',') {
3846				if (in_array) {
3847					state = DTRACE_JSON_VALUE;
3848					if (++array_pos == array_elem)
3849						found_key = B_TRUE;
3850				} else {
3851					state = DTRACE_JSON_OBJECT;
3852				}
3853				break;
3854			}
3855
3856			/*
3857			 * ERROR: either we hit an unexpected character, or
3858			 * we reached the end of the object or array without
3859			 * finding the requested key.
3860			 */
3861			return (NULL);
3862		case DTRACE_JSON_IDENTIFIER:
3863			if (islower(cc)) {
3864				*dd++ = cc;
3865				break;
3866			}
3867
3868			*dd = '\0';
3869			dd = dest; /* reset string buffer */
3870
3871			if (dtrace_strncmp(dest, "true", 5) == 0 ||
3872			    dtrace_strncmp(dest, "false", 6) == 0 ||
3873			    dtrace_strncmp(dest, "null", 5) == 0) {
3874				if (found_key) {
3875					if (nelems > 1) {
3876						/*
3877						 * ERROR: We expected an object,
3878						 * not this identifier.
3879						 */
3880						return (NULL);
3881					}
3882					return (dest);
3883				} else {
3884					cur--;
3885					state = DTRACE_JSON_COMMA;
3886					break;
3887				}
3888			}
3889
3890			/*
3891			 * ERROR: we did not recognise the identifier as one
3892			 * of those in the JSON specification.
3893			 */
3894			return (NULL);
3895		case DTRACE_JSON_NUMBER:
3896			if (cc == '.') {
3897				*dd++ = cc;
3898				state = DTRACE_JSON_NUMBER_FRAC;
3899				break;
3900			}
3901
3902			if (cc == 'x' || cc == 'X') {
3903				/*
3904				 * ERROR: specification explicitly excludes
3905				 * hexidecimal or octal numbers.
3906				 */
3907				return (NULL);
3908			}
3909
3910			/* FALLTHRU */
3911		case DTRACE_JSON_NUMBER_FRAC:
3912			if (cc == 'e' || cc == 'E') {
3913				*dd++ = cc;
3914				state = DTRACE_JSON_NUMBER_EXP;
3915				break;
3916			}
3917
3918			if (cc == '+' || cc == '-') {
3919				/*
3920				 * ERROR: expect sign as part of exponent only.
3921				 */
3922				return (NULL);
3923			}
3924			/* FALLTHRU */
3925		case DTRACE_JSON_NUMBER_EXP:
3926			if (isdigit(cc) || cc == '+' || cc == '-') {
3927				*dd++ = cc;
3928				break;
3929			}
3930
3931			*dd = '\0';
3932			dd = dest; /* reset string buffer */
3933			if (found_key) {
3934				if (nelems > 1) {
3935					/*
3936					 * ERROR: We expected an object, not
3937					 * this number.
3938					 */
3939					return (NULL);
3940				}
3941				return (dest);
3942			}
3943
3944			cur--;
3945			state = DTRACE_JSON_COMMA;
3946			break;
3947		case DTRACE_JSON_VALUE:
3948			if (isspace(cc))
3949				break;
3950
3951			if (cc == '{' || cc == '[') {
3952				if (nelems > 1 && found_key) {
3953					in_array = cc == '[' ? B_TRUE : B_FALSE;
3954					/*
3955					 * If our element selector directs us
3956					 * to descend into this nested object,
3957					 * then move to the next selector
3958					 * element in the list and restart the
3959					 * state machine.
3960					 */
3961					while (*elem != '\0')
3962						elem++;
3963					elem++; /* skip the inter-element NUL */
3964					nelems--;
3965					dd = dest;
3966					if (in_array) {
3967						state = DTRACE_JSON_VALUE;
3968						array_pos = 0;
3969						array_elem = dtrace_strtoll(
3970						    elem, 10, size);
3971						found_key = array_elem == 0 ?
3972						    B_TRUE : B_FALSE;
3973					} else {
3974						found_key = B_FALSE;
3975						state = DTRACE_JSON_OBJECT;
3976					}
3977					break;
3978				}
3979
3980				/*
3981				 * Otherwise, we wish to either skip this
3982				 * nested object or return it in full.
3983				 */
3984				if (cc == '[')
3985					brackets = 1;
3986				else
3987					braces = 1;
3988				*dd++ = cc;
3989				state = DTRACE_JSON_COLLECT_OBJECT;
3990				break;
3991			}
3992
3993			if (cc == '"') {
3994				state = DTRACE_JSON_STRING;
3995				break;
3996			}
3997
3998			if (islower(cc)) {
3999				/*
4000				 * Here we deal with true, false and null.
4001				 */
4002				*dd++ = cc;
4003				state = DTRACE_JSON_IDENTIFIER;
4004				break;
4005			}
4006
4007			if (cc == '-' || isdigit(cc)) {
4008				*dd++ = cc;
4009				state = DTRACE_JSON_NUMBER;
4010				break;
4011			}
4012
4013			/*
4014			 * ERROR: unexpected character at start of value.
4015			 */
4016			return (NULL);
4017		case DTRACE_JSON_COLLECT_OBJECT:
4018			if (cc == '\0')
4019				/*
4020				 * ERROR: unexpected end of input.
4021				 */
4022				return (NULL);
4023
4024			*dd++ = cc;
4025			if (cc == '"') {
4026				collect_object = B_TRUE;
4027				state = DTRACE_JSON_STRING;
4028				break;
4029			}
4030
4031			if (cc == ']') {
4032				if (brackets-- == 0) {
4033					/*
4034					 * ERROR: unbalanced brackets.
4035					 */
4036					return (NULL);
4037				}
4038			} else if (cc == '}') {
4039				if (braces-- == 0) {
4040					/*
4041					 * ERROR: unbalanced braces.
4042					 */
4043					return (NULL);
4044				}
4045			} else if (cc == '{') {
4046				braces++;
4047			} else if (cc == '[') {
4048				brackets++;
4049			}
4050
4051			if (brackets == 0 && braces == 0) {
4052				if (found_key) {
4053					*dd = '\0';
4054					return (dest);
4055				}
4056				dd = dest; /* reset string buffer */
4057				state = DTRACE_JSON_COMMA;
4058			}
4059			break;
4060		}
4061	}
4062	return (NULL);
4063}
4064
4065/*
4066 * Emulate the execution of DTrace ID subroutines invoked by the call opcode.
4067 * Notice that we don't bother validating the proper number of arguments or
4068 * their types in the tuple stack.  This isn't needed because all argument
4069 * interpretation is safe because of our load safety -- the worst that can
4070 * happen is that a bogus program can obtain bogus results.
4071 */
4072static void
4073dtrace_dif_subr(uint_t subr, uint_t rd, uint64_t *regs,
4074    dtrace_key_t *tupregs, int nargs,
4075    dtrace_mstate_t *mstate, dtrace_state_t *state)
4076{
4077	volatile uint16_t *flags = &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
4078	volatile uintptr_t *illval = &cpu_core[CPU->cpu_id].cpuc_dtrace_illval;
4079	dtrace_vstate_t *vstate = &state->dts_vstate;
4080
4081	union {
4082		mutex_impl_t mi;
4083		uint64_t mx;
4084	} m;
4085
4086	union {
4087		krwlock_t ri;
4088		uintptr_t rw;
4089	} r;
4090
4091	switch (subr) {
4092	case DIF_SUBR_RAND:
4093		regs[rd] = (dtrace_gethrtime() * 2416 + 374441) % 1771875;
4094		break;
4095
4096	case DIF_SUBR_MUTEX_OWNED:
4097		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
4098		    mstate, vstate)) {
4099			regs[rd] = 0;
4100			break;
4101		}
4102
4103		m.mx = dtrace_load64(tupregs[0].dttk_value);
4104		if (MUTEX_TYPE_ADAPTIVE(&m.mi))
4105			regs[rd] = MUTEX_OWNER(&m.mi) != MUTEX_NO_OWNER;
4106		else
4107			regs[rd] = LOCK_HELD(&m.mi.m_spin.m_spinlock);
4108		break;
4109
4110	case DIF_SUBR_MUTEX_OWNER:
4111		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
4112		    mstate, vstate)) {
4113			regs[rd] = 0;
4114			break;
4115		}
4116
4117		m.mx = dtrace_load64(tupregs[0].dttk_value);
4118		if (MUTEX_TYPE_ADAPTIVE(&m.mi) &&
4119		    MUTEX_OWNER(&m.mi) != MUTEX_NO_OWNER)
4120			regs[rd] = (uintptr_t)MUTEX_OWNER(&m.mi);
4121		else
4122			regs[rd] = 0;
4123		break;
4124
4125	case DIF_SUBR_MUTEX_TYPE_ADAPTIVE:
4126		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
4127		    mstate, vstate)) {
4128			regs[rd] = 0;
4129			break;
4130		}
4131
4132		m.mx = dtrace_load64(tupregs[0].dttk_value);
4133		regs[rd] = MUTEX_TYPE_ADAPTIVE(&m.mi);
4134		break;
4135
4136	case DIF_SUBR_MUTEX_TYPE_SPIN:
4137		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
4138		    mstate, vstate)) {
4139			regs[rd] = 0;
4140			break;
4141		}
4142
4143		m.mx = dtrace_load64(tupregs[0].dttk_value);
4144		regs[rd] = MUTEX_TYPE_SPIN(&m.mi);
4145		break;
4146
4147	case DIF_SUBR_RW_READ_HELD: {
4148		uintptr_t tmp;
4149
4150		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (uintptr_t),
4151		    mstate, vstate)) {
4152			regs[rd] = 0;
4153			break;
4154		}
4155
4156		r.rw = dtrace_loadptr(tupregs[0].dttk_value);
4157		regs[rd] = _RW_READ_HELD(&r.ri, tmp);
4158		break;
4159	}
4160
4161	case DIF_SUBR_RW_WRITE_HELD:
4162		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (krwlock_t),
4163		    mstate, vstate)) {
4164			regs[rd] = 0;
4165			break;
4166		}
4167
4168		r.rw = dtrace_loadptr(tupregs[0].dttk_value);
4169		regs[rd] = _RW_WRITE_HELD(&r.ri);
4170		break;
4171
4172	case DIF_SUBR_RW_ISWRITER:
4173		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (krwlock_t),
4174		    mstate, vstate)) {
4175			regs[rd] = 0;
4176			break;
4177		}
4178
4179		r.rw = dtrace_loadptr(tupregs[0].dttk_value);
4180		regs[rd] = _RW_ISWRITER(&r.ri);
4181		break;
4182
4183	case DIF_SUBR_BCOPY: {
4184		/*
4185		 * We need to be sure that the destination is in the scratch
4186		 * region -- no other region is allowed.
4187		 */
4188		uintptr_t src = tupregs[0].dttk_value;
4189		uintptr_t dest = tupregs[1].dttk_value;
4190		size_t size = tupregs[2].dttk_value;
4191
4192		if (!dtrace_inscratch(dest, size, mstate)) {
4193			*flags |= CPU_DTRACE_BADADDR;
4194			*illval = regs[rd];
4195			break;
4196		}
4197
4198		if (!dtrace_canload(src, size, mstate, vstate)) {
4199			regs[rd] = 0;
4200			break;
4201		}
4202
4203		dtrace_bcopy((void *)src, (void *)dest, size);
4204		break;
4205	}
4206
4207	case DIF_SUBR_ALLOCA:
4208	case DIF_SUBR_COPYIN: {
4209		uintptr_t dest = P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
4210		uint64_t size =
4211		    tupregs[subr == DIF_SUBR_ALLOCA ? 0 : 1].dttk_value;
4212		size_t scratch_size = (dest - mstate->dtms_scratch_ptr) + size;
4213
4214		/*
4215		 * This action doesn't require any credential checks since
4216		 * probes will not activate in user contexts to which the
4217		 * enabling user does not have permissions.
4218		 */
4219
4220		/*
4221		 * Rounding up the user allocation size could have overflowed
4222		 * a large, bogus allocation (like -1ULL) to 0.
4223		 */
4224		if (scratch_size < size ||
4225		    !DTRACE_INSCRATCH(mstate, scratch_size)) {
4226			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4227			regs[rd] = 0;
4228			break;
4229		}
4230
4231		if (subr == DIF_SUBR_COPYIN) {
4232			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4233			dtrace_copyin(tupregs[0].dttk_value, dest, size, flags);
4234			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4235		}
4236
4237		mstate->dtms_scratch_ptr += scratch_size;
4238		regs[rd] = dest;
4239		break;
4240	}
4241
4242	case DIF_SUBR_COPYINTO: {
4243		uint64_t size = tupregs[1].dttk_value;
4244		uintptr_t dest = tupregs[2].dttk_value;
4245
4246		/*
4247		 * This action doesn't require any credential checks since
4248		 * probes will not activate in user contexts to which the
4249		 * enabling user does not have permissions.
4250		 */
4251		if (!dtrace_inscratch(dest, size, mstate)) {
4252			*flags |= CPU_DTRACE_BADADDR;
4253			*illval = regs[rd];
4254			break;
4255		}
4256
4257		DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4258		dtrace_copyin(tupregs[0].dttk_value, dest, size, flags);
4259		DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4260		break;
4261	}
4262
4263	case DIF_SUBR_COPYINSTR: {
4264		uintptr_t dest = mstate->dtms_scratch_ptr;
4265		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4266
4267		if (nargs > 1 && tupregs[1].dttk_value < size)
4268			size = tupregs[1].dttk_value + 1;
4269
4270		/*
4271		 * This action doesn't require any credential checks since
4272		 * probes will not activate in user contexts to which the
4273		 * enabling user does not have permissions.
4274		 */
4275		if (!DTRACE_INSCRATCH(mstate, size)) {
4276			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4277			regs[rd] = 0;
4278			break;
4279		}
4280
4281		DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4282		dtrace_copyinstr(tupregs[0].dttk_value, dest, size, flags);
4283		DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4284
4285		((char *)dest)[size - 1] = '\0';
4286		mstate->dtms_scratch_ptr += size;
4287		regs[rd] = dest;
4288		break;
4289	}
4290
4291	case DIF_SUBR_MSGSIZE:
4292	case DIF_SUBR_MSGDSIZE: {
4293		uintptr_t baddr = tupregs[0].dttk_value, daddr;
4294		uintptr_t wptr, rptr;
4295		size_t count = 0;
4296		int cont = 0;
4297
4298		while (baddr != 0 && !(*flags & CPU_DTRACE_FAULT)) {
4299
4300			if (!dtrace_canload(baddr, sizeof (mblk_t), mstate,
4301			    vstate)) {
4302				regs[rd] = 0;
4303				break;
4304			}
4305
4306			wptr = dtrace_loadptr(baddr +
4307			    offsetof(mblk_t, b_wptr));
4308
4309			rptr = dtrace_loadptr(baddr +
4310			    offsetof(mblk_t, b_rptr));
4311
4312			if (wptr < rptr) {
4313				*flags |= CPU_DTRACE_BADADDR;
4314				*illval = tupregs[0].dttk_value;
4315				break;
4316			}
4317
4318			daddr = dtrace_loadptr(baddr +
4319			    offsetof(mblk_t, b_datap));
4320
4321			baddr = dtrace_loadptr(baddr +
4322			    offsetof(mblk_t, b_cont));
4323
4324			/*
4325			 * We want to prevent against denial-of-service here,
4326			 * so we're only going to search the list for
4327			 * dtrace_msgdsize_max mblks.
4328			 */
4329			if (cont++ > dtrace_msgdsize_max) {
4330				*flags |= CPU_DTRACE_ILLOP;
4331				break;
4332			}
4333
4334			if (subr == DIF_SUBR_MSGDSIZE) {
4335				if (dtrace_load8(daddr +
4336				    offsetof(dblk_t, db_type)) != M_DATA)
4337					continue;
4338			}
4339
4340			count += wptr - rptr;
4341		}
4342
4343		if (!(*flags & CPU_DTRACE_FAULT))
4344			regs[rd] = count;
4345
4346		break;
4347	}
4348
4349	case DIF_SUBR_PROGENYOF: {
4350		pid_t pid = tupregs[0].dttk_value;
4351		proc_t *p;
4352		int rval = 0;
4353
4354		DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4355
4356		for (p = curthread->t_procp; p != NULL; p = p->p_parent) {
4357			if (p->p_pidp->pid_id == pid) {
4358				rval = 1;
4359				break;
4360			}
4361		}
4362
4363		DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4364
4365		regs[rd] = rval;
4366		break;
4367	}
4368
4369	case DIF_SUBR_SPECULATION:
4370		regs[rd] = dtrace_speculation(state);
4371		break;
4372
4373	case DIF_SUBR_COPYOUT: {
4374		uintptr_t kaddr = tupregs[0].dttk_value;
4375		uintptr_t uaddr = tupregs[1].dttk_value;
4376		uint64_t size = tupregs[2].dttk_value;
4377
4378		if (!dtrace_destructive_disallow &&
4379		    dtrace_priv_proc_control(state, mstate) &&
4380		    !dtrace_istoxic(kaddr, size) &&
4381		    dtrace_canload(kaddr, size, mstate, vstate)) {
4382			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4383			dtrace_copyout(kaddr, uaddr, size, flags);
4384			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4385		}
4386		break;
4387	}
4388
4389	case DIF_SUBR_COPYOUTSTR: {
4390		uintptr_t kaddr = tupregs[0].dttk_value;
4391		uintptr_t uaddr = tupregs[1].dttk_value;
4392		uint64_t size = tupregs[2].dttk_value;
4393		size_t lim;
4394
4395		if (!dtrace_destructive_disallow &&
4396		    dtrace_priv_proc_control(state, mstate) &&
4397		    !dtrace_istoxic(kaddr, size) &&
4398		    dtrace_strcanload(kaddr, size, &lim, mstate, vstate)) {
4399			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4400			dtrace_copyoutstr(kaddr, uaddr, lim, flags);
4401			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4402		}
4403		break;
4404	}
4405
4406	case DIF_SUBR_STRLEN: {
4407		size_t size = state->dts_options[DTRACEOPT_STRSIZE];
4408		uintptr_t addr = (uintptr_t)tupregs[0].dttk_value;
4409		size_t lim;
4410
4411		if (!dtrace_strcanload(addr, size, &lim, mstate, vstate)) {
4412			regs[rd] = 0;
4413			break;
4414		}
4415		regs[rd] = dtrace_strlen((char *)addr, lim);
4416
4417		break;
4418	}
4419
4420	case DIF_SUBR_STRCHR:
4421	case DIF_SUBR_STRRCHR: {
4422		/*
4423		 * We're going to iterate over the string looking for the
4424		 * specified character.  We will iterate until we have reached
4425		 * the string length or we have found the character.  If this
4426		 * is DIF_SUBR_STRRCHR, we will look for the last occurrence
4427		 * of the specified character instead of the first.
4428		 */
4429		uintptr_t addr = tupregs[0].dttk_value;
4430		uintptr_t addr_limit;
4431		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4432		size_t lim;
4433		char c, target = (char)tupregs[1].dttk_value;
4434
4435		if (!dtrace_strcanload(addr, size, &lim, mstate, vstate)) {
4436			regs[rd] = 0;
4437			break;
4438		}
4439		addr_limit = addr + lim;
4440
4441		for (regs[rd] = 0; addr < addr_limit; addr++) {
4442			if ((c = dtrace_load8(addr)) == target) {
4443				regs[rd] = addr;
4444
4445				if (subr == DIF_SUBR_STRCHR)
4446					break;
4447			}
4448			if (c == '\0')
4449				break;
4450		}
4451
4452		break;
4453	}
4454
4455	case DIF_SUBR_STRSTR:
4456	case DIF_SUBR_INDEX:
4457	case DIF_SUBR_RINDEX: {
4458		/*
4459		 * We're going to iterate over the string looking for the
4460		 * specified string.  We will iterate until we have reached
4461		 * the string length or we have found the string.  (Yes, this
4462		 * is done in the most naive way possible -- but considering
4463		 * that the string we're searching for is likely to be
4464		 * relatively short, the complexity of Rabin-Karp or similar
4465		 * hardly seems merited.)
4466		 */
4467		char *addr = (char *)(uintptr_t)tupregs[0].dttk_value;
4468		char *substr = (char *)(uintptr_t)tupregs[1].dttk_value;
4469		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4470		size_t len = dtrace_strlen(addr, size);
4471		size_t sublen = dtrace_strlen(substr, size);
4472		char *limit = addr + len, *orig = addr;
4473		int notfound = subr == DIF_SUBR_STRSTR ? 0 : -1;
4474		int inc = 1;
4475
4476		regs[rd] = notfound;
4477
4478		if (!dtrace_canload((uintptr_t)addr, len + 1, mstate, vstate)) {
4479			regs[rd] = 0;
4480			break;
4481		}
4482
4483		if (!dtrace_canload((uintptr_t)substr, sublen + 1, mstate,
4484		    vstate)) {
4485			regs[rd] = 0;
4486			break;
4487		}
4488
4489		/*
4490		 * strstr() and index()/rindex() have similar semantics if
4491		 * both strings are the empty string: strstr() returns a
4492		 * pointer to the (empty) string, and index() and rindex()
4493		 * both return index 0 (regardless of any position argument).
4494		 */
4495		if (sublen == 0 && len == 0) {
4496			if (subr == DIF_SUBR_STRSTR)
4497				regs[rd] = (uintptr_t)addr;
4498			else
4499				regs[rd] = 0;
4500			break;
4501		}
4502
4503		if (subr != DIF_SUBR_STRSTR) {
4504			if (subr == DIF_SUBR_RINDEX) {
4505				limit = orig - 1;
4506				addr += len;
4507				inc = -1;
4508			}
4509
4510			/*
4511			 * Both index() and rindex() take an optional position
4512			 * argument that denotes the starting position.
4513			 */
4514			if (nargs == 3) {
4515				int64_t pos = (int64_t)tupregs[2].dttk_value;
4516
4517				/*
4518				 * If the position argument to index() is
4519				 * negative, Perl implicitly clamps it at
4520				 * zero.  This semantic is a little surprising
4521				 * given the special meaning of negative
4522				 * positions to similar Perl functions like
4523				 * substr(), but it appears to reflect a
4524				 * notion that index() can start from a
4525				 * negative index and increment its way up to
4526				 * the string.  Given this notion, Perl's
4527				 * rindex() is at least self-consistent in
4528				 * that it implicitly clamps positions greater
4529				 * than the string length to be the string
4530				 * length.  Where Perl completely loses
4531				 * coherence, however, is when the specified
4532				 * substring is the empty string ("").  In
4533				 * this case, even if the position is
4534				 * negative, rindex() returns 0 -- and even if
4535				 * the position is greater than the length,
4536				 * index() returns the string length.  These
4537				 * semantics violate the notion that index()
4538				 * should never return a value less than the
4539				 * specified position and that rindex() should
4540				 * never return a value greater than the
4541				 * specified position.  (One assumes that
4542				 * these semantics are artifacts of Perl's
4543				 * implementation and not the results of
4544				 * deliberate design -- it beggars belief that
4545				 * even Larry Wall could desire such oddness.)
4546				 * While in the abstract one would wish for
4547				 * consistent position semantics across
4548				 * substr(), index() and rindex() -- or at the
4549				 * very least self-consistent position
4550				 * semantics for index() and rindex() -- we
4551				 * instead opt to keep with the extant Perl
4552				 * semantics, in all their broken glory.  (Do
4553				 * we have more desire to maintain Perl's
4554				 * semantics than Perl does?  Probably.)
4555				 */
4556				if (subr == DIF_SUBR_RINDEX) {
4557					if (pos < 0) {
4558						if (sublen == 0)
4559							regs[rd] = 0;
4560						break;
4561					}
4562
4563					if (pos > len)
4564						pos = len;
4565				} else {
4566					if (pos < 0)
4567						pos = 0;
4568
4569					if (pos >= len) {
4570						if (sublen == 0)
4571							regs[rd] = len;
4572						break;
4573					}
4574				}
4575
4576				addr = orig + pos;
4577			}
4578		}
4579
4580		for (regs[rd] = notfound; addr != limit; addr += inc) {
4581			if (dtrace_strncmp(addr, substr, sublen) == 0) {
4582				if (subr != DIF_SUBR_STRSTR) {
4583					/*
4584					 * As D index() and rindex() are
4585					 * modeled on Perl (and not on awk),
4586					 * we return a zero-based (and not a
4587					 * one-based) index.  (For you Perl
4588					 * weenies: no, we're not going to add
4589					 * $[ -- and shouldn't you be at a con
4590					 * or something?)
4591					 */
4592					regs[rd] = (uintptr_t)(addr - orig);
4593					break;
4594				}
4595
4596				ASSERT(subr == DIF_SUBR_STRSTR);
4597				regs[rd] = (uintptr_t)addr;
4598				break;
4599			}
4600		}
4601
4602		break;
4603	}
4604
4605	case DIF_SUBR_STRTOK: {
4606		uintptr_t addr = tupregs[0].dttk_value;
4607		uintptr_t tokaddr = tupregs[1].dttk_value;
4608		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4609		uintptr_t limit, toklimit;
4610		size_t clim;
4611		uint8_t c, tokmap[32];	 /* 256 / 8 */
4612		char *dest = (char *)mstate->dtms_scratch_ptr;
4613		int i;
4614
4615		/*
4616		 * Check both the token buffer and (later) the input buffer,
4617		 * since both could be non-scratch addresses.
4618		 */
4619		if (!dtrace_strcanload(tokaddr, size, &clim, mstate, vstate)) {
4620			regs[rd] = 0;
4621			break;
4622		}
4623		toklimit = tokaddr + clim;
4624
4625		if (!DTRACE_INSCRATCH(mstate, size)) {
4626			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4627			regs[rd] = 0;
4628			break;
4629		}
4630
4631		if (addr == 0) {
4632			/*
4633			 * If the address specified is NULL, we use our saved
4634			 * strtok pointer from the mstate.  Note that this
4635			 * means that the saved strtok pointer is _only_
4636			 * valid within multiple enablings of the same probe --
4637			 * it behaves like an implicit clause-local variable.
4638			 */
4639			addr = mstate->dtms_strtok;
4640			limit = mstate->dtms_strtok_limit;
4641		} else {
4642			/*
4643			 * If the user-specified address is non-NULL we must
4644			 * access check it.  This is the only time we have
4645			 * a chance to do so, since this address may reside
4646			 * in the string table of this clause-- future calls
4647			 * (when we fetch addr from mstate->dtms_strtok)
4648			 * would fail this access check.
4649			 */
4650			if (!dtrace_strcanload(addr, size, &clim, mstate,
4651			    vstate)) {
4652				regs[rd] = 0;
4653				break;
4654			}
4655			limit = addr + clim;
4656		}
4657
4658		/*
4659		 * First, zero the token map, and then process the token
4660		 * string -- setting a bit in the map for every character
4661		 * found in the token string.
4662		 */
4663		for (i = 0; i < sizeof (tokmap); i++)
4664			tokmap[i] = 0;
4665
4666		for (; tokaddr < toklimit; tokaddr++) {
4667			if ((c = dtrace_load8(tokaddr)) == '\0')
4668				break;
4669
4670			ASSERT((c >> 3) < sizeof (tokmap));
4671			tokmap[c >> 3] |= (1 << (c & 0x7));
4672		}
4673
4674		for (; addr < limit; addr++) {
4675			/*
4676			 * We're looking for a character that is _not_
4677			 * contained in the token string.
4678			 */
4679			if ((c = dtrace_load8(addr)) == '\0')
4680				break;
4681
4682			if (!(tokmap[c >> 3] & (1 << (c & 0x7))))
4683				break;
4684		}
4685
4686		if (c == '\0') {
4687			/*
4688			 * We reached the end of the string without finding
4689			 * any character that was not in the token string.
4690			 * We return NULL in this case, and we set the saved
4691			 * address to NULL as well.
4692			 */
4693			regs[rd] = 0;
4694			mstate->dtms_strtok = 0;
4695			mstate->dtms_strtok_limit = 0;
4696			break;
4697		}
4698
4699		/*
4700		 * From here on, we're copying into the destination string.
4701		 */
4702		for (i = 0; addr < limit && i < size - 1; addr++) {
4703			if ((c = dtrace_load8(addr)) == '\0')
4704				break;
4705
4706			if (tokmap[c >> 3] & (1 << (c & 0x7)))
4707				break;
4708
4709			ASSERT(i < size);
4710			dest[i++] = c;
4711		}
4712
4713		ASSERT(i < size);
4714		dest[i] = '\0';
4715		regs[rd] = (uintptr_t)dest;
4716		mstate->dtms_scratch_ptr += size;
4717		mstate->dtms_strtok = addr;
4718		mstate->dtms_strtok_limit = limit;
4719		break;
4720	}
4721
4722	case DIF_SUBR_SUBSTR: {
4723		uintptr_t s = tupregs[0].dttk_value;
4724		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4725		char *d = (char *)mstate->dtms_scratch_ptr;
4726		int64_t index = (int64_t)tupregs[1].dttk_value;
4727		int64_t remaining = (int64_t)tupregs[2].dttk_value;
4728		size_t len = dtrace_strlen((char *)s, size);
4729		int64_t i;
4730
4731		if (!dtrace_canload(s, len + 1, mstate, vstate)) {
4732			regs[rd] = 0;
4733			break;
4734		}
4735
4736		if (!DTRACE_INSCRATCH(mstate, size)) {
4737			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4738			regs[rd] = 0;
4739			break;
4740		}
4741
4742		if (nargs <= 2)
4743			remaining = (int64_t)size;
4744
4745		if (index < 0) {
4746			index += len;
4747
4748			if (index < 0 && index + remaining > 0) {
4749				remaining += index;
4750				index = 0;
4751			}
4752		}
4753
4754		if (index >= len || index < 0) {
4755			remaining = 0;
4756		} else if (remaining < 0) {
4757			remaining += len - index;
4758		} else if (index + remaining > size) {
4759			remaining = size - index;
4760		}
4761
4762		for (i = 0; i < remaining; i++) {
4763			if ((d[i] = dtrace_load8(s + index + i)) == '\0')
4764				break;
4765		}
4766
4767		d[i] = '\0';
4768
4769		mstate->dtms_scratch_ptr += size;
4770		regs[rd] = (uintptr_t)d;
4771		break;
4772	}
4773
4774	case DIF_SUBR_JSON: {
4775		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4776		uintptr_t json = tupregs[0].dttk_value;
4777		size_t jsonlen = dtrace_strlen((char *)json, size);
4778		uintptr_t elem = tupregs[1].dttk_value;
4779		size_t elemlen = dtrace_strlen((char *)elem, size);
4780
4781		char *dest = (char *)mstate->dtms_scratch_ptr;
4782		char *elemlist = (char *)mstate->dtms_scratch_ptr + jsonlen + 1;
4783		char *ee = elemlist;
4784		int nelems = 1;
4785		uintptr_t cur;
4786
4787		if (!dtrace_canload(json, jsonlen + 1, mstate, vstate) ||
4788		    !dtrace_canload(elem, elemlen + 1, mstate, vstate)) {
4789			regs[rd] = 0;
4790			break;
4791		}
4792
4793		if (!DTRACE_INSCRATCH(mstate, jsonlen + 1 + elemlen + 1)) {
4794			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4795			regs[rd] = 0;
4796			break;
4797		}
4798
4799		/*
4800		 * Read the element selector and split it up into a packed list
4801		 * of strings.
4802		 */
4803		for (cur = elem; cur < elem + elemlen; cur++) {
4804			char cc = dtrace_load8(cur);
4805
4806			if (cur == elem && cc == '[') {
4807				/*
4808				 * If the first element selector key is
4809				 * actually an array index then ignore the
4810				 * bracket.
4811				 */
4812				continue;
4813			}
4814
4815			if (cc == ']')
4816				continue;
4817
4818			if (cc == '.' || cc == '[') {
4819				nelems++;
4820				cc = '\0';
4821			}
4822
4823			*ee++ = cc;
4824		}
4825		*ee++ = '\0';
4826
4827		if ((regs[rd] = (uintptr_t)dtrace_json(size, json, elemlist,
4828		    nelems, dest)) != 0)
4829			mstate->dtms_scratch_ptr += jsonlen + 1;
4830		break;
4831	}
4832
4833	case DIF_SUBR_TOUPPER:
4834	case DIF_SUBR_TOLOWER: {
4835		uintptr_t s = tupregs[0].dttk_value;
4836		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4837		char *dest = (char *)mstate->dtms_scratch_ptr, c;
4838		size_t len = dtrace_strlen((char *)s, size);
4839		char lower, upper, convert;
4840		int64_t i;
4841
4842		if (subr == DIF_SUBR_TOUPPER) {
4843			lower = 'a';
4844			upper = 'z';
4845			convert = 'A';
4846		} else {
4847			lower = 'A';
4848			upper = 'Z';
4849			convert = 'a';
4850		}
4851
4852		if (!dtrace_canload(s, len + 1, mstate, vstate)) {
4853			regs[rd] = 0;
4854			break;
4855		}
4856
4857		if (!DTRACE_INSCRATCH(mstate, size)) {
4858			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4859			regs[rd] = 0;
4860			break;
4861		}
4862
4863		for (i = 0; i < size - 1; i++) {
4864			if ((c = dtrace_load8(s + i)) == '\0')
4865				break;
4866
4867			if (c >= lower && c <= upper)
4868				c = convert + (c - lower);
4869
4870			dest[i] = c;
4871		}
4872
4873		ASSERT(i < size);
4874		dest[i] = '\0';
4875		regs[rd] = (uintptr_t)dest;
4876		mstate->dtms_scratch_ptr += size;
4877		break;
4878	}
4879
4880case DIF_SUBR_GETMAJOR:
4881#ifdef _LP64
4882		regs[rd] = (tupregs[0].dttk_value >> NBITSMINOR64) & MAXMAJ64;
4883#else
4884		regs[rd] = (tupregs[0].dttk_value >> NBITSMINOR) & MAXMAJ;
4885#endif
4886		break;
4887
4888	case DIF_SUBR_GETMINOR:
4889#ifdef _LP64
4890		regs[rd] = tupregs[0].dttk_value & MAXMIN64;
4891#else
4892		regs[rd] = tupregs[0].dttk_value & MAXMIN;
4893#endif
4894		break;
4895
4896	case DIF_SUBR_DDI_PATHNAME: {
4897		/*
4898		 * This one is a galactic mess.  We are going to roughly
4899		 * emulate ddi_pathname(), but it's made more complicated
4900		 * by the fact that we (a) want to include the minor name and
4901		 * (b) must proceed iteratively instead of recursively.
4902		 */
4903		uintptr_t dest = mstate->dtms_scratch_ptr;
4904		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4905		char *start = (char *)dest, *end = start + size - 1;
4906		uintptr_t daddr = tupregs[0].dttk_value;
4907		int64_t minor = (int64_t)tupregs[1].dttk_value;
4908		char *s;
4909		int i, len, depth = 0;
4910
4911		/*
4912		 * Due to all the pointer jumping we do and context we must
4913		 * rely upon, we just mandate that the user must have kernel
4914		 * read privileges to use this routine.
4915		 */
4916		if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) == 0) {
4917			*flags |= CPU_DTRACE_KPRIV;
4918			*illval = daddr;
4919			regs[rd] = 0;
4920		}
4921
4922		if (!DTRACE_INSCRATCH(mstate, size)) {
4923			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4924			regs[rd] = 0;
4925			break;
4926		}
4927
4928		*end = '\0';
4929
4930		/*
4931		 * We want to have a name for the minor.  In order to do this,
4932		 * we need to walk the minor list from the devinfo.  We want
4933		 * to be sure that we don't infinitely walk a circular list,
4934		 * so we check for circularity by sending a scout pointer
4935		 * ahead two elements for every element that we iterate over;
4936		 * if the list is circular, these will ultimately point to the
4937		 * same element.  You may recognize this little trick as the
4938		 * answer to a stupid interview question -- one that always
4939		 * seems to be asked by those who had to have it laboriously
4940		 * explained to them, and who can't even concisely describe
4941		 * the conditions under which one would be forced to resort to
4942		 * this technique.  Needless to say, those conditions are
4943		 * found here -- and probably only here.  Is this the only use
4944		 * of this infamous trick in shipping, production code?  If it
4945		 * isn't, it probably should be...
4946		 */
4947		if (minor != -1) {
4948			uintptr_t maddr = dtrace_loadptr(daddr +
4949			    offsetof(struct dev_info, devi_minor));
4950
4951			uintptr_t next = offsetof(struct ddi_minor_data, next);
4952			uintptr_t name = offsetof(struct ddi_minor_data,
4953			    d_minor) + offsetof(struct ddi_minor, name);
4954			uintptr_t dev = offsetof(struct ddi_minor_data,
4955			    d_minor) + offsetof(struct ddi_minor, dev);
4956			uintptr_t scout;
4957
4958			if (maddr != 0)
4959				scout = dtrace_loadptr(maddr + next);
4960
4961			while (maddr != 0 && !(*flags & CPU_DTRACE_FAULT)) {
4962				uint64_t m;
4963#ifdef _LP64
4964				m = dtrace_load64(maddr + dev) & MAXMIN64;
4965#else
4966				m = dtrace_load32(maddr + dev) & MAXMIN;
4967#endif
4968				if (m != minor) {
4969					maddr = dtrace_loadptr(maddr + next);
4970
4971					if (scout == 0)
4972						continue;
4973
4974					scout = dtrace_loadptr(scout + next);
4975
4976					if (scout == 0)
4977						continue;
4978
4979					scout = dtrace_loadptr(scout + next);
4980
4981					if (scout == 0)
4982						continue;
4983
4984					if (scout == maddr) {
4985						*flags |= CPU_DTRACE_ILLOP;
4986						break;
4987					}
4988
4989					continue;
4990				}
4991
4992				/*
4993				 * We have the minor data.  Now we need to
4994				 * copy the minor's name into the end of the
4995				 * pathname.
4996				 */
4997				s = (char *)dtrace_loadptr(maddr + name);
4998				len = dtrace_strlen(s, size);
4999
5000				if (*flags & CPU_DTRACE_FAULT)
5001					break;
5002
5003				if (len != 0) {
5004					if ((end -= (len + 1)) < start)
5005						break;
5006
5007					*end = ':';
5008				}
5009
5010				for (i = 1; i <= len; i++)
5011					end[i] = dtrace_load8((uintptr_t)s++);
5012				break;
5013			}
5014		}
5015
5016		while (daddr != 0 && !(*flags & CPU_DTRACE_FAULT)) {
5017			ddi_node_state_t devi_state;
5018
5019			devi_state = dtrace_load32(daddr +
5020			    offsetof(struct dev_info, devi_node_state));
5021
5022			if (*flags & CPU_DTRACE_FAULT)
5023				break;
5024
5025			if (devi_state >= DS_INITIALIZED) {
5026				s = (char *)dtrace_loadptr(daddr +
5027				    offsetof(struct dev_info, devi_addr));
5028				len = dtrace_strlen(s, size);
5029
5030				if (*flags & CPU_DTRACE_FAULT)
5031					break;
5032
5033				if (len != 0) {
5034					if ((end -= (len + 1)) < start)
5035						break;
5036
5037					*end = '@';
5038				}
5039
5040				for (i = 1; i <= len; i++)
5041					end[i] = dtrace_load8((uintptr_t)s++);
5042			}
5043
5044			/*
5045			 * Now for the node name...
5046			 */
5047			s = (char *)dtrace_loadptr(daddr +
5048			    offsetof(struct dev_info, devi_node_name));
5049
5050			daddr = dtrace_loadptr(daddr +
5051			    offsetof(struct dev_info, devi_parent));
5052
5053			/*
5054			 * If our parent is NULL (that is, if we're the root
5055			 * node), we're going to use the special path
5056			 * "devices".
5057			 */
5058			if (daddr == 0)
5059				s = "devices";
5060
5061			len = dtrace_strlen(s, size);
5062			if (*flags & CPU_DTRACE_FAULT)
5063				break;
5064
5065			if ((end -= (len + 1)) < start)
5066				break;
5067
5068			for (i = 1; i <= len; i++)
5069				end[i] = dtrace_load8((uintptr_t)s++);
5070			*end = '/';
5071
5072			if (depth++ > dtrace_devdepth_max) {
5073				*flags |= CPU_DTRACE_ILLOP;
5074				break;
5075			}
5076		}
5077
5078		if (end < start)
5079			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5080
5081		if (daddr == 0) {
5082			regs[rd] = (uintptr_t)end;
5083			mstate->dtms_scratch_ptr += size;
5084		}
5085
5086		break;
5087	}
5088
5089	case DIF_SUBR_STRJOIN: {
5090		char *d = (char *)mstate->dtms_scratch_ptr;
5091		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
5092		uintptr_t s1 = tupregs[0].dttk_value;
5093		uintptr_t s2 = tupregs[1].dttk_value;
5094		int i = 0, j = 0;
5095		size_t lim1, lim2;
5096		char c;
5097
5098		if (!dtrace_strcanload(s1, size, &lim1, mstate, vstate) ||
5099		    !dtrace_strcanload(s2, size, &lim2, mstate, vstate)) {
5100			regs[rd] = 0;
5101			break;
5102		}
5103
5104		if (!DTRACE_INSCRATCH(mstate, size)) {
5105			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5106			regs[rd] = 0;
5107			break;
5108		}
5109
5110		for (;;) {
5111			if (i >= size) {
5112				DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5113				regs[rd] = 0;
5114				break;
5115			}
5116			c = (i >= lim1) ? '\0' : dtrace_load8(s1++);
5117			if ((d[i++] = c) == '\0') {
5118				i--;
5119				break;
5120			}
5121		}
5122
5123		for (;;) {
5124			if (i >= size) {
5125				DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5126				regs[rd] = 0;
5127				break;
5128			}
5129
5130			c = (j++ >= lim2) ? '\0' : dtrace_load8(s2++);
5131			if ((d[i++] = c) == '\0')
5132				break;
5133		}
5134
5135		if (i < size) {
5136			mstate->dtms_scratch_ptr += i;
5137			regs[rd] = (uintptr_t)d;
5138		}
5139
5140		break;
5141	}
5142
5143	case DIF_SUBR_STRTOLL: {
5144		uintptr_t s = tupregs[0].dttk_value;
5145		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
5146		size_t lim;
5147		int base = 10;
5148
5149		if (nargs > 1) {
5150			if ((base = tupregs[1].dttk_value) <= 1 ||
5151			    base > ('z' - 'a' + 1) + ('9' - '0' + 1)) {
5152				*flags |= CPU_DTRACE_ILLOP;
5153				break;
5154			}
5155		}
5156
5157		if (!dtrace_strcanload(s, size, &lim, mstate, vstate)) {
5158			regs[rd] = INT64_MIN;
5159			break;
5160		}
5161
5162		regs[rd] = dtrace_strtoll((char *)s, base, lim);
5163		break;
5164	}
5165
5166	case DIF_SUBR_LLTOSTR: {
5167		int64_t i = (int64_t)tupregs[0].dttk_value;
5168		uint64_t val, digit;
5169		uint64_t size = 65;	/* enough room for 2^64 in binary */
5170		char *end = (char *)mstate->dtms_scratch_ptr + size - 1;
5171		int base = 10;
5172
5173		if (nargs > 1) {
5174			if ((base = tupregs[1].dttk_value) <= 1 ||
5175			    base > ('z' - 'a' + 1) + ('9' - '0' + 1)) {
5176				*flags |= CPU_DTRACE_ILLOP;
5177				break;
5178			}
5179		}
5180
5181		val = (base == 10 && i < 0) ? i * -1 : i;
5182
5183		if (!DTRACE_INSCRATCH(mstate, size)) {
5184			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5185			regs[rd] = 0;
5186			break;
5187		}
5188
5189		for (*end-- = '\0'; val; val /= base) {
5190			if ((digit = val % base) <= '9' - '0') {
5191				*end-- = '0' + digit;
5192			} else {
5193				*end-- = 'a' + (digit - ('9' - '0') - 1);
5194			}
5195		}
5196
5197		if (i == 0 && base == 16)
5198			*end-- = '0';
5199
5200		if (base == 16)
5201			*end-- = 'x';
5202
5203		if (i == 0 || base == 8 || base == 16)
5204			*end-- = '0';
5205
5206		if (i < 0 && base == 10)
5207			*end-- = '-';
5208
5209		regs[rd] = (uintptr_t)end + 1;
5210		mstate->dtms_scratch_ptr += size;
5211		break;
5212	}
5213
5214	case DIF_SUBR_HTONS:
5215	case DIF_SUBR_NTOHS:
5216#ifdef _BIG_ENDIAN
5217		regs[rd] = (uint16_t)tupregs[0].dttk_value;
5218#else
5219		regs[rd] = DT_BSWAP_16((uint16_t)tupregs[0].dttk_value);
5220#endif
5221		break;
5222
5223
5224	case DIF_SUBR_HTONL:
5225	case DIF_SUBR_NTOHL:
5226#ifdef _BIG_ENDIAN
5227		regs[rd] = (uint32_t)tupregs[0].dttk_value;
5228#else
5229		regs[rd] = DT_BSWAP_32((uint32_t)tupregs[0].dttk_value);
5230#endif
5231		break;
5232
5233
5234	case DIF_SUBR_HTONLL:
5235	case DIF_SUBR_NTOHLL:
5236#ifdef _BIG_ENDIAN
5237		regs[rd] = (uint64_t)tupregs[0].dttk_value;
5238#else
5239		regs[rd] = DT_BSWAP_64((uint64_t)tupregs[0].dttk_value);
5240#endif
5241		break;
5242
5243
5244	case DIF_SUBR_DIRNAME:
5245	case DIF_SUBR_BASENAME: {
5246		char *dest = (char *)mstate->dtms_scratch_ptr;
5247		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
5248		uintptr_t src = tupregs[0].dttk_value;
5249		int i, j, len = dtrace_strlen((char *)src, size);
5250		int lastbase = -1, firstbase = -1, lastdir = -1;
5251		int start, end;
5252
5253		if (!dtrace_canload(src, len + 1, mstate, vstate)) {
5254			regs[rd] = 0;
5255			break;
5256		}
5257
5258		if (!DTRACE_INSCRATCH(mstate, size)) {
5259			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5260			regs[rd] = 0;
5261			break;
5262		}
5263
5264		/*
5265		 * The basename and dirname for a zero-length string is
5266		 * defined to be "."
5267		 */
5268		if (len == 0) {
5269			len = 1;
5270			src = (uintptr_t)".";
5271		}
5272
5273		/*
5274		 * Start from the back of the string, moving back toward the
5275		 * front until we see a character that isn't a slash.  That
5276		 * character is the last character in the basename.
5277		 */
5278		for (i = len - 1; i >= 0; i--) {
5279			if (dtrace_load8(src + i) != '/')
5280				break;
5281		}
5282
5283		if (i >= 0)
5284			lastbase = i;
5285
5286		/*
5287		 * Starting from the last character in the basename, move
5288		 * towards the front until we find a slash.  The character
5289		 * that we processed immediately before that is the first
5290		 * character in the basename.
5291		 */
5292		for (; i >= 0; i--) {
5293			if (dtrace_load8(src + i) == '/')
5294				break;
5295		}
5296
5297		if (i >= 0)
5298			firstbase = i + 1;
5299
5300		/*
5301		 * Now keep going until we find a non-slash character.  That
5302		 * character is the last character in the dirname.
5303		 */
5304		for (; i >= 0; i--) {
5305			if (dtrace_load8(src + i) != '/')
5306				break;
5307		}
5308
5309		if (i >= 0)
5310			lastdir = i;
5311
5312		ASSERT(!(lastbase == -1 && firstbase != -1));
5313		ASSERT(!(firstbase == -1 && lastdir != -1));
5314
5315		if (lastbase == -1) {
5316			/*
5317			 * We didn't find a non-slash character.  We know that
5318			 * the length is non-zero, so the whole string must be
5319			 * slashes.  In either the dirname or the basename
5320			 * case, we return '/'.
5321			 */
5322			ASSERT(firstbase == -1);
5323			firstbase = lastbase = lastdir = 0;
5324		}
5325
5326		if (firstbase == -1) {
5327			/*
5328			 * The entire string consists only of a basename
5329			 * component.  If we're looking for dirname, we need
5330			 * to change our string to be just "."; if we're
5331			 * looking for a basename, we'll just set the first
5332			 * character of the basename to be 0.
5333			 */
5334			if (subr == DIF_SUBR_DIRNAME) {
5335				ASSERT(lastdir == -1);
5336				src = (uintptr_t)".";
5337				lastdir = 0;
5338			} else {
5339				firstbase = 0;
5340			}
5341		}
5342
5343		if (subr == DIF_SUBR_DIRNAME) {
5344			if (lastdir == -1) {
5345				/*
5346				 * We know that we have a slash in the name --
5347				 * or lastdir would be set to 0, above.  And
5348				 * because lastdir is -1, we know that this
5349				 * slash must be the first character.  (That
5350				 * is, the full string must be of the form
5351				 * "/basename".)  In this case, the last
5352				 * character of the directory name is 0.
5353				 */
5354				lastdir = 0;
5355			}
5356
5357			start = 0;
5358			end = lastdir;
5359		} else {
5360			ASSERT(subr == DIF_SUBR_BASENAME);
5361			ASSERT(firstbase != -1 && lastbase != -1);
5362			start = firstbase;
5363			end = lastbase;
5364		}
5365
5366		for (i = start, j = 0; i <= end && j < size - 1; i++, j++)
5367			dest[j] = dtrace_load8(src + i);
5368
5369		dest[j] = '\0';
5370		regs[rd] = (uintptr_t)dest;
5371		mstate->dtms_scratch_ptr += size;
5372		break;
5373	}
5374
5375	case DIF_SUBR_GETF: {
5376		uintptr_t fd = tupregs[0].dttk_value;
5377		uf_info_t *finfo = &curthread->t_procp->p_user.u_finfo;
5378		file_t *fp;
5379
5380		if (!dtrace_priv_proc(state, mstate)) {
5381			regs[rd] = 0;
5382			break;
5383		}
5384
5385		/*
5386		 * This is safe because fi_nfiles only increases, and the
5387		 * fi_list array is not freed when the array size doubles.
5388		 * (See the comment in flist_grow() for details on the
5389		 * management of the u_finfo structure.)
5390		 */
5391		fp = fd < finfo->fi_nfiles ? finfo->fi_list[fd].uf_file : NULL;
5392
5393		mstate->dtms_getf = fp;
5394		regs[rd] = (uintptr_t)fp;
5395		break;
5396	}
5397
5398	case DIF_SUBR_CLEANPATH: {
5399		char *dest = (char *)mstate->dtms_scratch_ptr, c;
5400		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
5401		uintptr_t src = tupregs[0].dttk_value;
5402		size_t lim;
5403		int i = 0, j = 0;
5404		zone_t *z;
5405
5406		if (!dtrace_strcanload(src, size, &lim, mstate, vstate)) {
5407			regs[rd] = 0;
5408			break;
5409		}
5410
5411		if (!DTRACE_INSCRATCH(mstate, size)) {
5412			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5413			regs[rd] = 0;
5414			break;
5415		}
5416
5417		/*
5418		 * Move forward, loading each character.
5419		 */
5420		do {
5421			c = (i >= lim) ? '\0' : dtrace_load8(src + i++);
5422next:
5423			if (j + 5 >= size)	/* 5 = strlen("/..c\0") */
5424				break;
5425
5426			if (c != '/') {
5427				dest[j++] = c;
5428				continue;
5429			}
5430
5431			c = (i >= lim) ? '\0' : dtrace_load8(src + i++);
5432
5433			if (c == '/') {
5434				/*
5435				 * We have two slashes -- we can just advance
5436				 * to the next character.
5437				 */
5438				goto next;
5439			}
5440
5441			if (c != '.') {
5442				/*
5443				 * This is not "." and it's not ".." -- we can
5444				 * just store the "/" and this character and
5445				 * drive on.
5446				 */
5447				dest[j++] = '/';
5448				dest[j++] = c;
5449				continue;
5450			}
5451
5452			c = (i >= lim) ? '\0' : dtrace_load8(src + i++);
5453
5454			if (c == '/') {
5455				/*
5456				 * This is a "/./" component.  We're not going
5457				 * to store anything in the destination buffer;
5458				 * we're just going to go to the next component.
5459				 */
5460				goto next;
5461			}
5462
5463			if (c != '.') {
5464				/*
5465				 * This is not ".." -- we can just store the
5466				 * "/." and this character and continue
5467				 * processing.
5468				 */
5469				dest[j++] = '/';
5470				dest[j++] = '.';
5471				dest[j++] = c;
5472				continue;
5473			}
5474
5475			c = (i >= lim) ? '\0' : dtrace_load8(src + i++);
5476
5477			if (c != '/' && c != '\0') {
5478				/*
5479				 * This is not ".." -- it's "..[mumble]".
5480				 * We'll store the "/.." and this character
5481				 * and continue processing.
5482				 */
5483				dest[j++] = '/';
5484				dest[j++] = '.';
5485				dest[j++] = '.';
5486				dest[j++] = c;
5487				continue;
5488			}
5489
5490			/*
5491			 * This is "/../" or "/..\0".  We need to back up
5492			 * our destination pointer until we find a "/".
5493			 */
5494			i--;
5495			while (j != 0 && dest[--j] != '/')
5496				continue;
5497
5498			if (c == '\0')
5499				dest[++j] = '/';
5500		} while (c != '\0');
5501
5502		dest[j] = '\0';
5503
5504		if (mstate->dtms_getf != NULL &&
5505		    !(mstate->dtms_access & DTRACE_ACCESS_KERNEL) &&
5506		    (z = state->dts_cred.dcr_cred->cr_zone) != kcred->cr_zone) {
5507			/*
5508			 * If we've done a getf() as a part of this ECB and we
5509			 * don't have kernel access (and we're not in the global
5510			 * zone), check if the path we cleaned up begins with
5511			 * the zone's root path, and trim it off if so.  Note
5512			 * that this is an output cleanliness issue, not a
5513			 * security issue: knowing one's zone root path does
5514			 * not enable privilege escalation.
5515			 */
5516			if (strstr(dest, z->zone_rootpath) == dest)
5517				dest += strlen(z->zone_rootpath) - 1;
5518		}
5519
5520		regs[rd] = (uintptr_t)dest;
5521		mstate->dtms_scratch_ptr += size;
5522		break;
5523	}
5524
5525	case DIF_SUBR_INET_NTOA:
5526	case DIF_SUBR_INET_NTOA6:
5527	case DIF_SUBR_INET_NTOP: {
5528		size_t size;
5529		int af, argi, i;
5530		char *base, *end;
5531
5532		if (subr == DIF_SUBR_INET_NTOP) {
5533			af = (int)tupregs[0].dttk_value;
5534			argi = 1;
5535		} else {
5536			af = subr == DIF_SUBR_INET_NTOA ? AF_INET: AF_INET6;
5537			argi = 0;
5538		}
5539
5540		if (af == AF_INET) {
5541			ipaddr_t ip4;
5542			uint8_t *ptr8, val;
5543
5544			if (!dtrace_canload(tupregs[argi].dttk_value,
5545			    sizeof (ipaddr_t), mstate, vstate)) {
5546				regs[rd] = 0;
5547				break;
5548			}
5549
5550			/*
5551			 * Safely load the IPv4 address.
5552			 */
5553			ip4 = dtrace_load32(tupregs[argi].dttk_value);
5554
5555			/*
5556			 * Check an IPv4 string will fit in scratch.
5557			 */
5558			size = INET_ADDRSTRLEN;
5559			if (!DTRACE_INSCRATCH(mstate, size)) {
5560				DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5561				regs[rd] = 0;
5562				break;
5563			}
5564			base = (char *)mstate->dtms_scratch_ptr;
5565			end = (char *)mstate->dtms_scratch_ptr + size - 1;
5566
5567			/*
5568			 * Stringify as a dotted decimal quad.
5569			 */
5570			*end-- = '\0';
5571			ptr8 = (uint8_t *)&ip4;
5572			for (i = 3; i >= 0; i--) {
5573				val = ptr8[i];
5574
5575				if (val == 0) {
5576					*end-- = '0';
5577				} else {
5578					for (; val; val /= 10) {
5579						*end-- = '0' + (val % 10);
5580					}
5581				}
5582
5583				if (i > 0)
5584					*end-- = '.';
5585			}
5586			ASSERT(end + 1 >= base);
5587
5588		} else if (af == AF_INET6) {
5589			struct in6_addr ip6;
5590			int firstzero, tryzero, numzero, v6end;
5591			uint16_t val;
5592			const char digits[] = "0123456789abcdef";
5593
5594			/*
5595			 * Stringify using RFC 1884 convention 2 - 16 bit
5596			 * hexadecimal values with a zero-run compression.
5597			 * Lower case hexadecimal digits are used.
5598			 *	eg, fe80::214:4fff:fe0b:76c8.
5599			 * The IPv4 embedded form is returned for inet_ntop,
5600			 * just the IPv4 string is returned for inet_ntoa6.
5601			 */
5602
5603			if (!dtrace_canload(tupregs[argi].dttk_value,
5604			    sizeof (struct in6_addr), mstate, vstate)) {
5605				regs[rd] = 0;
5606				break;
5607			}
5608
5609			/*
5610			 * Safely load the IPv6 address.
5611			 */
5612			dtrace_bcopy(
5613			    (void *)(uintptr_t)tupregs[argi].dttk_value,
5614			    (void *)(uintptr_t)&ip6, sizeof (struct in6_addr));
5615
5616			/*
5617			 * Check an IPv6 string will fit in scratch.
5618			 */
5619			size = INET6_ADDRSTRLEN;
5620			if (!DTRACE_INSCRATCH(mstate, size)) {
5621				DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5622				regs[rd] = 0;
5623				break;
5624			}
5625			base = (char *)mstate->dtms_scratch_ptr;
5626			end = (char *)mstate->dtms_scratch_ptr + size - 1;
5627			*end-- = '\0';
5628
5629			/*
5630			 * Find the longest run of 16 bit zero values
5631			 * for the single allowed zero compression - "::".
5632			 */
5633			firstzero = -1;
5634			tryzero = -1;
5635			numzero = 1;
5636			for (i = 0; i < sizeof (struct in6_addr); i++) {
5637				if (ip6._S6_un._S6_u8[i] == 0 &&
5638				    tryzero == -1 && i % 2 == 0) {
5639					tryzero = i;
5640					continue;
5641				}
5642
5643				if (tryzero != -1 &&
5644				    (ip6._S6_un._S6_u8[i] != 0 ||
5645				    i == sizeof (struct in6_addr) - 1)) {
5646
5647					if (i - tryzero <= numzero) {
5648						tryzero = -1;
5649						continue;
5650					}
5651
5652					firstzero = tryzero;
5653					numzero = i - i % 2 - tryzero;
5654					tryzero = -1;
5655
5656					if (ip6._S6_un._S6_u8[i] == 0 &&
5657					    i == sizeof (struct in6_addr) - 1)
5658						numzero += 2;
5659				}
5660			}
5661			ASSERT(firstzero + numzero <= sizeof (struct in6_addr));
5662
5663			/*
5664			 * Check for an IPv4 embedded address.
5665			 */
5666			v6end = sizeof (struct in6_addr) - 2;
5667			if (IN6_IS_ADDR_V4MAPPED(&ip6) ||
5668			    IN6_IS_ADDR_V4COMPAT(&ip6)) {
5669				for (i = sizeof (struct in6_addr) - 1;
5670				    i >= DTRACE_V4MAPPED_OFFSET; i--) {
5671					ASSERT(end >= base);
5672
5673					val = ip6._S6_un._S6_u8[i];
5674
5675					if (val == 0) {
5676						*end-- = '0';
5677					} else {
5678						for (; val; val /= 10) {
5679							*end-- = '0' + val % 10;
5680						}
5681					}
5682
5683					if (i > DTRACE_V4MAPPED_OFFSET)
5684						*end-- = '.';
5685				}
5686
5687				if (subr == DIF_SUBR_INET_NTOA6)
5688					goto inetout;
5689
5690				/*
5691				 * Set v6end to skip the IPv4 address that
5692				 * we have already stringified.
5693				 */
5694				v6end = 10;
5695			}
5696
5697			/*
5698			 * Build the IPv6 string by working through the
5699			 * address in reverse.
5700			 */
5701			for (i = v6end; i >= 0; i -= 2) {
5702				ASSERT(end >= base);
5703
5704				if (i == firstzero + numzero - 2) {
5705					*end-- = ':';
5706					*end-- = ':';
5707					i -= numzero - 2;
5708					continue;
5709				}
5710
5711				if (i < 14 && i != firstzero - 2)
5712					*end-- = ':';
5713
5714				val = (ip6._S6_un._S6_u8[i] << 8) +
5715				    ip6._S6_un._S6_u8[i + 1];
5716
5717				if (val == 0) {
5718					*end-- = '0';
5719				} else {
5720					for (; val; val /= 16) {
5721						*end-- = digits[val % 16];
5722					}
5723				}
5724			}
5725			ASSERT(end + 1 >= base);
5726
5727		} else {
5728			/*
5729			 * The user didn't use AH_INET or AH_INET6.
5730			 */
5731			DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
5732			regs[rd] = 0;
5733			break;
5734		}
5735
5736inetout:	regs[rd] = (uintptr_t)end + 1;
5737		mstate->dtms_scratch_ptr += size;
5738		break;
5739	}
5740
5741	}
5742}
5743
5744/*
5745 * Emulate the execution of DTrace IR instructions specified by the given
5746 * DIF object.  This function is deliberately void of assertions as all of
5747 * the necessary checks are handled by a call to dtrace_difo_validate().
5748 */
5749static uint64_t
5750dtrace_dif_emulate(dtrace_difo_t *difo, dtrace_mstate_t *mstate,
5751    dtrace_vstate_t *vstate, dtrace_state_t *state)
5752{
5753	const dif_instr_t *text = difo->dtdo_buf;
5754	const uint_t textlen = difo->dtdo_len;
5755	const char *strtab = difo->dtdo_strtab;
5756	const uint64_t *inttab = difo->dtdo_inttab;
5757
5758	uint64_t rval = 0;
5759	dtrace_statvar_t *svar;
5760	dtrace_dstate_t *dstate = &vstate->dtvs_dynvars;
5761	dtrace_difv_t *v;
5762	volatile uint16_t *flags = &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
5763	volatile uintptr_t *illval = &cpu_core[CPU->cpu_id].cpuc_dtrace_illval;
5764
5765	dtrace_key_t tupregs[DIF_DTR_NREGS + 2]; /* +2 for thread and id */
5766	uint64_t regs[DIF_DIR_NREGS];
5767	uint64_t *tmp;
5768
5769	uint8_t cc_n = 0, cc_z = 0, cc_v = 0, cc_c = 0;
5770	int64_t cc_r;
5771	uint_t pc = 0, id, opc;
5772	uint8_t ttop = 0;
5773	dif_instr_t instr;
5774	uint_t r1, r2, rd;
5775
5776	/*
5777	 * We stash the current DIF object into the machine state: we need it
5778	 * for subsequent access checking.
5779	 */
5780	mstate->dtms_difo = difo;
5781
5782	regs[DIF_REG_R0] = 0;		/* %r0 is fixed at zero */
5783
5784	while (pc < textlen && !(*flags & CPU_DTRACE_FAULT)) {
5785		opc = pc;
5786
5787		instr = text[pc++];
5788		r1 = DIF_INSTR_R1(instr);
5789		r2 = DIF_INSTR_R2(instr);
5790		rd = DIF_INSTR_RD(instr);
5791
5792		switch (DIF_INSTR_OP(instr)) {
5793		case DIF_OP_OR:
5794			regs[rd] = regs[r1] | regs[r2];
5795			break;
5796		case DIF_OP_XOR:
5797			regs[rd] = regs[r1] ^ regs[r2];
5798			break;
5799		case DIF_OP_AND:
5800			regs[rd] = regs[r1] & regs[r2];
5801			break;
5802		case DIF_OP_SLL:
5803			regs[rd] = regs[r1] << regs[r2];
5804			break;
5805		case DIF_OP_SRL:
5806			regs[rd] = regs[r1] >> regs[r2];
5807			break;
5808		case DIF_OP_SUB:
5809			regs[rd] = regs[r1] - regs[r2];
5810			break;
5811		case DIF_OP_ADD:
5812			regs[rd] = regs[r1] + regs[r2];
5813			break;
5814		case DIF_OP_MUL:
5815			regs[rd] = regs[r1] * regs[r2];
5816			break;
5817		case DIF_OP_SDIV:
5818			if (regs[r2] == 0) {
5819				regs[rd] = 0;
5820				*flags |= CPU_DTRACE_DIVZERO;
5821			} else {
5822				DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
5823				regs[rd] = (int64_t)regs[r1] /
5824				    (int64_t)regs[r2];
5825				DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
5826			}
5827			break;
5828
5829		case DIF_OP_UDIV:
5830			if (regs[r2] == 0) {
5831				regs[rd] = 0;
5832				*flags |= CPU_DTRACE_DIVZERO;
5833			} else {
5834				DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
5835				regs[rd] = regs[r1] / regs[r2];
5836				DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
5837			}
5838			break;
5839
5840		case DIF_OP_SREM:
5841			if (regs[r2] == 0) {
5842				regs[rd] = 0;
5843				*flags |= CPU_DTRACE_DIVZERO;
5844			} else {
5845				DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
5846				regs[rd] = (int64_t)regs[r1] %
5847				    (int64_t)regs[r2];
5848				DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
5849			}
5850			break;
5851
5852		case DIF_OP_UREM:
5853			if (regs[r2] == 0) {
5854				regs[rd] = 0;
5855				*flags |= CPU_DTRACE_DIVZERO;
5856			} else {
5857				DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
5858				regs[rd] = regs[r1] % regs[r2];
5859				DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
5860			}
5861			break;
5862
5863		case DIF_OP_NOT:
5864			regs[rd] = ~regs[r1];
5865			break;
5866		case DIF_OP_MOV:
5867			regs[rd] = regs[r1];
5868			break;
5869		case DIF_OP_CMP:
5870			cc_r = regs[r1] - regs[r2];
5871			cc_n = cc_r < 0;
5872			cc_z = cc_r == 0;
5873			cc_v = 0;
5874			cc_c = regs[r1] < regs[r2];
5875			break;
5876		case DIF_OP_TST:
5877			cc_n = cc_v = cc_c = 0;
5878			cc_z = regs[r1] == 0;
5879			break;
5880		case DIF_OP_BA:
5881			pc = DIF_INSTR_LABEL(instr);
5882			break;
5883		case DIF_OP_BE:
5884			if (cc_z)
5885				pc = DIF_INSTR_LABEL(instr);
5886			break;
5887		case DIF_OP_BNE:
5888			if (cc_z == 0)
5889				pc = DIF_INSTR_LABEL(instr);
5890			break;
5891		case DIF_OP_BG:
5892			if ((cc_z | (cc_n ^ cc_v)) == 0)
5893				pc = DIF_INSTR_LABEL(instr);
5894			break;
5895		case DIF_OP_BGU:
5896			if ((cc_c | cc_z) == 0)
5897				pc = DIF_INSTR_LABEL(instr);
5898			break;
5899		case DIF_OP_BGE:
5900			if ((cc_n ^ cc_v) == 0)
5901				pc = DIF_INSTR_LABEL(instr);
5902			break;
5903		case DIF_OP_BGEU:
5904			if (cc_c == 0)
5905				pc = DIF_INSTR_LABEL(instr);
5906			break;
5907		case DIF_OP_BL:
5908			if (cc_n ^ cc_v)
5909				pc = DIF_INSTR_LABEL(instr);
5910			break;
5911		case DIF_OP_BLU:
5912			if (cc_c)
5913				pc = DIF_INSTR_LABEL(instr);
5914			break;
5915		case DIF_OP_BLE:
5916			if (cc_z | (cc_n ^ cc_v))
5917				pc = DIF_INSTR_LABEL(instr);
5918			break;
5919		case DIF_OP_BLEU:
5920			if (cc_c | cc_z)
5921				pc = DIF_INSTR_LABEL(instr);
5922			break;
5923		case DIF_OP_RLDSB:
5924			if (!dtrace_canload(regs[r1], 1, mstate, vstate))
5925				break;
5926			/*FALLTHROUGH*/
5927		case DIF_OP_LDSB:
5928			regs[rd] = (int8_t)dtrace_load8(regs[r1]);
5929			break;
5930		case DIF_OP_RLDSH:
5931			if (!dtrace_canload(regs[r1], 2, mstate, vstate))
5932				break;
5933			/*FALLTHROUGH*/
5934		case DIF_OP_LDSH:
5935			regs[rd] = (int16_t)dtrace_load16(regs[r1]);
5936			break;
5937		case DIF_OP_RLDSW:
5938			if (!dtrace_canload(regs[r1], 4, mstate, vstate))
5939				break;
5940			/*FALLTHROUGH*/
5941		case DIF_OP_LDSW:
5942			regs[rd] = (int32_t)dtrace_load32(regs[r1]);
5943			break;
5944		case DIF_OP_RLDUB:
5945			if (!dtrace_canload(regs[r1], 1, mstate, vstate))
5946				break;
5947			/*FALLTHROUGH*/
5948		case DIF_OP_LDUB:
5949			regs[rd] = dtrace_load8(regs[r1]);
5950			break;
5951		case DIF_OP_RLDUH:
5952			if (!dtrace_canload(regs[r1], 2, mstate, vstate))
5953				break;
5954			/*FALLTHROUGH*/
5955		case DIF_OP_LDUH:
5956			regs[rd] = dtrace_load16(regs[r1]);
5957			break;
5958		case DIF_OP_RLDUW:
5959			if (!dtrace_canload(regs[r1], 4, mstate, vstate))
5960				break;
5961			/*FALLTHROUGH*/
5962		case DIF_OP_LDUW:
5963			regs[rd] = dtrace_load32(regs[r1]);
5964			break;
5965		case DIF_OP_RLDX:
5966			if (!dtrace_canload(regs[r1], 8, mstate, vstate))
5967				break;
5968			/*FALLTHROUGH*/
5969		case DIF_OP_LDX:
5970			regs[rd] = dtrace_load64(regs[r1]);
5971			break;
5972		case DIF_OP_ULDSB:
5973			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
5974			regs[rd] = (int8_t)
5975			    dtrace_fuword8((void *)(uintptr_t)regs[r1]);
5976			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
5977			break;
5978		case DIF_OP_ULDSH:
5979			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
5980			regs[rd] = (int16_t)
5981			    dtrace_fuword16((void *)(uintptr_t)regs[r1]);
5982			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
5983			break;
5984		case DIF_OP_ULDSW:
5985			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
5986			regs[rd] = (int32_t)
5987			    dtrace_fuword32((void *)(uintptr_t)regs[r1]);
5988			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
5989			break;
5990		case DIF_OP_ULDUB:
5991			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
5992			regs[rd] =
5993			    dtrace_fuword8((void *)(uintptr_t)regs[r1]);
5994			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
5995			break;
5996		case DIF_OP_ULDUH:
5997			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
5998			regs[rd] =
5999			    dtrace_fuword16((void *)(uintptr_t)regs[r1]);
6000			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6001			break;
6002		case DIF_OP_ULDUW:
6003			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6004			regs[rd] =
6005			    dtrace_fuword32((void *)(uintptr_t)regs[r1]);
6006			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6007			break;
6008		case DIF_OP_ULDX:
6009			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6010			regs[rd] =
6011			    dtrace_fuword64((void *)(uintptr_t)regs[r1]);
6012			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6013			break;
6014		case DIF_OP_RET:
6015			rval = regs[rd];
6016			pc = textlen;
6017			break;
6018		case DIF_OP_NOP:
6019			break;
6020		case DIF_OP_SETX:
6021			regs[rd] = inttab[DIF_INSTR_INTEGER(instr)];
6022			break;
6023		case DIF_OP_SETS:
6024			regs[rd] = (uint64_t)(uintptr_t)
6025			    (strtab + DIF_INSTR_STRING(instr));
6026			break;
6027		case DIF_OP_SCMP: {
6028			size_t sz = state->dts_options[DTRACEOPT_STRSIZE];
6029			uintptr_t s1 = regs[r1];
6030			uintptr_t s2 = regs[r2];
6031			size_t lim1, lim2;
6032
6033			if (s1 != 0 &&
6034			    !dtrace_strcanload(s1, sz, &lim1, mstate, vstate))
6035				break;
6036			if (s2 != 0 &&
6037			    !dtrace_strcanload(s2, sz, &lim2, mstate, vstate))
6038				break;
6039
6040			cc_r = dtrace_strncmp((char *)s1, (char *)s2,
6041			    MIN(lim1, lim2));
6042
6043			cc_n = cc_r < 0;
6044			cc_z = cc_r == 0;
6045			cc_v = cc_c = 0;
6046			break;
6047		}
6048		case DIF_OP_LDGA:
6049			regs[rd] = dtrace_dif_variable(mstate, state,
6050			    r1, regs[r2]);
6051			break;
6052		case DIF_OP_LDGS:
6053			id = DIF_INSTR_VAR(instr);
6054
6055			if (id >= DIF_VAR_OTHER_UBASE) {
6056				uintptr_t a;
6057
6058				id -= DIF_VAR_OTHER_UBASE;
6059				svar = vstate->dtvs_globals[id];
6060				ASSERT(svar != NULL);
6061				v = &svar->dtsv_var;
6062
6063				if (!(v->dtdv_type.dtdt_flags & DIF_TF_BYREF)) {
6064					regs[rd] = svar->dtsv_data;
6065					break;
6066				}
6067
6068				a = (uintptr_t)svar->dtsv_data;
6069
6070				if (*(uint8_t *)a == UINT8_MAX) {
6071					/*
6072					 * If the 0th byte is set to UINT8_MAX
6073					 * then this is to be treated as a
6074					 * reference to a NULL variable.
6075					 */
6076					regs[rd] = 0;
6077				} else {
6078					regs[rd] = a + sizeof (uint64_t);
6079				}
6080
6081				break;
6082			}
6083
6084			regs[rd] = dtrace_dif_variable(mstate, state, id, 0);
6085			break;
6086
6087		case DIF_OP_STGA:
6088			dtrace_dif_variable_write(mstate, state, r1, regs[r2],
6089			    regs[rd]);
6090			break;
6091
6092		case DIF_OP_STGS:
6093			id = DIF_INSTR_VAR(instr);
6094
6095			ASSERT(id >= DIF_VAR_OTHER_UBASE);
6096			id -= DIF_VAR_OTHER_UBASE;
6097
6098			VERIFY(id < vstate->dtvs_nglobals);
6099			svar = vstate->dtvs_globals[id];
6100			ASSERT(svar != NULL);
6101			v = &svar->dtsv_var;
6102
6103			if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6104				uintptr_t a = (uintptr_t)svar->dtsv_data;
6105				size_t lim;
6106
6107				ASSERT(a != (uintptr_t)NULL);
6108				ASSERT(svar->dtsv_size != 0);
6109
6110				if (regs[rd] == 0) {
6111					*(uint8_t *)a = UINT8_MAX;
6112					break;
6113				} else {
6114					*(uint8_t *)a = 0;
6115					a += sizeof (uint64_t);
6116				}
6117				if (!dtrace_vcanload(
6118				    (void *)(uintptr_t)regs[rd], &v->dtdv_type,
6119				    &lim, mstate, vstate))
6120					break;
6121
6122				dtrace_vcopy((void *)(uintptr_t)regs[rd],
6123				    (void *)a, &v->dtdv_type, lim);
6124				break;
6125			}
6126
6127			svar->dtsv_data = regs[rd];
6128			break;
6129
6130		case DIF_OP_LDTA:
6131			/*
6132			 * There are no DTrace built-in thread-local arrays at
6133			 * present.  This opcode is saved for future work.
6134			 */
6135			*flags |= CPU_DTRACE_ILLOP;
6136			regs[rd] = 0;
6137			break;
6138
6139		case DIF_OP_LDLS:
6140			id = DIF_INSTR_VAR(instr);
6141
6142			if (id < DIF_VAR_OTHER_UBASE) {
6143				/*
6144				 * For now, this has no meaning.
6145				 */
6146				regs[rd] = 0;
6147				break;
6148			}
6149
6150			id -= DIF_VAR_OTHER_UBASE;
6151
6152			ASSERT(id < vstate->dtvs_nlocals);
6153			ASSERT(vstate->dtvs_locals != NULL);
6154
6155			svar = vstate->dtvs_locals[id];
6156			ASSERT(svar != NULL);
6157			v = &svar->dtsv_var;
6158
6159			if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6160				uintptr_t a = (uintptr_t)svar->dtsv_data;
6161				size_t sz = v->dtdv_type.dtdt_size;
6162
6163				sz += sizeof (uint64_t);
6164				ASSERT(svar->dtsv_size == NCPU * sz);
6165				a += CPU->cpu_id * sz;
6166
6167				if (*(uint8_t *)a == UINT8_MAX) {
6168					/*
6169					 * If the 0th byte is set to UINT8_MAX
6170					 * then this is to be treated as a
6171					 * reference to a NULL variable.
6172					 */
6173					regs[rd] = 0;
6174				} else {
6175					regs[rd] = a + sizeof (uint64_t);
6176				}
6177
6178				break;
6179			}
6180
6181			ASSERT(svar->dtsv_size == NCPU * sizeof (uint64_t));
6182			tmp = (uint64_t *)(uintptr_t)svar->dtsv_data;
6183			regs[rd] = tmp[CPU->cpu_id];
6184			break;
6185
6186		case DIF_OP_STLS:
6187			id = DIF_INSTR_VAR(instr);
6188
6189			ASSERT(id >= DIF_VAR_OTHER_UBASE);
6190			id -= DIF_VAR_OTHER_UBASE;
6191			VERIFY(id < vstate->dtvs_nlocals);
6192
6193			ASSERT(vstate->dtvs_locals != NULL);
6194			svar = vstate->dtvs_locals[id];
6195			ASSERT(svar != NULL);
6196			v = &svar->dtsv_var;
6197
6198			if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6199				uintptr_t a = (uintptr_t)svar->dtsv_data;
6200				size_t sz = v->dtdv_type.dtdt_size;
6201				size_t lim;
6202
6203				sz += sizeof (uint64_t);
6204				ASSERT(svar->dtsv_size == NCPU * sz);
6205				a += CPU->cpu_id * sz;
6206
6207				if (regs[rd] == 0) {
6208					*(uint8_t *)a = UINT8_MAX;
6209					break;
6210				} else {
6211					*(uint8_t *)a = 0;
6212					a += sizeof (uint64_t);
6213				}
6214
6215				if (!dtrace_vcanload(
6216				    (void *)(uintptr_t)regs[rd], &v->dtdv_type,
6217				    &lim, mstate, vstate))
6218					break;
6219
6220				dtrace_vcopy((void *)(uintptr_t)regs[rd],
6221				    (void *)a, &v->dtdv_type, lim);
6222				break;
6223			}
6224
6225			ASSERT(svar->dtsv_size == NCPU * sizeof (uint64_t));
6226			tmp = (uint64_t *)(uintptr_t)svar->dtsv_data;
6227			tmp[CPU->cpu_id] = regs[rd];
6228			break;
6229
6230		case DIF_OP_LDTS: {
6231			dtrace_dynvar_t *dvar;
6232			dtrace_key_t *key;
6233
6234			id = DIF_INSTR_VAR(instr);
6235			ASSERT(id >= DIF_VAR_OTHER_UBASE);
6236			id -= DIF_VAR_OTHER_UBASE;
6237			v = &vstate->dtvs_tlocals[id];
6238
6239			key = &tupregs[DIF_DTR_NREGS];
6240			key[0].dttk_value = (uint64_t)id;
6241			key[0].dttk_size = 0;
6242			DTRACE_TLS_THRKEY(key[1].dttk_value);
6243			key[1].dttk_size = 0;
6244
6245			dvar = dtrace_dynvar(dstate, 2, key,
6246			    sizeof (uint64_t), DTRACE_DYNVAR_NOALLOC,
6247			    mstate, vstate);
6248
6249			if (dvar == NULL) {
6250				regs[rd] = 0;
6251				break;
6252			}
6253
6254			if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6255				regs[rd] = (uint64_t)(uintptr_t)dvar->dtdv_data;
6256			} else {
6257				regs[rd] = *((uint64_t *)dvar->dtdv_data);
6258			}
6259
6260			break;
6261		}
6262
6263		case DIF_OP_STTS: {
6264			dtrace_dynvar_t *dvar;
6265			dtrace_key_t *key;
6266
6267			id = DIF_INSTR_VAR(instr);
6268			ASSERT(id >= DIF_VAR_OTHER_UBASE);
6269			id -= DIF_VAR_OTHER_UBASE;
6270			VERIFY(id < vstate->dtvs_ntlocals);
6271
6272			key = &tupregs[DIF_DTR_NREGS];
6273			key[0].dttk_value = (uint64_t)id;
6274			key[0].dttk_size = 0;
6275			DTRACE_TLS_THRKEY(key[1].dttk_value);
6276			key[1].dttk_size = 0;
6277			v = &vstate->dtvs_tlocals[id];
6278
6279			dvar = dtrace_dynvar(dstate, 2, key,
6280			    v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
6281			    v->dtdv_type.dtdt_size : sizeof (uint64_t),
6282			    regs[rd] ? DTRACE_DYNVAR_ALLOC :
6283			    DTRACE_DYNVAR_DEALLOC, mstate, vstate);
6284
6285			/*
6286			 * Given that we're storing to thread-local data,
6287			 * we need to flush our predicate cache.
6288			 */
6289			curthread->t_predcache = DTRACE_CACHEIDNONE;
6290
6291			if (dvar == NULL)
6292				break;
6293
6294			if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6295				size_t lim;
6296
6297				if (!dtrace_vcanload(
6298				    (void *)(uintptr_t)regs[rd],
6299				    &v->dtdv_type, &lim, mstate, vstate))
6300					break;
6301
6302				dtrace_vcopy((void *)(uintptr_t)regs[rd],
6303				    dvar->dtdv_data, &v->dtdv_type, lim);
6304			} else {
6305				*((uint64_t *)dvar->dtdv_data) = regs[rd];
6306			}
6307
6308			break;
6309		}
6310
6311		case DIF_OP_SRA:
6312			regs[rd] = (int64_t)regs[r1] >> regs[r2];
6313			break;
6314
6315		case DIF_OP_CALL:
6316			dtrace_dif_subr(DIF_INSTR_SUBR(instr), rd,
6317			    regs, tupregs, ttop, mstate, state);
6318			break;
6319
6320		case DIF_OP_PUSHTR:
6321			if (ttop == DIF_DTR_NREGS) {
6322				*flags |= CPU_DTRACE_TUPOFLOW;
6323				break;
6324			}
6325
6326			if (r1 == DIF_TYPE_STRING) {
6327				/*
6328				 * If this is a string type and the size is 0,
6329				 * we'll use the system-wide default string
6330				 * size.  Note that we are _not_ looking at
6331				 * the value of the DTRACEOPT_STRSIZE option;
6332				 * had this been set, we would expect to have
6333				 * a non-zero size value in the "pushtr".
6334				 */
6335				tupregs[ttop].dttk_size =
6336				    dtrace_strlen((char *)(uintptr_t)regs[rd],
6337				    regs[r2] ? regs[r2] :
6338				    dtrace_strsize_default) + 1;
6339			} else {
6340				if (regs[r2] > LONG_MAX) {
6341					*flags |= CPU_DTRACE_ILLOP;
6342					break;
6343				}
6344
6345				tupregs[ttop].dttk_size = regs[r2];
6346			}
6347
6348			tupregs[ttop++].dttk_value = regs[rd];
6349			break;
6350
6351		case DIF_OP_PUSHTV:
6352			if (ttop == DIF_DTR_NREGS) {
6353				*flags |= CPU_DTRACE_TUPOFLOW;
6354				break;
6355			}
6356
6357			tupregs[ttop].dttk_value = regs[rd];
6358			tupregs[ttop++].dttk_size = 0;
6359			break;
6360
6361		case DIF_OP_POPTS:
6362			if (ttop != 0)
6363				ttop--;
6364			break;
6365
6366		case DIF_OP_FLUSHTS:
6367			ttop = 0;
6368			break;
6369
6370		case DIF_OP_LDGAA:
6371		case DIF_OP_LDTAA: {
6372			dtrace_dynvar_t *dvar;
6373			dtrace_key_t *key = tupregs;
6374			uint_t nkeys = ttop;
6375
6376			id = DIF_INSTR_VAR(instr);
6377			ASSERT(id >= DIF_VAR_OTHER_UBASE);
6378			id -= DIF_VAR_OTHER_UBASE;
6379
6380			key[nkeys].dttk_value = (uint64_t)id;
6381			key[nkeys++].dttk_size = 0;
6382
6383			if (DIF_INSTR_OP(instr) == DIF_OP_LDTAA) {
6384				DTRACE_TLS_THRKEY(key[nkeys].dttk_value);
6385				key[nkeys++].dttk_size = 0;
6386				VERIFY(id < vstate->dtvs_ntlocals);
6387				v = &vstate->dtvs_tlocals[id];
6388			} else {
6389				VERIFY(id < vstate->dtvs_nglobals);
6390				v = &vstate->dtvs_globals[id]->dtsv_var;
6391			}
6392
6393			dvar = dtrace_dynvar(dstate, nkeys, key,
6394			    v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
6395			    v->dtdv_type.dtdt_size : sizeof (uint64_t),
6396			    DTRACE_DYNVAR_NOALLOC, mstate, vstate);
6397
6398			if (dvar == NULL) {
6399				regs[rd] = 0;
6400				break;
6401			}
6402
6403			if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6404				regs[rd] = (uint64_t)(uintptr_t)dvar->dtdv_data;
6405			} else {
6406				regs[rd] = *((uint64_t *)dvar->dtdv_data);
6407			}
6408
6409			break;
6410		}
6411
6412		case DIF_OP_STGAA:
6413		case DIF_OP_STTAA: {
6414			dtrace_dynvar_t *dvar;
6415			dtrace_key_t *key = tupregs;
6416			uint_t nkeys = ttop;
6417
6418			id = DIF_INSTR_VAR(instr);
6419			ASSERT(id >= DIF_VAR_OTHER_UBASE);
6420			id -= DIF_VAR_OTHER_UBASE;
6421
6422			key[nkeys].dttk_value = (uint64_t)id;
6423			key[nkeys++].dttk_size = 0;
6424
6425			if (DIF_INSTR_OP(instr) == DIF_OP_STTAA) {
6426				DTRACE_TLS_THRKEY(key[nkeys].dttk_value);
6427				key[nkeys++].dttk_size = 0;
6428				VERIFY(id < vstate->dtvs_ntlocals);
6429				v = &vstate->dtvs_tlocals[id];
6430			} else {
6431				VERIFY(id < vstate->dtvs_nglobals);
6432				v = &vstate->dtvs_globals[id]->dtsv_var;
6433			}
6434
6435			dvar = dtrace_dynvar(dstate, nkeys, key,
6436			    v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
6437			    v->dtdv_type.dtdt_size : sizeof (uint64_t),
6438			    regs[rd] ? DTRACE_DYNVAR_ALLOC :
6439			    DTRACE_DYNVAR_DEALLOC, mstate, vstate);
6440
6441			if (dvar == NULL)
6442				break;
6443
6444			if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6445				size_t lim;
6446
6447				if (!dtrace_vcanload(
6448				    (void *)(uintptr_t)regs[rd], &v->dtdv_type,
6449				    &lim, mstate, vstate))
6450					break;
6451
6452				dtrace_vcopy((void *)(uintptr_t)regs[rd],
6453				    dvar->dtdv_data, &v->dtdv_type, lim);
6454			} else {
6455				*((uint64_t *)dvar->dtdv_data) = regs[rd];
6456			}
6457
6458			break;
6459		}
6460
6461		case DIF_OP_ALLOCS: {
6462			uintptr_t ptr = P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
6463			size_t size = ptr - mstate->dtms_scratch_ptr + regs[r1];
6464
6465			/*
6466			 * Rounding up the user allocation size could have
6467			 * overflowed large, bogus allocations (like -1ULL) to
6468			 * 0.
6469			 */
6470			if (size < regs[r1] ||
6471			    !DTRACE_INSCRATCH(mstate, size)) {
6472				DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
6473				regs[rd] = 0;
6474				break;
6475			}
6476
6477			dtrace_bzero((void *) mstate->dtms_scratch_ptr, size);
6478			mstate->dtms_scratch_ptr += size;
6479			regs[rd] = ptr;
6480			break;
6481		}
6482
6483		case DIF_OP_COPYS:
6484			if (!dtrace_canstore(regs[rd], regs[r2],
6485			    mstate, vstate)) {
6486				*flags |= CPU_DTRACE_BADADDR;
6487				*illval = regs[rd];
6488				break;
6489			}
6490
6491			if (!dtrace_canload(regs[r1], regs[r2], mstate, vstate))
6492				break;
6493
6494			dtrace_bcopy((void *)(uintptr_t)regs[r1],
6495			    (void *)(uintptr_t)regs[rd], (size_t)regs[r2]);
6496			break;
6497
6498		case DIF_OP_STB:
6499			if (!dtrace_canstore(regs[rd], 1, mstate, vstate)) {
6500				*flags |= CPU_DTRACE_BADADDR;
6501				*illval = regs[rd];
6502				break;
6503			}
6504			*((uint8_t *)(uintptr_t)regs[rd]) = (uint8_t)regs[r1];
6505			break;
6506
6507		case DIF_OP_STH:
6508			if (!dtrace_canstore(regs[rd], 2, mstate, vstate)) {
6509				*flags |= CPU_DTRACE_BADADDR;
6510				*illval = regs[rd];
6511				break;
6512			}
6513			if (regs[rd] & 1) {
6514				*flags |= CPU_DTRACE_BADALIGN;
6515				*illval = regs[rd];
6516				break;
6517			}
6518			*((uint16_t *)(uintptr_t)regs[rd]) = (uint16_t)regs[r1];
6519			break;
6520
6521		case DIF_OP_STW:
6522			if (!dtrace_canstore(regs[rd], 4, mstate, vstate)) {
6523				*flags |= CPU_DTRACE_BADADDR;
6524				*illval = regs[rd];
6525				break;
6526			}
6527			if (regs[rd] & 3) {
6528				*flags |= CPU_DTRACE_BADALIGN;
6529				*illval = regs[rd];
6530				break;
6531			}
6532			*((uint32_t *)(uintptr_t)regs[rd]) = (uint32_t)regs[r1];
6533			break;
6534
6535		case DIF_OP_STX:
6536			if (!dtrace_canstore(regs[rd], 8, mstate, vstate)) {
6537				*flags |= CPU_DTRACE_BADADDR;
6538				*illval = regs[rd];
6539				break;
6540			}
6541			if (regs[rd] & 7) {
6542				*flags |= CPU_DTRACE_BADALIGN;
6543				*illval = regs[rd];
6544				break;
6545			}
6546			*((uint64_t *)(uintptr_t)regs[rd]) = regs[r1];
6547			break;
6548		}
6549	}
6550
6551	if (!(*flags & CPU_DTRACE_FAULT))
6552		return (rval);
6553
6554	mstate->dtms_fltoffs = opc * sizeof (dif_instr_t);
6555	mstate->dtms_present |= DTRACE_MSTATE_FLTOFFS;
6556
6557	return (0);
6558}
6559
6560static void
6561dtrace_action_breakpoint(dtrace_ecb_t *ecb)
6562{
6563	dtrace_probe_t *probe = ecb->dte_probe;
6564	dtrace_provider_t *prov = probe->dtpr_provider;
6565	char c[DTRACE_FULLNAMELEN + 80], *str;
6566	char *msg = "dtrace: breakpoint action at probe ";
6567	char *ecbmsg = " (ecb ";
6568	uintptr_t mask = (0xf << (sizeof (uintptr_t) * NBBY / 4));
6569	uintptr_t val = (uintptr_t)ecb;
6570	int shift = (sizeof (uintptr_t) * NBBY) - 4, i = 0;
6571
6572	if (dtrace_destructive_disallow)
6573		return;
6574
6575	/*
6576	 * It's impossible to be taking action on the NULL probe.
6577	 */
6578	ASSERT(probe != NULL);
6579
6580	/*
6581	 * This is a poor man's (destitute man's?) sprintf():  we want to
6582	 * print the provider name, module name, function name and name of
6583	 * the probe, along with the hex address of the ECB with the breakpoint
6584	 * action -- all of which we must place in the character buffer by
6585	 * hand.
6586	 */
6587	while (*msg != '\0')
6588		c[i++] = *msg++;
6589
6590	for (str = prov->dtpv_name; *str != '\0'; str++)
6591		c[i++] = *str;
6592	c[i++] = ':';
6593
6594	for (str = probe->dtpr_mod; *str != '\0'; str++)
6595		c[i++] = *str;
6596	c[i++] = ':';
6597
6598	for (str = probe->dtpr_func; *str != '\0'; str++)
6599		c[i++] = *str;
6600	c[i++] = ':';
6601
6602	for (str = probe->dtpr_name; *str != '\0'; str++)
6603		c[i++] = *str;
6604
6605	while (*ecbmsg != '\0')
6606		c[i++] = *ecbmsg++;
6607
6608	while (shift >= 0) {
6609		mask = (uintptr_t)0xf << shift;
6610
6611		if (val >= ((uintptr_t)1 << shift))
6612			c[i++] = "0123456789abcdef"[(val & mask) >> shift];
6613		shift -= 4;
6614	}
6615
6616	c[i++] = ')';
6617	c[i] = '\0';
6618
6619	debug_enter(c);
6620}
6621
6622static void
6623dtrace_action_panic(dtrace_ecb_t *ecb)
6624{
6625	dtrace_probe_t *probe = ecb->dte_probe;
6626
6627	/*
6628	 * It's impossible to be taking action on the NULL probe.
6629	 */
6630	ASSERT(probe != NULL);
6631
6632	if (dtrace_destructive_disallow)
6633		return;
6634
6635	if (dtrace_panicked != NULL)
6636		return;
6637
6638	if (dtrace_casptr(&dtrace_panicked, NULL, curthread) != NULL)
6639		return;
6640
6641	/*
6642	 * We won the right to panic.  (We want to be sure that only one
6643	 * thread calls panic() from dtrace_probe(), and that panic() is
6644	 * called exactly once.)
6645	 */
6646	dtrace_panic("dtrace: panic action at probe %s:%s:%s:%s (ecb %p)",
6647	    probe->dtpr_provider->dtpv_name, probe->dtpr_mod,
6648	    probe->dtpr_func, probe->dtpr_name, (void *)ecb);
6649}
6650
6651static void
6652dtrace_action_raise(uint64_t sig)
6653{
6654	if (dtrace_destructive_disallow)
6655		return;
6656
6657	if (sig >= NSIG) {
6658		DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
6659		return;
6660	}
6661
6662	/*
6663	 * raise() has a queue depth of 1 -- we ignore all subsequent
6664	 * invocations of the raise() action.
6665	 */
6666	if (curthread->t_dtrace_sig == 0)
6667		curthread->t_dtrace_sig = (uint8_t)sig;
6668
6669	curthread->t_sig_check = 1;
6670	aston(curthread);
6671}
6672
6673static void
6674dtrace_action_stop(void)
6675{
6676	if (dtrace_destructive_disallow)
6677		return;
6678
6679	if (!curthread->t_dtrace_stop) {
6680		curthread->t_dtrace_stop = 1;
6681		curthread->t_sig_check = 1;
6682		aston(curthread);
6683	}
6684}
6685
6686static void
6687dtrace_action_chill(dtrace_mstate_t *mstate, hrtime_t val)
6688{
6689	hrtime_t now;
6690	volatile uint16_t *flags;
6691	cpu_t *cpu = CPU;
6692
6693	if (dtrace_destructive_disallow)
6694		return;
6695
6696	flags = (volatile uint16_t *)&cpu_core[cpu->cpu_id].cpuc_dtrace_flags;
6697
6698	now = dtrace_gethrtime();
6699
6700	if (now - cpu->cpu_dtrace_chillmark > dtrace_chill_interval) {
6701		/*
6702		 * We need to advance the mark to the current time.
6703		 */
6704		cpu->cpu_dtrace_chillmark = now;
6705		cpu->cpu_dtrace_chilled = 0;
6706	}
6707
6708	/*
6709	 * Now check to see if the requested chill time would take us over
6710	 * the maximum amount of time allowed in the chill interval.  (Or
6711	 * worse, if the calculation itself induces overflow.)
6712	 */
6713	if (cpu->cpu_dtrace_chilled + val > dtrace_chill_max ||
6714	    cpu->cpu_dtrace_chilled + val < cpu->cpu_dtrace_chilled) {
6715		*flags |= CPU_DTRACE_ILLOP;
6716		return;
6717	}
6718
6719	while (dtrace_gethrtime() - now < val)
6720		continue;
6721
6722	/*
6723	 * Normally, we assure that the value of the variable "timestamp" does
6724	 * not change within an ECB.  The presence of chill() represents an
6725	 * exception to this rule, however.
6726	 */
6727	mstate->dtms_present &= ~DTRACE_MSTATE_TIMESTAMP;
6728	cpu->cpu_dtrace_chilled += val;
6729}
6730
6731static void
6732dtrace_action_ustack(dtrace_mstate_t *mstate, dtrace_state_t *state,
6733    uint64_t *buf, uint64_t arg)
6734{
6735	int nframes = DTRACE_USTACK_NFRAMES(arg);
6736	int strsize = DTRACE_USTACK_STRSIZE(arg);
6737	uint64_t *pcs = &buf[1], *fps;
6738	char *str = (char *)&pcs[nframes];
6739	int size, offs = 0, i, j;
6740	size_t rem;
6741	uintptr_t old = mstate->dtms_scratch_ptr, saved;
6742	uint16_t *flags = &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
6743	char *sym;
6744
6745	/*
6746	 * Should be taking a faster path if string space has not been
6747	 * allocated.
6748	 */
6749	ASSERT(strsize != 0);
6750
6751	/*
6752	 * We will first allocate some temporary space for the frame pointers.
6753	 */
6754	fps = (uint64_t *)P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
6755	size = (uintptr_t)fps - mstate->dtms_scratch_ptr +
6756	    (nframes * sizeof (uint64_t));
6757
6758	if (!DTRACE_INSCRATCH(mstate, size)) {
6759		/*
6760		 * Not enough room for our frame pointers -- need to indicate
6761		 * that we ran out of scratch space.
6762		 */
6763		DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
6764		return;
6765	}
6766
6767	mstate->dtms_scratch_ptr += size;
6768	saved = mstate->dtms_scratch_ptr;
6769
6770	/*
6771	 * Now get a stack with both program counters and frame pointers.
6772	 */
6773	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6774	dtrace_getufpstack(buf, fps, nframes + 1);
6775	DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6776
6777	/*
6778	 * If that faulted, we're cooked.
6779	 */
6780	if (*