1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2016, Joyent, Inc. All rights reserved.
25 * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
26 */
27
28/*
29 * DTrace - Dynamic Tracing for Solaris
30 *
31 * This is the implementation of the Solaris Dynamic Tracing framework
32 * (DTrace).  The user-visible interface to DTrace is described at length in
33 * the "Solaris Dynamic Tracing Guide".  The interfaces between the libdtrace
34 * library, the in-kernel DTrace framework, and the DTrace providers are
35 * described in the block comments in the <sys/dtrace.h> header file.  The
36 * internal architecture of DTrace is described in the block comments in the
37 * <sys/dtrace_impl.h> header file.  The comments contained within the DTrace
38 * implementation very much assume mastery of all of these sources; if one has
39 * an unanswered question about the implementation, one should consult them
40 * first.
41 *
42 * The functions here are ordered roughly as follows:
43 *
44 *   - Probe context functions
45 *   - Probe hashing functions
46 *   - Non-probe context utility functions
47 *   - Matching functions
48 *   - Provider-to-Framework API functions
49 *   - Probe management functions
50 *   - DIF object functions
51 *   - Format functions
52 *   - Predicate functions
53 *   - ECB functions
54 *   - Buffer functions
55 *   - Enabling functions
56 *   - DOF functions
57 *   - Anonymous enabling functions
58 *   - Consumer state functions
59 *   - Helper functions
60 *   - Hook functions
61 *   - Driver cookbook functions
62 *
63 * Each group of functions begins with a block comment labelled the "DTrace
64 * [Group] Functions", allowing one to find each block by searching forward
65 * on capital-f functions.
66 */
67#include <sys/errno.h>
68#include <sys/stat.h>
69#include <sys/modctl.h>
70#include <sys/conf.h>
71#include <sys/systm.h>
72#include <sys/ddi.h>
73#include <sys/sunddi.h>
74#include <sys/cpuvar.h>
75#include <sys/kmem.h>
76#include <sys/strsubr.h>
77#include <sys/sysmacros.h>
78#include <sys/dtrace_impl.h>
79#include <sys/atomic.h>
80#include <sys/cmn_err.h>
81#include <sys/mutex_impl.h>
82#include <sys/rwlock_impl.h>
83#include <sys/ctf_api.h>
84#include <sys/panic.h>
85#include <sys/priv_impl.h>
86#include <sys/policy.h>
87#include <sys/cred_impl.h>
88#include <sys/procfs_isa.h>
89#include <sys/taskq.h>
90#include <sys/mkdev.h>
91#include <sys/kdi.h>
92#include <sys/zone.h>
93#include <sys/socket.h>
94#include <netinet/in.h>
95#include "strtolctype.h"
96
97/*
98 * DTrace Tunable Variables
99 *
100 * The following variables may be tuned by adding a line to /etc/system that
101 * includes both the name of the DTrace module ("dtrace") and the name of the
102 * variable.  For example:
103 *
104 *   set dtrace:dtrace_destructive_disallow = 1
105 *
106 * In general, the only variables that one should be tuning this way are those
107 * that affect system-wide DTrace behavior, and for which the default behavior
108 * is undesirable.  Most of these variables are tunable on a per-consumer
109 * basis using DTrace options, and need not be tuned on a system-wide basis.
110 * When tuning these variables, avoid pathological values; while some attempt
111 * is made to verify the integrity of these variables, they are not considered
112 * part of the supported interface to DTrace, and they are therefore not
113 * checked comprehensively.  Further, these variables should not be tuned
114 * dynamically via "mdb -kw" or other means; they should only be tuned via
115 * /etc/system.
116 */
117int		dtrace_destructive_disallow = 0;
118dtrace_optval_t	dtrace_nonroot_maxsize = (16 * 1024 * 1024);
119size_t		dtrace_difo_maxsize = (256 * 1024);
120dtrace_optval_t	dtrace_dof_maxsize = (8 * 1024 * 1024);
121size_t		dtrace_statvar_maxsize = (16 * 1024);
122size_t		dtrace_actions_max = (16 * 1024);
123size_t		dtrace_retain_max = 1024;
124dtrace_optval_t	dtrace_helper_actions_max = 1024;
125dtrace_optval_t	dtrace_helper_providers_max = 32;
126dtrace_optval_t	dtrace_dstate_defsize = (1 * 1024 * 1024);
127size_t		dtrace_strsize_default = 256;
128dtrace_optval_t	dtrace_cleanrate_default = 9900990;		/* 101 hz */
129dtrace_optval_t	dtrace_cleanrate_min = 200000;			/* 5000 hz */
130dtrace_optval_t	dtrace_cleanrate_max = (uint64_t)60 * NANOSEC;	/* 1/minute */
131dtrace_optval_t	dtrace_aggrate_default = NANOSEC;		/* 1 hz */
132dtrace_optval_t	dtrace_statusrate_default = NANOSEC;		/* 1 hz */
133dtrace_optval_t dtrace_statusrate_max = (hrtime_t)10 * NANOSEC;	 /* 6/minute */
134dtrace_optval_t	dtrace_switchrate_default = NANOSEC;		/* 1 hz */
135dtrace_optval_t	dtrace_nspec_default = 1;
136dtrace_optval_t	dtrace_specsize_default = 32 * 1024;
137dtrace_optval_t dtrace_stackframes_default = 20;
138dtrace_optval_t dtrace_ustackframes_default = 20;
139dtrace_optval_t dtrace_jstackframes_default = 50;
140dtrace_optval_t dtrace_jstackstrsize_default = 512;
141int		dtrace_msgdsize_max = 128;
142hrtime_t	dtrace_chill_max = MSEC2NSEC(500);		/* 500 ms */
143hrtime_t	dtrace_chill_interval = NANOSEC;		/* 1000 ms */
144int		dtrace_devdepth_max = 32;
145int		dtrace_err_verbose;
146hrtime_t	dtrace_deadman_interval = NANOSEC;
147hrtime_t	dtrace_deadman_timeout = (hrtime_t)10 * NANOSEC;
148hrtime_t	dtrace_deadman_user = (hrtime_t)30 * NANOSEC;
149hrtime_t	dtrace_unregister_defunct_reap = (hrtime_t)60 * NANOSEC;
150
151/*
152 * DTrace External Variables
153 *
154 * As dtrace(7D) is a kernel module, any DTrace variables are obviously
155 * available to DTrace consumers via the backtick (`) syntax.  One of these,
156 * dtrace_zero, is made deliberately so:  it is provided as a source of
157 * well-known, zero-filled memory.  While this variable is not documented,
158 * it is used by some translators as an implementation detail.
159 */
160const char	dtrace_zero[256] = { 0 };	/* zero-filled memory */
161
162/*
163 * DTrace Internal Variables
164 */
165static dev_info_t	*dtrace_devi;		/* device info */
166static vmem_t		*dtrace_arena;		/* probe ID arena */
167static vmem_t		*dtrace_minor;		/* minor number arena */
168static taskq_t		*dtrace_taskq;		/* task queue */
169static dtrace_probe_t	**dtrace_probes;	/* array of all probes */
170static int		dtrace_nprobes;		/* number of probes */
171static dtrace_provider_t *dtrace_provider;	/* provider list */
172static dtrace_meta_t	*dtrace_meta_pid;	/* user-land meta provider */
173static int		dtrace_opens;		/* number of opens */
174static int		dtrace_helpers;		/* number of helpers */
175static int		dtrace_getf;		/* number of unpriv getf()s */
176static void		*dtrace_softstate;	/* softstate pointer */
177static dtrace_hash_t	*dtrace_bymod;		/* probes hashed by module */
178static dtrace_hash_t	*dtrace_byfunc;		/* probes hashed by function */
179static dtrace_hash_t	*dtrace_byname;		/* probes hashed by name */
180static dtrace_toxrange_t *dtrace_toxrange;	/* toxic range array */
181static int		dtrace_toxranges;	/* number of toxic ranges */
182static int		dtrace_toxranges_max;	/* size of toxic range array */
183static dtrace_anon_t	dtrace_anon;		/* anonymous enabling */
184static kmem_cache_t	*dtrace_state_cache;	/* cache for dynamic state */
185static uint64_t		dtrace_vtime_references; /* number of vtimestamp refs */
186static kthread_t	*dtrace_panicked;	/* panicking thread */
187static dtrace_ecb_t	*dtrace_ecb_create_cache; /* cached created ECB */
188static dtrace_genid_t	dtrace_probegen;	/* current probe generation */
189static dtrace_helpers_t *dtrace_deferred_pid;	/* deferred helper list */
190static dtrace_enabling_t *dtrace_retained;	/* list of retained enablings */
191static dtrace_genid_t	dtrace_retained_gen;	/* current retained enab gen */
192static dtrace_dynvar_t	dtrace_dynhash_sink;	/* end of dynamic hash chains */
193static int		dtrace_dynvar_failclean; /* dynvars failed to clean */
194
195/*
196 * DTrace Locking
197 * DTrace is protected by three (relatively coarse-grained) locks:
198 *
199 * (1) dtrace_lock is required to manipulate essentially any DTrace state,
200 *     including enabling state, probes, ECBs, consumer state, helper state,
201 *     etc.  Importantly, dtrace_lock is _not_ required when in probe context;
202 *     probe context is lock-free -- synchronization is handled via the
203 *     dtrace_sync() cross call mechanism.
204 *
205 * (2) dtrace_provider_lock is required when manipulating provider state, or
206 *     when provider state must be held constant.
207 *
208 * (3) dtrace_meta_lock is required when manipulating meta provider state, or
209 *     when meta provider state must be held constant.
210 *
211 * The lock ordering between these three locks is dtrace_meta_lock before
212 * dtrace_provider_lock before dtrace_lock.  (In particular, there are
213 * several places where dtrace_provider_lock is held by the framework as it
214 * calls into the providers -- which then call back into the framework,
215 * grabbing dtrace_lock.)
216 *
217 * There are two other locks in the mix:  mod_lock and cpu_lock.  With respect
218 * to dtrace_provider_lock and dtrace_lock, cpu_lock continues its historical
219 * role as a coarse-grained lock; it is acquired before both of these locks.
220 * With respect to dtrace_meta_lock, its behavior is stranger:  cpu_lock must
221 * be acquired _between_ dtrace_meta_lock and any other DTrace locks.
222 * mod_lock is similar with respect to dtrace_provider_lock in that it must be
223 * acquired _between_ dtrace_provider_lock and dtrace_lock.
224 */
225static kmutex_t		dtrace_lock;		/* probe state lock */
226static kmutex_t		dtrace_provider_lock;	/* provider state lock */
227static kmutex_t		dtrace_meta_lock;	/* meta-provider state lock */
228
229/*
230 * DTrace Provider Variables
231 *
232 * These are the variables relating to DTrace as a provider (that is, the
233 * provider of the BEGIN, END, and ERROR probes).
234 */
235static dtrace_pattr_t	dtrace_provider_attr = {
236{ DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
237{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
238{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
239{ DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
240{ DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
241};
242
243static void
244dtrace_nullop(void)
245{}
246
247static int
248dtrace_enable_nullop(void)
249{
250	return (0);
251}
252
253static dtrace_pops_t	dtrace_provider_ops = {
254	(void (*)(void *, const dtrace_probedesc_t *))dtrace_nullop,
255	(void (*)(void *, struct modctl *))dtrace_nullop,
256	(int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop,
257	(void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
258	(void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
259	(void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
260	NULL,
261	NULL,
262	NULL,
263	(void (*)(void *, dtrace_id_t, void *))dtrace_nullop
264};
265
266static dtrace_id_t	dtrace_probeid_begin;	/* special BEGIN probe */
267static dtrace_id_t	dtrace_probeid_end;	/* special END probe */
268dtrace_id_t		dtrace_probeid_error;	/* special ERROR probe */
269
270/*
271 * DTrace Helper Tracing Variables
272 *
273 * These variables should be set dynamically to enable helper tracing.  The
274 * only variables that should be set are dtrace_helptrace_enable (which should
275 * be set to a non-zero value to allocate helper tracing buffers on the next
276 * open of /dev/dtrace) and dtrace_helptrace_disable (which should be set to a
277 * non-zero value to deallocate helper tracing buffers on the next close of
278 * /dev/dtrace).  When (and only when) helper tracing is disabled, the
279 * buffer size may also be set via dtrace_helptrace_bufsize.
280 */
281int			dtrace_helptrace_enable = 0;
282int			dtrace_helptrace_disable = 0;
283int			dtrace_helptrace_bufsize = 16 * 1024 * 1024;
284uint32_t		dtrace_helptrace_nlocals;
285static dtrace_helptrace_t *dtrace_helptrace_buffer;
286static uint32_t		dtrace_helptrace_next = 0;
287static int		dtrace_helptrace_wrapped = 0;
288
289/*
290 * DTrace Error Hashing
291 *
292 * On DEBUG kernels, DTrace will track the errors that has seen in a hash
293 * table.  This is very useful for checking coverage of tests that are
294 * expected to induce DIF or DOF processing errors, and may be useful for
295 * debugging problems in the DIF code generator or in DOF generation .  The
296 * error hash may be examined with the ::dtrace_errhash MDB dcmd.
297 */
298#ifdef DEBUG
299static dtrace_errhash_t	dtrace_errhash[DTRACE_ERRHASHSZ];
300static const char *dtrace_errlast;
301static kthread_t *dtrace_errthread;
302static kmutex_t dtrace_errlock;
303#endif
304
305/*
306 * DTrace Macros and Constants
307 *
308 * These are various macros that are useful in various spots in the
309 * implementation, along with a few random constants that have no meaning
310 * outside of the implementation.  There is no real structure to this cpp
311 * mishmash -- but is there ever?
312 */
313#define	DTRACE_HASHSTR(hash, probe)	\
314	dtrace_hash_str(*((char **)((uintptr_t)(probe) + (hash)->dth_stroffs)))
315
316#define	DTRACE_HASHNEXT(hash, probe)	\
317	(dtrace_probe_t **)((uintptr_t)(probe) + (hash)->dth_nextoffs)
318
319#define	DTRACE_HASHPREV(hash, probe)	\
320	(dtrace_probe_t **)((uintptr_t)(probe) + (hash)->dth_prevoffs)
321
322#define	DTRACE_HASHEQ(hash, lhs, rhs)	\
323	(strcmp(*((char **)((uintptr_t)(lhs) + (hash)->dth_stroffs)), \
324	    *((char **)((uintptr_t)(rhs) + (hash)->dth_stroffs))) == 0)
325
326#define	DTRACE_AGGHASHSIZE_SLEW		17
327
328#define	DTRACE_V4MAPPED_OFFSET		(sizeof (uint32_t) * 3)
329
330/*
331 * The key for a thread-local variable consists of the lower 61 bits of the
332 * t_did, plus the 3 bits of the highest active interrupt above LOCK_LEVEL.
333 * We add DIF_VARIABLE_MAX to t_did to assure that the thread key is never
334 * equal to a variable identifier.  This is necessary (but not sufficient) to
335 * assure that global associative arrays never collide with thread-local
336 * variables.  To guarantee that they cannot collide, we must also define the
337 * order for keying dynamic variables.  That order is:
338 *
339 *   [ key0 ] ... [ keyn ] [ variable-key ] [ tls-key ]
340 *
341 * Because the variable-key and the tls-key are in orthogonal spaces, there is
342 * no way for a global variable key signature to match a thread-local key
343 * signature.
344 */
345#define	DTRACE_TLS_THRKEY(where) { \
346	uint_t intr = 0; \
347	uint_t actv = CPU->cpu_intr_actv >> (LOCK_LEVEL + 1); \
348	for (; actv; actv >>= 1) \
349		intr++; \
350	ASSERT(intr < (1 << 3)); \
351	(where) = ((curthread->t_did + DIF_VARIABLE_MAX) & \
352	    (((uint64_t)1 << 61) - 1)) | ((uint64_t)intr << 61); \
353}
354
355#define	DT_BSWAP_8(x)	((x) & 0xff)
356#define	DT_BSWAP_16(x)	((DT_BSWAP_8(x) << 8) | DT_BSWAP_8((x) >> 8))
357#define	DT_BSWAP_32(x)	((DT_BSWAP_16(x) << 16) | DT_BSWAP_16((x) >> 16))
358#define	DT_BSWAP_64(x)	((DT_BSWAP_32(x) << 32) | DT_BSWAP_32((x) >> 32))
359
360#define	DT_MASK_LO 0x00000000FFFFFFFFULL
361
362#define	DTRACE_STORE(type, tomax, offset, what) \
363	*((type *)((uintptr_t)(tomax) + (uintptr_t)offset)) = (type)(what);
364
365#ifndef __x86
366#define	DTRACE_ALIGNCHECK(addr, size, flags)				\
367	if (addr & (size - 1)) {					\
368		*flags |= CPU_DTRACE_BADALIGN;				\
369		cpu_core[CPU->cpu_id].cpuc_dtrace_illval = addr;	\
370		return (0);						\
371	}
372#else
373#define	DTRACE_ALIGNCHECK(addr, size, flags)
374#endif
375
376/*
377 * Test whether a range of memory starting at testaddr of size testsz falls
378 * within the range of memory described by addr, sz.  We take care to avoid
379 * problems with overflow and underflow of the unsigned quantities, and
380 * disallow all negative sizes.  Ranges of size 0 are allowed.
381 */
382#define	DTRACE_INRANGE(testaddr, testsz, baseaddr, basesz) \
383	((testaddr) - (uintptr_t)(baseaddr) < (basesz) && \
384	(testaddr) + (testsz) - (uintptr_t)(baseaddr) <= (basesz) && \
385	(testaddr) + (testsz) >= (testaddr))
386
387#define	DTRACE_RANGE_REMAIN(remp, addr, baseaddr, basesz)		\
388do {									\
389	if ((remp) != NULL) {						\
390		*(remp) = (uintptr_t)(baseaddr) + (basesz) - (addr);	\
391	}								\
392_NOTE(CONSTCOND) } while (0)
393
394
395/*
396 * Test whether alloc_sz bytes will fit in the scratch region.  We isolate
397 * alloc_sz on the righthand side of the comparison in order to avoid overflow
398 * or underflow in the comparison with it.  This is simpler than the INRANGE
399 * check above, because we know that the dtms_scratch_ptr is valid in the
400 * range.  Allocations of size zero are allowed.
401 */
402#define	DTRACE_INSCRATCH(mstate, alloc_sz) \
403	((mstate)->dtms_scratch_base + (mstate)->dtms_scratch_size - \
404	(mstate)->dtms_scratch_ptr >= (alloc_sz))
405
406#define	DTRACE_LOADFUNC(bits)						\
407/*CSTYLED*/								\
408uint##bits##_t								\
409dtrace_load##bits(uintptr_t addr)					\
410{									\
411	size_t size = bits / NBBY;					\
412	/*CSTYLED*/							\
413	uint##bits##_t rval;						\
414	int i;								\
415	volatile uint16_t *flags = (volatile uint16_t *)		\
416	    &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;			\
417									\
418	DTRACE_ALIGNCHECK(addr, size, flags);				\
419									\
420	for (i = 0; i < dtrace_toxranges; i++) {			\
421		if (addr >= dtrace_toxrange[i].dtt_limit)		\
422			continue;					\
423									\
424		if (addr + size <= dtrace_toxrange[i].dtt_base)		\
425			continue;					\
426									\
427		/*							\
428		 * This address falls within a toxic region; return 0.	\
429		 */							\
430		*flags |= CPU_DTRACE_BADADDR;				\
431		cpu_core[CPU->cpu_id].cpuc_dtrace_illval = addr;	\
432		return (0);						\
433	}								\
434									\
435	*flags |= CPU_DTRACE_NOFAULT;					\
436	/*CSTYLED*/							\
437	rval = *((volatile uint##bits##_t *)addr);			\
438	*flags &= ~CPU_DTRACE_NOFAULT;					\
439									\
440	return (!(*flags & CPU_DTRACE_FAULT) ? rval : 0);		\
441}
442
443#ifdef _LP64
444#define	dtrace_loadptr	dtrace_load64
445#else
446#define	dtrace_loadptr	dtrace_load32
447#endif
448
449#define	DTRACE_DYNHASH_FREE	0
450#define	DTRACE_DYNHASH_SINK	1
451#define	DTRACE_DYNHASH_VALID	2
452
453#define	DTRACE_MATCH_FAIL	-1
454#define	DTRACE_MATCH_NEXT	0
455#define	DTRACE_MATCH_DONE	1
456#define	DTRACE_ANCHORED(probe)	((probe)->dtpr_func[0] != '\0')
457#define	DTRACE_STATE_ALIGN	64
458
459#define	DTRACE_FLAGS2FLT(flags)						\
460	(((flags) & CPU_DTRACE_BADADDR) ? DTRACEFLT_BADADDR :		\
461	((flags) & CPU_DTRACE_ILLOP) ? DTRACEFLT_ILLOP :		\
462	((flags) & CPU_DTRACE_DIVZERO) ? DTRACEFLT_DIVZERO :		\
463	((flags) & CPU_DTRACE_KPRIV) ? DTRACEFLT_KPRIV :		\
464	((flags) & CPU_DTRACE_UPRIV) ? DTRACEFLT_UPRIV :		\
465	((flags) & CPU_DTRACE_TUPOFLOW) ?  DTRACEFLT_TUPOFLOW :		\
466	((flags) & CPU_DTRACE_BADALIGN) ?  DTRACEFLT_BADALIGN :		\
467	((flags) & CPU_DTRACE_NOSCRATCH) ?  DTRACEFLT_NOSCRATCH :	\
468	((flags) & CPU_DTRACE_BADSTACK) ?  DTRACEFLT_BADSTACK :		\
469	DTRACEFLT_UNKNOWN)
470
471#define	DTRACEACT_ISSTRING(act)						\
472	((act)->dta_kind == DTRACEACT_DIFEXPR &&			\
473	(act)->dta_difo->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING)
474
475static size_t dtrace_strlen(const char *, size_t);
476static dtrace_probe_t *dtrace_probe_lookup_id(dtrace_id_t id);
477static void dtrace_enabling_provide(dtrace_provider_t *);
478static int dtrace_enabling_match(dtrace_enabling_t *, int *);
479static void dtrace_enabling_matchall(void);
480static void dtrace_enabling_reap(void);
481static dtrace_state_t *dtrace_anon_grab(void);
482static uint64_t dtrace_helper(int, dtrace_mstate_t *,
483    dtrace_state_t *, uint64_t, uint64_t);
484static dtrace_helpers_t *dtrace_helpers_create(proc_t *);
485static void dtrace_buffer_drop(dtrace_buffer_t *);
486static int dtrace_buffer_consumed(dtrace_buffer_t *, hrtime_t when);
487static intptr_t dtrace_buffer_reserve(dtrace_buffer_t *, size_t, size_t,
488    dtrace_state_t *, dtrace_mstate_t *);
489static int dtrace_state_option(dtrace_state_t *, dtrace_optid_t,
490    dtrace_optval_t);
491static int dtrace_ecb_create_enable(dtrace_probe_t *, void *);
492static void dtrace_helper_provider_destroy(dtrace_helper_provider_t *);
493static int dtrace_priv_proc(dtrace_state_t *, dtrace_mstate_t *);
494static void dtrace_getf_barrier(void);
495static int dtrace_canload_remains(uint64_t, size_t, size_t *,
496    dtrace_mstate_t *, dtrace_vstate_t *);
497static int dtrace_canstore_remains(uint64_t, size_t, size_t *,
498    dtrace_mstate_t *, dtrace_vstate_t *);
499
500/*
501 * DTrace Probe Context Functions
502 *
503 * These functions are called from probe context.  Because probe context is
504 * any context in which C may be called, arbitrarily locks may be held,
505 * interrupts may be disabled, we may be in arbitrary dispatched state, etc.
506 * As a result, functions called from probe context may only call other DTrace
507 * support functions -- they may not interact at all with the system at large.
508 * (Note that the ASSERT macro is made probe-context safe by redefining it in
509 * terms of dtrace_assfail(), a probe-context safe function.) If arbitrary
510 * loads are to be performed from probe context, they _must_ be in terms of
511 * the safe dtrace_load*() variants.
512 *
513 * Some functions in this block are not actually called from probe context;
514 * for these functions, there will be a comment above the function reading
515 * "Note:  not called from probe context."
516 */
517void
518dtrace_panic(const char *format, ...)
519{
520	va_list alist;
521
522	va_start(alist, format);
523	dtrace_vpanic(format, alist);
524	va_end(alist);
525}
526
527int
528dtrace_assfail(const char *a, const char *f, int l)
529{
530	dtrace_panic("assertion failed: %s, file: %s, line: %d", a, f, l);
531
532	/*
533	 * We just need something here that even the most clever compiler
534	 * cannot optimize away.
535	 */
536	return (a[(uintptr_t)f]);
537}
538
539/*
540 * Atomically increment a specified error counter from probe context.
541 */
542static void
543dtrace_error(uint32_t *counter)
544{
545	/*
546	 * Most counters stored to in probe context are per-CPU counters.
547	 * However, there are some error conditions that are sufficiently
548	 * arcane that they don't merit per-CPU storage.  If these counters
549	 * are incremented concurrently on different CPUs, scalability will be
550	 * adversely affected -- but we don't expect them to be white-hot in a
551	 * correctly constructed enabling...
552	 */
553	uint32_t oval, nval;
554
555	do {
556		oval = *counter;
557
558		if ((nval = oval + 1) == 0) {
559			/*
560			 * If the counter would wrap, set it to 1 -- assuring
561			 * that the counter is never zero when we have seen
562			 * errors.  (The counter must be 32-bits because we
563			 * aren't guaranteed a 64-bit compare&swap operation.)
564			 * To save this code both the infamy of being fingered
565			 * by a priggish news story and the indignity of being
566			 * the target of a neo-puritan witch trial, we're
567			 * carefully avoiding any colorful description of the
568			 * likelihood of this condition -- but suffice it to
569			 * say that it is only slightly more likely than the
570			 * overflow of predicate cache IDs, as discussed in
571			 * dtrace_predicate_create().
572			 */
573			nval = 1;
574		}
575	} while (dtrace_cas32(counter, oval, nval) != oval);
576}
577
578/*
579 * Use the DTRACE_LOADFUNC macro to define functions for each of loading a
580 * uint8_t, a uint16_t, a uint32_t and a uint64_t.
581 */
582/* BEGIN CSTYLED */
583DTRACE_LOADFUNC(8)
584DTRACE_LOADFUNC(16)
585DTRACE_LOADFUNC(32)
586DTRACE_LOADFUNC(64)
587/* END CSTYLED */
588
589static int
590dtrace_inscratch(uintptr_t dest, size_t size, dtrace_mstate_t *mstate)
591{
592	if (dest < mstate->dtms_scratch_base)
593		return (0);
594
595	if (dest + size < dest)
596		return (0);
597
598	if (dest + size > mstate->dtms_scratch_ptr)
599		return (0);
600
601	return (1);
602}
603
604static int
605dtrace_canstore_statvar(uint64_t addr, size_t sz, size_t *remain,
606    dtrace_statvar_t **svars, int nsvars)
607{
608	int i;
609	size_t maxglobalsize, maxlocalsize;
610
611	if (nsvars == 0)
612		return (0);
613
614	maxglobalsize = dtrace_statvar_maxsize + sizeof (uint64_t);
615	maxlocalsize = maxglobalsize * NCPU;
616
617	for (i = 0; i < nsvars; i++) {
618		dtrace_statvar_t *svar = svars[i];
619		uint8_t scope;
620		size_t size;
621
622		if (svar == NULL || (size = svar->dtsv_size) == 0)
623			continue;
624
625		scope = svar->dtsv_var.dtdv_scope;
626
627		/*
628		 * We verify that our size is valid in the spirit of providing
629		 * defense in depth:  we want to prevent attackers from using
630		 * DTrace to escalate an orthogonal kernel heap corruption bug
631		 * into the ability to store to arbitrary locations in memory.
632		 */
633		VERIFY((scope == DIFV_SCOPE_GLOBAL && size <= maxglobalsize) ||
634		    (scope == DIFV_SCOPE_LOCAL && size <= maxlocalsize));
635
636		if (DTRACE_INRANGE(addr, sz, svar->dtsv_data,
637		    svar->dtsv_size)) {
638			DTRACE_RANGE_REMAIN(remain, addr, svar->dtsv_data,
639			    svar->dtsv_size);
640			return (1);
641		}
642	}
643
644	return (0);
645}
646
647/*
648 * Check to see if the address is within a memory region to which a store may
649 * be issued.  This includes the DTrace scratch areas, and any DTrace variable
650 * region.  The caller of dtrace_canstore() is responsible for performing any
651 * alignment checks that are needed before stores are actually executed.
652 */
653static int
654dtrace_canstore(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
655    dtrace_vstate_t *vstate)
656{
657	return (dtrace_canstore_remains(addr, sz, NULL, mstate, vstate));
658}
659
660/*
661 * Implementation of dtrace_canstore which communicates the upper bound of the
662 * allowed memory region.
663 */
664static int
665dtrace_canstore_remains(uint64_t addr, size_t sz, size_t *remain,
666    dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
667{
668	/*
669	 * First, check to see if the address is in scratch space...
670	 */
671	if (DTRACE_INRANGE(addr, sz, mstate->dtms_scratch_base,
672	    mstate->dtms_scratch_size)) {
673		DTRACE_RANGE_REMAIN(remain, addr, mstate->dtms_scratch_base,
674		    mstate->dtms_scratch_size);
675		return (1);
676	}
677
678	/*
679	 * Now check to see if it's a dynamic variable.  This check will pick
680	 * up both thread-local variables and any global dynamically-allocated
681	 * variables.
682	 */
683	if (DTRACE_INRANGE(addr, sz, vstate->dtvs_dynvars.dtds_base,
684	    vstate->dtvs_dynvars.dtds_size)) {
685		dtrace_dstate_t *dstate = &vstate->dtvs_dynvars;
686		uintptr_t base = (uintptr_t)dstate->dtds_base +
687		    (dstate->dtds_hashsize * sizeof (dtrace_dynhash_t));
688		uintptr_t chunkoffs;
689		dtrace_dynvar_t *dvar;
690
691		/*
692		 * Before we assume that we can store here, we need to make
693		 * sure that it isn't in our metadata -- storing to our
694		 * dynamic variable metadata would corrupt our state.  For
695		 * the range to not include any dynamic variable metadata,
696		 * it must:
697		 *
698		 *	(1) Start above the hash table that is at the base of
699		 *	the dynamic variable space
700		 *
701		 *	(2) Have a starting chunk offset that is beyond the
702		 *	dtrace_dynvar_t that is at the base of every chunk
703		 *
704		 *	(3) Not span a chunk boundary
705		 *
706		 *	(4) Not be in the tuple space of a dynamic variable
707		 *
708		 */
709		if (addr < base)
710			return (0);
711
712		chunkoffs = (addr - base) % dstate->dtds_chunksize;
713
714		if (chunkoffs < sizeof (dtrace_dynvar_t))
715			return (0);
716
717		if (chunkoffs + sz > dstate->dtds_chunksize)
718			return (0);
719
720		dvar = (dtrace_dynvar_t *)((uintptr_t)addr - chunkoffs);
721
722		if (dvar->dtdv_hashval == DTRACE_DYNHASH_FREE)
723			return (0);
724
725		if (chunkoffs < sizeof (dtrace_dynvar_t) +
726		    ((dvar->dtdv_tuple.dtt_nkeys - 1) * sizeof (dtrace_key_t)))
727			return (0);
728
729		DTRACE_RANGE_REMAIN(remain, addr, dvar, dstate->dtds_chunksize);
730		return (1);
731	}
732
733	/*
734	 * Finally, check the static local and global variables.  These checks
735	 * take the longest, so we perform them last.
736	 */
737	if (dtrace_canstore_statvar(addr, sz, remain,
738	    vstate->dtvs_locals, vstate->dtvs_nlocals))
739		return (1);
740
741	if (dtrace_canstore_statvar(addr, sz, remain,
742	    vstate->dtvs_globals, vstate->dtvs_nglobals))
743		return (1);
744
745	return (0);
746}
747
748
749/*
750 * Convenience routine to check to see if the address is within a memory
751 * region in which a load may be issued given the user's privilege level;
752 * if not, it sets the appropriate error flags and loads 'addr' into the
753 * illegal value slot.
754 *
755 * DTrace subroutines (DIF_SUBR_*) should use this helper to implement
756 * appropriate memory access protection.
757 */
758static int
759dtrace_canload(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
760    dtrace_vstate_t *vstate)
761{
762	return (dtrace_canload_remains(addr, sz, NULL, mstate, vstate));
763}
764
765/*
766 * Implementation of dtrace_canload which communicates the upper bound of the
767 * allowed memory region.
768 */
769static int
770dtrace_canload_remains(uint64_t addr, size_t sz, size_t *remain,
771    dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
772{
773	volatile uintptr_t *illval = &cpu_core[CPU->cpu_id].cpuc_dtrace_illval;
774	file_t *fp;
775
776	/*
777	 * If we hold the privilege to read from kernel memory, then
778	 * everything is readable.
779	 */
780	if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0) {
781		DTRACE_RANGE_REMAIN(remain, addr, addr, sz);
782		return (1);
783	}
784
785	/*
786	 * You can obviously read that which you can store.
787	 */
788	if (dtrace_canstore_remains(addr, sz, remain, mstate, vstate))
789		return (1);
790
791	/*
792	 * We're allowed to read from our own string table.
793	 */
794	if (DTRACE_INRANGE(addr, sz, mstate->dtms_difo->dtdo_strtab,
795	    mstate->dtms_difo->dtdo_strlen)) {
796		DTRACE_RANGE_REMAIN(remain, addr,
797		    mstate->dtms_difo->dtdo_strtab,
798		    mstate->dtms_difo->dtdo_strlen);
799		return (1);
800	}
801
802	if (vstate->dtvs_state != NULL &&
803	    dtrace_priv_proc(vstate->dtvs_state, mstate)) {
804		proc_t *p;
805
806		/*
807		 * When we have privileges to the current process, there are
808		 * several context-related kernel structures that are safe to
809		 * read, even absent the privilege to read from kernel memory.
810		 * These reads are safe because these structures contain only
811		 * state that (1) we're permitted to read, (2) is harmless or
812		 * (3) contains pointers to additional kernel state that we're
813		 * not permitted to read (and as such, do not present an
814		 * opportunity for privilege escalation).  Finally (and
815		 * critically), because of the nature of their relation with
816		 * the current thread context, the memory associated with these
817		 * structures cannot change over the duration of probe context,
818		 * and it is therefore impossible for this memory to be
819		 * deallocated and reallocated as something else while it's
820		 * being operated upon.
821		 */
822		if (DTRACE_INRANGE(addr, sz, curthread, sizeof (kthread_t))) {
823			DTRACE_RANGE_REMAIN(remain, addr, curthread,
824			    sizeof (kthread_t));
825			return (1);
826		}
827
828		if ((p = curthread->t_procp) != NULL && DTRACE_INRANGE(addr,
829		    sz, curthread->t_procp, sizeof (proc_t))) {
830			DTRACE_RANGE_REMAIN(remain, addr, curthread->t_procp,
831			    sizeof (proc_t));
832			return (1);
833		}
834
835		if (curthread->t_cred != NULL && DTRACE_INRANGE(addr, sz,
836		    curthread->t_cred, sizeof (cred_t))) {
837			DTRACE_RANGE_REMAIN(remain, addr, curthread->t_cred,
838			    sizeof (cred_t));
839			return (1);
840		}
841
842		if (p != NULL && p->p_pidp != NULL && DTRACE_INRANGE(addr, sz,
843		    &(p->p_pidp->pid_id), sizeof (pid_t))) {
844			DTRACE_RANGE_REMAIN(remain, addr, &(p->p_pidp->pid_id),
845			    sizeof (pid_t));
846			return (1);
847		}
848
849		if (curthread->t_cpu != NULL && DTRACE_INRANGE(addr, sz,
850		    curthread->t_cpu, offsetof(cpu_t, cpu_pause_thread))) {
851			DTRACE_RANGE_REMAIN(remain, addr, curthread->t_cpu,
852			    offsetof(cpu_t, cpu_pause_thread));
853			return (1);
854		}
855	}
856
857	if ((fp = mstate->dtms_getf) != NULL) {
858		uintptr_t psz = sizeof (void *);
859		vnode_t *vp;
860		vnodeops_t *op;
861
862		/*
863		 * When getf() returns a file_t, the enabling is implicitly
864		 * granted the (transient) right to read the returned file_t
865		 * as well as the v_path and v_op->vnop_name of the underlying
866		 * vnode.  These accesses are allowed after a successful
867		 * getf() because the members that they refer to cannot change
868		 * once set -- and the barrier logic in the kernel's closef()
869		 * path assures that the file_t and its referenced vode_t
870		 * cannot themselves be stale (that is, it impossible for
871		 * either dtms_getf itself or its f_vnode member to reference
872		 * freed memory).
873		 */
874		if (DTRACE_INRANGE(addr, sz, fp, sizeof (file_t))) {
875			DTRACE_RANGE_REMAIN(remain, addr, fp, sizeof (file_t));
876			return (1);
877		}
878
879		if ((vp = fp->f_vnode) != NULL) {
880			size_t slen;
881
882			if (DTRACE_INRANGE(addr, sz, &vp->v_path, psz)) {
883				DTRACE_RANGE_REMAIN(remain, addr, &vp->v_path,
884				    psz);
885				return (1);
886			}
887
888			slen = strlen(vp->v_path) + 1;
889			if (DTRACE_INRANGE(addr, sz, vp->v_path, slen)) {
890				DTRACE_RANGE_REMAIN(remain, addr, vp->v_path,
891				    slen);
892				return (1);
893			}
894
895			if (DTRACE_INRANGE(addr, sz, &vp->v_op, psz)) {
896				DTRACE_RANGE_REMAIN(remain, addr, &vp->v_op,
897				    psz);
898				return (1);
899			}
900
901			if ((op = vp->v_op) != NULL &&
902			    DTRACE_INRANGE(addr, sz, &op->vnop_name, psz)) {
903				DTRACE_RANGE_REMAIN(remain, addr,
904				    &op->vnop_name, psz);
905				return (1);
906			}
907
908			if (op != NULL && op->vnop_name != NULL &&
909			    DTRACE_INRANGE(addr, sz, op->vnop_name,
910			    (slen = strlen(op->vnop_name) + 1))) {
911				DTRACE_RANGE_REMAIN(remain, addr,
912				    op->vnop_name, slen);
913				return (1);
914			}
915		}
916	}
917
918	DTRACE_CPUFLAG_SET(CPU_DTRACE_KPRIV);
919	*illval = addr;
920	return (0);
921}
922
923/*
924 * Convenience routine to check to see if a given string is within a memory
925 * region in which a load may be issued given the user's privilege level;
926 * this exists so that we don't need to issue unnecessary dtrace_strlen()
927 * calls in the event that the user has all privileges.
928 */
929static int
930dtrace_strcanload(uint64_t addr, size_t sz, size_t *remain,
931    dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
932{
933	size_t rsize;
934
935	/*
936	 * If we hold the privilege to read from kernel memory, then
937	 * everything is readable.
938	 */
939	if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0) {
940		DTRACE_RANGE_REMAIN(remain, addr, addr, sz);
941		return (1);
942	}
943
944	/*
945	 * Even if the caller is uninterested in querying the remaining valid
946	 * range, it is required to ensure that the access is allowed.
947	 */
948	if (remain == NULL) {
949		remain = &rsize;
950	}
951	if (dtrace_canload_remains(addr, 0, remain, mstate, vstate)) {
952		size_t strsz;
953		/*
954		 * Perform the strlen after determining the length of the
955		 * memory region which is accessible.  This prevents timing
956		 * information from being used to find NULs in memory which is
957		 * not accessible to the caller.
958		 */
959		strsz = 1 + dtrace_strlen((char *)(uintptr_t)addr,
960		    MIN(sz, *remain));
961		if (strsz <= *remain) {
962			return (1);
963		}
964	}
965
966	return (0);
967}
968
969/*
970 * Convenience routine to check to see if a given variable is within a memory
971 * region in which a load may be issued given the user's privilege level.
972 */
973static int
974dtrace_vcanload(void *src, dtrace_diftype_t *type, size_t *remain,
975    dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
976{
977	size_t sz;
978	ASSERT(type->dtdt_flags & DIF_TF_BYREF);
979
980	/*
981	 * Calculate the max size before performing any checks since even
982	 * DTRACE_ACCESS_KERNEL-credentialed callers expect that this function
983	 * return the max length via 'remain'.
984	 */
985	if (type->dtdt_kind == DIF_TYPE_STRING) {
986		dtrace_state_t *state = vstate->dtvs_state;
987
988		if (state != NULL) {
989			sz = state->dts_options[DTRACEOPT_STRSIZE];
990		} else {
991			/*
992			 * In helper context, we have a NULL state; fall back
993			 * to using the system-wide default for the string size
994			 * in this case.
995			 */
996			sz = dtrace_strsize_default;
997		}
998	} else {
999		sz = type->dtdt_size;
1000	}
1001
1002	/*
1003	 * If we hold the privilege to read from kernel memory, then
1004	 * everything is readable.
1005	 */
1006	if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0) {
1007		DTRACE_RANGE_REMAIN(remain, (uintptr_t)src, src, sz);
1008		return (1);
1009	}
1010
1011	if (type->dtdt_kind == DIF_TYPE_STRING) {
1012		return (dtrace_strcanload((uintptr_t)src, sz, remain, mstate,
1013		    vstate));
1014	}
1015	return (dtrace_canload_remains((uintptr_t)src, sz, remain, mstate,
1016	    vstate));
1017}
1018
1019/*
1020 * Convert a string to a signed integer using safe loads.
1021 *
1022 * NOTE: This function uses various macros from strtolctype.h to manipulate
1023 * digit values, etc -- these have all been checked to ensure they make
1024 * no additional function calls.
1025 */
1026static int64_t
1027dtrace_strtoll(char *input, int base, size_t limit)
1028{
1029	uintptr_t pos = (uintptr_t)input;
1030	int64_t val = 0;
1031	int x;
1032	boolean_t neg = B_FALSE;
1033	char c, cc, ccc;
1034	uintptr_t end = pos + limit;
1035
1036	/*
1037	 * Consume any whitespace preceding digits.
1038	 */
1039	while ((c = dtrace_load8(pos)) == ' ' || c == '\t')
1040		pos++;
1041
1042	/*
1043	 * Handle an explicit sign if one is present.
1044	 */
1045	if (c == '-' || c == '+') {
1046		if (c == '-')
1047			neg = B_TRUE;
1048		c = dtrace_load8(++pos);
1049	}
1050
1051	/*
1052	 * Check for an explicit hexadecimal prefix ("0x" or "0X") and skip it
1053	 * if present.
1054	 */
1055	if (base == 16 && c == '0' && ((cc = dtrace_load8(pos + 1)) == 'x' ||
1056	    cc == 'X') && isxdigit(ccc = dtrace_load8(pos + 2))) {
1057		pos += 2;
1058		c = ccc;
1059	}
1060
1061	/*
1062	 * Read in contiguous digits until the first non-digit character.
1063	 */
1064	for (; pos < end && c != '\0' && lisalnum(c) && (x = DIGIT(c)) < base;
1065	    c = dtrace_load8(++pos))
1066		val = val * base + x;
1067
1068	return (neg ? -val : val);
1069}
1070
1071/*
1072 * Compare two strings using safe loads.
1073 */
1074static int
1075dtrace_strncmp(char *s1, char *s2, size_t limit)
1076{
1077	uint8_t c1, c2;
1078	volatile uint16_t *flags;
1079
1080	if (s1 == s2 || limit == 0)
1081		return (0);
1082
1083	flags = (volatile uint16_t *)&cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
1084
1085	do {
1086		if (s1 == NULL) {
1087			c1 = '\0';
1088		} else {
1089			c1 = dtrace_load8((uintptr_t)s1++);
1090		}
1091
1092		if (s2 == NULL) {
1093			c2 = '\0';
1094		} else {
1095			c2 = dtrace_load8((uintptr_t)s2++);
1096		}
1097
1098		if (c1 != c2)
1099			return (c1 - c2);
1100	} while (--limit && c1 != '\0' && !(*flags & CPU_DTRACE_FAULT));
1101
1102	return (0);
1103}
1104
1105/*
1106 * Compute strlen(s) for a string using safe memory accesses.  The additional
1107 * len parameter is used to specify a maximum length to ensure completion.
1108 */
1109static size_t
1110dtrace_strlen(const char *s, size_t lim)
1111{
1112	uint_t len;
1113
1114	for (len = 0; len != lim; len++) {
1115		if (dtrace_load8((uintptr_t)s++) == '\0')
1116			break;
1117	}
1118
1119	return (len);
1120}
1121
1122/*
1123 * Check if an address falls within a toxic region.
1124 */
1125static int
1126dtrace_istoxic(uintptr_t kaddr, size_t size)
1127{
1128	uintptr_t taddr, tsize;
1129	int i;
1130
1131	for (i = 0; i < dtrace_toxranges; i++) {
1132		taddr = dtrace_toxrange[i].dtt_base;
1133		tsize = dtrace_toxrange[i].dtt_limit - taddr;
1134
1135		if (kaddr - taddr < tsize) {
1136			DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
1137			cpu_core[CPU->cpu_id].cpuc_dtrace_illval = kaddr;
1138			return (1);
1139		}
1140
1141		if (taddr - kaddr < size) {
1142			DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
1143			cpu_core[CPU->cpu_id].cpuc_dtrace_illval = taddr;
1144			return (1);
1145		}
1146	}
1147
1148	return (0);
1149}
1150
1151/*
1152 * Copy src to dst using safe memory accesses.  The src is assumed to be unsafe
1153 * memory specified by the DIF program.  The dst is assumed to be safe memory
1154 * that we can store to directly because it is managed by DTrace.  As with
1155 * standard bcopy, overlapping copies are handled properly.
1156 */
1157static void
1158dtrace_bcopy(const void *src, void *dst, size_t len)
1159{
1160	if (len != 0) {
1161		uint8_t *s1 = dst;
1162		const uint8_t *s2 = src;
1163
1164		if (s1 <= s2) {
1165			do {
1166				*s1++ = dtrace_load8((uintptr_t)s2++);
1167			} while (--len != 0);
1168		} else {
1169			s2 += len;
1170			s1 += len;
1171
1172			do {
1173				*--s1 = dtrace_load8((uintptr_t)--s2);
1174			} while (--len != 0);
1175		}
1176	}
1177}
1178
1179/*
1180 * Copy src to dst using safe memory accesses, up to either the specified
1181 * length, or the point that a nul byte is encountered.  The src is assumed to
1182 * be unsafe memory specified by the DIF program.  The dst is assumed to be
1183 * safe memory that we can store to directly because it is managed by DTrace.
1184 * Unlike dtrace_bcopy(), overlapping regions are not handled.
1185 */
1186static void
1187dtrace_strcpy(const void *src, void *dst, size_t len)
1188{
1189	if (len != 0) {
1190		uint8_t *s1 = dst, c;
1191		const uint8_t *s2 = src;
1192
1193		do {
1194			*s1++ = c = dtrace_load8((uintptr_t)s2++);
1195		} while (--len != 0 && c != '\0');
1196	}
1197}
1198
1199/*
1200 * Copy src to dst, deriving the size and type from the specified (BYREF)
1201 * variable type.  The src is assumed to be unsafe memory specified by the DIF
1202 * program.  The dst is assumed to be DTrace variable memory that is of the
1203 * specified type; we assume that we can store to directly.
1204 */
1205static void
1206dtrace_vcopy(void *src, void *dst, dtrace_diftype_t *type, size_t limit)
1207{
1208	ASSERT(type->dtdt_flags & DIF_TF_BYREF);
1209
1210	if (type->dtdt_kind == DIF_TYPE_STRING) {
1211		dtrace_strcpy(src, dst, MIN(type->dtdt_size, limit));
1212	} else {
1213		dtrace_bcopy(src, dst, MIN(type->dtdt_size, limit));
1214	}
1215}
1216
1217/*
1218 * Compare s1 to s2 using safe memory accesses.  The s1 data is assumed to be
1219 * unsafe memory specified by the DIF program.  The s2 data is assumed to be
1220 * safe memory that we can access directly because it is managed by DTrace.
1221 */
1222static int
1223dtrace_bcmp(const void *s1, const void *s2, size_t len)
1224{
1225	volatile uint16_t *flags;
1226
1227	flags = (volatile uint16_t *)&cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
1228
1229	if (s1 == s2)
1230		return (0);
1231
1232	if (s1 == NULL || s2 == NULL)
1233		return (1);
1234
1235	if (s1 != s2 && len != 0) {
1236		const uint8_t *ps1 = s1;
1237		const uint8_t *ps2 = s2;
1238
1239		do {
1240			if (dtrace_load8((uintptr_t)ps1++) != *ps2++)
1241				return (1);
1242		} while (--len != 0 && !(*flags & CPU_DTRACE_FAULT));
1243	}
1244	return (0);
1245}
1246
1247/*
1248 * Zero the specified region using a simple byte-by-byte loop.  Note that this
1249 * is for safe DTrace-managed memory only.
1250 */
1251static void
1252dtrace_bzero(void *dst, size_t len)
1253{
1254	uchar_t *cp;
1255
1256	for (cp = dst; len != 0; len--)
1257		*cp++ = 0;
1258}
1259
1260static void
1261dtrace_add_128(uint64_t *addend1, uint64_t *addend2, uint64_t *sum)
1262{
1263	uint64_t result[2];
1264
1265	result[0] = addend1[0] + addend2[0];
1266	result[1] = addend1[1] + addend2[1] +
1267	    (result[0] < addend1[0] || result[0] < addend2[0] ? 1 : 0);
1268
1269	sum[0] = result[0];
1270	sum[1] = result[1];
1271}
1272
1273/*
1274 * Shift the 128-bit value in a by b. If b is positive, shift left.
1275 * If b is negative, shift right.
1276 */
1277static void
1278dtrace_shift_128(uint64_t *a, int b)
1279{
1280	uint64_t mask;
1281
1282	if (b == 0)
1283		return;
1284
1285	if (b < 0) {
1286		b = -b;
1287		if (b >= 64) {
1288			a[0] = a[1] >> (b - 64);
1289			a[1] = 0;
1290		} else {
1291			a[0] >>= b;
1292			mask = 1LL << (64 - b);
1293			mask -= 1;
1294			a[0] |= ((a[1] & mask) << (64 - b));
1295			a[1] >>= b;
1296		}
1297	} else {
1298		if (b >= 64) {
1299			a[1] = a[0] << (b - 64);
1300			a[0] = 0;
1301		} else {
1302			a[1] <<= b;
1303			mask = a[0] >> (64 - b);
1304			a[1] |= mask;
1305			a[0] <<= b;
1306		}
1307	}
1308}
1309
1310/*
1311 * The basic idea is to break the 2 64-bit values into 4 32-bit values,
1312 * use native multiplication on those, and then re-combine into the
1313 * resulting 128-bit value.
1314 *
1315 * (hi1 << 32 + lo1) * (hi2 << 32 + lo2) =
1316 *     hi1 * hi2 << 64 +
1317 *     hi1 * lo2 << 32 +
1318 *     hi2 * lo1 << 32 +
1319 *     lo1 * lo2
1320 */
1321static void
1322dtrace_multiply_128(uint64_t factor1, uint64_t factor2, uint64_t *product)
1323{
1324	uint64_t hi1, hi2, lo1, lo2;
1325	uint64_t tmp[2];
1326
1327	hi1 = factor1 >> 32;
1328	hi2 = factor2 >> 32;
1329
1330	lo1 = factor1 & DT_MASK_LO;
1331	lo2 = factor2 & DT_MASK_LO;
1332
1333	product[0] = lo1 * lo2;
1334	product[1] = hi1 * hi2;
1335
1336	tmp[0] = hi1 * lo2;
1337	tmp[1] = 0;
1338	dtrace_shift_128(tmp, 32);
1339	dtrace_add_128(product, tmp, product);
1340
1341	tmp[0] = hi2 * lo1;
1342	tmp[1] = 0;
1343	dtrace_shift_128(tmp, 32);
1344	dtrace_add_128(product, tmp, product);
1345}
1346
1347/*
1348 * This privilege check should be used by actions and subroutines to
1349 * verify that the user credentials of the process that enabled the
1350 * invoking ECB match the target credentials
1351 */
1352static int
1353dtrace_priv_proc_common_user(dtrace_state_t *state)
1354{
1355	cred_t *cr, *s_cr = state->dts_cred.dcr_cred;
1356
1357	/*
1358	 * We should always have a non-NULL state cred here, since if cred
1359	 * is null (anonymous tracing), we fast-path bypass this routine.
1360	 */
1361	ASSERT(s_cr != NULL);
1362
1363	if ((cr = CRED()) != NULL &&
1364	    s_cr->cr_uid == cr->cr_uid &&
1365	    s_cr->cr_uid == cr->cr_ruid &&
1366	    s_cr->cr_uid == cr->cr_suid &&
1367	    s_cr->cr_gid == cr->cr_gid &&
1368	    s_cr->cr_gid == cr->cr_rgid &&
1369	    s_cr->cr_gid == cr->cr_sgid)
1370		return (1);
1371
1372	return (0);
1373}
1374
1375/*
1376 * This privilege check should be used by actions and subroutines to
1377 * verify that the zone of the process that enabled the invoking ECB
1378 * matches the target credentials
1379 */
1380static int
1381dtrace_priv_proc_common_zone(dtrace_state_t *state)
1382{
1383	cred_t *cr, *s_cr = state->dts_cred.dcr_cred;
1384
1385	/*
1386	 * We should always have a non-NULL state cred here, since if cred
1387	 * is null (anonymous tracing), we fast-path bypass this routine.
1388	 */
1389	ASSERT(s_cr != NULL);
1390
1391	if ((cr = CRED()) != NULL && s_cr->cr_zone == cr->cr_zone)
1392		return (1);
1393
1394	return (0);
1395}
1396
1397/*
1398 * This privilege check should be used by actions and subroutines to
1399 * verify that the process has not setuid or changed credentials.
1400 */
1401static int
1402dtrace_priv_proc_common_nocd()
1403{
1404	proc_t *proc;
1405
1406	if ((proc = ttoproc(curthread)) != NULL &&
1407	    !(proc->p_flag & SNOCD))
1408		return (1);
1409
1410	return (0);
1411}
1412
1413static int
1414dtrace_priv_proc_destructive(dtrace_state_t *state, dtrace_mstate_t *mstate)
1415{
1416	int action = state->dts_cred.dcr_action;
1417
1418	if (!(mstate->dtms_access & DTRACE_ACCESS_PROC))
1419		goto bad;
1420
1421	if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE) == 0) &&
1422	    dtrace_priv_proc_common_zone(state) == 0)
1423		goto bad;
1424
1425	if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER) == 0) &&
1426	    dtrace_priv_proc_common_user(state) == 0)
1427		goto bad;
1428
1429	if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG) == 0) &&
1430	    dtrace_priv_proc_common_nocd() == 0)
1431		goto bad;
1432
1433	return (1);
1434
1435bad:
1436	cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1437
1438	return (0);
1439}
1440
1441static int
1442dtrace_priv_proc_control(dtrace_state_t *state, dtrace_mstate_t *mstate)
1443{
1444	if (mstate->dtms_access & DTRACE_ACCESS_PROC) {
1445		if (state->dts_cred.dcr_action & DTRACE_CRA_PROC_CONTROL)
1446			return (1);
1447
1448		if (dtrace_priv_proc_common_zone(state) &&
1449		    dtrace_priv_proc_common_user(state) &&
1450		    dtrace_priv_proc_common_nocd())
1451			return (1);
1452	}
1453
1454	cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1455
1456	return (0);
1457}
1458
1459static int
1460dtrace_priv_proc(dtrace_state_t *state, dtrace_mstate_t *mstate)
1461{
1462	if ((mstate->dtms_access & DTRACE_ACCESS_PROC) &&
1463	    (state->dts_cred.dcr_action & DTRACE_CRA_PROC))
1464		return (1);
1465
1466	cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1467
1468	return (0);
1469}
1470
1471static int
1472dtrace_priv_kernel(dtrace_state_t *state)
1473{
1474	if (state->dts_cred.dcr_action & DTRACE_CRA_KERNEL)
1475		return (1);
1476
1477	cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_KPRIV;
1478
1479	return (0);
1480}
1481
1482static int
1483dtrace_priv_kernel_destructive(dtrace_state_t *state)
1484{
1485	if (state->dts_cred.dcr_action & DTRACE_CRA_KERNEL_DESTRUCTIVE)
1486		return (1);
1487
1488	cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_KPRIV;
1489
1490	return (0);
1491}
1492
1493/*
1494 * Determine if the dte_cond of the specified ECB allows for processing of
1495 * the current probe to continue.  Note that this routine may allow continued
1496 * processing, but with access(es) stripped from the mstate's dtms_access
1497 * field.
1498 */
1499static int
1500dtrace_priv_probe(dtrace_state_t *state, dtrace_mstate_t *mstate,
1501    dtrace_ecb_t *ecb)
1502{
1503	dtrace_probe_t *probe = ecb->dte_probe;
1504	dtrace_provider_t *prov = probe->dtpr_provider;
1505	dtrace_pops_t *pops = &prov->dtpv_pops;
1506	int mode = DTRACE_MODE_NOPRIV_DROP;
1507
1508	ASSERT(ecb->dte_cond);
1509
1510	if (pops->dtps_mode != NULL) {
1511		mode = pops->dtps_mode(prov->dtpv_arg,
1512		    probe->dtpr_id, probe->dtpr_arg);
1513
1514		ASSERT(mode & (DTRACE_MODE_USER | DTRACE_MODE_KERNEL));
1515		ASSERT(mode & (DTRACE_MODE_NOPRIV_RESTRICT |
1516		    DTRACE_MODE_NOPRIV_DROP));
1517	}
1518
1519	/*
1520	 * If the dte_cond bits indicate that this consumer is only allowed to
1521	 * see user-mode firings of this probe, check that the probe was fired
1522	 * while in a user context.  If that's not the case, use the policy
1523	 * specified by the provider to determine if we drop the probe or
1524	 * merely restrict operation.
1525	 */
1526	if (ecb->dte_cond & DTRACE_COND_USERMODE) {
1527		ASSERT(mode != DTRACE_MODE_NOPRIV_DROP);
1528
1529		if (!(mode & DTRACE_MODE_USER)) {
1530			if (mode & DTRACE_MODE_NOPRIV_DROP)
1531				return (0);
1532
1533			mstate->dtms_access &= ~DTRACE_ACCESS_ARGS;
1534		}
1535	}
1536
1537	/*
1538	 * This is more subtle than it looks. We have to be absolutely certain
1539	 * that CRED() isn't going to change out from under us so it's only
1540	 * legit to examine that structure if we're in constrained situations.
1541	 * Currently, the only times we'll this check is if a non-super-user
1542	 * has enabled the profile or syscall providers -- providers that
1543	 * allow visibility of all processes. For the profile case, the check
1544	 * above will ensure that we're examining a user context.
1545	 */
1546	if (ecb->dte_cond & DTRACE_COND_OWNER) {
1547		cred_t *cr;
1548		cred_t *s_cr = state->dts_cred.dcr_cred;
1549		proc_t *proc;
1550
1551		ASSERT(s_cr != NULL);
1552
1553		if ((cr = CRED()) == NULL ||
1554		    s_cr->cr_uid != cr->cr_uid ||
1555		    s_cr->cr_uid != cr->cr_ruid ||
1556		    s_cr->cr_uid != cr->cr_suid ||
1557		    s_cr->cr_gid != cr->cr_gid ||
1558		    s_cr->cr_gid != cr->cr_rgid ||
1559		    s_cr->cr_gid != cr->cr_sgid ||
1560		    (proc = ttoproc(curthread)) == NULL ||
1561		    (proc->p_flag & SNOCD)) {
1562			if (mode & DTRACE_MODE_NOPRIV_DROP)
1563				return (0);
1564
1565			mstate->dtms_access &= ~DTRACE_ACCESS_PROC;
1566		}
1567	}
1568
1569	/*
1570	 * If our dte_cond is set to DTRACE_COND_ZONEOWNER and we are not
1571	 * in our zone, check to see if our mode policy is to restrict rather
1572	 * than to drop; if to restrict, strip away both DTRACE_ACCESS_PROC
1573	 * and DTRACE_ACCESS_ARGS
1574	 */
1575	if (ecb->dte_cond & DTRACE_COND_ZONEOWNER) {
1576		cred_t *cr;
1577		cred_t *s_cr = state->dts_cred.dcr_cred;
1578
1579		ASSERT(s_cr != NULL);
1580
1581		if ((cr = CRED()) == NULL ||
1582		    s_cr->cr_zone->zone_id != cr->cr_zone->zone_id) {
1583			if (mode & DTRACE_MODE_NOPRIV_DROP)
1584				return (0);
1585
1586			mstate->dtms_access &=
1587			    ~(DTRACE_ACCESS_PROC | DTRACE_ACCESS_ARGS);
1588		}
1589	}
1590
1591	/*
1592	 * By merits of being in this code path at all, we have limited
1593	 * privileges.  If the provider has indicated that limited privileges
1594	 * are to denote restricted operation, strip off the ability to access
1595	 * arguments.
1596	 */
1597	if (mode & DTRACE_MODE_LIMITEDPRIV_RESTRICT)
1598		mstate->dtms_access &= ~DTRACE_ACCESS_ARGS;
1599
1600	return (1);
1601}
1602
1603/*
1604 * Note:  not called from probe context.  This function is called
1605 * asynchronously (and at a regular interval) from outside of probe context to
1606 * clean the dirty dynamic variable lists on all CPUs.  Dynamic variable
1607 * cleaning is explained in detail in <sys/dtrace_impl.h>.
1608 */
1609void
1610dtrace_dynvar_clean(dtrace_dstate_t *dstate)
1611{
1612	dtrace_dynvar_t *dirty;
1613	dtrace_dstate_percpu_t *dcpu;
1614	dtrace_dynvar_t **rinsep;
1615	int i, j, work = 0;
1616
1617	for (i = 0; i < NCPU; i++) {
1618		dcpu = &dstate->dtds_percpu[i];
1619		rinsep = &dcpu->dtdsc_rinsing;
1620
1621		/*
1622		 * If the dirty list is NULL, there is no dirty work to do.
1623		 */
1624		if (dcpu->dtdsc_dirty == NULL)
1625			continue;
1626
1627		if (dcpu->dtdsc_rinsing != NULL) {
1628			/*
1629			 * If the rinsing list is non-NULL, then it is because
1630			 * this CPU was selected to accept another CPU's
1631			 * dirty list -- and since that time, dirty buffers
1632			 * have accumulated.  This is a highly unlikely
1633			 * condition, but we choose to ignore the dirty
1634			 * buffers -- they'll be picked up a future cleanse.
1635			 */
1636			continue;
1637		}
1638
1639		if (dcpu->dtdsc_clean != NULL) {
1640			/*
1641			 * If the clean list is non-NULL, then we're in a
1642			 * situation where a CPU has done deallocations (we
1643			 * have a non-NULL dirty list) but no allocations (we
1644			 * also have a non-NULL clean list).  We can't simply
1645			 * move the dirty list into the clean list on this
1646			 * CPU, yet we also don't want to allow this condition
1647			 * to persist, lest a short clean list prevent a
1648			 * massive dirty list from being cleaned (which in
1649			 * turn could lead to otherwise avoidable dynamic
1650			 * drops).  To deal with this, we look for some CPU
1651			 * with a NULL clean list, NULL dirty list, and NULL
1652			 * rinsing list -- and then we borrow this CPU to
1653			 * rinse our dirty list.
1654			 */
1655			for (j = 0; j < NCPU; j++) {
1656				dtrace_dstate_percpu_t *rinser;
1657
1658				rinser = &dstate->dtds_percpu[j];
1659
1660				if (rinser->dtdsc_rinsing != NULL)
1661					continue;
1662
1663				if (rinser->dtdsc_dirty != NULL)
1664					continue;
1665
1666				if (rinser->dtdsc_clean != NULL)
1667					continue;
1668
1669				rinsep = &rinser->dtdsc_rinsing;
1670				break;
1671			}
1672
1673			if (j == NCPU) {
1674				/*
1675				 * We were unable to find another CPU that
1676				 * could accept this dirty list -- we are
1677				 * therefore unable to clean it now.
1678				 */
1679				dtrace_dynvar_failclean++;
1680				continue;
1681			}
1682		}
1683
1684		work = 1;
1685
1686		/*
1687		 * Atomically move the dirty list aside.
1688		 */
1689		do {
1690			dirty = dcpu->dtdsc_dirty;
1691
1692			/*
1693			 * Before we zap the dirty list, set the rinsing list.
1694			 * (This allows for a potential assertion in
1695			 * dtrace_dynvar():  if a free dynamic variable appears
1696			 * on a hash chain, either the dirty list or the
1697			 * rinsing list for some CPU must be non-NULL.)
1698			 */
1699			*rinsep = dirty;
1700			dtrace_membar_producer();
1701		} while (dtrace_casptr(&dcpu->dtdsc_dirty,
1702		    dirty, NULL) != dirty);
1703	}
1704
1705	if (!work) {
1706		/*
1707		 * We have no work to do; we can simply return.
1708		 */
1709		return;
1710	}
1711
1712	dtrace_sync();
1713
1714	for (i = 0; i < NCPU; i++) {
1715		dcpu = &dstate->dtds_percpu[i];
1716
1717		if (dcpu->dtdsc_rinsing == NULL)
1718			continue;
1719
1720		/*
1721		 * We are now guaranteed that no hash chain contains a pointer
1722		 * into this dirty list; we can make it clean.
1723		 */
1724		ASSERT(dcpu->dtdsc_clean == NULL);
1725		dcpu->dtdsc_clean = dcpu->dtdsc_rinsing;
1726		dcpu->dtdsc_rinsing = NULL;
1727	}
1728
1729	/*
1730	 * Before we actually set the state to be DTRACE_DSTATE_CLEAN, make
1731	 * sure that all CPUs have seen all of the dtdsc_clean pointers.
1732	 * This prevents a race whereby a CPU incorrectly decides that
1733	 * the state should be something other than DTRACE_DSTATE_CLEAN
1734	 * after dtrace_dynvar_clean() has completed.
1735	 */
1736	dtrace_sync();
1737
1738	dstate->dtds_state = DTRACE_DSTATE_CLEAN;
1739}
1740
1741/*
1742 * Depending on the value of the op parameter, this function looks-up,
1743 * allocates or deallocates an arbitrarily-keyed dynamic variable.  If an
1744 * allocation is requested, this function will return a pointer to a
1745 * dtrace_dynvar_t corresponding to the allocated variable -- or NULL if no
1746 * variable can be allocated.  If NULL is returned, the appropriate counter
1747 * will be incremented.
1748 */
1749dtrace_dynvar_t *
1750dtrace_dynvar(dtrace_dstate_t *dstate, uint_t nkeys,
1751    dtrace_key_t *key, size_t dsize, dtrace_dynvar_op_t op,
1752    dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
1753{
1754	uint64_t hashval = DTRACE_DYNHASH_VALID;
1755	dtrace_dynhash_t *hash = dstate->dtds_hash;
1756	dtrace_dynvar_t *free, *new_free, *next, *dvar, *start, *prev = NULL;
1757	processorid_t me = CPU->cpu_id, cpu = me;
1758	dtrace_dstate_percpu_t *dcpu = &dstate->dtds_percpu[me];
1759	size_t bucket, ksize;
1760	size_t chunksize = dstate->dtds_chunksize;
1761	uintptr_t kdata, lock, nstate;
1762	uint_t i;
1763
1764	ASSERT(nkeys != 0);
1765
1766	/*
1767	 * Hash the key.  As with aggregations, we use Jenkins' "One-at-a-time"
1768	 * algorithm.  For the by-value portions, we perform the algorithm in
1769	 * 16-bit chunks (as opposed to 8-bit chunks).  This speeds things up a
1770	 * bit, and seems to have only a minute effect on distribution.  For
1771	 * the by-reference data, we perform "One-at-a-time" iterating (safely)
1772	 * over each referenced byte.  It's painful to do this, but it's much
1773	 * better than pathological hash distribution.  The efficacy of the
1774	 * hashing algorithm (and a comparison with other algorithms) may be
1775	 * found by running the ::dtrace_dynstat MDB dcmd.
1776	 */
1777	for (i = 0; i < nkeys; i++) {
1778		if (key[i].dttk_size == 0) {
1779			uint64_t val = key[i].dttk_value;
1780
1781			hashval += (val >> 48) & 0xffff;
1782			hashval += (hashval << 10);
1783			hashval ^= (hashval >> 6);
1784
1785			hashval += (val >> 32) & 0xffff;
1786			hashval += (hashval << 10);
1787			hashval ^= (hashval >> 6);
1788
1789			hashval += (val >> 16) & 0xffff;
1790			hashval += (hashval << 10);
1791			hashval ^= (hashval >> 6);
1792
1793			hashval += val & 0xffff;
1794			hashval += (hashval << 10);
1795			hashval ^= (hashval >> 6);
1796		} else {
1797			/*
1798			 * This is incredibly painful, but it beats the hell
1799			 * out of the alternative.
1800			 */
1801			uint64_t j, size = key[i].dttk_size;
1802			uintptr_t base = (uintptr_t)key[i].dttk_value;
1803
1804			if (!dtrace_canload(base, size, mstate, vstate))
1805				break;
1806
1807			for (j = 0; j < size; j++) {
1808				hashval += dtrace_load8(base + j);
1809				hashval += (hashval << 10);
1810				hashval ^= (hashval >> 6);
1811			}
1812		}
1813	}
1814
1815	if (DTRACE_CPUFLAG_ISSET(CPU_DTRACE_FAULT))
1816		return (NULL);
1817
1818	hashval += (hashval << 3);
1819	hashval ^= (hashval >> 11);
1820	hashval += (hashval << 15);
1821
1822	/*
1823	 * There is a remote chance (ideally, 1 in 2^31) that our hashval
1824	 * comes out to be one of our two sentinel hash values.  If this
1825	 * actually happens, we set the hashval to be a value known to be a
1826	 * non-sentinel value.
1827	 */
1828	if (hashval == DTRACE_DYNHASH_FREE || hashval == DTRACE_DYNHASH_SINK)
1829		hashval = DTRACE_DYNHASH_VALID;
1830
1831	/*
1832	 * Yes, it's painful to do a divide here.  If the cycle count becomes
1833	 * important here, tricks can be pulled to reduce it.  (However, it's
1834	 * critical that hash collisions be kept to an absolute minimum;
1835	 * they're much more painful than a divide.)  It's better to have a
1836	 * solution that generates few collisions and still keeps things
1837	 * relatively simple.
1838	 */
1839	bucket = hashval % dstate->dtds_hashsize;
1840
1841	if (op == DTRACE_DYNVAR_DEALLOC) {
1842		volatile uintptr_t *lockp = &hash[bucket].dtdh_lock;
1843
1844		for (;;) {
1845			while ((lock = *lockp) & 1)
1846				continue;
1847
1848			if (dtrace_casptr((void *)lockp,
1849			    (void *)lock, (void *)(lock + 1)) == (void *)lock)
1850				break;
1851		}
1852
1853		dtrace_membar_producer();
1854	}
1855
1856top:
1857	prev = NULL;
1858	lock = hash[bucket].dtdh_lock;
1859
1860	dtrace_membar_consumer();
1861
1862	start = hash[bucket].dtdh_chain;
1863	ASSERT(start != NULL && (start->dtdv_hashval == DTRACE_DYNHASH_SINK ||
1864	    start->dtdv_hashval != DTRACE_DYNHASH_FREE ||
1865	    op != DTRACE_DYNVAR_DEALLOC));
1866
1867	for (dvar = start; dvar != NULL; dvar = dvar->dtdv_next) {
1868		dtrace_tuple_t *dtuple = &dvar->dtdv_tuple;
1869		dtrace_key_t *dkey = &dtuple->dtt_key[0];
1870
1871		if (dvar->dtdv_hashval != hashval) {
1872			if (dvar->dtdv_hashval == DTRACE_DYNHASH_SINK) {
1873				/*
1874				 * We've reached the sink, and therefore the
1875				 * end of the hash chain; we can kick out of
1876				 * the loop knowing that we have seen a valid
1877				 * snapshot of state.
1878				 */
1879				ASSERT(dvar->dtdv_next == NULL);
1880				ASSERT(dvar == &dtrace_dynhash_sink);
1881				break;
1882			}
1883
1884			if (dvar->dtdv_hashval == DTRACE_DYNHASH_FREE) {
1885				/*
1886				 * We've gone off the rails:  somewhere along
1887				 * the line, one of the members of this hash
1888				 * chain was deleted.  Note that we could also
1889				 * detect this by simply letting this loop run
1890				 * to completion, as we would eventually hit
1891				 * the end of the dirty list.  However, we
1892				 * want to avoid running the length of the
1893				 * dirty list unnecessarily (it might be quite
1894				 * long), so we catch this as early as
1895				 * possible by detecting the hash marker.  In
1896				 * this case, we simply set dvar to NULL and
1897				 * break; the conditional after the loop will
1898				 * send us back to top.
1899				 */
1900				dvar = NULL;
1901				break;
1902			}
1903
1904			goto next;
1905		}
1906
1907		if (dtuple->dtt_nkeys != nkeys)
1908			goto next;
1909
1910		for (i = 0; i < nkeys; i++, dkey++) {
1911			if (dkey->dttk_size != key[i].dttk_size)
1912				goto next; /* size or type mismatch */
1913
1914			if (dkey->dttk_size != 0) {
1915				if (dtrace_bcmp(
1916				    (void *)(uintptr_t)key[i].dttk_value,
1917				    (void *)(uintptr_t)dkey->dttk_value,
1918				    dkey->dttk_size))
1919					goto next;
1920			} else {
1921				if (dkey->dttk_value != key[i].dttk_value)
1922					goto next;
1923			}
1924		}
1925
1926		if (op != DTRACE_DYNVAR_DEALLOC)
1927			return (dvar);
1928
1929		ASSERT(dvar->dtdv_next == NULL ||
1930		    dvar->dtdv_next->dtdv_hashval != DTRACE_DYNHASH_FREE);
1931
1932		if (prev != NULL) {
1933			ASSERT(hash[bucket].dtdh_chain != dvar);
1934			ASSERT(start != dvar);
1935			ASSERT(prev->dtdv_next == dvar);
1936			prev->dtdv_next = dvar->dtdv_next;
1937		} else {
1938			if (dtrace_casptr(&hash[bucket].dtdh_chain,
1939			    start, dvar->dtdv_next) != start) {
1940				/*
1941				 * We have failed to atomically swing the
1942				 * hash table head pointer, presumably because
1943				 * of a conflicting allocation on another CPU.
1944				 * We need to reread the hash chain and try
1945				 * again.
1946				 */
1947				goto top;
1948			}
1949		}
1950
1951		dtrace_membar_producer();
1952
1953		/*
1954		 * Now set the hash value to indicate that it's free.
1955		 */
1956		ASSERT(hash[bucket].dtdh_chain != dvar);
1957		dvar->dtdv_hashval = DTRACE_DYNHASH_FREE;
1958
1959		dtrace_membar_producer();
1960
1961		/*
1962		 * Set the next pointer to point at the dirty list, and
1963		 * atomically swing the dirty pointer to the newly freed dvar.
1964		 */
1965		do {
1966			next = dcpu->dtdsc_dirty;
1967			dvar->dtdv_next = next;
1968		} while (dtrace_casptr(&dcpu->dtdsc_dirty, next, dvar) != next);
1969
1970		/*
1971		 * Finally, unlock this hash bucket.
1972		 */
1973		ASSERT(hash[bucket].dtdh_lock == lock);
1974		ASSERT(lock & 1);
1975		hash[bucket].dtdh_lock++;
1976
1977		return (NULL);
1978next:
1979		prev = dvar;
1980		continue;
1981	}
1982
1983	if (dvar == NULL) {
1984		/*
1985		 * If dvar is NULL, it is because we went off the rails:
1986		 * one of the elements that we traversed in the hash chain
1987		 * was deleted while we were traversing it.  In this case,
1988		 * we assert that we aren't doing a dealloc (deallocs lock
1989		 * the hash bucket to prevent themselves from racing with
1990		 * one another), and retry the hash chain traversal.
1991		 */
1992		ASSERT(op != DTRACE_DYNVAR_DEALLOC);
1993		goto top;
1994	}
1995
1996	if (op != DTRACE_DYNVAR_ALLOC) {
1997		/*
1998		 * If we are not to allocate a new variable, we want to
1999		 * return NULL now.  Before we return, check that the value
2000		 * of the lock word hasn't changed.  If it has, we may have
2001		 * seen an inconsistent snapshot.
2002		 */
2003		if (op == DTRACE_DYNVAR_NOALLOC) {
2004			if (hash[bucket].dtdh_lock != lock)
2005				goto top;
2006		} else {
2007			ASSERT(op == DTRACE_DYNVAR_DEALLOC);
2008			ASSERT(hash[bucket].dtdh_lock == lock);
2009			ASSERT(lock & 1);
2010			hash[bucket].dtdh_lock++;
2011		}
2012
2013		return (NULL);
2014	}
2015
2016	/*
2017	 * We need to allocate a new dynamic variable.  The size we need is the
2018	 * size of dtrace_dynvar plus the size of nkeys dtrace_key_t's plus the
2019	 * size of any auxiliary key data (rounded up to 8-byte alignment) plus
2020	 * the size of any referred-to data (dsize).  We then round the final
2021	 * size up to the chunksize for allocation.
2022	 */
2023	for (ksize = 0, i = 0; i < nkeys; i++)
2024		ksize += P2ROUNDUP(key[i].dttk_size, sizeof (uint64_t));
2025
2026	/*
2027	 * This should be pretty much impossible, but could happen if, say,
2028	 * strange DIF specified the tuple.  Ideally, this should be an
2029	 * assertion and not an error condition -- but that requires that the
2030	 * chunksize calculation in dtrace_difo_chunksize() be absolutely
2031	 * bullet-proof.  (That is, it must not be able to be fooled by
2032	 * malicious DIF.)  Given the lack of backwards branches in DIF,
2033	 * solving this would presumably not amount to solving the Halting
2034	 * Problem -- but it still seems awfully hard.
2035	 */
2036	if (sizeof (dtrace_dynvar_t) + sizeof (dtrace_key_t) * (nkeys - 1) +
2037	    ksize + dsize > chunksize) {
2038		dcpu->dtdsc_drops++;
2039		return (NULL);
2040	}
2041
2042	nstate = DTRACE_DSTATE_EMPTY;
2043
2044	do {
2045retry:
2046		free = dcpu->dtdsc_free;
2047
2048		if (free == NULL) {
2049			dtrace_dynvar_t *clean = dcpu->dtdsc_clean;
2050			void *rval;
2051
2052			if (clean == NULL) {
2053				/*
2054				 * We're out of dynamic variable space on
2055				 * this CPU.  Unless we have tried all CPUs,
2056				 * we'll try to allocate from a different
2057				 * CPU.
2058				 */
2059				switch (dstate->dtds_state) {
2060				case DTRACE_DSTATE_CLEAN: {
2061					void *sp = &dstate->dtds_state;
2062
2063					if (++cpu >= NCPU)
2064						cpu = 0;
2065
2066					if (dcpu->dtdsc_dirty != NULL &&
2067					    nstate == DTRACE_DSTATE_EMPTY)
2068						nstate = DTRACE_DSTATE_DIRTY;
2069
2070					if (dcpu->dtdsc_rinsing != NULL)
2071						nstate = DTRACE_DSTATE_RINSING;
2072
2073					dcpu = &dstate->dtds_percpu[cpu];
2074
2075					if (cpu != me)
2076						goto retry;
2077
2078					(void) dtrace_cas32(sp,
2079					    DTRACE_DSTATE_CLEAN, nstate);
2080
2081					/*
2082					 * To increment the correct bean
2083					 * counter, take another lap.
2084					 */
2085					goto retry;
2086				}
2087
2088				case DTRACE_DSTATE_DIRTY:
2089					dcpu->dtdsc_dirty_drops++;
2090					break;
2091
2092				case DTRACE_DSTATE_RINSING:
2093					dcpu->dtdsc_rinsing_drops++;
2094					break;
2095
2096				case DTRACE_DSTATE_EMPTY:
2097					dcpu->dtdsc_drops++;
2098					break;
2099				}
2100
2101				DTRACE_CPUFLAG_SET(CPU_DTRACE_DROP);
2102				return (NULL);
2103			}
2104
2105			/*
2106			 * The clean list appears to be non-empty.  We want to
2107			 * move the clean list to the free list; we start by
2108			 * moving the clean pointer aside.
2109			 */
2110			if (dtrace_casptr(&dcpu->dtdsc_clean,
2111			    clean, NULL) != clean) {
2112				/*
2113				 * We are in one of two situations:
2114				 *
2115				 *  (a)	The clean list was switched to the
2116				 *	free list by another CPU.
2117				 *
2118				 *  (b)	The clean list was added to by the
2119				 *	cleansing cyclic.
2120				 *
2121				 * In either of these situations, we can
2122				 * just reattempt the free list allocation.
2123				 */
2124				goto retry;
2125			}
2126
2127			ASSERT(clean->dtdv_hashval == DTRACE_DYNHASH_FREE);
2128
2129			/*
2130			 * Now we'll move the clean list to our free list.
2131			 * It's impossible for this to fail:  the only way
2132			 * the free list can be updated is through this
2133			 * code path, and only one CPU can own the clean list.
2134			 * Thus, it would only be possible for this to fail if
2135			 * this code were racing with dtrace_dynvar_clean().
2136			 * (That is, if dtrace_dynvar_clean() updated the clean
2137			 * list, and we ended up racing to update the free
2138			 * list.)  This race is prevented by the dtrace_sync()
2139			 * in dtrace_dynvar_clean() -- which flushes the
2140			 * owners of the clean lists out before resetting
2141			 * the clean lists.
2142			 */
2143			dcpu = &dstate->dtds_percpu[me];
2144			rval = dtrace_casptr(&dcpu->dtdsc_free, NULL, clean);
2145			ASSERT(rval == NULL);
2146			goto retry;
2147		}
2148
2149		dvar = free;
2150		new_free = dvar->dtdv_next;
2151	} while (dtrace_casptr(&dcpu->dtdsc_free, free, new_free) != free);
2152
2153	/*
2154	 * We have now allocated a new chunk.  We copy the tuple keys into the
2155	 * tuple array and copy any referenced key data into the data space
2156	 * following the tuple array.  As we do this, we relocate dttk_value
2157	 * in the final tuple to point to the key data address in the chunk.
2158	 */
2159	kdata = (uintptr_t)&dvar->dtdv_tuple.dtt_key[nkeys];
2160	dvar->dtdv_data = (void *)(kdata + ksize);
2161	dvar->dtdv_tuple.dtt_nkeys = nkeys;
2162
2163	for (i = 0; i < nkeys; i++) {
2164		dtrace_key_t *dkey = &dvar->dtdv_tuple.dtt_key[i];
2165		size_t kesize = key[i].dttk_size;
2166
2167		if (kesize != 0) {
2168			dtrace_bcopy(
2169			    (const void *)(uintptr_t)key[i].dttk_value,
2170			    (void *)kdata, kesize);
2171			dkey->dttk_value = kdata;
2172			kdata += P2ROUNDUP(kesize, sizeof (uint64_t));
2173		} else {
2174			dkey->dttk_value = key[i].dttk_value;
2175		}
2176
2177		dkey->dttk_size = kesize;
2178	}
2179
2180	ASSERT(dvar->dtdv_hashval == DTRACE_DYNHASH_FREE);
2181	dvar->dtdv_hashval = hashval;
2182	dvar->dtdv_next = start;
2183
2184	if (dtrace_casptr(&hash[bucket].dtdh_chain, start, dvar) == start)
2185		return (dvar);
2186
2187	/*
2188	 * The cas has failed.  Either another CPU is adding an element to
2189	 * this hash chain, or another CPU is deleting an element from this
2190	 * hash chain.  The simplest way to deal with both of these cases
2191	 * (though not necessarily the most efficient) is to free our
2192	 * allocated block and re-attempt it all.  Note that the free is
2193	 * to the dirty list and _not_ to the free list.  This is to prevent
2194	 * races with allocators, above.
2195	 */
2196	dvar->dtdv_hashval = DTRACE_DYNHASH_FREE;
2197
2198	dtrace_membar_producer();
2199
2200	do {
2201		free = dcpu->dtdsc_dirty;
2202		dvar->dtdv_next = free;
2203	} while (dtrace_casptr(&dcpu->dtdsc_dirty, free, dvar) != free);
2204
2205	goto top;
2206}
2207
2208/*ARGSUSED*/
2209static void
2210dtrace_aggregate_min(uint64_t *oval, uint64_t nval, uint64_t arg)
2211{
2212	if ((int64_t)nval < (int64_t)*oval)
2213		*oval = nval;
2214}
2215
2216/*ARGSUSED*/
2217static void
2218dtrace_aggregate_max(uint64_t *oval, uint64_t nval, uint64_t arg)
2219{
2220	if ((int64_t)nval > (int64_t)*oval)
2221		*oval = nval;
2222}
2223
2224static void
2225dtrace_aggregate_quantize(uint64_t *quanta, uint64_t nval, uint64_t incr)
2226{
2227	int i, zero = DTRACE_QUANTIZE_ZEROBUCKET;
2228	int64_t val = (int64_t)nval;
2229
2230	if (val < 0) {
2231		for (i = 0; i < zero; i++) {
2232			if (val <= DTRACE_QUANTIZE_BUCKETVAL(i)) {
2233				quanta[i] += incr;
2234				return;
2235			}
2236		}
2237	} else {
2238		for (i = zero + 1; i < DTRACE_QUANTIZE_NBUCKETS; i++) {
2239			if (val < DTRACE_QUANTIZE_BUCKETVAL(i)) {
2240				quanta[i - 1] += incr;
2241				return;
2242			}
2243		}
2244
2245		quanta[DTRACE_QUANTIZE_NBUCKETS - 1] += incr;
2246		return;
2247	}
2248
2249	ASSERT(0);
2250}
2251
2252static void
2253dtrace_aggregate_lquantize(uint64_t *lquanta, uint64_t nval, uint64_t incr)
2254{
2255	uint64_t arg = *lquanta++;
2256	int32_t base = DTRACE_LQUANTIZE_BASE(arg);
2257	uint16_t step = DTRACE_LQUANTIZE_STEP(arg);
2258	uint16_t levels = DTRACE_LQUANTIZE_LEVELS(arg);
2259	int32_t val = (int32_t)nval, level;
2260
2261	ASSERT(step != 0);
2262	ASSERT(levels != 0);
2263
2264	if (val < base) {
2265		/*
2266		 * This is an underflow.
2267		 */
2268		lquanta[0] += incr;
2269		return;
2270	}
2271
2272	level = (val - base) / step;
2273
2274	if (level < levels) {
2275		lquanta[level + 1] += incr;
2276		return;
2277	}
2278
2279	/*
2280	 * This is an overflow.
2281	 */
2282	lquanta[levels + 1] += incr;
2283}
2284
2285static int
2286dtrace_aggregate_llquantize_bucket(uint16_t factor, uint16_t low,
2287    uint16_t high, uint16_t nsteps, int64_t value)
2288{
2289	int64_t this = 1, last, next;
2290	int base = 1, order;
2291
2292	ASSERT(factor <= nsteps);
2293	ASSERT(nsteps % factor == 0);
2294
2295	for (order = 0; order < low; order++)
2296		this *= factor;
2297
2298	/*
2299	 * If our value is less than our factor taken to the power of the
2300	 * low order of magnitude, it goes into the zeroth bucket.
2301	 */
2302	if (value < (last = this))
2303		return (0);
2304
2305	for (this *= factor; order <= high; order++) {
2306		int nbuckets = this > nsteps ? nsteps : this;
2307
2308		if ((next = this * factor) < this) {
2309			/*
2310			 * We should not generally get log/linear quantizations
2311			 * with a high magnitude that allows 64-bits to
2312			 * overflow, but we nonetheless protect against this
2313			 * by explicitly checking for overflow, and clamping
2314			 * our value accordingly.
2315			 */
2316			value = this - 1;
2317		}
2318
2319		if (value < this) {
2320			/*
2321			 * If our value lies within this order of magnitude,
2322			 * determine its position by taking the offset within
2323			 * the order of magnitude, dividing by the bucket
2324			 * width, and adding to our (accumulated) base.
2325			 */
2326			return (base + (value - last) / (this / nbuckets));
2327		}
2328
2329		base += nbuckets - (nbuckets / factor);
2330		last = this;
2331		this = next;
2332	}
2333
2334	/*
2335	 * Our value is greater than or equal to our factor taken to the
2336	 * power of one plus the high magnitude -- return the top bucket.
2337	 */
2338	return (base);
2339}
2340
2341static void
2342dtrace_aggregate_llquantize(uint64_t *llquanta, uint64_t nval, uint64_t incr)
2343{
2344	uint64_t arg = *llquanta++;
2345	uint16_t factor = DTRACE_LLQUANTIZE_FACTOR(arg);
2346	uint16_t low = DTRACE_LLQUANTIZE_LOW(arg);
2347	uint16_t high = DTRACE_LLQUANTIZE_HIGH(arg);
2348	uint16_t nsteps = DTRACE_LLQUANTIZE_NSTEP(arg);
2349
2350	llquanta[dtrace_aggregate_llquantize_bucket(factor,
2351	    low, high, nsteps, nval)] += incr;
2352}
2353
2354/*ARGSUSED*/
2355static void
2356dtrace_aggregate_avg(uint64_t *data, uint64_t nval, uint64_t arg)
2357{
2358	data[0]++;
2359	data[1] += nval;
2360}
2361
2362/*ARGSUSED*/
2363static void
2364dtrace_aggregate_stddev(uint64_t *data, uint64_t nval, uint64_t arg)
2365{
2366	int64_t snval = (int64_t)nval;
2367	uint64_t tmp[2];
2368
2369	data[0]++;
2370	data[1] += nval;
2371
2372	/*
2373	 * What we want to say here is:
2374	 *
2375	 * data[2] += nval * nval;
2376	 *
2377	 * But given that nval is 64-bit, we could easily overflow, so
2378	 * we do this as 128-bit arithmetic.
2379	 */
2380	if (snval < 0)
2381		snval = -snval;
2382
2383	dtrace_multiply_128((uint64_t)snval, (uint64_t)snval, tmp);
2384	dtrace_add_128(data + 2, tmp, data + 2);
2385}
2386
2387/*ARGSUSED*/
2388static void
2389dtrace_aggregate_count(uint64_t *oval, uint64_t nval, uint64_t arg)
2390{
2391	*oval = *oval + 1;
2392}
2393
2394/*ARGSUSED*/
2395static void
2396dtrace_aggregate_sum(uint64_t *oval, uint64_t nval, uint64_t arg)
2397{
2398	*oval += nval;
2399}
2400
2401/*
2402 * Aggregate given the tuple in the principal data buffer, and the aggregating
2403 * action denoted by the specified dtrace_aggregation_t.  The aggregation
2404 * buffer is specified as the buf parameter.  This routine does not return
2405 * failure; if there is no space in the aggregation buffer, the data will be
2406 * dropped, and a corresponding counter incremented.
2407 */
2408static void
2409dtrace_aggregate(dtrace_aggregation_t *agg, dtrace_buffer_t *dbuf,
2410    intptr_t offset, dtrace_buffer_t *buf, uint64_t expr, uint64_t arg)
2411{
2412	dtrace_recdesc_t *rec = &agg->dtag_action.dta_rec;
2413	uint32_t i, ndx, size, fsize;
2414	uint32_t align = sizeof (uint64_t) - 1;
2415	dtrace_aggbuffer_t *agb;
2416	dtrace_aggkey_t *key;
2417	uint32_t hashval = 0, limit, isstr;
2418	caddr_t tomax, data, kdata;
2419	dtrace_actkind_t action;
2420	dtrace_action_t *act;
2421	uintptr_t offs;
2422
2423	if (buf == NULL)
2424		return;
2425
2426	if (!agg->dtag_hasarg) {
2427		/*
2428		 * Currently, only quantize() and lquantize() take additional
2429		 * arguments, and they have the same semantics:  an increment
2430		 * value that defaults to 1 when not present.  If additional
2431		 * aggregating actions take arguments, the setting of the
2432		 * default argument value will presumably have to become more
2433		 * sophisticated...
2434		 */
2435		arg = 1;
2436	}
2437
2438	action = agg->dtag_action.dta_kind - DTRACEACT_AGGREGATION;
2439	size = rec->dtrd_offset - agg->dtag_base;
2440	fsize = size + rec->dtrd_size;
2441
2442	ASSERT(dbuf->dtb_tomax != NULL);
2443	data = dbuf->dtb_tomax + offset + agg->dtag_base;
2444
2445	if ((tomax = buf->dtb_tomax) == NULL) {
2446		dtrace_buffer_drop(buf);
2447		return;
2448	}
2449
2450	/*
2451	 * The metastructure is always at the bottom of the buffer.
2452	 */
2453	agb = (dtrace_aggbuffer_t *)(tomax + buf->dtb_size -
2454	    sizeof (dtrace_aggbuffer_t));
2455
2456	if (buf->dtb_offset == 0) {
2457		/*
2458		 * We just kludge up approximately 1/8th of the size to be
2459		 * buckets.  If this guess ends up being routinely
2460		 * off-the-mark, we may need to dynamically readjust this
2461		 * based on past performance.
2462		 */
2463		uintptr_t hashsize = (buf->dtb_size >> 3) / sizeof (uintptr_t);
2464
2465		if ((uintptr_t)agb - hashsize * sizeof (dtrace_aggkey_t *) <
2466		    (uintptr_t)tomax || hashsize == 0) {
2467			/*
2468			 * We've been given a ludicrously small buffer;
2469			 * increment our drop count and leave.
2470			 */
2471			dtrace_buffer_drop(buf);
2472			return;
2473		}
2474
2475		/*
2476		 * And now, a pathetic attempt to try to get a an odd (or
2477		 * perchance, a prime) hash size for better hash distribution.
2478		 */
2479		if (hashsize > (DTRACE_AGGHASHSIZE_SLEW << 3))
2480			hashsize -= DTRACE_AGGHASHSIZE_SLEW;
2481
2482		agb->dtagb_hashsize = hashsize;
2483		agb->dtagb_hash = (dtrace_aggkey_t **)((uintptr_t)agb -
2484		    agb->dtagb_hashsize * sizeof (dtrace_aggkey_t *));
2485		agb->dtagb_free = (uintptr_t)agb->dtagb_hash;
2486
2487		for (i = 0; i < agb->dtagb_hashsize; i++)
2488			agb->dtagb_hash[i] = NULL;
2489	}
2490
2491	ASSERT(agg->dtag_first != NULL);
2492	ASSERT(agg->dtag_first->dta_intuple);
2493
2494	/*
2495	 * Calculate the hash value based on the key.  Note that we _don't_
2496	 * include the aggid in the hashing (but we will store it as part of
2497	 * the key).  The hashing algorithm is Bob Jenkins' "One-at-a-time"
2498	 * algorithm: a simple, quick algorithm that has no known funnels, and
2499	 * gets good distribution in practice.  The efficacy of the hashing
2500	 * algorithm (and a comparison with other algorithms) may be found by
2501	 * running the ::dtrace_aggstat MDB dcmd.
2502	 */
2503	for (act = agg->dtag_first; act->dta_intuple; act = act->dta_next) {
2504		i = act->dta_rec.dtrd_offset - agg->dtag_base;
2505		limit = i + act->dta_rec.dtrd_size;
2506		ASSERT(limit <= size);
2507		isstr = DTRACEACT_ISSTRING(act);
2508
2509		for (; i < limit; i++) {
2510			hashval += data[i];
2511			hashval += (hashval << 10);
2512			hashval ^= (hashval >> 6);
2513
2514			if (isstr && data[i] == '\0')
2515				break;
2516		}
2517	}
2518
2519	hashval += (hashval << 3);
2520	hashval ^= (hashval >> 11);
2521	hashval += (hashval << 15);
2522
2523	/*
2524	 * Yes, the divide here is expensive -- but it's generally the least
2525	 * of the performance issues given the amount of data that we iterate
2526	 * over to compute hash values, compare data, etc.
2527	 */
2528	ndx = hashval % agb->dtagb_hashsize;
2529
2530	for (key = agb->dtagb_hash[ndx]; key != NULL; key = key->dtak_next) {
2531		ASSERT((caddr_t)key >= tomax);
2532		ASSERT((caddr_t)key < tomax + buf->dtb_size);
2533
2534		if (hashval != key->dtak_hashval || key->dtak_size != size)
2535			continue;
2536
2537		kdata = key->dtak_data;
2538		ASSERT(kdata >= tomax && kdata < tomax + buf->dtb_size);
2539
2540		for (act = agg->dtag_first; act->dta_intuple;
2541		    act = act->dta_next) {
2542			i = act->dta_rec.dtrd_offset - agg->dtag_base;
2543			limit = i + act->dta_rec.dtrd_size;
2544			ASSERT(limit <= size);
2545			isstr = DTRACEACT_ISSTRING(act);
2546
2547			for (; i < limit; i++) {
2548				if (kdata[i] != data[i])
2549					goto next;
2550
2551				if (isstr && data[i] == '\0')
2552					break;
2553			}
2554		}
2555
2556		if (action != key->dtak_action) {
2557			/*
2558			 * We are aggregating on the same value in the same
2559			 * aggregation with two different aggregating actions.
2560			 * (This should have been picked up in the compiler,
2561			 * so we may be dealing with errant or devious DIF.)
2562			 * This is an error condition; we indicate as much,
2563			 * and return.
2564			 */
2565			DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
2566			return;
2567		}
2568
2569		/*
2570		 * This is a hit:  we need to apply the aggregator to
2571		 * the value at this key.
2572		 */
2573		agg->dtag_aggregate((uint64_t *)(kdata + size), expr, arg);
2574		return;
2575next:
2576		continue;
2577	}
2578
2579	/*
2580	 * We didn't find it.  We need to allocate some zero-filled space,
2581	 * link it into the hash table appropriately, and apply the aggregator
2582	 * to the (zero-filled) value.
2583	 */
2584	offs = buf->dtb_offset;
2585	while (offs & (align - 1))
2586		offs += sizeof (uint32_t);
2587
2588	/*
2589	 * If we don't have enough room to both allocate a new key _and_
2590	 * its associated data, increment the drop count and return.
2591	 */
2592	if ((uintptr_t)tomax + offs + fsize >
2593	    agb->dtagb_free - sizeof (dtrace_aggkey_t)) {
2594		dtrace_buffer_drop(buf);
2595		return;
2596	}
2597
2598	/*CONSTCOND*/
2599	ASSERT(!(sizeof (dtrace_aggkey_t) & (sizeof (uintptr_t) - 1)));
2600	key = (dtrace_aggkey_t *)(agb->dtagb_free - sizeof (dtrace_aggkey_t));
2601	agb->dtagb_free -= sizeof (dtrace_aggkey_t);
2602
2603	key->dtak_data = kdata = tomax + offs;
2604	buf->dtb_offset = offs + fsize;
2605
2606	/*
2607	 * Now copy the data across.
2608	 */
2609	*((dtrace_aggid_t *)kdata) = agg->dtag_id;
2610
2611	for (i = sizeof (dtrace_aggid_t); i < size; i++)
2612		kdata[i] = data[i];
2613
2614	/*
2615	 * Because strings are not zeroed out by default, we need to iterate
2616	 * looking for actions that store strings, and we need to explicitly
2617	 * pad these strings out with zeroes.
2618	 */
2619	for (act = agg->dtag_first; act->dta_intuple; act = act->dta_next) {
2620		int nul;
2621
2622		if (!DTRACEACT_ISSTRING(act))
2623			continue;
2624
2625		i = act->dta_rec.dtrd_offset - agg->dtag_base;
2626		limit = i + act->dta_rec.dtrd_size;
2627		ASSERT(limit <= size);
2628
2629		for (nul = 0; i < limit; i++) {
2630			if (nul) {
2631				kdata[i] = '\0';
2632				continue;
2633			}
2634
2635			if (data[i] != '\0')
2636				continue;
2637
2638			nul = 1;
2639		}
2640	}
2641
2642	for (i = size; i < fsize; i++)
2643		kdata[i] = 0;
2644
2645	key->dtak_hashval = hashval;
2646	key->dtak_size = size;
2647	key->dtak_action = action;
2648	key->dtak_next = agb->dtagb_hash[ndx];
2649	agb->dtagb_hash[ndx] = key;
2650
2651	/*
2652	 * Finally, apply the aggregator.
2653	 */
2654	*((uint64_t *)(key->dtak_data + size)) = agg->dtag_initial;
2655	agg->dtag_aggregate((uint64_t *)(key->dtak_data + size), expr, arg);
2656}
2657
2658/*
2659 * Given consumer state, this routine finds a speculation in the INACTIVE
2660 * state and transitions it into the ACTIVE state.  If there is no speculation
2661 * in the INACTIVE state, 0 is returned.  In this case, no error counter is
2662 * incremented -- it is up to the caller to take appropriate action.
2663 */
2664static int
2665dtrace_speculation(dtrace_state_t *state)
2666{
2667	int i = 0;
2668	dtrace_speculation_state_t current;
2669	uint32_t *stat = &state->dts_speculations_unavail, count;
2670
2671	while (i < state->dts_nspeculations) {
2672		dtrace_speculation_t *spec = &state->dts_speculations[i];
2673
2674		current = spec->dtsp_state;
2675
2676		if (current != DTRACESPEC_INACTIVE) {
2677			if (current == DTRACESPEC_COMMITTINGMANY ||
2678			    current == DTRACESPEC_COMMITTING ||
2679			    current == DTRACESPEC_DISCARDING)
2680				stat = &state->dts_speculations_busy;
2681			i++;
2682			continue;
2683		}
2684
2685		if (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2686		    current, DTRACESPEC_ACTIVE) == current)
2687			return (i + 1);
2688	}
2689
2690	/*
2691	 * We couldn't find a speculation.  If we found as much as a single
2692	 * busy speculation buffer, we'll attribute this failure as "busy"
2693	 * instead of "unavail".
2694	 */
2695	do {
2696		count = *stat;
2697	} while (dtrace_cas32(stat, count, count + 1) != count);
2698
2699	return (0);
2700}
2701
2702/*
2703 * This routine commits an active speculation.  If the specified speculation
2704 * is not in a valid state to perform a commit(), this routine will silently do
2705 * nothing.  The state of the specified speculation is transitioned according
2706 * to the state transition diagram outlined in <sys/dtrace_impl.h>
2707 */
2708static void
2709dtrace_speculation_commit(dtrace_state_t *state, processorid_t cpu,
2710    dtrace_specid_t which)
2711{
2712	dtrace_speculation_t *spec;
2713	dtrace_buffer_t *src, *dest;
2714	uintptr_t daddr, saddr, dlimit, slimit;
2715	dtrace_speculation_state_t current, new;
2716	intptr_t offs;
2717	uint64_t timestamp;
2718
2719	if (which == 0)
2720		return;
2721
2722	if (which > state->dts_nspeculations) {
2723		cpu_core[cpu].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
2724		return;
2725	}
2726
2727	spec = &state->dts_speculations[which - 1];
2728	src = &spec->dtsp_buffer[cpu];
2729	dest = &state->dts_buffer[cpu];
2730
2731	do {
2732		current = spec->dtsp_state;
2733
2734		if (current == DTRACESPEC_COMMITTINGMANY)
2735			break;
2736
2737		switch (current) {
2738		case DTRACESPEC_INACTIVE:
2739		case DTRACESPEC_DISCARDING:
2740			return;
2741
2742		case DTRACESPEC_COMMITTING:
2743			/*
2744			 * This is only possible if we are (a) commit()'ing
2745			 * without having done a prior speculate() on this CPU
2746			 * and (b) racing with another commit() on a different
2747			 * CPU.  There's nothing to do -- we just assert that
2748			 * our offset is 0.
2749			 */
2750			ASSERT(src->dtb_offset == 0);
2751			return;
2752
2753		case DTRACESPEC_ACTIVE:
2754			new = DTRACESPEC_COMMITTING;
2755			break;
2756
2757		case DTRACESPEC_ACTIVEONE:
2758			/*
2759			 * This speculation is active on one CPU.  If our
2760			 * buffer offset is non-zero, we know that the one CPU
2761			 * must be us.  Otherwise, we are committing on a
2762			 * different CPU from the speculate(), and we must
2763			 * rely on being asynchronously cleaned.
2764			 */
2765			if (src->dtb_offset != 0) {
2766				new = DTRACESPEC_COMMITTING;
2767				break;
2768			}
2769			/*FALLTHROUGH*/
2770
2771		case DTRACESPEC_ACTIVEMANY:
2772			new = DTRACESPEC_COMMITTINGMANY;
2773			break;
2774
2775		default:
2776			ASSERT(0);
2777		}
2778	} while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2779	    current, new) != current);
2780
2781	/*
2782	 * We have set the state to indicate that we are committing this
2783	 * speculation.  Now reserve the necessary space in the destination
2784	 * buffer.
2785	 */
2786	if ((offs = dtrace_buffer_reserve(dest, src->dtb_offset,
2787	    sizeof (uint64_t), state, NULL)) < 0) {
2788		dtrace_buffer_drop(dest);
2789		goto out;
2790	}
2791
2792	/*
2793	 * We have sufficient space to copy the speculative buffer into the
2794	 * primary buffer.  First, modify the speculative buffer, filling
2795	 * in the timestamp of all entries with the current time.  The data
2796	 * must have the commit() time rather than the time it was traced,
2797	 * so that all entries in the primary buffer are in timestamp order.
2798	 */
2799	timestamp = dtrace_gethrtime();
2800	saddr = (uintptr_t)src->dtb_tomax;
2801	slimit = saddr + src->dtb_offset;
2802	while (saddr < slimit) {
2803		size_t size;
2804		dtrace_rechdr_t *dtrh = (dtrace_rechdr_t *)saddr;
2805
2806		if (dtrh->dtrh_epid == DTRACE_EPIDNONE) {
2807			saddr += sizeof (dtrace_epid_t);
2808			continue;
2809		}
2810		ASSERT3U(dtrh->dtrh_epid, <=, state->dts_necbs);
2811		size = state->dts_ecbs[dtrh->dtrh_epid - 1]->dte_size;
2812
2813		ASSERT3U(saddr + size, <=, slimit);
2814		ASSERT3U(size, >=, sizeof (dtrace_rechdr_t));
2815		ASSERT3U(DTRACE_RECORD_LOAD_TIMESTAMP(dtrh), ==, UINT64_MAX);
2816
2817		DTRACE_RECORD_STORE_TIMESTAMP(dtrh, timestamp);
2818
2819		saddr += size;
2820	}
2821
2822	/*
2823	 * Copy the buffer across.  (Note that this is a
2824	 * highly subobtimal bcopy(); in the unlikely event that this becomes
2825	 * a serious performance issue, a high-performance DTrace-specific
2826	 * bcopy() should obviously be invented.)
2827	 */
2828	daddr = (uintptr_t)dest->dtb_tomax + offs;
2829	dlimit = daddr + src->dtb_offset;
2830	saddr = (uintptr_t)src->dtb_tomax;
2831
2832	/*
2833	 * First, the aligned portion.
2834	 */
2835	while (dlimit - daddr >= sizeof (uint64_t)) {
2836		*((uint64_t *)daddr) = *((uint64_t *)saddr);
2837
2838		daddr += sizeof (uint64_t);
2839		saddr += sizeof (uint64_t);
2840	}
2841
2842	/*
2843	 * Now any left-over bit...
2844	 */
2845	while (dlimit - daddr)
2846		*((uint8_t *)daddr++) = *((uint8_t *)saddr++);
2847
2848	/*
2849	 * Finally, commit the reserved space in the destination buffer.
2850	 */
2851	dest->dtb_offset = offs + src->dtb_offset;
2852
2853out:
2854	/*
2855	 * If we're lucky enough to be the only active CPU on this speculation
2856	 * buffer, we can just set the state back to DTRACESPEC_INACTIVE.
2857	 */
2858	if (current == DTRACESPEC_ACTIVE ||
2859	    (current == DTRACESPEC_ACTIVEONE && new == DTRACESPEC_COMMITTING)) {
2860		uint32_t rval = dtrace_cas32((uint32_t *)&spec->dtsp_state,
2861		    DTRACESPEC_COMMITTING, DTRACESPEC_INACTIVE);
2862
2863		ASSERT(rval == DTRACESPEC_COMMITTING);
2864	}
2865
2866	src->dtb_offset = 0;
2867	src->dtb_xamot_drops += src->dtb_drops;
2868	src->dtb_drops = 0;
2869}
2870
2871/*
2872 * This routine discards an active speculation.  If the specified speculation
2873 * is not in a valid state to perform a discard(), this routine will silently
2874 * do nothing.  The state of the specified speculation is transitioned
2875 * according to the state transition diagram outlined in <sys/dtrace_impl.h>
2876 */
2877static void
2878dtrace_speculation_discard(dtrace_state_t *state, processorid_t cpu,
2879    dtrace_specid_t which)
2880{
2881	dtrace_speculation_t *spec;
2882	dtrace_speculation_state_t current, new;
2883	dtrace_buffer_t *buf;
2884
2885	if (which == 0)
2886		return;
2887
2888	if (which > state->dts_nspeculations) {
2889		cpu_core[cpu].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
2890		return;
2891	}
2892
2893	spec = &state->dts_speculations[which - 1];
2894	buf = &spec->dtsp_buffer[cpu];
2895
2896	do {
2897		current = spec->dtsp_state;
2898
2899		switch (current) {
2900		case DTRACESPEC_INACTIVE:
2901		case DTRACESPEC_COMMITTINGMANY:
2902		case DTRACESPEC_COMMITTING:
2903		case DTRACESPEC_DISCARDING:
2904			return;
2905
2906		case DTRACESPEC_ACTIVE:
2907		case DTRACESPEC_ACTIVEMANY:
2908			new = DTRACESPEC_DISCARDING;
2909			break;
2910
2911		case DTRACESPEC_ACTIVEONE:
2912			if (buf->dtb_offset != 0) {
2913				new = DTRACESPEC_INACTIVE;
2914			} else {
2915				new = DTRACESPEC_DISCARDING;
2916			}
2917			break;
2918
2919		default:
2920			ASSERT(0);
2921		}
2922	} while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2923	    current, new) != current);
2924
2925	buf->dtb_offset = 0;
2926	buf->dtb_drops = 0;
2927}
2928
2929/*
2930 * Note:  not called from probe context.  This function is called
2931 * asynchronously from cross call context to clean any speculations that are
2932 * in the COMMITTINGMANY or DISCARDING states.  These speculations may not be
2933 * transitioned back to the INACTIVE state until all CPUs have cleaned the
2934 * speculation.
2935 */
2936static void
2937dtrace_speculation_clean_here(dtrace_state_t *state)
2938{
2939	dtrace_icookie_t cookie;
2940	processorid_t cpu = CPU->cpu_id;
2941	dtrace_buffer_t *dest = &state->dts_buffer[cpu];
2942	dtrace_specid_t i;
2943
2944	cookie = dtrace_interrupt_disable();
2945
2946	if (dest->dtb_tomax == NULL) {
2947		dtrace_interrupt_enable(cookie);
2948		return;
2949	}
2950
2951	for (i = 0; i < state->dts_nspeculations; i++) {
2952		dtrace_speculation_t *spec = &state->dts_speculations[i];
2953		dtrace_buffer_t *src = &spec->dtsp_buffer[cpu];
2954
2955		if (src->dtb_tomax == NULL)
2956			continue;
2957
2958		if (spec->dtsp_state == DTRACESPEC_DISCARDING) {
2959			src->dtb_offset = 0;
2960			continue;
2961		}
2962
2963		if (spec->dtsp_state != DTRACESPEC_COMMITTINGMANY)
2964			continue;
2965
2966		if (src->dtb_offset == 0)
2967			continue;
2968
2969		dtrace_speculation_commit(state, cpu, i + 1);
2970	}
2971
2972	dtrace_interrupt_enable(cookie);
2973}
2974
2975/*
2976 * Note:  not called from probe context.  This function is called
2977 * asynchronously (and at a regular interval) to clean any speculations that
2978 * are in the COMMITTINGMANY or DISCARDING states.  If it discovers that there
2979 * is work to be done, it cross calls all CPUs to perform that work;
2980 * COMMITMANY and DISCARDING speculations may not be transitioned back to the
2981 * INACTIVE state until they have been cleaned by all CPUs.
2982 */
2983static void
2984dtrace_speculation_clean(dtrace_state_t *state)
2985{
2986	int work = 0, rv;
2987	dtrace_specid_t i;
2988
2989	for (i = 0; i < state->dts_nspeculations; i++) {
2990		dtrace_speculation_t *spec = &state->dts_speculations[i];
2991
2992		ASSERT(!spec->dtsp_cleaning);
2993
2994		if (spec->dtsp_state != DTRACESPEC_DISCARDING &&
2995		    spec->dtsp_state != DTRACESPEC_COMMITTINGMANY)
2996			continue;
2997
2998		work++;
2999		spec->dtsp_cleaning = 1;
3000	}
3001
3002	if (!work)
3003		return;
3004
3005	dtrace_xcall(DTRACE_CPUALL,
3006	    (dtrace_xcall_t)dtrace_speculation_clean_here, state);
3007
3008	/*
3009	 * We now know that all CPUs have committed or discarded their
3010	 * speculation buffers, as appropriate.  We can now set the state
3011	 * to inactive.
3012	 */
3013	for (i = 0; i < state->dts_nspeculations; i++) {
3014		dtrace_speculation_t *spec = &state->dts_speculations[i];
3015		dtrace_speculation_state_t current, new;
3016
3017		if (!spec->dtsp_cleaning)
3018			continue;
3019
3020		current = spec->dtsp_state;
3021		ASSERT(current == DTRACESPEC_DISCARDING ||
3022		    current == DTRACESPEC_COMMITTINGMANY);
3023
3024		new = DTRACESPEC_INACTIVE;
3025
3026		rv = dtrace_cas32((uint32_t *)&spec->dtsp_state, current, new);
3027		ASSERT(rv == current);
3028		spec->dtsp_cleaning = 0;
3029	}
3030}
3031
3032/*
3033 * Called as part of a speculate() to get the speculative buffer associated
3034 * with a given speculation.  Returns NULL if the specified speculation is not
3035 * in an ACTIVE state.  If the speculation is in the ACTIVEONE state -- and
3036 * the active CPU is not the specified CPU -- the speculation will be
3037 * atomically transitioned into the ACTIVEMANY state.
3038 */
3039static dtrace_buffer_t *
3040dtrace_speculation_buffer(dtrace_state_t *state, processorid_t cpuid,
3041    dtrace_specid_t which)
3042{
3043	dtrace_speculation_t *spec;
3044	dtrace_speculation_state_t current, new;
3045	dtrace_buffer_t *buf;
3046
3047	if (which == 0)
3048		return (NULL);
3049
3050	if (which > state->dts_nspeculations) {
3051		cpu_core[cpuid].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
3052		return (NULL);
3053	}
3054
3055	spec = &state->dts_speculations[which - 1];
3056	buf = &spec->dtsp_buffer[cpuid];
3057
3058	do {
3059		current = spec->dtsp_state;
3060
3061		switch (current) {
3062		case DTRACESPEC_INACTIVE:
3063		case DTRACESPEC_COMMITTINGMANY:
3064		case DTRACESPEC_DISCARDING:
3065			return (NULL);
3066
3067		case DTRACESPEC_COMMITTING:
3068			ASSERT(buf->dtb_offset == 0);
3069			return (NULL);
3070
3071		case DTRACESPEC_ACTIVEONE:
3072			/*
3073			 * This speculation is currently active on one CPU.
3074			 * Check the offset in the buffer; if it's non-zero,
3075			 * that CPU must be us (and we leave the state alone).
3076			 * If it's zero, assume that we're starting on a new
3077			 * CPU -- and change the state to indicate that the
3078			 * speculation is active on more than one CPU.
3079			 */
3080			if (buf->dtb_offset != 0)
3081				return (buf);
3082
3083			new = DTRACESPEC_ACTIVEMANY;
3084			break;
3085
3086		case DTRACESPEC_ACTIVEMANY:
3087			return (buf);
3088
3089		case DTRACESPEC_ACTIVE:
3090			new = DTRACESPEC_ACTIVEONE;
3091			break;
3092
3093		default:
3094			ASSERT(0);
3095		}
3096	} while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
3097	    current, new) != current);
3098
3099	ASSERT(new == DTRACESPEC_ACTIVEONE || new == DTRACESPEC_ACTIVEMANY);
3100	return (buf);
3101}
3102
3103/*
3104 * Return a string.  In the event that the user lacks the privilege to access
3105 * arbitrary kernel memory, we copy the string out to scratch memory so that we
3106 * don't fail access checking.
3107 *
3108 * dtrace_dif_variable() uses this routine as a helper for various
3109 * builtin values such as 'execname' and 'probefunc.'
3110 */
3111uintptr_t
3112dtrace_dif_varstr(uintptr_t addr, dtrace_state_t *state,
3113    dtrace_mstate_t *mstate)
3114{
3115	uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
3116	uintptr_t ret;
3117	size_t strsz;
3118
3119	/*
3120	 * The easy case: this probe is allowed to read all of memory, so
3121	 * we can just return this as a vanilla pointer.
3122	 */
3123	if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
3124		return (addr);
3125
3126	/*
3127	 * This is the tougher case: we copy the string in question from
3128	 * kernel memory into scratch memory and return it that way: this
3129	 * ensures that we won't trip up when access checking tests the
3130	 * BYREF return value.
3131	 */
3132	strsz = dtrace_strlen((char *)addr, size) + 1;
3133
3134	if (mstate->dtms_scratch_ptr + strsz >
3135	    mstate->dtms_scratch_base + mstate->dtms_scratch_size) {
3136		DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3137		return (NULL);
3138	}
3139
3140	dtrace_strcpy((const void *)addr, (void *)mstate->dtms_scratch_ptr,
3141	    strsz);
3142	ret = mstate->dtms_scratch_ptr;
3143	mstate->dtms_scratch_ptr += strsz;
3144	return (ret);
3145}
3146
3147/*
3148 * This function implements the DIF emulator's variable lookups.  The emulator
3149 * passes a reserved variable identifier and optional built-in array index.
3150 */
3151static uint64_t
3152dtrace_dif_variable(dtrace_mstate_t *mstate, dtrace_state_t *state, uint64_t v,
3153    uint64_t ndx)
3154{
3155	/*
3156	 * If we're accessing one of the uncached arguments, we'll turn this
3157	 * into a reference in the args array.
3158	 */
3159	if (v >= DIF_VAR_ARG0 && v <= DIF_VAR_ARG9) {
3160		ndx = v - DIF_VAR_ARG0;
3161		v = DIF_VAR_ARGS;
3162	}
3163
3164	switch (v) {
3165	case DIF_VAR_ARGS:
3166		if (!(mstate->dtms_access & DTRACE_ACCESS_ARGS)) {
3167			cpu_core[CPU->cpu_id].cpuc_dtrace_flags |=
3168			    CPU_DTRACE_KPRIV;
3169			return (0);
3170		}
3171
3172		ASSERT(mstate->dtms_present & DTRACE_MSTATE_ARGS);
3173		if (ndx >= sizeof (mstate->dtms_arg) /
3174		    sizeof (mstate->dtms_arg[0])) {
3175			int aframes = mstate->dtms_probe->dtpr_aframes + 2;
3176			dtrace_provider_t *pv;
3177			uint64_t val;
3178
3179			pv = mstate->dtms_probe->dtpr_provider;
3180			if (pv->dtpv_pops.dtps_getargval != NULL)
3181				val = pv->dtpv_pops.dtps_getargval(pv->dtpv_arg,
3182				    mstate->dtms_probe->dtpr_id,
3183				    mstate->dtms_probe->dtpr_arg, ndx, aframes);
3184			else
3185				val = dtrace_getarg(ndx, aframes);
3186
3187			/*
3188			 * This is regrettably required to keep the compiler
3189			 * from tail-optimizing the call to dtrace_getarg().
3190			 * The condition always evaluates to true, but the
3191			 * compiler has no way of figuring that out a priori.
3192			 * (None of this would be necessary if the compiler
3193			 * could be relied upon to _always_ tail-optimize
3194			 * the call to dtrace_getarg() -- but it can't.)
3195			 */
3196			if (mstate->dtms_probe != NULL)
3197				return (val);
3198
3199			ASSERT(0);
3200		}
3201
3202		return (mstate->dtms_arg[ndx]);
3203
3204	case DIF_VAR_UREGS: {
3205		klwp_t *lwp;
3206
3207		if (!dtrace_priv_proc(state, mstate))
3208			return (0);
3209
3210		if ((lwp = curthread->t_lwp) == NULL) {
3211			DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
3212			cpu_core[CPU->cpu_id].cpuc_dtrace_illval = NULL;
3213			return (0);
3214		}
3215
3216		return (dtrace_getreg(lwp->lwp_regs, ndx));
3217	}
3218
3219	case DIF_VAR_VMREGS: {
3220		uint64_t rval;
3221
3222		if (!dtrace_priv_kernel(state))
3223			return (0);
3224
3225		DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3226
3227		rval = dtrace_getvmreg(ndx,
3228		    &cpu_core[CPU->cpu_id].cpuc_dtrace_flags);
3229
3230		DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3231
3232		return (rval);
3233	}
3234
3235	case DIF_VAR_CURTHREAD:
3236		if (!dtrace_priv_proc(state, mstate))
3237			return (0);
3238		return ((uint64_t)(uintptr_t)curthread);
3239
3240	case DIF_VAR_TIMESTAMP:
3241		if (!(mstate->dtms_present & DTRACE_MSTATE_TIMESTAMP)) {
3242			mstate->dtms_timestamp = dtrace_gethrtime();
3243			mstate->dtms_present |= DTRACE_MSTATE_TIMESTAMP;
3244		}
3245		return (mstate->dtms_timestamp);
3246
3247	case DIF_VAR_VTIMESTAMP:
3248		ASSERT(dtrace_vtime_references != 0);
3249		return (curthread->t_dtrace_vtime);
3250
3251	case DIF_VAR_WALLTIMESTAMP:
3252		if (!(mstate->dtms_present & DTRACE_MSTATE_WALLTIMESTAMP)) {
3253			mstate->dtms_walltimestamp = dtrace_gethrestime();
3254			mstate->dtms_present |= DTRACE_MSTATE_WALLTIMESTAMP;
3255		}
3256		return (mstate->dtms_walltimestamp);
3257
3258	case DIF_VAR_IPL:
3259		if (!dtrace_priv_kernel(state))
3260			return (0);
3261		if (!(mstate->dtms_present & DTRACE_MSTATE_IPL)) {
3262			mstate->dtms_ipl = dtrace_getipl();
3263			mstate->dtms_present |= DTRACE_MSTATE_IPL;
3264		}
3265		return (mstate->dtms_ipl);
3266
3267	case DIF_VAR_EPID:
3268		ASSERT(mstate->dtms_present & DTRACE_MSTATE_EPID);
3269		return (mstate->dtms_epid);
3270
3271	case DIF_VAR_ID:
3272		ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3273		return (mstate->dtms_probe->dtpr_id);
3274
3275	case DIF_VAR_STACKDEPTH:
3276		if (!dtrace_priv_kernel(state))
3277			return (0);
3278		if (!(mstate->dtms_present & DTRACE_MSTATE_STACKDEPTH)) {
3279			int aframes = mstate->dtms_probe->dtpr_aframes + 2;
3280
3281			mstate->dtms_stackdepth = dtrace_getstackdepth(aframes);
3282			mstate->dtms_present |= DTRACE_MSTATE_STACKDEPTH;
3283		}
3284		return (mstate->dtms_stackdepth);
3285
3286	case DIF_VAR_USTACKDEPTH:
3287		if (!dtrace_priv_proc(state, mstate))
3288			return (0);
3289		if (!(mstate->dtms_present & DTRACE_MSTATE_USTACKDEPTH)) {
3290			/*
3291			 * See comment in DIF_VAR_PID.
3292			 */
3293			if (DTRACE_ANCHORED(mstate->dtms_probe) &&
3294			    CPU_ON_INTR(CPU)) {
3295				mstate->dtms_ustackdepth = 0;
3296			} else {
3297				DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3298				mstate->dtms_ustackdepth =
3299				    dtrace_getustackdepth();
3300				DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3301			}
3302			mstate->dtms_present |= DTRACE_MSTATE_USTACKDEPTH;
3303		}
3304		return (mstate->dtms_ustackdepth);
3305
3306	case DIF_VAR_CALLER:
3307		if (!dtrace_priv_kernel(state))
3308			return (0);
3309		if (!(mstate->dtms_present & DTRACE_MSTATE_CALLER)) {
3310			int aframes = mstate->dtms_probe->dtpr_aframes + 2;
3311
3312			if (!DTRACE_ANCHORED(mstate->dtms_probe)) {
3313				/*
3314				 * If this is an unanchored probe, we are
3315				 * required to go through the slow path:
3316				 * dtrace_caller() only guarantees correct
3317				 * results for anchored probes.
3318				 */
3319				pc_t caller[2];
3320
3321				dtrace_getpcstack(caller, 2, aframes,
3322				    (uint32_t *)(uintptr_t)mstate->dtms_arg[0]);
3323				mstate->dtms_caller = caller[1];
3324			} else if ((mstate->dtms_caller =
3325			    dtrace_caller(aframes)) == -1) {
3326				/*
3327				 * We have failed to do this the quick way;
3328				 * we must resort to the slower approach of
3329				 * calling dtrace_getpcstack().
3330				 */
3331				pc_t caller;
3332
3333				dtrace_getpcstack(&caller, 1, aframes, NULL);
3334				mstate->dtms_caller = caller;
3335			}
3336
3337			mstate->dtms_present |= DTRACE_MSTATE_CALLER;
3338		}
3339		return (mstate->dtms_caller);
3340
3341	case DIF_VAR_UCALLER:
3342		if (!dtrace_priv_proc(state, mstate))
3343			return (0);
3344
3345		if (!(mstate->dtms_present & DTRACE_MSTATE_UCALLER)) {
3346			uint64_t ustack[3];
3347
3348			/*
3349			 * dtrace_getupcstack() fills in the first uint64_t
3350			 * with the current PID.  The second uint64_t will
3351			 * be the program counter at user-level.  The third
3352			 * uint64_t will contain the caller, which is what
3353			 * we're after.
3354			 */
3355			ustack[2] = NULL;
3356			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3357			dtrace_getupcstack(ustack, 3);
3358			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3359			mstate->dtms_ucaller = ustack[2];
3360			mstate->dtms_present |= DTRACE_MSTATE_UCALLER;
3361		}
3362
3363		return (mstate->dtms_ucaller);
3364
3365	case DIF_VAR_PROBEPROV:
3366		ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3367		return (dtrace_dif_varstr(
3368		    (uintptr_t)mstate->dtms_probe->dtpr_provider->dtpv_name,
3369		    state, mstate));
3370
3371	case DIF_VAR_PROBEMOD:
3372		ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3373		return (dtrace_dif_varstr(
3374		    (uintptr_t)mstate->dtms_probe->dtpr_mod,
3375		    state, mstate));
3376
3377	case DIF_VAR_PROBEFUNC:
3378		ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3379		return (dtrace_dif_varstr(
3380		    (uintptr_t)mstate->dtms_probe->dtpr_func,
3381		    state, mstate));
3382
3383	case DIF_VAR_PROBENAME:
3384		ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3385		return (dtrace_dif_varstr(
3386		    (uintptr_t)mstate->dtms_probe->dtpr_name,
3387		    state, mstate));
3388
3389	case DIF_VAR_PID:
3390		if (!dtrace_priv_proc(state, mstate))
3391			return (0);
3392
3393		/*
3394		 * Note that we are assuming that an unanchored probe is
3395		 * always due to a high-level interrupt.  (And we're assuming
3396		 * that there is only a single high level interrupt.)
3397		 */
3398		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3399			return (pid0.pid_id);
3400
3401		/*
3402		 * It is always safe to dereference one's own t_procp pointer:
3403		 * it always points to a valid, allocated proc structure.
3404		 * Further, it is always safe to dereference the p_pidp member
3405		 * of one's own proc structure.  (These are truisms becuase
3406		 * threads and processes don't clean up their own state --
3407		 * they leave that task to whomever reaps them.)
3408		 */
3409		return ((uint64_t)curthread->t_procp->p_pidp->pid_id);
3410
3411	case DIF_VAR_PPID:
3412		if (!dtrace_priv_proc(state, mstate))
3413			return (0);
3414
3415		/*
3416		 * See comment in DIF_VAR_PID.
3417		 */
3418		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3419			return (pid0.pid_id);
3420
3421		/*
3422		 * It is always safe to dereference one's own t_procp pointer:
3423		 * it always points to a valid, allocated proc structure.
3424		 * (This is true because threads don't clean up their own
3425		 * state -- they leave that task to whomever reaps them.)
3426		 */
3427		return ((uint64_t)curthread->t_procp->p_ppid);
3428
3429	case DIF_VAR_TID:
3430		/*
3431		 * See comment in DIF_VAR_PID.
3432		 */
3433		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3434			return (0);
3435
3436		return ((uint64_t)curthread->t_tid);
3437
3438	case DIF_VAR_EXECNAME:
3439		if (!dtrace_priv_proc(state, mstate))
3440			return (0);
3441
3442		/*
3443		 * See comment in DIF_VAR_PID.
3444		 */
3445		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3446			return ((uint64_t)(uintptr_t)p0.p_user.u_comm);
3447
3448		/*
3449		 * It is always safe to dereference one's own t_procp pointer:
3450		 * it always points to a valid, allocated proc structure.
3451		 * (This is true because threads don't clean up their own
3452		 * state -- they leave that task to whomever reaps them.)
3453		 */
3454		return (dtrace_dif_varstr(
3455		    (uintptr_t)curthread->t_procp->p_user.u_comm,
3456		    state, mstate));
3457
3458	case DIF_VAR_ZONENAME:
3459		if (!dtrace_priv_proc(state, mstate))
3460			return (0);
3461
3462		/*
3463		 * See comment in DIF_VAR_PID.
3464		 */
3465		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3466			return ((uint64_t)(uintptr_t)p0.p_zone->zone_name);
3467
3468		/*
3469		 * It is always safe to dereference one's own t_procp pointer:
3470		 * it always points to a valid, allocated proc structure.
3471		 * (This is true because threads don't clean up their own
3472		 * state -- they leave that task to whomever reaps them.)
3473		 */
3474		return (dtrace_dif_varstr(
3475		    (uintptr_t)curthread->t_procp->p_zone->zone_name,
3476		    state, mstate));
3477
3478	case DIF_VAR_UID:
3479		if (!dtrace_priv_proc(state, mstate))
3480			return (0);
3481
3482		/*
3483		 * See comment in DIF_VAR_PID.
3484		 */
3485		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3486			return ((uint64_t)p0.p_cred->cr_uid);
3487
3488		/*
3489		 * It is always safe to dereference one's own t_procp pointer:
3490		 * it always points to a valid, allocated proc structure.
3491		 * (This is true because threads don't clean up their own
3492		 * state -- they leave that task to whomever reaps them.)
3493		 *
3494		 * Additionally, it is safe to dereference one's own process
3495		 * credential, since this is never NULL after process birth.
3496		 */
3497		return ((uint64_t)curthread->t_procp->p_cred->cr_uid);
3498
3499	case DIF_VAR_GID:
3500		if (!dtrace_priv_proc(state, mstate))
3501			return (0);
3502
3503		/*
3504		 * See comment in DIF_VAR_PID.
3505		 */
3506		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3507			return ((uint64_t)p0.p_cred->cr_gid);
3508
3509		/*
3510		 * It is always safe to dereference one's own t_procp pointer:
3511		 * it always points to a valid, allocated proc structure.
3512		 * (This is true because threads don't clean up their own
3513		 * state -- they leave that task to whomever reaps them.)
3514		 *
3515		 * Additionally, it is safe to dereference one's own process
3516		 * credential, since this is never NULL after process birth.
3517		 */
3518		return ((uint64_t)curthread->t_procp->p_cred->cr_gid);
3519
3520	case DIF_VAR_ERRNO: {
3521		klwp_t *lwp;
3522		if (!dtrace_priv_proc(state, mstate))
3523			return (0);
3524
3525		/*
3526		 * See comment in DIF_VAR_PID.
3527		 */
3528		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3529			return (0);
3530
3531		/*
3532		 * It is always safe to dereference one's own t_lwp pointer in
3533		 * the event that this pointer is non-NULL.  (This is true
3534		 * because threads and lwps don't clean up their own state --
3535		 * they leave that task to whomever reaps them.)
3536		 */
3537		if ((lwp = curthread->t_lwp) == NULL)
3538			return (0);
3539
3540		return ((uint64_t)lwp->lwp_errno);
3541	}
3542	default:
3543		DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3544		return (0);
3545	}
3546}
3547
3548
3549typedef enum dtrace_json_state {
3550	DTRACE_JSON_REST = 1,
3551	DTRACE_JSON_OBJECT,
3552	DTRACE_JSON_STRING,
3553	DTRACE_JSON_STRING_ESCAPE,
3554	DTRACE_JSON_STRING_ESCAPE_UNICODE,
3555	DTRACE_JSON_COLON,
3556	DTRACE_JSON_COMMA,
3557	DTRACE_JSON_VALUE,
3558	DTRACE_JSON_IDENTIFIER,
3559	DTRACE_JSON_NUMBER,
3560	DTRACE_JSON_NUMBER_FRAC,
3561	DTRACE_JSON_NUMBER_EXP,
3562	DTRACE_JSON_COLLECT_OBJECT
3563} dtrace_json_state_t;
3564
3565/*
3566 * This function possesses just enough knowledge about JSON to extract a single
3567 * value from a JSON string and store it in the scratch buffer.  It is able
3568 * to extract nested object values, and members of arrays by index.
3569 *
3570 * elemlist is a list of JSON keys, stored as packed NUL-terminated strings, to
3571 * be looked up as we descend into the object tree.  e.g.
3572 *
3573 *    foo[0].bar.baz[32] --> "foo" NUL "0" NUL "bar" NUL "baz" NUL "32" NUL
3574 *       with nelems = 5.
3575 *
3576 * The run time of this function must be bounded above by strsize to limit the
3577 * amount of work done in probe context.  As such, it is implemented as a
3578 * simple state machine, reading one character at a time using safe loads
3579 * until we find the requested element, hit a parsing error or run off the
3580 * end of the object or string.
3581 *
3582 * As there is no way for a subroutine to return an error without interrupting
3583 * clause execution, we simply return NULL in the event of a missing key or any
3584 * other error condition.  Each NULL return in this function is commented with
3585 * the error condition it represents -- parsing or otherwise.
3586 *
3587 * The set of states for the state machine closely matches the JSON
3588 * specification (http://json.org/).  Briefly:
3589 *
3590 *   DTRACE_JSON_REST:
3591 *     Skip whitespace until we find either a top-level Object, moving
3592 *     to DTRACE_JSON_OBJECT; or an Array, moving to DTRACE_JSON_VALUE.
3593 *
3594 *   DTRACE_JSON_OBJECT:
3595 *     Locate the next key String in an Object.  Sets a flag to denote
3596 *     the next String as a key string and moves to DTRACE_JSON_STRING.
3597 *
3598 *   DTRACE_JSON_COLON:
3599 *     Skip whitespace until we find the colon that separates key Strings
3600 *     from their values.  Once found, move to DTRACE_JSON_VALUE.
3601 *
3602 *   DTRACE_JSON_VALUE:
3603 *     Detects the type of the next value (String, Number, Identifier, Object
3604 *     or Array) and routes to the states that process that type.  Here we also
3605 *     deal with the element selector list if we are requested to traverse down
3606 *     into the object tree.
3607 *
3608 *   DTRACE_JSON_COMMA:
3609 *     Skip whitespace until we find the comma that separates key-value pairs
3610 *     in Objects (returning to DTRACE_JSON_OBJECT) or values in Arrays
3611 *     (similarly DTRACE_JSON_VALUE).  All following literal value processing
3612 *     states return to this state at the end of their value, unless otherwise
3613 *     noted.
3614 *
3615 *   DTRACE_JSON_NUMBER, DTRACE_JSON_NUMBER_FRAC, DTRACE_JSON_NUMBER_EXP:
3616 *     Processes a Number literal from the JSON, including any exponent
3617 *     component that may be present.  Numbers are returned as strings, which
3618 *     may be passed to strtoll() if an integer is required.
3619 *
3620 *   DTRACE_JSON_IDENTIFIER:
3621 *     Processes a "true", "false" or "null" literal in the JSON.
3622 *
3623 *   DTRACE_JSON_STRING, DTRACE_JSON_STRING_ESCAPE,
3624 *   DTRACE_JSON_STRING_ESCAPE_UNICODE:
3625 *     Processes a String literal from the JSON, whether the String denotes
3626 *     a key, a value or part of a larger Object.  Handles all escape sequences
3627 *     present in the specification, including four-digit unicode characters,
3628 *     but merely includes the escape sequence without converting it to the
3629 *     actual escaped character.  If the String is flagged as a key, we
3630 *     move to DTRACE_JSON_COLON rather than DTRACE_JSON_COMMA.
3631 *
3632 *   DTRACE_JSON_COLLECT_OBJECT:
3633 *     This state collects an entire Object (or Array), correctly handling
3634 *     embedded strings.  If the full element selector list matches this nested
3635 *     object, we return the Object in full as a string.  If not, we use this
3636 *     state to skip to the next value at this level and continue processing.
3637 *
3638 * NOTE: This function uses various macros from strtolctype.h to manipulate
3639 * digit values, etc -- these have all been checked to ensure they make
3640 * no additional function calls.
3641 */
3642static char *
3643dtrace_json(uint64_t size, uintptr_t json, char *elemlist, int nelems,
3644    char *dest)
3645{
3646	dtrace_json_state_t state = DTRACE_JSON_REST;
3647	int64_t array_elem = INT64_MIN;
3648	int64_t array_pos = 0;
3649	uint8_t escape_unicount = 0;
3650	boolean_t string_is_key = B_FALSE;
3651	boolean_t collect_object = B_FALSE;
3652	boolean_t found_key = B_FALSE;
3653	boolean_t in_array = B_FALSE;
3654	uint32_t braces = 0, brackets = 0;
3655	char *elem = elemlist;
3656	char *dd = dest;
3657	uintptr_t cur;
3658
3659	for (cur = json; cur < json + size; cur++) {
3660		char cc = dtrace_load8(cur);
3661		if (cc == '\0')
3662			return (NULL);
3663
3664		switch (state) {
3665		case DTRACE_JSON_REST:
3666			if (isspace(cc))
3667				break;
3668
3669			if (cc == '{') {
3670				state = DTRACE_JSON_OBJECT;
3671				break;
3672			}
3673
3674			if (cc == '[') {
3675				in_array = B_TRUE;
3676				array_pos = 0;
3677				array_elem = dtrace_strtoll(elem, 10, size);
3678				found_key = array_elem == 0 ? B_TRUE : B_FALSE;
3679				state = DTRACE_JSON_VALUE;
3680				break;
3681			}
3682
3683			/*
3684			 * ERROR: expected to find a top-level object or array.
3685			 */
3686			return (NULL);
3687		case DTRACE_JSON_OBJECT:
3688			if (isspace(cc))
3689				break;
3690
3691			if (cc == '"') {
3692				state = DTRACE_JSON_STRING;
3693				string_is_key = B_TRUE;
3694				break;
3695			}
3696
3697			/*
3698			 * ERROR: either the object did not start with a key
3699			 * string, or we've run off the end of the object
3700			 * without finding the requested key.
3701			 */
3702			return (NULL);
3703		case DTRACE_JSON_STRING:
3704			if (cc == '\\') {
3705				*dd++ = '\\';
3706				state = DTRACE_JSON_STRING_ESCAPE;
3707				break;
3708			}
3709
3710			if (cc == '"') {
3711				if (collect_object) {
3712					/*
3713					 * We don't reset the dest here, as
3714					 * the string is part of a larger
3715					 * object being collected.
3716					 */
3717					*dd++ = cc;
3718					collect_object = B_FALSE;
3719					state = DTRACE_JSON_COLLECT_OBJECT;
3720					break;
3721				}
3722				*dd = '\0';
3723				dd = dest; /* reset string buffer */
3724				if (string_is_key) {
3725					if (dtrace_strncmp(dest, elem,
3726					    size) == 0)
3727						found_key = B_TRUE;
3728				} else if (found_key) {
3729					if (nelems > 1) {
3730						/*
3731						 * We expected an object, not
3732						 * this string.
3733						 */
3734						return (NULL);
3735					}
3736					return (dest);
3737				}
3738				state = string_is_key ? DTRACE_JSON_COLON :
3739				    DTRACE_JSON_COMMA;
3740				string_is_key = B_FALSE;
3741				break;
3742			}
3743
3744			*dd++ = cc;
3745			break;
3746		case DTRACE_JSON_STRING_ESCAPE:
3747			*dd++ = cc;
3748			if (cc == 'u') {
3749				escape_unicount = 0;
3750				state = DTRACE_JSON_STRING_ESCAPE_UNICODE;
3751			} else {
3752				state = DTRACE_JSON_STRING;
3753			}
3754			break;
3755		case DTRACE_JSON_STRING_ESCAPE_UNICODE:
3756			if (!isxdigit(cc)) {
3757				/*
3758				 * ERROR: invalid unicode escape, expected
3759				 * four valid hexidecimal digits.
3760				 */
3761				return (NULL);
3762			}
3763
3764			*dd++ = cc;
3765			if (++escape_unicount == 4)
3766				state = DTRACE_JSON_STRING;
3767			break;
3768		case DTRACE_JSON_COLON:
3769			if (isspace(cc))
3770				break;
3771
3772			if (cc == ':') {
3773				state = DTRACE_JSON_VALUE;
3774				break;
3775			}
3776
3777			/*
3778			 * ERROR: expected a colon.
3779			 */
3780			return (NULL);
3781		case DTRACE_JSON_COMMA:
3782			if (isspace(cc))
3783				break;
3784
3785			if (cc == ',') {
3786				if (in_array) {
3787					state = DTRACE_JSON_VALUE;
3788					if (++array_pos == array_elem)
3789						found_key = B_TRUE;
3790				} else {
3791					state = DTRACE_JSON_OBJECT;
3792				}
3793				break;
3794			}
3795
3796			/*
3797			 * ERROR: either we hit an unexpected character, or
3798			 * we reached the end of the object or array without
3799			 * finding the requested key.
3800			 */
3801			return (NULL);
3802		case DTRACE_JSON_IDENTIFIER:
3803			if (islower(cc)) {
3804				*dd++ = cc;
3805				break;
3806			}
3807
3808			*dd = '\0';
3809			dd = dest; /* reset string buffer */
3810
3811			if (dtrace_strncmp(dest, "true", 5) == 0 ||
3812			    dtrace_strncmp(dest, "false", 6) == 0 ||
3813			    dtrace_strncmp(dest, "null", 5) == 0) {
3814				if (found_key) {
3815					if (nelems > 1) {
3816						/*
3817						 * ERROR: We expected an object,
3818						 * not this identifier.
3819						 */
3820						return (NULL);
3821					}
3822					return (dest);
3823				} else {
3824					cur--;
3825					state = DTRACE_JSON_COMMA;
3826					break;
3827				}
3828			}
3829
3830			/*
3831			 * ERROR: we did not recognise the identifier as one
3832			 * of those in the JSON specification.
3833			 */
3834			return (NULL);
3835		case DTRACE_JSON_NUMBER:
3836			if (cc == '.') {
3837				*dd++ = cc;
3838				state = DTRACE_JSON_NUMBER_FRAC;
3839				break;
3840			}
3841
3842			if (cc == 'x' || cc == 'X') {
3843				/*
3844				 * ERROR: specification explicitly excludes
3845				 * hexidecimal or octal numbers.
3846				 */
3847				return (NULL);
3848			}
3849
3850			/* FALLTHRU */
3851		case DTRACE_JSON_NUMBER_FRAC:
3852			if (cc == 'e' || cc == 'E') {
3853				*dd++ = cc;
3854				state = DTRACE_JSON_NUMBER_EXP;
3855				break;
3856			}
3857
3858			if (cc == '+' || cc == '-') {
3859				/*
3860				 * ERROR: expect sign as part of exponent only.
3861				 */
3862				return (NULL);
3863			}
3864			/* FALLTHRU */
3865		case DTRACE_JSON_NUMBER_EXP:
3866			if (isdigit(cc) || cc == '+' || cc == '-') {
3867				*dd++ = cc;
3868				break;
3869			}
3870
3871			*dd = '\0';
3872			dd = dest; /* reset string buffer */
3873			if (found_key) {
3874				if (nelems > 1) {
3875					/*
3876					 * ERROR: We expected an object, not
3877					 * this number.
3878					 */
3879					return (NULL);
3880				}
3881				return (dest);
3882			}
3883
3884			cur--;
3885			state = DTRACE_JSON_COMMA;
3886			break;
3887		case DTRACE_JSON_VALUE:
3888			if (isspace(cc))
3889				break;
3890
3891			if (cc == '{' || cc == '[') {
3892				if (nelems > 1 && found_key) {
3893					in_array = cc == '[' ? B_TRUE : B_FALSE;
3894					/*
3895					 * If our element selector directs us
3896					 * to descend into this nested object,
3897					 * then move to the next selector
3898					 * element in the list and restart the
3899					 * state machine.
3900					 */
3901					while (*elem != '\0')
3902						elem++;
3903					elem++; /* skip the inter-element NUL */
3904					nelems--;
3905					dd = dest;
3906					if (in_array) {
3907						state = DTRACE_JSON_VALUE;
3908						array_pos = 0;
3909						array_elem = dtrace_strtoll(
3910						    elem, 10, size);
3911						found_key = array_elem == 0 ?
3912						    B_TRUE : B_FALSE;
3913					} else {
3914						found_key = B_FALSE;
3915						state = DTRACE_JSON_OBJECT;
3916					}
3917					break;
3918				}
3919
3920				/*
3921				 * Otherwise, we wish to either skip this
3922				 * nested object or return it in full.
3923				 */
3924				if (cc == '[')
3925					brackets = 1;
3926				else
3927					braces = 1;
3928				*dd++ = cc;
3929				state = DTRACE_JSON_COLLECT_OBJECT;
3930				break;
3931			}
3932
3933			if (cc == '"') {
3934				state = DTRACE_JSON_STRING;
3935				break;
3936			}
3937
3938			if (islower(cc)) {
3939				/*
3940				 * Here we deal with true, false and null.
3941				 */
3942				*dd++ = cc;
3943				state = DTRACE_JSON_IDENTIFIER;
3944				break;
3945			}
3946
3947			if (cc == '-' || isdigit(cc)) {
3948				*dd++ = cc;
3949				state = DTRACE_JSON_NUMBER;
3950				break;
3951			}
3952
3953			/*
3954			 * ERROR: unexpected character at start of value.
3955			 */
3956			return (NULL);
3957		case DTRACE_JSON_COLLECT_OBJECT:
3958			if (cc == '\0')
3959				/*
3960				 * ERROR: unexpected end of input.
3961				 */
3962				return (NULL);
3963
3964			*dd++ = cc;
3965			if (cc == '"') {
3966				collect_object = B_TRUE;
3967				state = DTRACE_JSON_STRING;
3968				break;
3969			}
3970
3971			if (cc == ']') {
3972				if (brackets-- == 0) {
3973					/*
3974					 * ERROR: unbalanced brackets.
3975					 */
3976					return (NULL);
3977				}
3978			} else if (cc == '}') {
3979				if (braces-- == 0) {
3980					/*
3981					 * ERROR: unbalanced braces.
3982					 */
3983					return (NULL);
3984				}
3985			} else if (cc == '{') {
3986				braces++;
3987			} else if (cc == '[') {
3988				brackets++;
3989			}
3990
3991			if (brackets == 0 && braces == 0) {
3992				if (found_key) {
3993					*dd = '\0';
3994					return (dest);
3995				}
3996				dd = dest; /* reset string buffer */
3997				state = DTRACE_JSON_COMMA;
3998			}
3999			break;
4000		}
4001	}
4002	return (NULL);
4003}
4004
4005/*
4006 * Emulate the execution of DTrace ID subroutines invoked by the call opcode.
4007 * Notice that we don't bother validating the proper number of arguments or
4008 * their types in the tuple stack.  This isn't needed because all argument
4009 * interpretation is safe because of our load safety -- the worst that can
4010 * happen is that a bogus program can obtain bogus results.
4011 */
4012static void
4013dtrace_dif_subr(uint_t subr, uint_t rd, uint64_t *regs,
4014    dtrace_key_t *tupregs, int nargs,
4015    dtrace_mstate_t *mstate, dtrace_state_t *state)
4016{
4017	volatile uint16_t *flags = &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
4018	volatile uintptr_t *illval = &cpu_core[CPU->cpu_id].cpuc_dtrace_illval;
4019	dtrace_vstate_t *vstate = &state->dts_vstate;
4020
4021	union {
4022		mutex_impl_t mi;
4023		uint64_t mx;
4024	} m;
4025
4026	union {
4027		krwlock_t ri;
4028		uintptr_t rw;
4029	} r;
4030
4031	switch (subr) {
4032	case DIF_SUBR_RAND:
4033		regs[rd] = (dtrace_gethrtime() * 2416 + 374441) % 1771875;
4034		break;
4035
4036	case DIF_SUBR_MUTEX_OWNED:
4037		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
4038		    mstate, vstate)) {
4039			regs[rd] = NULL;
4040			break;
4041		}
4042
4043		m.mx = dtrace_load64(tupregs[0].dttk_value);
4044		if (MUTEX_TYPE_ADAPTIVE(&m.mi))
4045			regs[rd] = MUTEX_OWNER(&m.mi) != MUTEX_NO_OWNER;
4046		else
4047			regs[rd] = LOCK_HELD(&m.mi.m_spin.m_spinlock);
4048		break;
4049
4050	case DIF_SUBR_MUTEX_OWNER:
4051		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
4052		    mstate, vstate)) {
4053			regs[rd] = NULL;
4054			break;
4055		}
4056
4057		m.mx = dtrace_load64(tupregs[0].dttk_value);
4058		if (MUTEX_TYPE_ADAPTIVE(&m.mi) &&
4059		    MUTEX_OWNER(&m.mi) != MUTEX_NO_OWNER)
4060			regs[rd] = (uintptr_t)MUTEX_OWNER(&m.mi);
4061		else
4062			regs[rd] = 0;
4063		break;
4064
4065	case DIF_SUBR_MUTEX_TYPE_ADAPTIVE:
4066		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
4067		    mstate, vstate)) {
4068			regs[rd] = NULL;
4069			break;
4070		}
4071
4072		m.mx = dtrace_load64(tupregs[0].dttk_value);
4073		regs[rd] = MUTEX_TYPE_ADAPTIVE(&m.mi);
4074		break;
4075
4076	case DIF_SUBR_MUTEX_TYPE_SPIN:
4077		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
4078		    mstate, vstate)) {
4079			regs[rd] = NULL;
4080			break;
4081		}
4082
4083		m.mx = dtrace_load64(tupregs[0].dttk_value);
4084		regs[rd] = MUTEX_TYPE_SPIN(&m.mi);
4085		break;
4086
4087	case DIF_SUBR_RW_READ_HELD: {
4088		uintptr_t tmp;
4089
4090		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (uintptr_t),
4091		    mstate, vstate)) {
4092			regs[rd] = NULL;
4093			break;
4094		}
4095
4096		r.rw = dtrace_loadptr(tupregs[0].dttk_value);
4097		regs[rd] = _RW_READ_HELD(&r.ri, tmp);
4098		break;
4099	}
4100
4101	case DIF_SUBR_RW_WRITE_HELD:
4102		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (krwlock_t),
4103		    mstate, vstate)) {
4104			regs[rd] = NULL;
4105			break;
4106		}
4107
4108		r.rw = dtrace_loadptr(tupregs[0].dttk_value);
4109		regs[rd] = _RW_WRITE_HELD(&r.ri);
4110		break;
4111
4112	case DIF_SUBR_RW_ISWRITER:
4113		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (krwlock_t),
4114		    mstate, vstate)) {
4115			regs[rd] = NULL;
4116			break;
4117		}
4118
4119		r.rw = dtrace_loadptr(tupregs[0].dttk_value);
4120		regs[rd] = _RW_ISWRITER(&r.ri);
4121		break;
4122
4123	case DIF_SUBR_BCOPY: {
4124		/*
4125		 * We need to be sure that the destination is in the scratch
4126		 * region -- no other region is allowed.
4127		 */
4128		uintptr_t src = tupregs[0].dttk_value;
4129		uintptr_t dest = tupregs[1].dttk_value;
4130		size_t size = tupregs[2].dttk_value;
4131
4132		if (!dtrace_inscratch(dest, size, mstate)) {
4133			*flags |= CPU_DTRACE_BADADDR;
4134			*illval = regs[rd];
4135			break;
4136		}
4137
4138		if (!dtrace_canload(src, size, mstate, vstate)) {
4139			regs[rd] = NULL;
4140			break;
4141		}
4142
4143		dtrace_bcopy((void *)src, (void *)dest, size);
4144		break;
4145	}
4146
4147	case DIF_SUBR_ALLOCA:
4148	case DIF_SUBR_COPYIN: {
4149		uintptr_t dest = P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
4150		uint64_t size =
4151		    tupregs[subr == DIF_SUBR_ALLOCA ? 0 : 1].dttk_value;
4152		size_t scratch_size = (dest - mstate->dtms_scratch_ptr) + size;
4153
4154		/*
4155		 * This action doesn't require any credential checks since
4156		 * probes will not activate in user contexts to which the
4157		 * enabling user does not have permissions.
4158		 */
4159
4160		/*
4161		 * Rounding up the user allocation size could have overflowed
4162		 * a large, bogus allocation (like -1ULL) to 0.
4163		 */
4164		if (scratch_size < size ||
4165		    !DTRACE_INSCRATCH(mstate, scratch_size)) {
4166			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4167			regs[rd] = NULL;
4168			break;
4169		}
4170
4171		if (subr == DIF_SUBR_COPYIN) {
4172			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4173			dtrace_copyin(tupregs[0].dttk_value, dest, size, flags);
4174			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4175		}
4176
4177		mstate->dtms_scratch_ptr += scratch_size;
4178		regs[rd] = dest;
4179		break;
4180	}
4181
4182	case DIF_SUBR_COPYINTO: {
4183		uint64_t size = tupregs[1].dttk_value;
4184		uintptr_t dest = tupregs[2].dttk_value;
4185
4186		/*
4187		 * This action doesn't require any credential checks since
4188		 * probes will not activate in user contexts to which the
4189		 * enabling user does not have permissions.
4190		 */
4191		if (!dtrace_inscratch(dest, size, mstate)) {
4192			*flags |= CPU_DTRACE_BADADDR;
4193			*illval = regs[rd];
4194			break;
4195		}
4196
4197		DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4198		dtrace_copyin(tupregs[0].dttk_value, dest, size, flags);
4199		DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4200		break;
4201	}
4202
4203	case DIF_SUBR_COPYINSTR: {
4204		uintptr_t dest = mstate->dtms_scratch_ptr;
4205		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4206
4207		if (nargs > 1 && tupregs[1].dttk_value < size)
4208			size = tupregs[1].dttk_value + 1;
4209
4210		/*
4211		 * This action doesn't require any credential checks since
4212		 * probes will not activate in user contexts to which the
4213		 * enabling user does not have permissions.
4214		 */
4215		if (!DTRACE_INSCRATCH(mstate, size)) {
4216			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4217			regs[rd] = NULL;
4218			break;
4219		}
4220
4221		DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4222		dtrace_copyinstr(tupregs[0].dttk_value, dest, size, flags);
4223		DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4224
4225		((char *)dest)[size - 1] = '\0';
4226		mstate->dtms_scratch_ptr += size;
4227		regs[rd] = dest;
4228		break;
4229	}
4230
4231	case DIF_SUBR_MSGSIZE:
4232	case DIF_SUBR_MSGDSIZE: {
4233		uintptr_t baddr = tupregs[0].dttk_value, daddr;
4234		uintptr_t wptr, rptr;
4235		size_t count = 0;
4236		int cont = 0;
4237
4238		while (baddr != NULL && !(*flags & CPU_DTRACE_FAULT)) {
4239
4240			if (!dtrace_canload(baddr, sizeof (mblk_t), mstate,
4241			    vstate)) {
4242				regs[rd] = NULL;
4243				break;
4244			}
4245
4246			wptr = dtrace_loadptr(baddr +
4247			    offsetof(mblk_t, b_wptr));
4248
4249			rptr = dtrace_loadptr(baddr +
4250			    offsetof(mblk_t, b_rptr));
4251
4252			if (wptr < rptr) {
4253				*flags |= CPU_DTRACE_BADADDR;
4254				*illval = tupregs[0].dttk_value;
4255				break;
4256			}
4257
4258			daddr = dtrace_loadptr(baddr +
4259			    offsetof(mblk_t, b_datap));
4260
4261			baddr = dtrace_loadptr(baddr +
4262			    offsetof(mblk_t, b_cont));
4263
4264			/*
4265			 * We want to prevent against denial-of-service here,
4266			 * so we're only going to search the list for
4267			 * dtrace_msgdsize_max mblks.
4268			 */
4269			if (cont++ > dtrace_msgdsize_max) {
4270				*flags |= CPU_DTRACE_ILLOP;
4271				break;
4272			}
4273
4274			if (subr == DIF_SUBR_MSGDSIZE) {
4275				if (dtrace_load8(daddr +
4276				    offsetof(dblk_t, db_type)) != M_DATA)
4277					continue;
4278			}
4279
4280			count += wptr - rptr;
4281		}
4282
4283		if (!(*flags & CPU_DTRACE_FAULT))
4284			regs[rd] = count;
4285
4286		break;
4287	}
4288
4289	case DIF_SUBR_PROGENYOF: {
4290		pid_t pid = tupregs[0].dttk_value;
4291		proc_t *p;
4292		int rval = 0;
4293
4294		DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4295
4296		for (p = curthread->t_procp; p != NULL; p = p->p_parent) {
4297			if (p->p_pidp->pid_id == pid) {
4298				rval = 1;
4299				break;
4300			}
4301		}
4302
4303		DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4304
4305		regs[rd] = rval;
4306		break;
4307	}
4308
4309	case DIF_SUBR_SPECULATION:
4310		regs[rd] = dtrace_speculation(state);
4311		break;
4312
4313	case DIF_SUBR_COPYOUT: {
4314		uintptr_t kaddr = tupregs[0].dttk_value;
4315		uintptr_t uaddr = tupregs[1].dttk_value;
4316		uint64_t size = tupregs[2].dttk_value;
4317
4318		if (!dtrace_destructive_disallow &&
4319		    dtrace_priv_proc_control(state, mstate) &&
4320		    !dtrace_istoxic(kaddr, size) &&
4321		    dtrace_canload(kaddr, size, mstate, vstate)) {
4322			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4323			dtrace_copyout(kaddr, uaddr, size, flags);
4324			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4325		}
4326		break;
4327	}
4328
4329	case DIF_SUBR_COPYOUTSTR: {
4330		uintptr_t kaddr = tupregs[0].dttk_value;
4331		uintptr_t uaddr = tupregs[1].dttk_value;
4332		uint64_t size = tupregs[2].dttk_value;
4333		size_t lim;
4334
4335		if (!dtrace_destructive_disallow &&
4336		    dtrace_priv_proc_control(state, mstate) &&
4337		    !dtrace_istoxic(kaddr, size) &&
4338		    dtrace_strcanload(kaddr, size, &lim, mstate, vstate)) {
4339			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4340			dtrace_copyoutstr(kaddr, uaddr, lim, flags);
4341			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4342		}
4343		break;
4344	}
4345
4346	case DIF_SUBR_STRLEN: {
4347		size_t size = state->dts_options[DTRACEOPT_STRSIZE];
4348		uintptr_t addr = (uintptr_t)tupregs[0].dttk_value;
4349		size_t lim;
4350
4351		if (!dtrace_strcanload(addr, size, &lim, mstate, vstate)) {
4352			regs[rd] = NULL;
4353			break;
4354		}
4355		regs[rd] = dtrace_strlen((char *)addr, lim);
4356
4357		break;
4358	}
4359
4360	case DIF_SUBR_STRCHR:
4361	case DIF_SUBR_STRRCHR: {
4362		/*
4363		 * We're going to iterate over the string looking for the
4364		 * specified character.  We will iterate until we have reached
4365		 * the string length or we have found the character.  If this
4366		 * is DIF_SUBR_STRRCHR, we will look for the last occurrence
4367		 * of the specified character instead of the first.
4368		 */
4369		uintptr_t addr = tupregs[0].dttk_value;
4370		uintptr_t addr_limit;
4371		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4372		size_t lim;
4373		char c, target = (char)tupregs[1].dttk_value;
4374
4375		if (!dtrace_strcanload(addr, size, &lim, mstate, vstate)) {
4376			regs[rd] = NULL;
4377			break;
4378		}
4379		addr_limit = addr + lim;
4380
4381		for (regs[rd] = NULL; addr < addr_limit; addr++) {
4382			if ((c = dtrace_load8(addr)) == target) {
4383				regs[rd] = addr;
4384
4385				if (subr == DIF_SUBR_STRCHR)
4386					break;
4387			}
4388			if (c == '\0')
4389				break;
4390		}
4391
4392		break;
4393	}
4394
4395	case DIF_SUBR_STRSTR:
4396	case DIF_SUBR_INDEX:
4397	case DIF_SUBR_RINDEX: {
4398		/*
4399		 * We're going to iterate over the string looking for the
4400		 * specified string.  We will iterate until we have reached
4401		 * the string length or we have found the string.  (Yes, this
4402		 * is done in the most naive way possible -- but considering
4403		 * that the string we're searching for is likely to be
4404		 * relatively short, the complexity of Rabin-Karp or similar
4405		 * hardly seems merited.)
4406		 */
4407		char *addr = (char *)(uintptr_t)tupregs[0].dttk_value;
4408		char *substr = (char *)(uintptr_t)tupregs[1].dttk_value;
4409		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4410		size_t len = dtrace_strlen(addr, size);
4411		size_t sublen = dtrace_strlen(substr, size);
4412		char *limit = addr + len, *orig = addr;
4413		int notfound = subr == DIF_SUBR_STRSTR ? 0 : -1;
4414		int inc = 1;
4415
4416		regs[rd] = notfound;
4417
4418		if (!dtrace_canload((uintptr_t)addr, len + 1, mstate, vstate)) {
4419			regs[rd] = NULL;
4420			break;
4421		}
4422
4423		if (!dtrace_canload((uintptr_t)substr, sublen + 1, mstate,
4424		    vstate)) {
4425			regs[rd] = NULL;
4426			break;
4427		}
4428
4429		/*
4430		 * strstr() and index()/rindex() have similar semantics if
4431		 * both strings are the empty string: strstr() returns a
4432		 * pointer to the (empty) string, and index() and rindex()
4433		 * both return index 0 (regardless of any position argument).
4434		 */
4435		if (sublen == 0 && len == 0) {
4436			if (subr == DIF_SUBR_STRSTR)
4437				regs[rd] = (uintptr_t)addr;
4438			else
4439				regs[rd] = 0;
4440			break;
4441		}
4442
4443		if (subr != DIF_SUBR_STRSTR) {
4444			if (subr == DIF_SUBR_RINDEX) {
4445				limit = orig - 1;
4446				addr += len;
4447				inc = -1;
4448			}
4449
4450			/*
4451			 * Both index() and rindex() take an optional position
4452			 * argument that denotes the starting position.
4453			 */
4454			if (nargs == 3) {
4455				int64_t pos = (int64_t)tupregs[2].dttk_value;
4456
4457				/*
4458				 * If the position argument to index() is
4459				 * negative, Perl implicitly clamps it at
4460				 * zero.  This semantic is a little surprising
4461				 * given the special meaning of negative
4462				 * positions to similar Perl functions like
4463				 * substr(), but it appears to reflect a
4464				 * notion that index() can start from a
4465				 * negative index and increment its way up to
4466				 * the string.  Given this notion, Perl's
4467				 * rindex() is at least self-consistent in
4468				 * that it implicitly clamps positions greater
4469				 * than the string length to be the string
4470				 * length.  Where Perl completely loses
4471				 * coherence, however, is when the specified
4472				 * substring is the empty string ("").  In
4473				 * this case, even if the position is
4474				 * negative, rindex() returns 0 -- and even if
4475				 * the position is greater than the length,
4476				 * index() returns the string length.  These
4477				 * semantics violate the notion that index()
4478				 * should never return a value less than the
4479				 * specified position and that rindex() should
4480				 * never return a value greater than the
4481				 * specified position.  (One assumes that
4482				 * these semantics are artifacts of Perl's
4483				 * implementation and not the results of
4484				 * deliberate design -- it beggars belief that
4485				 * even Larry Wall could desire such oddness.)
4486				 * While in the abstract one would wish for
4487				 * consistent position semantics across
4488				 * substr(), index() and rindex() -- or at the
4489				 * very least self-consistent position
4490				 * semantics for index() and rindex() -- we
4491				 * instead opt to keep with the extant Perl
4492				 * semantics, in all their broken glory.  (Do
4493				 * we have more desire to maintain Perl's
4494				 * semantics than Perl does?  Probably.)
4495				 */
4496				if (subr == DIF_SUBR_RINDEX) {
4497					if (pos < 0) {
4498						if (sublen == 0)
4499							regs[rd] = 0;
4500						break;
4501					}
4502
4503					if (pos > len)
4504						pos = len;
4505				} else {
4506					if (pos < 0)
4507						pos = 0;
4508
4509					if (pos >= len) {
4510						if (sublen == 0)
4511							regs[rd] = len;
4512						break;
4513					}
4514				}
4515
4516				addr = orig + pos;
4517			}
4518		}
4519
4520		for (regs[rd] = notfound; addr != limit; addr += inc) {
4521			if (dtrace_strncmp(addr, substr, sublen) == 0) {
4522				if (subr != DIF_SUBR_STRSTR) {
4523					/*
4524					 * As D index() and rindex() are
4525					 * modeled on Perl (and not on awk),
4526					 * we return a zero-based (and not a
4527					 * one-based) index.  (For you Perl
4528					 * weenies: no, we're not going to add
4529					 * $[ -- and shouldn't you be at a con
4530					 * or something?)
4531					 */
4532					regs[rd] = (uintptr_t)(addr - orig);
4533					break;
4534				}
4535
4536				ASSERT(subr == DIF_SUBR_STRSTR);
4537				regs[rd] = (uintptr_t)addr;
4538				break;
4539			}
4540		}
4541
4542		break;
4543	}
4544
4545	case DIF_SUBR_STRTOK: {
4546		uintptr_t addr = tupregs[0].dttk_value;
4547		uintptr_t tokaddr = tupregs[1].dttk_value;
4548		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4549		uintptr_t limit, toklimit;
4550		size_t clim;
4551		uint8_t c, tokmap[32];	 /* 256 / 8 */
4552		char *dest = (char *)mstate->dtms_scratch_ptr;
4553		int i;
4554
4555		/*
4556		 * Check both the token buffer and (later) the input buffer,
4557		 * since both could be non-scratch addresses.
4558		 */
4559		if (!dtrace_strcanload(tokaddr, size, &clim, mstate, vstate)) {
4560			regs[rd] = NULL;
4561			break;
4562		}
4563		toklimit = tokaddr + clim;
4564
4565		if (!DTRACE_INSCRATCH(mstate, size)) {
4566			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4567			regs[rd] = NULL;
4568			break;
4569		}
4570
4571		if (addr == NULL) {
4572			/*
4573			 * If the address specified is NULL, we use our saved
4574			 * strtok pointer from the mstate.  Note that this
4575			 * means that the saved strtok pointer is _only_
4576			 * valid within multiple enablings of the same probe --
4577			 * it behaves like an implicit clause-local variable.
4578			 */
4579			addr = mstate->dtms_strtok;
4580			limit = mstate->dtms_strtok_limit;
4581		} else {
4582			/*
4583			 * If the user-specified address is non-NULL we must
4584			 * access check it.  This is the only time we have
4585			 * a chance to do so, since this address may reside
4586			 * in the string table of this clause-- future calls
4587			 * (when we fetch addr from mstate->dtms_strtok)
4588			 * would fail this access check.
4589			 */
4590			if (!dtrace_strcanload(addr, size, &clim, mstate,
4591			    vstate)) {
4592				regs[rd] = NULL;
4593				break;
4594			}
4595			limit = addr + clim;
4596		}
4597
4598		/*
4599		 * First, zero the token map, and then process the token
4600		 * string -- setting a bit in the map for every character
4601		 * found in the token string.
4602		 */
4603		for (i = 0; i < sizeof (tokmap); i++)
4604			tokmap[i] = 0;
4605
4606		for (; tokaddr < toklimit; tokaddr++) {
4607			if ((c = dtrace_load8(tokaddr)) == '\0')
4608				break;
4609
4610			ASSERT((c >> 3) < sizeof (tokmap));
4611			tokmap[c >> 3] |= (1 << (c & 0x7));
4612		}
4613
4614		for (; addr < limit; addr++) {
4615			/*
4616			 * We're looking for a character that is _not_
4617			 * contained in the token string.
4618			 */
4619			if ((c = dtrace_load8(addr)) == '\0')
4620				break;
4621
4622			if (!(tokmap[c >> 3] & (1 << (c & 0x7))))
4623				break;
4624		}
4625
4626		if (c == '\0') {
4627			/*
4628			 * We reached the end of the string without finding
4629			 * any character that was not in the token string.
4630			 * We return NULL in this case, and we set the saved
4631			 * address to NULL as well.
4632			 */
4633			regs[rd] = NULL;
4634			mstate->dtms_strtok = NULL;
4635			mstate->dtms_strtok_limit = NULL;
4636			break;
4637		}
4638
4639		/*
4640		 * From here on, we're copying into the destination string.
4641		 */
4642		for (i = 0; addr < limit && i < size - 1; addr++) {
4643			if ((c = dtrace_load8(addr)) == '\0')
4644				break;
4645
4646			if (tokmap[c >> 3] & (1 << (c & 0x7)))
4647				break;
4648
4649			ASSERT(i < size);
4650			dest[i++] = c;
4651		}
4652
4653		ASSERT(i < size);
4654		dest[i] = '\0';
4655		regs[rd] = (uintptr_t)dest;
4656		mstate->dtms_scratch_ptr += size;
4657		mstate->dtms_strtok = addr;
4658		mstate->dtms_strtok_limit = limit;
4659		break;
4660	}
4661
4662	case DIF_SUBR_SUBSTR: {
4663		uintptr_t s = tupregs[0].dttk_value;
4664		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4665		char *d = (char *)mstate->dtms_scratch_ptr;
4666		int64_t index = (int64_t)tupregs[1].dttk_value;
4667		int64_t remaining = (int64_t)tupregs[2].dttk_value;
4668		size_t len = dtrace_strlen((char *)s, size);
4669		int64_t i;
4670
4671		if (!dtrace_canload(s, len + 1, mstate, vstate)) {
4672			regs[rd] = NULL;
4673			break;
4674		}
4675
4676		if (!DTRACE_INSCRATCH(mstate, size)) {
4677			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4678			regs[rd] = NULL;
4679			break;
4680		}
4681
4682		if (nargs <= 2)
4683			remaining = (int64_t)size;
4684
4685		if (index < 0) {
4686			index += len;
4687
4688			if (index < 0 && index + remaining > 0) {
4689				remaining += index;
4690				index = 0;
4691			}
4692		}
4693
4694		if (index >= len || index < 0) {
4695			remaining = 0;
4696		} else if (remaining < 0) {
4697			remaining += len - index;
4698		} else if (index + remaining > size) {
4699			remaining = size - index;
4700		}
4701
4702		for (i = 0; i < remaining; i++) {
4703			if ((d[i] = dtrace_load8(s + index + i)) == '\0')
4704				break;
4705		}
4706
4707		d[i] = '\0';
4708
4709		mstate->dtms_scratch_ptr += size;
4710		regs[rd] = (uintptr_t)d;
4711		break;
4712	}
4713
4714	case DIF_SUBR_JSON: {
4715		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4716		uintptr_t json = tupregs[0].dttk_value;
4717		size_t jsonlen = dtrace_strlen((char *)json, size);
4718		uintptr_t elem = tupregs[1].dttk_value;
4719		size_t elemlen = dtrace_strlen((char *)elem, size);
4720
4721		char *dest = (char *)mstate->dtms_scratch_ptr;
4722		char *elemlist = (char *)mstate->dtms_scratch_ptr + jsonlen + 1;
4723		char *ee = elemlist;
4724		int nelems = 1;
4725		uintptr_t cur;
4726
4727		if (!dtrace_canload(json, jsonlen + 1, mstate, vstate) ||
4728		    !dtrace_canload(elem, elemlen + 1, mstate, vstate)) {
4729			regs[rd] = NULL;
4730			break;
4731		}
4732
4733		if (!DTRACE_INSCRATCH(mstate, jsonlen + 1 + elemlen + 1)) {
4734			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4735			regs[rd] = NULL;
4736			break;
4737		}
4738
4739		/*
4740		 * Read the element selector and split it up into a packed list
4741		 * of strings.
4742		 */
4743		for (cur = elem; cur < elem + elemlen; cur++) {
4744			char cc = dtrace_load8(cur);
4745
4746			if (cur == elem && cc == '[') {
4747				/*
4748				 * If the first element selector key is
4749				 * actually an array index then ignore the
4750				 * bracket.
4751				 */
4752				continue;
4753			}
4754
4755			if (cc == ']')
4756				continue;
4757
4758			if (cc == '.' || cc == '[') {
4759				nelems++;
4760				cc = '\0';
4761			}
4762
4763			*ee++ = cc;
4764		}
4765		*ee++ = '\0';
4766
4767		if ((regs[rd] = (uintptr_t)dtrace_json(size, json, elemlist,
4768		    nelems, dest)) != NULL)
4769			mstate->dtms_scratch_ptr += jsonlen + 1;
4770		break;
4771	}
4772
4773	case DIF_SUBR_TOUPPER:
4774	case DIF_SUBR_TOLOWER: {
4775		uintptr_t s = tupregs[0].dttk_value;
4776		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4777		char *dest = (char *)mstate->dtms_scratch_ptr, c;
4778		size_t len = dtrace_strlen((char *)s, size);
4779		char lower, upper, convert;
4780		int64_t i;
4781
4782		if (subr == DIF_SUBR_TOUPPER) {
4783			lower = 'a';
4784			upper = 'z';
4785			convert = 'A';
4786		} else {
4787			lower = 'A';
4788			upper = 'Z';
4789			convert = 'a';
4790		}
4791
4792		if (!dtrace_canload(s, len + 1, mstate, vstate)) {
4793			regs[rd] = NULL;
4794			break;
4795		}
4796
4797		if (!DTRACE_INSCRATCH(mstate, size)) {
4798			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4799			regs[rd] = NULL;
4800			break;
4801		}
4802
4803		for (i = 0; i < size - 1; i++) {
4804			if ((c = dtrace_load8(s + i)) == '\0')
4805				break;
4806
4807			if (c >= lower && c <= upper)
4808				c = convert + (c - lower);
4809
4810			dest[i] = c;
4811		}
4812
4813		ASSERT(i < size);
4814		dest[i] = '\0';
4815		regs[rd] = (uintptr_t)dest;
4816		mstate->dtms_scratch_ptr += size;
4817		break;
4818	}
4819
4820case DIF_SUBR_GETMAJOR:
4821#ifdef _LP64
4822		regs[rd] = (tupregs[0].dttk_value >> NBITSMINOR64) & MAXMAJ64;
4823#else
4824		regs[rd] = (tupregs[0].dttk_value >> NBITSMINOR) & MAXMAJ;
4825#endif
4826		break;
4827
4828	case DIF_SUBR_GETMINOR:
4829#ifdef _LP64
4830		regs[rd] = tupregs[0].dttk_value & MAXMIN64;
4831#else
4832		regs[rd] = tupregs[0].dttk_value & MAXMIN;
4833#endif
4834		break;
4835
4836	case DIF_SUBR_DDI_PATHNAME: {
4837		/*
4838		 * This one is a galactic mess.  We are going to roughly
4839		 * emulate ddi_pathname(), but it's made more complicated
4840		 * by the fact that we (a) want to include the minor name and
4841		 * (b) must proceed iteratively instead of recursively.
4842		 */
4843		uintptr_t dest = mstate->dtms_scratch_ptr;
4844		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4845		char *start = (char *)dest, *end = start + size - 1;
4846		uintptr_t daddr = tupregs[0].dttk_value;
4847		int64_t minor = (int64_t)tupregs[1].dttk_value;
4848		char *s;
4849		int i, len, depth = 0;
4850
4851		/*
4852		 * Due to all the pointer jumping we do and context we must
4853		 * rely upon, we just mandate that the user must have kernel
4854		 * read privileges to use this routine.
4855		 */
4856		if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) == 0) {
4857			*flags |= CPU_DTRACE_KPRIV;
4858			*illval = daddr;
4859			regs[rd] = NULL;
4860		}
4861
4862		if (!DTRACE_INSCRATCH(mstate, size)) {
4863			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4864			regs[rd] = NULL;
4865			break;
4866		}
4867
4868		*end = '\0';
4869
4870		/*
4871		 * We want to have a name for the minor.  In order to do this,
4872		 * we need to walk the minor list from the devinfo.  We want
4873		 * to be sure that we don't infinitely walk a circular list,
4874		 * so we check for circularity by sending a scout pointer
4875		 * ahead two elements for every element that we iterate over;
4876		 * if the list is circular, these will ultimately point to the
4877		 * same element.  You may recognize this little trick as the
4878		 * answer to a stupid interview question -- one that always
4879		 * seems to be asked by those who had to have it laboriously
4880		 * explained to them, and who can't even concisely describe
4881		 * the conditions under which one would be forced to resort to
4882		 * this technique.  Needless to say, those conditions are
4883		 * found here -- and probably only here.  Is this the only use
4884		 * of this infamous trick in shipping, production code?  If it
4885		 * isn't, it probably should be...
4886		 */
4887		if (minor != -1) {
4888			uintptr_t maddr = dtrace_loadptr(daddr +
4889			    offsetof(struct dev_info, devi_minor));
4890
4891			uintptr_t next = offsetof(struct ddi_minor_data, next);
4892			uintptr_t name = offsetof(struct ddi_minor_data,
4893			    d_minor) + offsetof(struct ddi_minor, name);
4894			uintptr_t dev = offsetof(struct ddi_minor_data,
4895			    d_minor) + offsetof(struct ddi_minor, dev);
4896			uintptr_t scout;
4897
4898			if (maddr != NULL)
4899				scout = dtrace_loadptr(maddr + next);
4900
4901			while (maddr != NULL && !(*flags & CPU_DTRACE_FAULT)) {
4902				uint64_t m;
4903#ifdef _LP64
4904				m = dtrace_load64(maddr + dev) & MAXMIN64;
4905#else
4906				m = dtrace_load32(maddr + dev) & MAXMIN;
4907#endif
4908				if (m != minor) {
4909					maddr = dtrace_loadptr(maddr + next);
4910
4911					if (scout == NULL)
4912						continue;
4913
4914					scout = dtrace_loadptr(scout + next);
4915
4916					if (scout == NULL)
4917						continue;
4918
4919					scout = dtrace_loadptr(scout + next);
4920
4921					if (scout == NULL)
4922						continue;
4923
4924					if (scout == maddr) {
4925						*flags |= CPU_DTRACE_ILLOP;
4926						break;
4927					}
4928
4929					continue;
4930				}
4931
4932				/*
4933				 * We have the minor data.  Now we need to
4934				 * copy the minor's name into the end of the
4935				 * pathname.
4936				 */
4937				s = (char *)dtrace_loadptr(maddr + name);
4938				len = dtrace_strlen(s, size);
4939
4940				if (*flags & CPU_DTRACE_FAULT)
4941					break;
4942
4943				if (len != 0) {
4944					if ((end -= (len + 1)) < start)
4945						break;
4946
4947					*end = ':';
4948				}
4949
4950				for (i = 1; i <= len; i++)
4951					end[i] = dtrace_load8((uintptr_t)s++);
4952				break;
4953			}
4954		}
4955
4956		while (daddr != NULL && !(*flags & CPU_DTRACE_FAULT)) {
4957			ddi_node_state_t devi_state;
4958
4959			devi_state = dtrace_load32(daddr +
4960			    offsetof(struct dev_info, devi_node_state));
4961
4962			if (*flags & CPU_DTRACE_FAULT)
4963				break;
4964
4965			if (devi_state >= DS_INITIALIZED) {
4966				s = (char *)dtrace_loadptr(daddr +
4967				    offsetof(struct dev_info, devi_addr));
4968				len = dtrace_strlen(s, size);
4969
4970				if (*flags & CPU_DTRACE_FAULT)
4971					break;
4972
4973				if (len != 0) {
4974					if ((end -= (len + 1)) < start)
4975						break;
4976
4977					*end = '@';
4978				}
4979
4980				for (i = 1; i <= len; i++)
4981					end[i] = dtrace_load8((uintptr_t)s++);
4982			}
4983
4984			/*
4985			 * Now for the node name...
4986			 */
4987			s = (char *)dtrace_loadptr(daddr +
4988			    offsetof(struct dev_info, devi_node_name));
4989
4990			daddr = dtrace_loadptr(daddr +
4991			    offsetof(struct dev_info, devi_parent));
4992
4993			/*
4994			 * If our parent is NULL (that is, if we're the root
4995			 * node), we're going to use the special path
4996			 * "devices".
4997			 */
4998			if (daddr == NULL)
4999				s = "devices";
5000
5001			len = dtrace_strlen(s, size);
5002			if (*flags & CPU_DTRACE_FAULT)
5003				break;
5004
5005			if ((end -= (len + 1)) < start)
5006				break;
5007
5008			for (i = 1; i <= len; i++)
5009				end[i] = dtrace_load8((uintptr_t)s++);
5010			*end = '/';
5011
5012			if (depth++ > dtrace_devdepth_max) {
5013				*flags |= CPU_DTRACE_ILLOP;
5014				break;
5015			}
5016		}
5017
5018		if (end < start)
5019			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5020
5021		if (daddr == NULL) {
5022			regs[rd] = (uintptr_t)end;
5023			mstate->dtms_scratch_ptr += size;
5024		}
5025
5026		break;
5027	}
5028
5029	case DIF_SUBR_STRJOIN: {
5030		char *d = (char *)mstate->dtms_scratch_ptr;
5031		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
5032		uintptr_t s1 = tupregs[0].dttk_value;
5033		uintptr_t s2 = tupregs[1].dttk_value;
5034		int i = 0, j = 0;
5035		size_t lim1, lim2;
5036		char c;
5037
5038		if (!dtrace_strcanload(s1, size, &lim1, mstate, vstate) ||
5039		    !dtrace_strcanload(s2, size, &lim2, mstate, vstate)) {
5040			regs[rd] = NULL;
5041			break;
5042		}
5043
5044		if (!DTRACE_INSCRATCH(mstate, size)) {
5045			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5046			regs[rd] = NULL;
5047			break;
5048		}
5049
5050		for (;;) {
5051			if (i >= size) {
5052				DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5053				regs[rd] = NULL;
5054				break;
5055			}
5056			c = (i >= lim1) ? '\0' : dtrace_load8(s1++);
5057			if ((d[i++] = c) == '\0') {
5058				i--;
5059				break;
5060			}
5061		}
5062
5063		for (;;) {
5064			if (i >= size) {
5065				DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5066				regs[rd] = NULL;
5067				break;
5068			}
5069
5070			c = (j++ >= lim2) ? '\0' : dtrace_load8(s2++);
5071			if ((d[i++] = c) == '\0')
5072				break;
5073		}
5074
5075		if (i < size) {
5076			mstate->dtms_scratch_ptr += i;
5077			regs[rd] = (uintptr_t)d;
5078		}
5079
5080		break;
5081	}
5082
5083	case DIF_SUBR_STRTOLL: {
5084		uintptr_t s = tupregs[0].dttk_value;
5085		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
5086		size_t lim;
5087		int base = 10;
5088
5089		if (nargs > 1) {
5090			if ((base = tupregs[1].dttk_value) <= 1 ||
5091			    base > ('z' - 'a' + 1) + ('9' - '0' + 1)) {
5092				*flags |= CPU_DTRACE_ILLOP;
5093				break;
5094			}
5095		}
5096
5097		if (!dtrace_strcanload(s, size, &lim, mstate, vstate)) {
5098			regs[rd] = INT64_MIN;
5099			break;
5100		}
5101
5102		regs[rd] = dtrace_strtoll((char *)s, base, lim);
5103		break;
5104	}
5105
5106	case DIF_SUBR_LLTOSTR: {
5107		int64_t i = (int64_t)tupregs[0].dttk_value;
5108		uint64_t val, digit;
5109		uint64_t size = 65;	/* enough room for 2^64 in binary */
5110		char *end = (char *)mstate->dtms_scratch_ptr + size - 1;
5111		int base = 10;
5112
5113		if (nargs > 1) {
5114			if ((base = tupregs[1].dttk_value) <= 1 ||
5115			    base > ('z' - 'a' + 1) + ('9' - '0' + 1)) {
5116				*flags |= CPU_DTRACE_ILLOP;
5117				break;
5118			}
5119		}
5120
5121		val = (base == 10 && i < 0) ? i * -1 : i;
5122
5123		if (!DTRACE_INSCRATCH(mstate, size)) {
5124			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5125			regs[rd] = NULL;
5126			break;
5127		}
5128
5129		for (*end-- = '\0'; val; val /= base) {
5130			if ((digit = val % base) <= '9' - '0') {
5131				*end-- = '0' + digit;
5132			} else {
5133				*end-- = 'a' + (digit - ('9' - '0') - 1);
5134			}
5135		}
5136
5137		if (i == 0 && base == 16)
5138			*end-- = '0';
5139
5140		if (base == 16)
5141			*end-- = 'x';
5142
5143		if (i == 0 || base == 8 || base == 16)
5144			*end-- = '0';
5145
5146		if (i < 0 && base == 10)
5147			*end-- = '-';
5148
5149		regs[rd] = (uintptr_t)end + 1;
5150		mstate->dtms_scratch_ptr += size;
5151		break;
5152	}
5153
5154	case DIF_SUBR_HTONS:
5155	case DIF_SUBR_NTOHS:
5156#ifdef _BIG_ENDIAN
5157		regs[rd] = (uint16_t)tupregs[0].dttk_value;
5158#else
5159		regs[rd] = DT_BSWAP_16((uint16_t)tupregs[0].dttk_value);
5160#endif
5161		break;
5162
5163
5164	case DIF_SUBR_HTONL:
5165	case DIF_SUBR_NTOHL:
5166#ifdef _BIG_ENDIAN
5167		regs[rd] = (uint32_t)tupregs[0].dttk_value;
5168#else
5169		regs[rd] = DT_BSWAP_32((uint32_t)tupregs[0].dttk_value);
5170#endif
5171		break;
5172
5173
5174	case DIF_SUBR_HTONLL:
5175	case DIF_SUBR_NTOHLL:
5176#ifdef _BIG_ENDIAN
5177		regs[rd] = (uint64_t)tupregs[0].dttk_value;
5178#else
5179		regs[rd] = DT_BSWAP_64((uint64_t)tupregs[0].dttk_value);
5180#endif
5181		break;
5182
5183
5184	case DIF_SUBR_DIRNAME:
5185	case DIF_SUBR_BASENAME: {
5186		char *dest = (char *)mstate->dtms_scratch_ptr;
5187		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
5188		uintptr_t src = tupregs[0].dttk_value;
5189		int i, j, len = dtrace_strlen((char *)src, size);
5190		int lastbase = -1, firstbase = -1, lastdir = -1;
5191		int start, end;
5192
5193		if (!dtrace_canload(src, len + 1, mstate, vstate)) {
5194			regs[rd] = NULL;
5195			break;
5196		}
5197
5198		if (!DTRACE_INSCRATCH(mstate, size)) {
5199			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5200			regs[rd] = NULL;
5201			break;
5202		}
5203
5204		/*
5205		 * The basename and dirname for a zero-length string is
5206		 * defined to be "."
5207		 */
5208		if (len == 0) {
5209			len = 1;
5210			src = (uintptr_t)".";
5211		}
5212
5213		/*
5214		 * Start from the back of the string, moving back toward the
5215		 * front until we see a character that isn't a slash.  That
5216		 * character is the last character in the basename.
5217		 */
5218		for (i = len - 1; i >= 0; i--) {
5219			if (dtrace_load8(src + i) != '/')
5220				break;
5221		}
5222
5223		if (i >= 0)
5224			lastbase = i;
5225
5226		/*
5227		 * Starting from the last character in the basename, move
5228		 * towards the front until we find a slash.  The character
5229		 * that we processed immediately before that is the first
5230		 * character in the basename.
5231		 */
5232		for (; i >= 0; i--) {
5233			if (dtrace_load8(src + i) == '/')
5234				break;
5235		}
5236
5237		if (i >= 0)
5238			firstbase = i + 1;
5239
5240		/*
5241		 * Now keep going until we find a non-slash character.  That
5242		 * character is the last character in the dirname.
5243		 */
5244		for (; i >= 0; i--) {
5245			if (dtrace_load8(src + i) != '/')
5246				break;
5247		}
5248
5249		if (i >= 0)
5250			lastdir = i;
5251
5252		ASSERT(!(lastbase == -1 && firstbase != -1));
5253		ASSERT(!(firstbase == -1 && lastdir != -1));
5254
5255		if (lastbase == -1) {
5256			/*
5257			 * We didn't find a non-slash character.  We know that
5258			 * the length is non-zero, so the whole string must be
5259			 * slashes.  In either the dirname or the basename
5260			 * case, we return '/'.
5261			 */
5262			ASSERT(firstbase == -1);
5263			firstbase = lastbase = lastdir = 0;
5264		}
5265
5266		if (firstbase == -1) {
5267			/*
5268			 * The entire string consists only of a basename
5269			 * component.  If we're looking for dirname, we need
5270			 * to change our string to be just "."; if we're
5271			 * looking for a basename, we'll just set the first
5272			 * character of the basename to be 0.
5273			 */
5274			if (subr == DIF_SUBR_DIRNAME) {
5275				ASSERT(lastdir == -1);
5276				src = (uintptr_t)".";
5277				lastdir = 0;
5278			} else {
5279				firstbase = 0;
5280			}
5281		}
5282
5283		if (subr == DIF_SUBR_DIRNAME) {
5284			if (lastdir == -1) {
5285				/*
5286				 * We know that we have a slash in the name --
5287				 * or lastdir would be set to 0, above.  And
5288				 * because lastdir is -1, we know that this
5289				 * slash must be the first character.  (That
5290				 * is, the full string must be of the form
5291				 * "/basename".)  In this case, the last
5292				 * character of the directory name is 0.
5293				 */
5294				lastdir = 0;
5295			}
5296
5297			start = 0;
5298			end = lastdir;
5299		} else {
5300			ASSERT(subr == DIF_SUBR_BASENAME);
5301			ASSERT(firstbase != -1 && lastbase != -1);
5302			start = firstbase;
5303			end = lastbase;
5304		}
5305
5306		for (i = start, j = 0; i <= end && j < size - 1; i++, j++)
5307			dest[j] = dtrace_load8(src + i);
5308
5309		dest[j] = '\0';
5310		regs[rd] = (uintptr_t)dest;
5311		mstate->dtms_scratch_ptr += size;
5312		break;
5313	}
5314
5315	case DIF_SUBR_GETF: {
5316		uintptr_t fd = tupregs[0].dttk_value;
5317		uf_info_t *finfo = &curthread->t_procp->p_user.u_finfo;
5318		file_t *fp;
5319
5320		if (!dtrace_priv_proc(state, mstate)) {
5321			regs[rd] = NULL;
5322			break;
5323		}
5324
5325		/*
5326		 * This is safe because fi_nfiles only increases, and the
5327		 * fi_list array is not freed when the array size doubles.
5328		 * (See the comment in flist_grow() for details on the
5329		 * management of the u_finfo structure.)
5330		 */
5331		fp = fd < finfo->fi_nfiles ? finfo->fi_list[fd].uf_file : NULL;
5332
5333		mstate->dtms_getf = fp;
5334		regs[rd] = (uintptr_t)fp;
5335		break;
5336	}
5337
5338	case DIF_SUBR_CLEANPATH: {
5339		char *dest = (char *)mstate->dtms_scratch_ptr, c;
5340		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
5341		uintptr_t src = tupregs[0].dttk_value;
5342		size_t lim;
5343		int i = 0, j = 0;
5344		zone_t *z;
5345
5346		if (!dtrace_strcanload(src, size, &lim, mstate, vstate)) {
5347			regs[rd] = NULL;
5348			break;
5349		}
5350
5351		if (!DTRACE_INSCRATCH(mstate, size)) {
5352			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5353			regs[rd] = NULL;
5354			break;
5355		}
5356
5357		/*
5358		 * Move forward, loading each character.
5359		 */
5360		do {
5361			c = (i >= lim) ? '\0' : dtrace_load8(src + i++);
5362next:
5363			if (j + 5 >= size)	/* 5 = strlen("/..c\0") */
5364				break;
5365
5366			if (c != '/') {
5367				dest[j++] = c;
5368				continue;
5369			}
5370
5371			c = (i >= lim) ? '\0' : dtrace_load8(src + i++);
5372
5373			if (c == '/') {
5374				/*
5375				 * We have two slashes -- we can just advance
5376				 * to the next character.
5377				 */
5378				goto next;
5379			}
5380
5381			if (c != '.') {
5382				/*
5383				 * This is not "." and it's not ".." -- we can
5384				 * just store the "/" and this character and
5385				 * drive on.
5386				 */
5387				dest[j++] = '/';
5388				dest[j++] = c;
5389				continue;
5390			}
5391
5392			c = (i >= lim) ? '\0' : dtrace_load8(src + i++);
5393
5394			if (c == '/') {
5395				/*
5396				 * This is a "/./" component.  We're not going
5397				 * to store anything in the destination buffer;
5398				 * we're just going to go to the next component.
5399				 */
5400				goto next;
5401			}
5402
5403			if (c != '.') {
5404				/*
5405				 * This is not ".." -- we can just store the
5406				 * "/." and this character and continue
5407				 * processing.
5408				 */
5409				dest[j++] = '/';
5410				dest[j++] = '.';
5411				dest[j++] = c;
5412				continue;
5413			}
5414
5415			c = (i >= lim) ? '\0' : dtrace_load8(src + i++);
5416
5417			if (c != '/' && c != '\0') {
5418				/*
5419				 * This is not ".." -- it's "..[mumble]".
5420				 * We'll store the "/.." and this character
5421				 * and continue processing.
5422				 */
5423				dest[j++] = '/';
5424				dest[j++] = '.';
5425				dest[j++] = '.';
5426				dest[j++] = c;
5427				continue;
5428			}
5429
5430			/*
5431			 * This is "/../" or "/..\0".  We need to back up
5432			 * our destination pointer until we find a "/".
5433			 */
5434			i--;
5435			while (j != 0 && dest[--j] != '/')
5436				continue;
5437
5438			if (c == '\0')
5439				dest[++j] = '/';
5440		} while (c != '\0');
5441
5442		dest[j] = '\0';
5443
5444		if (mstate->dtms_getf != NULL &&
5445		    !(mstate->dtms_access & DTRACE_ACCESS_KERNEL) &&
5446		    (z = state->dts_cred.dcr_cred->cr_zone) != kcred->cr_zone) {
5447			/*
5448			 * If we've done a getf() as a part of this ECB and we
5449			 * don't have kernel access (and we're not in the global
5450			 * zone), check if the path we cleaned up begins with
5451			 * the zone's root path, and trim it off if so.  Note
5452			 * that this is an output cleanliness issue, not a
5453			 * security issue: knowing one's zone root path does
5454			 * not enable privilege escalation.
5455			 */
5456			if (strstr(dest, z->zone_rootpath) == dest)
5457				dest += strlen(z->zone_rootpath) - 1;
5458		}
5459
5460		regs[rd] = (uintptr_t)dest;
5461		mstate->dtms_scratch_ptr += size;
5462		break;
5463	}
5464
5465	case DIF_SUBR_INET_NTOA:
5466	case DIF_SUBR_INET_NTOA6:
5467	case DIF_SUBR_INET_NTOP: {
5468		size_t size;
5469		int af, argi, i;
5470		char *base, *end;
5471
5472		if (subr == DIF_SUBR_INET_NTOP) {
5473			af = (int)tupregs[0].dttk_value;
5474			argi = 1;
5475		} else {
5476			af = subr == DIF_SUBR_INET_NTOA ? AF_INET: AF_INET6;
5477			argi = 0;
5478		}
5479
5480		if (af == AF_INET) {
5481			ipaddr_t ip4;
5482			uint8_t *ptr8, val;
5483
5484			if (!dtrace_canload(tupregs[argi].dttk_value,
5485			    sizeof (ipaddr_t), mstate, vstate)) {
5486				regs[rd] = NULL;
5487				break;
5488			}
5489
5490			/*
5491			 * Safely load the IPv4 address.
5492			 */
5493			ip4 = dtrace_load32(tupregs[argi].dttk_value);
5494
5495			/*
5496			 * Check an IPv4 string will fit in scratch.
5497			 */
5498			size = INET_ADDRSTRLEN;
5499			if (!DTRACE_INSCRATCH(mstate, size)) {
5500				DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5501				regs[rd] = NULL;
5502				break;
5503			}
5504			base = (char *)mstate->dtms_scratch_ptr;
5505			end = (char *)mstate->dtms_scratch_ptr + size - 1;
5506
5507			/*
5508			 * Stringify as a dotted decimal quad.
5509			 */
5510			*end-- = '\0';
5511			ptr8 = (uint8_t *)&ip4;
5512			for (i = 3; i >= 0; i--) {
5513				val = ptr8[i];
5514
5515				if (val == 0) {
5516					*end-- = '0';
5517				} else {
5518					for (; val; val /= 10) {
5519						*end-- = '0' + (val % 10);
5520					}
5521				}
5522
5523				if (i > 0)
5524					*end-- = '.';
5525			}
5526			ASSERT(end + 1 >= base);
5527
5528		} else if (af == AF_INET6) {
5529			struct in6_addr ip6;
5530			int firstzero, tryzero, numzero, v6end;
5531			uint16_t val;
5532			const char digits[] = "0123456789abcdef";
5533
5534			/*
5535			 * Stringify using RFC 1884 convention 2 - 16 bit
5536			 * hexadecimal values with a zero-run compression.
5537			 * Lower case hexadecimal digits are used.
5538			 * 	eg, fe80::214:4fff:fe0b:76c8.
5539			 * The IPv4 embedded form is returned for inet_ntop,
5540			 * just the IPv4 string is returned for inet_ntoa6.
5541			 */
5542
5543			if (!dtrace_canload(tupregs[argi].dttk_value,
5544			    sizeof (struct in6_addr), mstate, vstate)) {
5545				regs[rd] = NULL;
5546				break;
5547			}
5548
5549			/*
5550			 * Safely load the IPv6 address.
5551			 */
5552			dtrace_bcopy(
5553			    (void *)(uintptr_t)tupregs[argi].dttk_value,
5554			    (void *)(uintptr_t)&ip6, sizeof (struct in6_addr));
5555
5556			/*
5557			 * Check an IPv6 string will fit in scratch.
5558			 */
5559			size = INET6_ADDRSTRLEN;
5560			if (!DTRACE_INSCRATCH(mstate, size)) {
5561				DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5562				regs[rd] = NULL;
5563				break;
5564			}
5565			base = (char *)mstate->dtms_scratch_ptr;
5566			end = (char *)mstate->dtms_scratch_ptr + size - 1;
5567			*end-- = '\0';
5568
5569			/*
5570			 * Find the longest run of 16 bit zero values
5571			 * for the single allowed zero compression - "::".
5572			 */
5573			firstzero = -1;
5574			tryzero = -1;
5575			numzero = 1;
5576			for (i = 0; i < sizeof (struct in6_addr); i++) {
5577				if (ip6._S6_un._S6_u8[i] == 0 &&
5578				    tryzero == -1 && i % 2 == 0) {
5579					tryzero = i;
5580					continue;
5581				}
5582
5583				if (tryzero != -1 &&
5584				    (ip6._S6_un._S6_u8[i] != 0 ||
5585				    i == sizeof (struct in6_addr) - 1)) {
5586
5587					if (i - tryzero <= numzero) {
5588						tryzero = -1;
5589						continue;
5590					}
5591
5592					firstzero = tryzero;
5593					numzero = i - i % 2 - tryzero;
5594					tryzero = -1;
5595
5596					if (ip6._S6_un._S6_u8[i] == 0 &&
5597					    i == sizeof (struct in6_addr) - 1)
5598						numzero += 2;
5599				}
5600			}
5601			ASSERT(firstzero + numzero <= sizeof (struct in6_addr));
5602
5603			/*
5604			 * Check for an IPv4 embedded address.
5605			 */
5606			v6end = sizeof (struct in6_addr) - 2;
5607			if (IN6_IS_ADDR_V4MAPPED(&ip6) ||
5608			    IN6_IS_ADDR_V4COMPAT(&ip6)) {
5609				for (i = sizeof (struct in6_addr) - 1;
5610				    i >= DTRACE_V4MAPPED_OFFSET; i--) {
5611					ASSERT(end >= base);
5612
5613					val = ip6._S6_un._S6_u8[i];
5614
5615					if (val == 0) {
5616						*end-- = '0';
5617					} else {
5618						for (; val; val /= 10) {
5619							*end-- = '0' + val % 10;
5620						}
5621					}
5622
5623					if (i > DTRACE_V4MAPPED_OFFSET)
5624						*end-- = '.';
5625				}
5626
5627				if (subr == DIF_SUBR_INET_NTOA6)
5628					goto inetout;
5629
5630				/*
5631				 * Set v6end to skip the IPv4 address that
5632				 * we have already stringified.
5633				 */
5634				v6end = 10;
5635			}
5636
5637			/*
5638			 * Build the IPv6 string by working through the
5639			 * address in reverse.
5640			 */
5641			for (i = v6end; i >= 0; i -= 2) {
5642				ASSERT(end >= base);
5643
5644				if (i == firstzero + numzero - 2) {
5645					*end-- = ':';
5646					*end-- = ':';
5647					i -= numzero - 2;
5648					continue;
5649				}
5650
5651				if (i < 14 && i != firstzero - 2)
5652					*end-- = ':';
5653
5654				val = (ip6._S6_un._S6_u8[i] << 8) +
5655				    ip6._S6_un._S6_u8[i + 1];
5656
5657				if (val == 0) {
5658					*end-- = '0';
5659				} else {
5660					for (; val; val /= 16) {
5661						*end-- = digits[val % 16];
5662					}
5663				}
5664			}
5665			ASSERT(end + 1 >= base);
5666
5667		} else {
5668			/*
5669			 * The user didn't use AH_INET or AH_INET6.
5670			 */
5671			DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
5672			regs[rd] = NULL;
5673			break;
5674		}
5675
5676inetout:	regs[rd] = (uintptr_t)end + 1;
5677		mstate->dtms_scratch_ptr += size;
5678		break;
5679	}
5680
5681	}
5682}
5683
5684/*
5685 * Emulate the execution of DTrace IR instructions specified by the given
5686 * DIF object.  This function is deliberately void of assertions as all of
5687 * the necessary checks are handled by a call to dtrace_difo_validate().
5688 */
5689static uint64_t
5690dtrace_dif_emulate(dtrace_difo_t *difo, dtrace_mstate_t *mstate,
5691    dtrace_vstate_t *vstate, dtrace_state_t *state)
5692{
5693	const dif_instr_t *text = difo->dtdo_buf;
5694	const uint_t textlen = difo->dtdo_len;
5695	const char *strtab = difo->dtdo_strtab;
5696	const uint64_t *inttab = difo->dtdo_inttab;
5697
5698	uint64_t rval = 0;
5699	dtrace_statvar_t *svar;
5700	dtrace_dstate_t *dstate = &vstate->dtvs_dynvars;
5701	dtrace_difv_t *v;
5702	volatile uint16_t *flags = &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
5703	volatile uintptr_t *illval = &cpu_core[CPU->cpu_id].cpuc_dtrace_illval;
5704
5705	dtrace_key_t tupregs[DIF_DTR_NREGS + 2]; /* +2 for thread and id */
5706	uint64_t regs[DIF_DIR_NREGS];
5707	uint64_t *tmp;
5708
5709	uint8_t cc_n = 0, cc_z = 0, cc_v = 0, cc_c = 0;
5710	int64_t cc_r;
5711	uint_t pc = 0, id, opc;
5712	uint8_t ttop = 0;
5713	dif_instr_t instr;
5714	uint_t r1, r2, rd;
5715
5716	/*
5717	 * We stash the current DIF object into the machine state: we need it
5718	 * for subsequent access checking.
5719	 */
5720	mstate->dtms_difo = difo;
5721
5722	regs[DIF_REG_R0] = 0; 		/* %r0 is fixed at zero */
5723
5724	while (pc < textlen && !(*flags & CPU_DTRACE_FAULT)) {
5725		opc = pc;
5726
5727		instr = text[pc++];
5728		r1 = DIF_INSTR_R1(instr);
5729		r2 = DIF_INSTR_R2(instr);
5730		rd = DIF_INSTR_RD(instr);
5731
5732		switch (DIF_INSTR_OP(instr)) {
5733		case DIF_OP_OR:
5734			regs[rd] = regs[r1] | regs[r2];
5735			break;
5736		case DIF_OP_XOR:
5737			regs[rd] = regs[r1] ^ regs[r2];
5738			break;
5739		case DIF_OP_AND:
5740			regs[rd] = regs[r1] & regs[r2];
5741			break;
5742		case DIF_OP_SLL:
5743			regs[rd] = regs[r1] << regs[r2];
5744			break;
5745		case DIF_OP_SRL:
5746			regs[rd] = regs[r1] >> regs[r2];
5747			break;
5748		case DIF_OP_SUB:
5749			regs[rd] = regs[r1] - regs[r2];
5750			break;
5751		case DIF_OP_ADD:
5752			regs[rd] = regs[r1] + regs[r2];
5753			break;
5754		case DIF_OP_MUL:
5755			regs[rd] = regs[r1] * regs[r2];
5756			break;
5757		case DIF_OP_SDIV:
5758			if (regs[r2] == 0) {
5759				regs[rd] = 0;
5760				*flags |= CPU_DTRACE_DIVZERO;
5761			} else {
5762				DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
5763				regs[rd] = (int64_t)regs[r1] /
5764				    (int64_t)regs[r2];
5765				DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
5766			}
5767			break;
5768
5769		case DIF_OP_UDIV:
5770			if (regs[r2] == 0) {
5771				regs[rd] = 0;
5772				*flags |= CPU_DTRACE_DIVZERO;
5773			} else {
5774				DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
5775				regs[rd] = regs[r1] / regs[r2];
5776				DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
5777			}
5778			break;
5779
5780		case DIF_OP_SREM:
5781			if (regs[r2] == 0) {
5782				regs[rd] = 0;
5783				*flags |= CPU_DTRACE_DIVZERO;
5784			} else {
5785				DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
5786				regs[rd] = (int64_t)regs[r1] %
5787				    (int64_t)regs[r2];
5788				DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
5789			}
5790			break;
5791
5792		case DIF_OP_UREM:
5793			if (regs[r2] == 0) {
5794				regs[rd] = 0;
5795				*flags |= CPU_DTRACE_DIVZERO;
5796			} else {
5797				DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
5798				regs[rd] = regs[r1] % regs[r2];
5799				DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
5800			}
5801			break;
5802
5803		case DIF_OP_NOT:
5804			regs[rd] = ~regs[r1];
5805			break;
5806		case DIF_OP_MOV:
5807			regs[rd] = regs[r1];
5808			break;
5809		case DIF_OP_CMP:
5810			cc_r = regs[r1] - regs[r2];
5811			cc_n = cc_r < 0;
5812			cc_z = cc_r == 0;
5813			cc_v = 0;
5814			cc_c = regs[r1] < regs[r2];
5815			break;
5816		case DIF_OP_TST:
5817			cc_n = cc_v = cc_c = 0;
5818			cc_z = regs[r1] == 0;
5819			break;
5820		case DIF_OP_BA:
5821			pc = DIF_INSTR_LABEL(instr);
5822			break;
5823		case DIF_OP_BE:
5824			if (cc_z)
5825				pc = DIF_INSTR_LABEL(instr);
5826			break;
5827		case DIF_OP_BNE:
5828			if (cc_z == 0)
5829				pc = DIF_INSTR_LABEL(instr);
5830			break;
5831		case DIF_OP_BG:
5832			if ((cc_z | (cc_n ^ cc_v)) == 0)
5833				pc = DIF_INSTR_LABEL(instr);
5834			break;
5835		case DIF_OP_BGU:
5836			if ((cc_c | cc_z) == 0)
5837				pc = DIF_INSTR_LABEL(instr);
5838			break;
5839		case DIF_OP_BGE:
5840			if ((cc_n ^ cc_v) == 0)
5841				pc = DIF_INSTR_LABEL(instr);
5842			break;
5843		case DIF_OP_BGEU:
5844			if (cc_c == 0)
5845				pc = DIF_INSTR_LABEL(instr);
5846			break;
5847		case DIF_OP_BL:
5848			if (cc_n ^ cc_v)
5849				pc = DIF_INSTR_LABEL(instr);
5850			break;
5851		case DIF_OP_BLU:
5852			if (cc_c)
5853				pc = DIF_INSTR_LABEL(instr);
5854			break;
5855		case DIF_OP_BLE:
5856			if (cc_z | (cc_n ^ cc_v))
5857				pc = DIF_INSTR_LABEL(instr);
5858			break;
5859		case DIF_OP_BLEU:
5860			if (cc_c | cc_z)
5861				pc = DIF_INSTR_LABEL(instr);
5862			break;
5863		case DIF_OP_RLDSB:
5864			if (!dtrace_canload(regs[r1], 1, mstate, vstate))
5865				break;
5866			/*FALLTHROUGH*/
5867		case DIF_OP_LDSB:
5868			regs[rd] = (int8_t)dtrace_load8(regs[r1]);
5869			break;
5870		case DIF_OP_RLDSH:
5871			if (!dtrace_canload(regs[r1], 2, mstate, vstate))
5872				break;
5873			/*FALLTHROUGH*/
5874		case DIF_OP_LDSH:
5875			regs[rd] = (int16_t)dtrace_load16(regs[r1]);
5876			break;
5877		case DIF_OP_RLDSW:
5878			if (!dtrace_canload(regs[r1], 4, mstate, vstate))
5879				break;
5880			/*FALLTHROUGH*/
5881		case DIF_OP_LDSW:
5882			regs[rd] = (int32_t)dtrace_load32(regs[r1]);
5883			break;
5884		case DIF_OP_RLDUB:
5885			if (!dtrace_canload(regs[r1], 1, mstate, vstate))
5886				break;
5887			/*FALLTHROUGH*/
5888		case DIF_OP_LDUB:
5889			regs[rd] = dtrace_load8(regs[r1]);
5890			break;
5891		case DIF_OP_RLDUH:
5892			if (!dtrace_canload(regs[r1], 2, mstate, vstate))
5893				break;
5894			/*FALLTHROUGH*/
5895		case DIF_OP_LDUH:
5896			regs[rd] = dtrace_load16(regs[r1]);
5897			break;
5898		case DIF_OP_RLDUW:
5899			if (!dtrace_canload(regs[r1], 4, mstate, vstate))
5900				break;
5901			/*FALLTHROUGH*/
5902		case DIF_OP_LDUW:
5903			regs[rd] = dtrace_load32(regs[r1]);
5904			break;
5905		case DIF_OP_RLDX:
5906			if (!dtrace_canload(regs[r1], 8, mstate, vstate))
5907				break;
5908			/*FALLTHROUGH*/
5909		case DIF_OP_LDX:
5910			regs[rd] = dtrace_load64(regs[r1]);
5911			break;
5912		case DIF_OP_ULDSB:
5913			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
5914			regs[rd] = (int8_t)
5915			    dtrace_fuword8((void *)(uintptr_t)regs[r1]);
5916			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
5917			break;
5918		case DIF_OP_ULDSH:
5919			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
5920			regs[rd] = (int16_t)
5921			    dtrace_fuword16((void *)(uintptr_t)regs[r1]);
5922			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
5923			break;
5924		case DIF_OP_ULDSW:
5925			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
5926			regs[rd] = (int32_t)
5927			    dtrace_fuword32((void *)(uintptr_t)regs[r1]);
5928			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
5929			break;
5930		case DIF_OP_ULDUB:
5931			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
5932			regs[rd] =
5933			    dtrace_fuword8((void *)(uintptr_t)regs[r1]);
5934			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
5935			break;
5936		case DIF_OP_ULDUH:
5937			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
5938			regs[rd] =
5939			    dtrace_fuword16((void *)(uintptr_t)regs[r1]);
5940			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
5941			break;
5942		case DIF_OP_ULDUW:
5943			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
5944			regs[rd] =
5945			    dtrace_fuword32((void *)(uintptr_t)regs[r1]);
5946			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
5947			break;
5948		case DIF_OP_ULDX:
5949			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
5950			regs[rd] =
5951			    dtrace_fuword64((void *)(uintptr_t)regs[r1]);
5952			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
5953			break;
5954		case DIF_OP_RET:
5955			rval = regs[rd];
5956			pc = textlen;
5957			break;
5958		case DIF_OP_NOP:
5959			break;
5960		case DIF_OP_SETX:
5961			regs[rd] = inttab[DIF_INSTR_INTEGER(instr)];
5962			break;
5963		case DIF_OP_SETS:
5964			regs[rd] = (uint64_t)(uintptr_t)
5965			    (strtab + DIF_INSTR_STRING(instr));
5966			break;
5967		case DIF_OP_SCMP: {
5968			size_t sz = state->dts_options[DTRACEOPT_STRSIZE];
5969			uintptr_t s1 = regs[r1];
5970			uintptr_t s2 = regs[r2];
5971			size_t lim1, lim2;
5972
5973			if (s1 != NULL &&
5974			    !dtrace_strcanload(s1, sz, &lim1, mstate, vstate))
5975				break;
5976			if (s2 != NULL &&
5977			    !dtrace_strcanload(s2, sz, &lim2, mstate, vstate))
5978				break;
5979
5980			cc_r = dtrace_strncmp((char *)s1, (char *)s2,
5981			    MIN(lim1, lim2));
5982
5983			cc_n = cc_r < 0;
5984			cc_z = cc_r == 0;
5985			cc_v = cc_c = 0;
5986			break;
5987		}
5988		case DIF_OP_LDGA:
5989			regs[rd] = dtrace_dif_variable(mstate, state,
5990			    r1, regs[r2]);
5991			break;
5992		case DIF_OP_LDGS:
5993			id = DIF_INSTR_VAR(instr);
5994
5995			if (id >= DIF_VAR_OTHER_UBASE) {
5996				uintptr_t a;
5997
5998				id -= DIF_VAR_OTHER_UBASE;
5999				svar = vstate->dtvs_globals[id];
6000				ASSERT(svar != NULL);
6001				v = &svar->dtsv_var;
6002
6003				if (!(v->dtdv_type.dtdt_flags & DIF_TF_BYREF)) {
6004					regs[rd] = svar->dtsv_data;
6005					break;
6006				}
6007
6008				a = (uintptr_t)svar->dtsv_data;
6009
6010				if (*(uint8_t *)a == UINT8_MAX) {
6011					/*
6012					 * If the 0th byte is set to UINT8_MAX
6013					 * then this is to be treated as a
6014					 * reference to a NULL variable.
6015					 */
6016					regs[rd] = NULL;
6017				} else {
6018					regs[rd] = a + sizeof (uint64_t);
6019				}
6020
6021				break;
6022			}
6023
6024			regs[rd] = dtrace_dif_variable(mstate, state, id, 0);
6025			break;
6026
6027		case DIF_OP_STGS:
6028			id = DIF_INSTR_VAR(instr);
6029
6030			ASSERT(id >= DIF_VAR_OTHER_UBASE);
6031			id -= DIF_VAR_OTHER_UBASE;
6032
6033			VERIFY(id < vstate->dtvs_nglobals);
6034			svar = vstate->dtvs_globals[id];
6035			ASSERT(svar != NULL);
6036			v = &svar->dtsv_var;
6037
6038			if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6039				uintptr_t a = (uintptr_t)svar->dtsv_data;
6040				size_t lim;
6041
6042				ASSERT(a != NULL);
6043				ASSERT(svar->dtsv_size != 0);
6044
6045				if (regs[rd] == NULL) {
6046					*(uint8_t *)a = UINT8_MAX;
6047					break;
6048				} else {
6049					*(uint8_t *)a = 0;
6050					a += sizeof (uint64_t);
6051				}
6052				if (!dtrace_vcanload(
6053				    (void *)(uintptr_t)regs[rd], &v->dtdv_type,
6054				    &lim, mstate, vstate))
6055					break;
6056
6057				dtrace_vcopy((void *)(uintptr_t)regs[rd],
6058				    (void *)a, &v->dtdv_type, lim);
6059				break;
6060			}
6061
6062			svar->dtsv_data = regs[rd];
6063			break;
6064
6065		case DIF_OP_LDTA:
6066			/*
6067			 * There are no DTrace built-in thread-local arrays at
6068			 * present.  This opcode is saved for future work.
6069			 */
6070			*flags |= CPU_DTRACE_ILLOP;
6071			regs[rd] = 0;
6072			break;
6073
6074		case DIF_OP_LDLS:
6075			id = DIF_INSTR_VAR(instr);
6076
6077			if (id < DIF_VAR_OTHER_UBASE) {
6078				/*
6079				 * For now, this has no meaning.
6080				 */
6081				regs[rd] = 0;
6082				break;
6083			}
6084
6085			id -= DIF_VAR_OTHER_UBASE;
6086
6087			ASSERT(id < vstate->dtvs_nlocals);
6088			ASSERT(vstate->dtvs_locals != NULL);
6089
6090			svar = vstate->dtvs_locals[id];
6091			ASSERT(svar != NULL);
6092			v = &svar->dtsv_var;
6093
6094			if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6095				uintptr_t a = (uintptr_t)svar->dtsv_data;
6096				size_t sz = v->dtdv_type.dtdt_size;
6097
6098				sz += sizeof (uint64_t);
6099				ASSERT(svar->dtsv_size == NCPU * sz);
6100				a += CPU->cpu_id * sz;
6101
6102				if (*(uint8_t *)a == UINT8_MAX) {
6103					/*
6104					 * If the 0th byte is set to UINT8_MAX
6105					 * then this is to be treated as a
6106					 * reference to a NULL variable.
6107					 */
6108					regs[rd] = NULL;
6109				} else {
6110					regs[rd] = a + sizeof (uint64_t);
6111				}
6112
6113				break;
6114			}
6115
6116			ASSERT(svar->dtsv_size == NCPU * sizeof (uint64_t));
6117			tmp = (uint64_t *)(uintptr_t)svar->dtsv_data;
6118			regs[rd] = tmp[CPU->cpu_id];
6119			break;
6120
6121		case DIF_OP_STLS:
6122			id = DIF_INSTR_VAR(instr);
6123
6124			ASSERT(id >= DIF_VAR_OTHER_UBASE);
6125			id -= DIF_VAR_OTHER_UBASE;
6126			VERIFY(id < vstate->dtvs_nlocals);
6127
6128			ASSERT(vstate->dtvs_locals != NULL);
6129			svar = vstate->dtvs_locals[id];
6130			ASSERT(svar != NULL);
6131			v = &svar->dtsv_var;
6132
6133			if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6134				uintptr_t a = (uintptr_t)svar->dtsv_data;
6135				size_t sz = v->dtdv_type.dtdt_size;
6136				size_t lim;
6137
6138				sz += sizeof (uint64_t);
6139				ASSERT(svar->dtsv_size == NCPU * sz);
6140				a += CPU->cpu_id * sz;
6141
6142				if (regs[rd] == NULL) {
6143					*(uint8_t *)a = UINT8_MAX;
6144					break;
6145				} else {
6146					*(uint8_t *)a = 0;
6147					a += sizeof (uint64_t);
6148				}
6149
6150				if (!dtrace_vcanload(
6151				    (void *)(uintptr_t)regs[rd], &v->dtdv_type,
6152				    &lim, mstate, vstate))
6153					break;
6154
6155				dtrace_vcopy((void *)(uintptr_t)regs[rd],
6156				    (void *)a, &v->dtdv_type, lim);
6157				break;
6158			}
6159
6160			ASSERT(svar->dtsv_size == NCPU * sizeof (uint64_t));
6161			tmp = (uint64_t *)(uintptr_t)svar->dtsv_data;
6162			tmp[CPU->cpu_id] = regs[rd];
6163			break;
6164
6165		case DIF_OP_LDTS: {
6166			dtrace_dynvar_t *dvar;
6167			dtrace_key_t *key;
6168
6169			id = DIF_INSTR_VAR(instr);
6170			ASSERT(id >= DIF_VAR_OTHER_UBASE);
6171			id -= DIF_VAR_OTHER_UBASE;
6172			v = &vstate->dtvs_tlocals[id];
6173
6174			key = &tupregs[DIF_DTR_NREGS];
6175			key[0].dttk_value = (uint64_t)id;
6176			key[0].dttk_size = 0;
6177			DTRACE_TLS_THRKEY(key[1].dttk_value);
6178			key[1].dttk_size = 0;
6179
6180			dvar = dtrace_dynvar(dstate, 2, key,
6181			    sizeof (uint64_t), DTRACE_DYNVAR_NOALLOC,
6182			    mstate, vstate);
6183
6184			if (dvar == NULL) {
6185				regs[rd] = 0;
6186				break;
6187			}
6188
6189			if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6190				regs[rd] = (uint64_t)(uintptr_t)dvar->dtdv_data;
6191			} else {
6192				regs[rd] = *((uint64_t *)dvar->dtdv_data);
6193			}
6194
6195			break;
6196		}
6197
6198		case DIF_OP_STTS: {
6199			dtrace_dynvar_t *dvar;
6200			dtrace_key_t *key;
6201
6202			id = DIF_INSTR_VAR(instr);
6203			ASSERT(id >= DIF_VAR_OTHER_UBASE);
6204			id -= DIF_VAR_OTHER_UBASE;
6205			VERIFY(id < vstate->dtvs_ntlocals);
6206
6207			key = &tupregs[DIF_DTR_NREGS];
6208			key[0].dttk_value = (uint64_t)id;
6209			key[0].dttk_size = 0;
6210			DTRACE_TLS_THRKEY(key[1].dttk_value);
6211			key[1].dttk_size = 0;
6212			v = &vstate->dtvs_tlocals[id];
6213
6214			dvar = dtrace_dynvar(dstate, 2, key,
6215			    v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
6216			    v->dtdv_type.dtdt_size : sizeof (uint64_t),
6217			    regs[rd] ? DTRACE_DYNVAR_ALLOC :
6218			    DTRACE_DYNVAR_DEALLOC, mstate, vstate);
6219
6220			/*
6221			 * Given that we're storing to thread-local data,
6222			 * we need to flush our predicate cache.
6223			 */
6224			curthread->t_predcache = NULL;
6225
6226			if (dvar == NULL)
6227				break;
6228
6229			if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6230				size_t lim;
6231
6232				if (!dtrace_vcanload(
6233				    (void *)(uintptr_t)regs[rd],
6234				    &v->dtdv_type, &lim, mstate, vstate))
6235					break;
6236
6237				dtrace_vcopy((void *)(uintptr_t)regs[rd],
6238				    dvar->dtdv_data, &v->dtdv_type, lim);
6239			} else {
6240				*((uint64_t *)dvar->dtdv_data) = regs[rd];
6241			}
6242
6243			break;
6244		}
6245
6246		case DIF_OP_SRA:
6247			regs[rd] = (int64_t)regs[r1] >> regs[r2];
6248			break;
6249
6250		case DIF_OP_CALL:
6251			dtrace_dif_subr(DIF_INSTR_SUBR(instr), rd,
6252			    regs, tupregs, ttop, mstate, state);
6253			break;
6254
6255		case DIF_OP_PUSHTR:
6256			if (ttop == DIF_DTR_NREGS) {
6257				*flags |= CPU_DTRACE_TUPOFLOW;
6258				break;
6259			}
6260
6261			if (r1 == DIF_TYPE_STRING) {
6262				/*
6263				 * If this is a string type and the size is 0,
6264				 * we'll use the system-wide default string
6265				 * size.  Note that we are _not_ looking at
6266				 * the value of the DTRACEOPT_STRSIZE option;
6267				 * had this been set, we would expect to have
6268				 * a non-zero size value in the "pushtr".
6269				 */
6270				tupregs[ttop].dttk_size =
6271				    dtrace_strlen((char *)(uintptr_t)regs[rd],
6272				    regs[r2] ? regs[r2] :
6273				    dtrace_strsize_default) + 1;
6274			} else {
6275				if (regs[r2] > LONG_MAX) {
6276					*flags |= CPU_DTRACE_ILLOP;
6277					break;
6278				}
6279
6280				tupregs[ttop].dttk_size = regs[r2];
6281			}
6282
6283			tupregs[ttop++].dttk_value = regs[rd];
6284			break;
6285
6286		case DIF_OP_PUSHTV:
6287			if (ttop == DIF_DTR_NREGS) {
6288				*flags |= CPU_DTRACE_TUPOFLOW;
6289				break;
6290			}
6291
6292			tupregs[ttop].dttk_value = regs[rd];
6293			tupregs[ttop++].dttk_size = 0;
6294			break;
6295
6296		case DIF_OP_POPTS:
6297			if (ttop != 0)
6298				ttop--;
6299			break;
6300
6301		case DIF_OP_FLUSHTS:
6302			ttop = 0;
6303			break;
6304
6305		case DIF_OP_LDGAA:
6306		case DIF_OP_LDTAA: {
6307			dtrace_dynvar_t *dvar;
6308			dtrace_key_t *key = tupregs;
6309			uint_t nkeys = ttop;
6310
6311			id = DIF_INSTR_VAR(instr);
6312			ASSERT(id >= DIF_VAR_OTHER_UBASE);
6313			id -= DIF_VAR_OTHER_UBASE;
6314
6315			key[nkeys].dttk_value = (uint64_t)id;
6316			key[nkeys++].dttk_size = 0;
6317
6318			if (DIF_INSTR_OP(instr) == DIF_OP_LDTAA) {
6319				DTRACE_TLS_THRKEY(key[nkeys].dttk_value);
6320				key[nkeys++].dttk_size = 0;
6321				VERIFY(id < vstate->dtvs_ntlocals);
6322				v = &vstate->dtvs_tlocals[id];
6323			} else {
6324				VERIFY(id < vstate->dtvs_nglobals);
6325				v = &vstate->dtvs_globals[id]->dtsv_var;
6326			}
6327
6328			dvar = dtrace_dynvar(dstate, nkeys, key,
6329			    v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
6330			    v->dtdv_type.dtdt_size : sizeof (uint64_t),
6331			    DTRACE_DYNVAR_NOALLOC, mstate, vstate);
6332
6333			if (dvar == NULL) {
6334				regs[rd] = 0;
6335				break;
6336			}
6337
6338			if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6339				regs[rd] = (uint64_t)(uintptr_t)dvar->dtdv_data;
6340			} else {
6341				regs[rd] = *((uint64_t *)dvar->dtdv_data);
6342			}
6343
6344			break;
6345		}
6346
6347		case DIF_OP_STGAA:
6348		case DIF_OP_STTAA: {
6349			dtrace_dynvar_t *dvar;
6350			dtrace_key_t *key = tupregs;
6351			uint_t nkeys = ttop;
6352
6353			id = DIF_INSTR_VAR(instr);
6354			ASSERT(id >= DIF_VAR_OTHER_UBASE);
6355			id -= DIF_VAR_OTHER_UBASE;
6356
6357			key[nkeys].dttk_value = (uint64_t)id;
6358			key[nkeys++].dttk_size = 0;
6359
6360			if (DIF_INSTR_OP(instr) == DIF_OP_STTAA) {
6361				DTRACE_TLS_THRKEY(key[nkeys].dttk_value);
6362				key[nkeys++].dttk_size = 0;
6363				VERIFY(id < vstate->dtvs_ntlocals);
6364				v = &vstate->dtvs_tlocals[id];
6365			} else {
6366				VERIFY(id < vstate->dtvs_nglobals);
6367				v = &vstate->dtvs_globals[id]->dtsv_var;
6368			}
6369
6370			dvar = dtrace_dynvar(dstate, nkeys, key,
6371			    v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
6372			    v->dtdv_type.dtdt_size : sizeof (uint64_t),
6373			    regs[rd] ? DTRACE_DYNVAR_ALLOC :
6374			    DTRACE_DYNVAR_DEALLOC, mstate, vstate);
6375
6376			if (dvar == NULL)
6377				break;
6378
6379			if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6380				size_t lim;
6381
6382				if (!dtrace_vcanload(
6383				    (void *)(uintptr_t)regs[rd], &v->dtdv_type,
6384				    &lim, mstate, vstate))
6385					break;
6386
6387				dtrace_vcopy((void *)(uintptr_t)regs[rd],
6388				    dvar->dtdv_data, &v->dtdv_type, lim);
6389			} else {
6390				*((uint64_t *)dvar->dtdv_data) = regs[rd];
6391			}
6392
6393			break;
6394		}
6395
6396		case DIF_OP_ALLOCS: {
6397			uintptr_t ptr = P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
6398			size_t size = ptr - mstate->dtms_scratch_ptr + regs[r1];
6399
6400			/*
6401			 * Rounding up the user allocation size could have
6402			 * overflowed large, bogus allocations (like -1ULL) to
6403			 * 0.
6404			 */
6405			if (size < regs[r1] ||
6406			    !DTRACE_INSCRATCH(mstate, size)) {
6407				DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
6408				regs[rd] = NULL;
6409				break;
6410			}
6411
6412			dtrace_bzero((void *) mstate->dtms_scratch_ptr, size);
6413			mstate->dtms_scratch_ptr += size;
6414			regs[rd] = ptr;
6415			break;
6416		}
6417
6418		case DIF_OP_COPYS:
6419			if (!dtrace_canstore(regs[rd], regs[r2],
6420			    mstate, vstate)) {
6421				*flags |= CPU_DTRACE_BADADDR;
6422				*illval = regs[rd];
6423				break;
6424			}
6425
6426			if (!dtrace_canload(regs[r1], regs[r2], mstate, vstate))
6427				break;
6428
6429			dtrace_bcopy((void *)(uintptr_t)regs[r1],
6430			    (void *)(uintptr_t)regs[rd], (size_t)regs[r2]);
6431			break;
6432
6433		case DIF_OP_STB:
6434			if (!dtrace_canstore(regs[rd], 1, mstate, vstate)) {
6435				*flags |= CPU_DTRACE_BADADDR;
6436				*illval = regs[rd];
6437				break;
6438			}
6439			*((uint8_t *)(uintptr_t)regs[rd]) = (uint8_t)regs[r1];
6440			break;
6441
6442		case DIF_OP_STH:
6443			if (!dtrace_canstore(regs[rd], 2, mstate, vstate)) {
6444				*flags |= CPU_DTRACE_BADADDR;
6445				*illval = regs[rd];
6446				break;
6447			}
6448			if (regs[rd] & 1) {
6449				*flags |= CPU_DTRACE_BADALIGN;
6450				*illval = regs[rd];
6451				break;
6452			}
6453			*((uint16_t *)(uintptr_t)regs[rd]) = (uint16_t)regs[r1];
6454			break;
6455
6456		case DIF_OP_STW:
6457			if (!dtrace_canstore(regs[rd], 4, mstate, vstate)) {
6458				*flags |= CPU_DTRACE_BADADDR;
6459				*illval = regs[rd];
6460				break;
6461			}
6462			if (regs[rd] & 3) {
6463				*flags |= CPU_DTRACE_BADALIGN;
6464				*illval = regs[rd];
6465				break;
6466			}
6467			*((uint32_t *)(uintptr_t)regs[rd]) = (uint32_t)regs[r1];
6468			break;
6469
6470		case DIF_OP_STX:
6471			if (!dtrace_canstore(regs[rd], 8, mstate, vstate)) {
6472				*flags |= CPU_DTRACE_BADADDR;
6473				*illval = regs[rd];
6474				break;
6475			}
6476			if (regs[rd] & 7) {
6477				*flags |= CPU_DTRACE_BADALIGN;
6478				*illval = regs[rd];
6479				break;
6480			}
6481			*((uint64_t *)(uintptr_t)regs[rd]) = regs[r1];
6482			break;
6483		}
6484	}
6485
6486	if (!(*flags & CPU_DTRACE_FAULT))
6487		return (rval);
6488
6489	mstate->dtms_fltoffs = opc * sizeof (dif_instr_t);
6490	mstate->dtms_present |= DTRACE_MSTATE_FLTOFFS;
6491
6492	return (0);
6493}
6494
6495static void
6496dtrace_action_breakpoint(dtrace_ecb_t *ecb)
6497{
6498	dtrace_probe_t *probe = ecb->dte_probe;
6499	dtrace_provider_t *prov = probe->dtpr_provider;
6500	char c[DTRACE_FULLNAMELEN + 80], *str;
6501	char *msg = "dtrace: breakpoint action at probe ";
6502	char *ecbmsg = " (ecb ";
6503	uintptr_t mask = (0xf << (sizeof (uintptr_t) * NBBY / 4));
6504	uintptr_t val = (uintptr_t)ecb;
6505	int shift = (sizeof (uintptr_t) * NBBY) - 4, i = 0;
6506
6507	if (dtrace_destructive_disallow)
6508		return;
6509
6510	/*
6511	 * It's impossible to be taking action on the NULL probe.
6512	 */
6513	ASSERT(probe != NULL);
6514
6515	/*
6516	 * This is a poor man's (destitute man's?) sprintf():  we want to
6517	 * print the provider name, module name, function name and name of
6518	 * the probe, along with the hex address of the ECB with the breakpoint
6519	 * action -- all of which we must place in the character buffer by
6520	 * hand.
6521	 */
6522	while (*msg != '\0')
6523		c[i++] = *msg++;
6524
6525	for (str = prov->dtpv_name; *str != '\0'; str++)
6526		c[i++] = *str;
6527	c[i++] = ':';
6528
6529	for (str = probe->dtpr_mod; *str != '\0'; str++)
6530		c[i++] = *str;
6531	c[i++] = ':';
6532
6533	for (str = probe->dtpr_func; *str != '\0'; str++)
6534		c[i++] = *str;
6535	c[i++] = ':';
6536
6537	for (str = probe->dtpr_name; *str != '\0'; str++)
6538		c[i++] = *str;
6539
6540	while (*ecbmsg != '\0')
6541		c[i++] = *ecbmsg++;
6542
6543	while (shift >= 0) {
6544		mask = (uintptr_t)0xf << shift;
6545
6546		if (val >= ((uintptr_t)1 << shift))
6547			c[i++] = "0123456789abcdef"[(val & mask) >> shift];
6548		shift -= 4;
6549	}
6550
6551	c[i++] = ')';
6552	c[i] = '\0';
6553
6554	debug_enter(c);
6555}
6556
6557static void
6558dtrace_action_panic(dtrace_ecb_t *ecb)
6559{
6560	dtrace_probe_t *probe = ecb->dte_probe;
6561
6562	/*
6563	 * It's impossible to be taking action on the NULL probe.
6564	 */
6565	ASSERT(probe != NULL);
6566
6567	if (dtrace_destructive_disallow)
6568		return;
6569
6570	if (dtrace_panicked != NULL)
6571		return;
6572
6573	if (dtrace_casptr(&dtrace_panicked, NULL, curthread) != NULL)
6574		return;
6575
6576	/*
6577	 * We won the right to panic.  (We want to be sure that only one
6578	 * thread calls panic() from dtrace_probe(), and that panic() is
6579	 * called exactly once.)
6580	 */
6581	dtrace_panic("dtrace: panic action at probe %s:%s:%s:%s (ecb %p)",
6582	    probe->dtpr_provider->dtpv_name, probe->dtpr_mod,
6583	    probe->dtpr_func, probe->dtpr_name, (void *)ecb);
6584}
6585
6586static void
6587dtrace_action_raise(uint64_t sig)
6588{
6589	if (dtrace_destructive_disallow)
6590		return;
6591
6592	if (sig >= NSIG) {
6593		DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
6594		return;
6595	}
6596
6597	/*
6598	 * raise() has a queue depth of 1 -- we ignore all subsequent
6599	 * invocations of the raise() action.
6600	 */
6601	if (curthread->t_dtrace_sig == 0)
6602		curthread->t_dtrace_sig = (uint8_t)sig;
6603
6604	curthread->t_sig_check = 1;
6605	aston(curthread);
6606}
6607
6608static void
6609dtrace_action_stop(void)
6610{
6611	if (dtrace_destructive_disallow)
6612		return;
6613
6614	if (!curthread->t_dtrace_stop) {
6615		curthread->t_dtrace_stop = 1;
6616		curthread->t_sig_check = 1;
6617		aston(curthread);
6618	}
6619}
6620
6621static void
6622dtrace_action_chill(dtrace_mstate_t *mstate, hrtime_t val)
6623{
6624	hrtime_t now;
6625	volatile uint16_t *flags;
6626	cpu_t *cpu = CPU;
6627
6628	if (dtrace_destructive_disallow)
6629		return;
6630
6631	flags = (volatile uint16_t *)&cpu_core[cpu->cpu_id].cpuc_dtrace_flags;
6632
6633	now = dtrace_gethrtime();
6634
6635	if (now - cpu->cpu_dtrace_chillmark > dtrace_chill_interval) {
6636		/*
6637		 * We need to advance the mark to the current time.
6638		 */
6639		cpu->cpu_dtrace_chillmark = now;
6640		cpu->cpu_dtrace_chilled = 0;
6641	}
6642
6643	/*
6644	 * Now check to see if the requested chill time would take us over
6645	 * the maximum amount of time allowed in the chill interval.  (Or
6646	 * worse, if the calculation itself induces overflow.)
6647	 */
6648	if (cpu->cpu_dtrace_chilled + val > dtrace_chill_max ||
6649	    cpu->cpu_dtrace_chilled + val < cpu->cpu_dtrace_chilled) {
6650		*flags |= CPU_DTRACE_ILLOP;
6651		return;
6652	}
6653
6654	while (dtrace_gethrtime() - now < val)
6655		continue;
6656
6657	/*
6658	 * Normally, we assure that the value of the variable "timestamp" does
6659	 * not change within an ECB.  The presence of chill() represents an
6660	 * exception to this rule, however.
6661	 */
6662	mstate->dtms_present &= ~DTRACE_MSTATE_TIMESTAMP;
6663	cpu->cpu_dtrace_chilled += val;
6664}
6665
6666static void
6667dtrace_action_ustack(dtrace_mstate_t *mstate, dtrace_state_t *state,
6668    uint64_t *buf, uint64_t arg)
6669{
6670	int nframes = DTRACE_USTACK_NFRAMES(arg);
6671	int strsize = DTRACE_USTACK_STRSIZE(arg);
6672	uint64_t *pcs = &buf[1], *fps;
6673	char *str = (char *)&pcs[nframes];
6674	int size, offs = 0, i, j;
6675	size_t rem;
6676	uintptr_t old = mstate->dtms_scratch_ptr, saved;
6677	uint16_t *flags = &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
6678	char *sym;
6679
6680	/*
6681	 * Should be taking a faster path if string space has not been
6682	 * allocated.
6683	 */
6684	ASSERT(strsize != 0);
6685
6686	/*
6687	 * We will first allocate some temporary space for the frame pointers.
6688	 */
6689	fps = (uint64_t *)P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
6690	size = (uintptr_t)fps - mstate->dtms_scratch_ptr +
6691	    (nframes * sizeof (uint64_t));
6692
6693	if (!DTRACE_INSCRATCH(mstate, size)) {
6694		/*
6695		 * Not enough room for our frame pointers -- need to indicate
6696		 * that we ran out of scratch space.
6697		 */
6698		DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
6699		return;
6700	}
6701
6702	mstate->dtms_scratch_ptr += size;
6703	saved = mstate->dtms_scratch_ptr;
6704
6705	/*
6706	 * Now get a stack with both program counters and frame pointers.
6707	 */
6708	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6709	dtrace_getufpstack(buf, fps, nframes + 1);
6710	DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6711
6712	/*
6713	 * If that faulted, we're cooked.
6714	 */
6715	if (*flags & CPU_DTRACE_FAULT)
6716		goto out;
6717
6718	/*
6719	 * Now we want to walk up the stack, calling the USTACK helper.  For
6720	 * each iteration, we restore the scratch pointer.
6721	 */
6722	for (i = 0; i < nframes; i++) {
6723		mstate->dtms_scratch_ptr = saved;
6724
6725		if (<