1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright 2015, Joyent Inc. All rights reserved.
25 * Copyright (c) 2016 by Delphix. All rights reserved.
26 * Copyright 2018 OmniOS Community Edition (OmniOSce) Association.
27 */
28
29/*
30 * Zones
31 *
32 *   A zone is a named collection of processes, namespace constraints,
33 *   and other system resources which comprise a secure and manageable
34 *   application containment facility.
35 *
36 *   Zones (represented by the reference counted zone_t) are tracked in
37 *   the kernel in the zonehash.  Elsewhere in the kernel, Zone IDs
38 *   (zoneid_t) are used to track zone association.  Zone IDs are
39 *   dynamically generated when the zone is created; if a persistent
40 *   identifier is needed (core files, accounting logs, audit trail,
41 *   etc.), the zone name should be used.
42 *
43 *
44 *   Global Zone:
45 *
46 *   The global zone (zoneid 0) is automatically associated with all
47 *   system resources that have not been bound to a user-created zone.
48 *   This means that even systems where zones are not in active use
49 *   have a global zone, and all processes, mounts, etc. are
50 *   associated with that zone.  The global zone is generally
51 *   unconstrained in terms of privileges and access, though the usual
52 *   credential and privilege based restrictions apply.
53 *
54 *
55 *   Zone States:
56 *
57 *   The states in which a zone may be in and the transitions are as
58 *   follows:
59 *
60 *   ZONE_IS_UNINITIALIZED: primordial state for a zone. The partially
61 *   initialized zone is added to the list of active zones on the system but
62 *   isn't accessible.
63 *
64 *   ZONE_IS_INITIALIZED: Initialization complete except the ZSD callbacks are
65 *   not yet completed. Not possible to enter the zone, but attributes can
66 *   be retrieved.
67 *
68 *   ZONE_IS_READY: zsched (the kernel dummy process for a zone) is
69 *   ready.  The zone is made visible after the ZSD constructor callbacks are
70 *   executed.  A zone remains in this state until it transitions into
71 *   the ZONE_IS_BOOTING state as a result of a call to zone_boot().
72 *
73 *   ZONE_IS_BOOTING: in this shortlived-state, zsched attempts to start
74 *   init.  Should that fail, the zone proceeds to the ZONE_IS_SHUTTING_DOWN
75 *   state.
76 *
77 *   ZONE_IS_RUNNING: The zone is open for business: zsched has
78 *   successfully started init.   A zone remains in this state until
79 *   zone_shutdown() is called.
80 *
81 *   ZONE_IS_SHUTTING_DOWN: zone_shutdown() has been called, the system is
82 *   killing all processes running in the zone. The zone remains
83 *   in this state until there are no more user processes running in the zone.
84 *   zone_create(), zone_enter(), and zone_destroy() on this zone will fail.
85 *   Since zone_shutdown() is restartable, it may be called successfully
86 *   multiple times for the same zone_t.  Setting of the zone's state to
87 *   ZONE_IS_SHUTTING_DOWN is synchronized with mounts, so VOP_MOUNT() may check
88 *   the zone's status without worrying about it being a moving target.
89 *
90 *   ZONE_IS_EMPTY: zone_shutdown() has been called, and there
91 *   are no more user processes in the zone.  The zone remains in this
92 *   state until there are no more kernel threads associated with the
93 *   zone.  zone_create(), zone_enter(), and zone_destroy() on this zone will
94 *   fail.
95 *
96 *   ZONE_IS_DOWN: All kernel threads doing work on behalf of the zone
97 *   have exited.  zone_shutdown() returns.  Henceforth it is not possible to
98 *   join the zone or create kernel threads therein.
99 *
100 *   ZONE_IS_DYING: zone_destroy() has been called on the zone; zone
101 *   remains in this state until zsched exits.  Calls to zone_find_by_*()
102 *   return NULL from now on.
103 *
104 *   ZONE_IS_DEAD: zsched has exited (zone_ntasks == 0).  There are no
105 *   processes or threads doing work on behalf of the zone.  The zone is
106 *   removed from the list of active zones.  zone_destroy() returns, and
107 *   the zone can be recreated.
108 *
109 *   ZONE_IS_FREE (internal state): zone_ref goes to 0, ZSD destructor
110 *   callbacks are executed, and all memory associated with the zone is
111 *   freed.
112 *
113 *   Threads can wait for the zone to enter a requested state by using
114 *   zone_status_wait() or zone_status_timedwait() with the desired
115 *   state passed in as an argument.  Zone state transitions are
116 *   uni-directional; it is not possible to move back to an earlier state.
117 *
118 *
119 *   Zone-Specific Data:
120 *
121 *   Subsystems needing to maintain zone-specific data can store that
122 *   data using the ZSD mechanism.  This provides a zone-specific data
123 *   store, similar to thread-specific data (see pthread_getspecific(3C)
124 *   or the TSD code in uts/common/disp/thread.c.  Also, ZSD can be used
125 *   to register callbacks to be invoked when a zone is created, shut
126 *   down, or destroyed.  This can be used to initialize zone-specific
127 *   data for new zones and to clean up when zones go away.
128 *
129 *
130 *   Data Structures:
131 *
132 *   The per-zone structure (zone_t) is reference counted, and freed
133 *   when all references are released.  zone_hold and zone_rele can be
134 *   used to adjust the reference count.  In addition, reference counts
135 *   associated with the cred_t structure are tracked separately using
136 *   zone_cred_hold and zone_cred_rele.
137 *
138 *   Pointers to active zone_t's are stored in two hash tables; one
139 *   for searching by id, the other for searching by name.  Lookups
140 *   can be performed on either basis, using zone_find_by_id and
141 *   zone_find_by_name.  Both return zone_t pointers with the zone
142 *   held, so zone_rele should be called when the pointer is no longer
143 *   needed.  Zones can also be searched by path; zone_find_by_path
144 *   returns the zone with which a path name is associated (global
145 *   zone if the path is not within some other zone's file system
146 *   hierarchy).  This currently requires iterating through each zone,
147 *   so it is slower than an id or name search via a hash table.
148 *
149 *
150 *   Locking:
151 *
152 *   zonehash_lock: This is a top-level global lock used to protect the
153 *       zone hash tables and lists.  Zones cannot be created or destroyed
154 *       while this lock is held.
155 *   zone_status_lock: This is a global lock protecting zone state.
156 *       Zones cannot change state while this lock is held.  It also
157 *       protects the list of kernel threads associated with a zone.
158 *   zone_lock: This is a per-zone lock used to protect several fields of
159 *       the zone_t (see <sys/zone.h> for details).  In addition, holding
160 *       this lock means that the zone cannot go away.
161 *   zone_nlwps_lock: This is a per-zone lock used to protect the fields
162 *	 related to the zone.max-lwps rctl.
163 *   zone_mem_lock: This is a per-zone lock used to protect the fields
164 *	 related to the zone.max-locked-memory and zone.max-swap rctls.
165 *   zone_rctl_lock: This is a per-zone lock used to protect other rctls,
166 *       currently just max_lofi
167 *   zsd_key_lock: This is a global lock protecting the key state for ZSD.
168 *   zone_deathrow_lock: This is a global lock protecting the "deathrow"
169 *       list (a list of zones in the ZONE_IS_DEAD state).
170 *
171 *   Ordering requirements:
172 *       pool_lock --> cpu_lock --> zonehash_lock --> zone_status_lock -->
173 *       zone_lock --> zsd_key_lock --> pidlock --> p_lock
174 *
175 *   When taking zone_mem_lock or zone_nlwps_lock, the lock ordering is:
176 *	zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_mem_lock
177 *	zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_nlwps_lock
178 *
179 *   Blocking memory allocations are permitted while holding any of the
180 *   zone locks.
181 *
182 *
183 *   System Call Interface:
184 *
185 *   The zone subsystem can be managed and queried from user level with
186 *   the following system calls (all subcodes of the primary "zone"
187 *   system call):
188 *   - zone_create: creates a zone with selected attributes (name,
189 *     root path, privileges, resource controls, ZFS datasets)
190 *   - zone_enter: allows the current process to enter a zone
191 *   - zone_getattr: reports attributes of a zone
192 *   - zone_setattr: set attributes of a zone
193 *   - zone_boot: set 'init' running for the zone
194 *   - zone_list: lists all zones active in the system
195 *   - zone_lookup: looks up zone id based on name
196 *   - zone_shutdown: initiates shutdown process (see states above)
197 *   - zone_destroy: completes shutdown process (see states above)
198 *
199 */
200
201#include <sys/priv_impl.h>
202#include <sys/cred.h>
203#include <c2/audit.h>
204#include <sys/debug.h>
205#include <sys/file.h>
206#include <sys/kmem.h>
207#include <sys/kstat.h>
208#include <sys/mutex.h>
209#include <sys/note.h>
210#include <sys/pathname.h>
211#include <sys/proc.h>
212#include <sys/project.h>
213#include <sys/sysevent.h>
214#include <sys/task.h>
215#include <sys/systm.h>
216#include <sys/types.h>
217#include <sys/utsname.h>
218#include <sys/vnode.h>
219#include <sys/vfs.h>
220#include <sys/systeminfo.h>
221#include <sys/policy.h>
222#include <sys/cred_impl.h>
223#include <sys/contract_impl.h>
224#include <sys/contract/process_impl.h>
225#include <sys/class.h>
226#include <sys/pool.h>
227#include <sys/pool_pset.h>
228#include <sys/pset.h>
229#include <sys/strlog.h>
230#include <sys/sysmacros.h>
231#include <sys/callb.h>
232#include <sys/vmparam.h>
233#include <sys/corectl.h>
234#include <sys/ipc_impl.h>
235#include <sys/klpd.h>
236
237#include <sys/door.h>
238#include <sys/cpuvar.h>
239#include <sys/sdt.h>
240
241#include <sys/uadmin.h>
242#include <sys/session.h>
243#include <sys/cmn_err.h>
244#include <sys/modhash.h>
245#include <sys/sunddi.h>
246#include <sys/nvpair.h>
247#include <sys/rctl.h>
248#include <sys/fss.h>
249#include <sys/brand.h>
250#include <sys/zone.h>
251#include <net/if.h>
252#include <sys/cpucaps.h>
253#include <vm/seg.h>
254#include <sys/mac.h>
255
256/*
257 * This constant specifies the number of seconds that threads waiting for
258 * subsystems to release a zone's general-purpose references will wait before
259 * they log the zone's reference counts.  The constant's value shouldn't
260 * be so small that reference counts are unnecessarily reported for zones
261 * whose references are slowly released.  On the other hand, it shouldn't be so
262 * large that users reboot their systems out of frustration over hung zones
263 * before the system logs the zones' reference counts.
264 */
265#define	ZONE_DESTROY_TIMEOUT_SECS	60
266
267/* List of data link IDs which are accessible from the zone */
268typedef struct zone_dl {
269	datalink_id_t	zdl_id;
270	nvlist_t	*zdl_net;
271	list_node_t	zdl_linkage;
272} zone_dl_t;
273
274/*
275 * cv used to signal that all references to the zone have been released.  This
276 * needs to be global since there may be multiple waiters, and the first to
277 * wake up will free the zone_t, hence we cannot use zone->zone_cv.
278 */
279static kcondvar_t zone_destroy_cv;
280/*
281 * Lock used to serialize access to zone_cv.  This could have been per-zone,
282 * but then we'd need another lock for zone_destroy_cv, and why bother?
283 */
284static kmutex_t zone_status_lock;
285
286/*
287 * ZSD-related global variables.
288 */
289static kmutex_t zsd_key_lock;	/* protects the following two */
290/*
291 * The next caller of zone_key_create() will be assigned a key of ++zsd_keyval.
292 */
293static zone_key_t zsd_keyval = 0;
294/*
295 * Global list of registered keys.  We use this when a new zone is created.
296 */
297static list_t zsd_registered_keys;
298
299int zone_hash_size = 256;
300static mod_hash_t *zonehashbyname, *zonehashbyid, *zonehashbylabel;
301static kmutex_t zonehash_lock;
302static uint_t zonecount;
303static id_space_t *zoneid_space;
304
305/*
306 * The global zone (aka zone0) is the all-seeing, all-knowing zone in which the
307 * kernel proper runs, and which manages all other zones.
308 *
309 * Although not declared as static, the variable "zone0" should not be used
310 * except for by code that needs to reference the global zone early on in boot,
311 * before it is fully initialized.  All other consumers should use
312 * 'global_zone'.
313 */
314zone_t zone0;
315zone_t *global_zone = NULL;	/* Set when the global zone is initialized */
316
317/*
318 * List of active zones, protected by zonehash_lock.
319 */
320static list_t zone_active;
321
322/*
323 * List of destroyed zones that still have outstanding cred references.
324 * Used for debugging.  Uses a separate lock to avoid lock ordering
325 * problems in zone_free.
326 */
327static list_t zone_deathrow;
328static kmutex_t zone_deathrow_lock;
329
330/* number of zones is limited by virtual interface limit in IP */
331uint_t maxzones = 8192;
332
333/* Event channel to sent zone state change notifications */
334evchan_t *zone_event_chan;
335
336/*
337 * This table holds the mapping from kernel zone states to
338 * states visible in the state notification API.
339 * The idea is that we only expose "obvious" states and
340 * do not expose states which are just implementation details.
341 */
342const char  *zone_status_table[] = {
343	ZONE_EVENT_UNINITIALIZED,	/* uninitialized */
344	ZONE_EVENT_INITIALIZED,		/* initialized */
345	ZONE_EVENT_READY,		/* ready */
346	ZONE_EVENT_READY,		/* booting */
347	ZONE_EVENT_RUNNING,		/* running */
348	ZONE_EVENT_SHUTTING_DOWN,	/* shutting_down */
349	ZONE_EVENT_SHUTTING_DOWN,	/* empty */
350	ZONE_EVENT_SHUTTING_DOWN,	/* down */
351	ZONE_EVENT_SHUTTING_DOWN,	/* dying */
352	ZONE_EVENT_UNINITIALIZED,	/* dead */
353};
354
355/*
356 * This array contains the names of the subsystems listed in zone_ref_subsys_t
357 * (see sys/zone.h).
358 */
359static char *zone_ref_subsys_names[] = {
360	"NFS",		/* ZONE_REF_NFS */
361	"NFSv4",	/* ZONE_REF_NFSV4 */
362	"SMBFS",	/* ZONE_REF_SMBFS */
363	"MNTFS",	/* ZONE_REF_MNTFS */
364	"LOFI",		/* ZONE_REF_LOFI */
365	"VFS",		/* ZONE_REF_VFS */
366	"IPC"		/* ZONE_REF_IPC */
367};
368
369/*
370 * This isn't static so lint doesn't complain.
371 */
372rctl_hndl_t rc_zone_cpu_shares;
373rctl_hndl_t rc_zone_locked_mem;
374rctl_hndl_t rc_zone_max_swap;
375rctl_hndl_t rc_zone_max_lofi;
376rctl_hndl_t rc_zone_cpu_cap;
377rctl_hndl_t rc_zone_nlwps;
378rctl_hndl_t rc_zone_nprocs;
379rctl_hndl_t rc_zone_shmmax;
380rctl_hndl_t rc_zone_shmmni;
381rctl_hndl_t rc_zone_semmni;
382rctl_hndl_t rc_zone_msgmni;
383
384const char * const zone_default_initname = "/sbin/init";
385static char * const zone_prefix = "/zone/";
386static int zone_shutdown(zoneid_t zoneid);
387static int zone_add_datalink(zoneid_t, datalink_id_t);
388static int zone_remove_datalink(zoneid_t, datalink_id_t);
389static int zone_list_datalink(zoneid_t, int *, datalink_id_t *);
390static int zone_set_network(zoneid_t, zone_net_data_t *);
391static int zone_get_network(zoneid_t, zone_net_data_t *);
392
393typedef boolean_t zsd_applyfn_t(kmutex_t *, boolean_t, zone_t *, zone_key_t);
394
395static void zsd_apply_all_zones(zsd_applyfn_t *, zone_key_t);
396static void zsd_apply_all_keys(zsd_applyfn_t *, zone_t *);
397static boolean_t zsd_apply_create(kmutex_t *, boolean_t, zone_t *, zone_key_t);
398static boolean_t zsd_apply_shutdown(kmutex_t *, boolean_t, zone_t *,
399    zone_key_t);
400static boolean_t zsd_apply_destroy(kmutex_t *, boolean_t, zone_t *, zone_key_t);
401static boolean_t zsd_wait_for_creator(zone_t *, struct zsd_entry *,
402    kmutex_t *);
403static boolean_t zsd_wait_for_inprogress(zone_t *, struct zsd_entry *,
404    kmutex_t *);
405
406/*
407 * Bump this number when you alter the zone syscall interfaces; this is
408 * because we need to have support for previous API versions in libc
409 * to support patching; libc calls into the kernel to determine this number.
410 *
411 * Version 1 of the API is the version originally shipped with Solaris 10
412 * Version 2 alters the zone_create system call in order to support more
413 *     arguments by moving the args into a structure; and to do better
414 *     error reporting when zone_create() fails.
415 * Version 3 alters the zone_create system call in order to support the
416 *     import of ZFS datasets to zones.
417 * Version 4 alters the zone_create system call in order to support
418 *     Trusted Extensions.
419 * Version 5 alters the zone_boot system call, and converts its old
420 *     bootargs parameter to be set by the zone_setattr API instead.
421 * Version 6 adds the flag argument to zone_create.
422 */
423static const int ZONE_SYSCALL_API_VERSION = 6;
424
425/*
426 * Certain filesystems (such as NFS and autofs) need to know which zone
427 * the mount is being placed in.  Because of this, we need to be able to
428 * ensure that a zone isn't in the process of being created/destroyed such
429 * that nfs_mount() thinks it is in the global/NGZ zone, while by the time
430 * it gets added the list of mounted zones, it ends up on the wrong zone's
431 * mount list. Since a zone can't reside on an NFS file system, we don't
432 * have to worry about the zonepath itself.
433 *
434 * The following functions: block_mounts()/resume_mounts() and
435 * mount_in_progress()/mount_completed() are used by zones and the VFS
436 * layer (respectively) to synchronize zone state transitions and new
437 * mounts within a zone. This syncronization is on a per-zone basis, so
438 * activity for one zone will not interfere with activity for another zone.
439 *
440 * The semantics are like a reader-reader lock such that there may
441 * either be multiple mounts (or zone state transitions, if that weren't
442 * serialized by zonehash_lock) in progress at the same time, but not
443 * both.
444 *
445 * We use cv's so the user can ctrl-C out of the operation if it's
446 * taking too long.
447 *
448 * The semantics are such that there is unfair bias towards the
449 * "current" operation.  This means that zone halt may starve if
450 * there is a rapid succession of new mounts coming in to the zone.
451 */
452/*
453 * Prevent new mounts from progressing to the point of calling
454 * VFS_MOUNT().  If there are already mounts in this "region", wait for
455 * them to complete.
456 */
457static int
458block_mounts(zone_t *zp)
459{
460	int retval = 0;
461
462	/*
463	 * Since it may block for a long time, block_mounts() shouldn't be
464	 * called with zonehash_lock held.
465	 */
466	ASSERT(MUTEX_NOT_HELD(&zonehash_lock));
467	mutex_enter(&zp->zone_mount_lock);
468	while (zp->zone_mounts_in_progress > 0) {
469		if (cv_wait_sig(&zp->zone_mount_cv, &zp->zone_mount_lock) == 0)
470			goto signaled;
471	}
472	/*
473	 * A negative value of mounts_in_progress indicates that mounts
474	 * have been blocked by (-mounts_in_progress) different callers
475	 * (remotely possible if two threads enter zone_shutdown at the same
476	 * time).
477	 */
478	zp->zone_mounts_in_progress--;
479	retval = 1;
480signaled:
481	mutex_exit(&zp->zone_mount_lock);
482	return (retval);
483}
484
485/*
486 * The VFS layer may progress with new mounts as far as we're concerned.
487 * Allow them to progress if we were the last obstacle.
488 */
489static void
490resume_mounts(zone_t *zp)
491{
492	mutex_enter(&zp->zone_mount_lock);
493	if (++zp->zone_mounts_in_progress == 0)
494		cv_broadcast(&zp->zone_mount_cv);
495	mutex_exit(&zp->zone_mount_lock);
496}
497
498/*
499 * The VFS layer is busy with a mount; this zone should wait until all
500 * of its mounts are completed to progress.
501 */
502void
503mount_in_progress(zone_t *zp)
504{
505	mutex_enter(&zp->zone_mount_lock);
506	while (zp->zone_mounts_in_progress < 0)
507		cv_wait(&zp->zone_mount_cv, &zp->zone_mount_lock);
508	zp->zone_mounts_in_progress++;
509	mutex_exit(&zp->zone_mount_lock);
510}
511
512/*
513 * VFS is done with one mount; wake up any waiting block_mounts()
514 * callers if this is the last mount.
515 */
516void
517mount_completed(zone_t *zp)
518{
519	mutex_enter(&zp->zone_mount_lock);
520	if (--zp->zone_mounts_in_progress == 0)
521		cv_broadcast(&zp->zone_mount_cv);
522	mutex_exit(&zp->zone_mount_lock);
523}
524
525/*
526 * ZSD routines.
527 *
528 * Zone Specific Data (ZSD) is modeled after Thread Specific Data as
529 * defined by the pthread_key_create() and related interfaces.
530 *
531 * Kernel subsystems may register one or more data items and/or
532 * callbacks to be executed when a zone is created, shutdown, or
533 * destroyed.
534 *
535 * Unlike the thread counterpart, destructor callbacks will be executed
536 * even if the data pointer is NULL and/or there are no constructor
537 * callbacks, so it is the responsibility of such callbacks to check for
538 * NULL data values if necessary.
539 *
540 * The locking strategy and overall picture is as follows:
541 *
542 * When someone calls zone_key_create(), a template ZSD entry is added to the
543 * global list "zsd_registered_keys", protected by zsd_key_lock.  While
544 * holding that lock all the existing zones are marked as
545 * ZSD_CREATE_NEEDED and a copy of the ZSD entry added to the per-zone
546 * zone_zsd list (protected by zone_lock). The global list is updated first
547 * (under zone_key_lock) to make sure that newly created zones use the
548 * most recent list of keys. Then under zonehash_lock we walk the zones
549 * and mark them.  Similar locking is used in zone_key_delete().
550 *
551 * The actual create, shutdown, and destroy callbacks are done without
552 * holding any lock. And zsd_flags are used to ensure that the operations
553 * completed so that when zone_key_create (and zone_create) is done, as well as
554 * zone_key_delete (and zone_destroy) is done, all the necessary callbacks
555 * are completed.
556 *
557 * When new zones are created constructor callbacks for all registered ZSD
558 * entries will be called. That also uses the above two phases of marking
559 * what needs to be done, and then running the callbacks without holding
560 * any locks.
561 *
562 * The framework does not provide any locking around zone_getspecific() and
563 * zone_setspecific() apart from that needed for internal consistency, so
564 * callers interested in atomic "test-and-set" semantics will need to provide
565 * their own locking.
566 */
567
568/*
569 * Helper function to find the zsd_entry associated with the key in the
570 * given list.
571 */
572static struct zsd_entry *
573zsd_find(list_t *l, zone_key_t key)
574{
575	struct zsd_entry *zsd;
576
577	for (zsd = list_head(l); zsd != NULL; zsd = list_next(l, zsd)) {
578		if (zsd->zsd_key == key) {
579			return (zsd);
580		}
581	}
582	return (NULL);
583}
584
585/*
586 * Helper function to find the zsd_entry associated with the key in the
587 * given list. Move it to the front of the list.
588 */
589static struct zsd_entry *
590zsd_find_mru(list_t *l, zone_key_t key)
591{
592	struct zsd_entry *zsd;
593
594	for (zsd = list_head(l); zsd != NULL; zsd = list_next(l, zsd)) {
595		if (zsd->zsd_key == key) {
596			/*
597			 * Move to head of list to keep list in MRU order.
598			 */
599			if (zsd != list_head(l)) {
600				list_remove(l, zsd);
601				list_insert_head(l, zsd);
602			}
603			return (zsd);
604		}
605	}
606	return (NULL);
607}
608
609void
610zone_key_create(zone_key_t *keyp, void *(*create)(zoneid_t),
611    void (*shutdown)(zoneid_t, void *), void (*destroy)(zoneid_t, void *))
612{
613	struct zsd_entry *zsdp;
614	struct zsd_entry *t;
615	struct zone *zone;
616	zone_key_t  key;
617
618	zsdp = kmem_zalloc(sizeof (*zsdp), KM_SLEEP);
619	zsdp->zsd_data = NULL;
620	zsdp->zsd_create = create;
621	zsdp->zsd_shutdown = shutdown;
622	zsdp->zsd_destroy = destroy;
623
624	/*
625	 * Insert in global list of callbacks. Makes future zone creations
626	 * see it.
627	 */
628	mutex_enter(&zsd_key_lock);
629	key = zsdp->zsd_key = ++zsd_keyval;
630	ASSERT(zsd_keyval != 0);
631	list_insert_tail(&zsd_registered_keys, zsdp);
632	mutex_exit(&zsd_key_lock);
633
634	/*
635	 * Insert for all existing zones and mark them as needing
636	 * a create callback.
637	 */
638	mutex_enter(&zonehash_lock);	/* stop the world */
639	for (zone = list_head(&zone_active); zone != NULL;
640	    zone = list_next(&zone_active, zone)) {
641		zone_status_t status;
642
643		mutex_enter(&zone->zone_lock);
644
645		/* Skip zones that are on the way down or not yet up */
646		status = zone_status_get(zone);
647		if (status >= ZONE_IS_DOWN ||
648		    status == ZONE_IS_UNINITIALIZED) {
649			mutex_exit(&zone->zone_lock);
650			continue;
651		}
652
653		t = zsd_find_mru(&zone->zone_zsd, key);
654		if (t != NULL) {
655			/*
656			 * A zsd_configure already inserted it after
657			 * we dropped zsd_key_lock above.
658			 */
659			mutex_exit(&zone->zone_lock);
660			continue;
661		}
662		t = kmem_zalloc(sizeof (*t), KM_SLEEP);
663		t->zsd_key = key;
664		t->zsd_create = create;
665		t->zsd_shutdown = shutdown;
666		t->zsd_destroy = destroy;
667		if (create != NULL) {
668			t->zsd_flags = ZSD_CREATE_NEEDED;
669			DTRACE_PROBE2(zsd__create__needed,
670			    zone_t *, zone, zone_key_t, key);
671		}
672		list_insert_tail(&zone->zone_zsd, t);
673		mutex_exit(&zone->zone_lock);
674	}
675	mutex_exit(&zonehash_lock);
676
677	if (create != NULL) {
678		/* Now call the create callback for this key */
679		zsd_apply_all_zones(zsd_apply_create, key);
680	}
681	/*
682	 * It is safe for consumers to use the key now, make it
683	 * globally visible. Specifically zone_getspecific() will
684	 * always successfully return the zone specific data associated
685	 * with the key.
686	 */
687	*keyp = key;
688
689}
690
691/*
692 * Function called when a module is being unloaded, or otherwise wishes
693 * to unregister its ZSD key and callbacks.
694 *
695 * Remove from the global list and determine the functions that need to
696 * be called under a global lock. Then call the functions without
697 * holding any locks. Finally free up the zone_zsd entries. (The apply
698 * functions need to access the zone_zsd entries to find zsd_data etc.)
699 */
700int
701zone_key_delete(zone_key_t key)
702{
703	struct zsd_entry *zsdp = NULL;
704	zone_t *zone;
705
706	mutex_enter(&zsd_key_lock);
707	zsdp = zsd_find_mru(&zsd_registered_keys, key);
708	if (zsdp == NULL) {
709		mutex_exit(&zsd_key_lock);
710		return (-1);
711	}
712	list_remove(&zsd_registered_keys, zsdp);
713	mutex_exit(&zsd_key_lock);
714
715	mutex_enter(&zonehash_lock);
716	for (zone = list_head(&zone_active); zone != NULL;
717	    zone = list_next(&zone_active, zone)) {
718		struct zsd_entry *del;
719
720		mutex_enter(&zone->zone_lock);
721		del = zsd_find_mru(&zone->zone_zsd, key);
722		if (del == NULL) {
723			/*
724			 * Somebody else got here first e.g the zone going
725			 * away.
726			 */
727			mutex_exit(&zone->zone_lock);
728			continue;
729		}
730		ASSERT(del->zsd_shutdown == zsdp->zsd_shutdown);
731		ASSERT(del->zsd_destroy == zsdp->zsd_destroy);
732		if (del->zsd_shutdown != NULL &&
733		    (del->zsd_flags & ZSD_SHUTDOWN_ALL) == 0) {
734			del->zsd_flags |= ZSD_SHUTDOWN_NEEDED;
735			DTRACE_PROBE2(zsd__shutdown__needed,
736			    zone_t *, zone, zone_key_t, key);
737		}
738		if (del->zsd_destroy != NULL &&
739		    (del->zsd_flags & ZSD_DESTROY_ALL) == 0) {
740			del->zsd_flags |= ZSD_DESTROY_NEEDED;
741			DTRACE_PROBE2(zsd__destroy__needed,
742			    zone_t *, zone, zone_key_t, key);
743		}
744		mutex_exit(&zone->zone_lock);
745	}
746	mutex_exit(&zonehash_lock);
747	kmem_free(zsdp, sizeof (*zsdp));
748
749	/* Now call the shutdown and destroy callback for this key */
750	zsd_apply_all_zones(zsd_apply_shutdown, key);
751	zsd_apply_all_zones(zsd_apply_destroy, key);
752
753	/* Now we can free up the zsdp structures in each zone */
754	mutex_enter(&zonehash_lock);
755	for (zone = list_head(&zone_active); zone != NULL;
756	    zone = list_next(&zone_active, zone)) {
757		struct zsd_entry *del;
758
759		mutex_enter(&zone->zone_lock);
760		del = zsd_find(&zone->zone_zsd, key);
761		if (del != NULL) {
762			list_remove(&zone->zone_zsd, del);
763			ASSERT(!(del->zsd_flags & ZSD_ALL_INPROGRESS));
764			kmem_free(del, sizeof (*del));
765		}
766		mutex_exit(&zone->zone_lock);
767	}
768	mutex_exit(&zonehash_lock);
769
770	return (0);
771}
772
773/*
774 * ZSD counterpart of pthread_setspecific().
775 *
776 * Since all zsd callbacks, including those with no create function,
777 * have an entry in zone_zsd, if the key is registered it is part of
778 * the zone_zsd list.
779 * Return an error if the key wasn't registerd.
780 */
781int
782zone_setspecific(zone_key_t key, zone_t *zone, const void *data)
783{
784	struct zsd_entry *t;
785
786	mutex_enter(&zone->zone_lock);
787	t = zsd_find_mru(&zone->zone_zsd, key);
788	if (t != NULL) {
789		/*
790		 * Replace old value with new
791		 */
792		t->zsd_data = (void *)data;
793		mutex_exit(&zone->zone_lock);
794		return (0);
795	}
796	mutex_exit(&zone->zone_lock);
797	return (-1);
798}
799
800/*
801 * ZSD counterpart of pthread_getspecific().
802 */
803void *
804zone_getspecific(zone_key_t key, zone_t *zone)
805{
806	struct zsd_entry *t;
807	void *data;
808
809	mutex_enter(&zone->zone_lock);
810	t = zsd_find_mru(&zone->zone_zsd, key);
811	data = (t == NULL ? NULL : t->zsd_data);
812	mutex_exit(&zone->zone_lock);
813	return (data);
814}
815
816/*
817 * Function used to initialize a zone's list of ZSD callbacks and data
818 * when the zone is being created.  The callbacks are initialized from
819 * the template list (zsd_registered_keys). The constructor callback is
820 * executed later (once the zone exists and with locks dropped).
821 */
822static void
823zone_zsd_configure(zone_t *zone)
824{
825	struct zsd_entry *zsdp;
826	struct zsd_entry *t;
827
828	ASSERT(MUTEX_HELD(&zonehash_lock));
829	ASSERT(list_head(&zone->zone_zsd) == NULL);
830	mutex_enter(&zone->zone_lock);
831	mutex_enter(&zsd_key_lock);
832	for (zsdp = list_head(&zsd_registered_keys); zsdp != NULL;
833	    zsdp = list_next(&zsd_registered_keys, zsdp)) {
834		/*
835		 * Since this zone is ZONE_IS_UNCONFIGURED, zone_key_create
836		 * should not have added anything to it.
837		 */
838		ASSERT(zsd_find(&zone->zone_zsd, zsdp->zsd_key) == NULL);
839
840		t = kmem_zalloc(sizeof (*t), KM_SLEEP);
841		t->zsd_key = zsdp->zsd_key;
842		t->zsd_create = zsdp->zsd_create;
843		t->zsd_shutdown = zsdp->zsd_shutdown;
844		t->zsd_destroy = zsdp->zsd_destroy;
845		if (zsdp->zsd_create != NULL) {
846			t->zsd_flags = ZSD_CREATE_NEEDED;
847			DTRACE_PROBE2(zsd__create__needed,
848			    zone_t *, zone, zone_key_t, zsdp->zsd_key);
849		}
850		list_insert_tail(&zone->zone_zsd, t);
851	}
852	mutex_exit(&zsd_key_lock);
853	mutex_exit(&zone->zone_lock);
854}
855
856enum zsd_callback_type { ZSD_CREATE, ZSD_SHUTDOWN, ZSD_DESTROY };
857
858/*
859 * Helper function to execute shutdown or destructor callbacks.
860 */
861static void
862zone_zsd_callbacks(zone_t *zone, enum zsd_callback_type ct)
863{
864	struct zsd_entry *t;
865
866	ASSERT(ct == ZSD_SHUTDOWN || ct == ZSD_DESTROY);
867	ASSERT(ct != ZSD_SHUTDOWN || zone_status_get(zone) >= ZONE_IS_EMPTY);
868	ASSERT(ct != ZSD_DESTROY || zone_status_get(zone) >= ZONE_IS_DOWN);
869
870	/*
871	 * Run the callback solely based on what is registered for the zone
872	 * in zone_zsd. The global list can change independently of this
873	 * as keys are registered and unregistered and we don't register new
874	 * callbacks for a zone that is in the process of going away.
875	 */
876	mutex_enter(&zone->zone_lock);
877	for (t = list_head(&zone->zone_zsd); t != NULL;
878	    t = list_next(&zone->zone_zsd, t)) {
879		zone_key_t key = t->zsd_key;
880
881		/* Skip if no callbacks registered */
882
883		if (ct == ZSD_SHUTDOWN) {
884			if (t->zsd_shutdown != NULL &&
885			    (t->zsd_flags & ZSD_SHUTDOWN_ALL) == 0) {
886				t->zsd_flags |= ZSD_SHUTDOWN_NEEDED;
887				DTRACE_PROBE2(zsd__shutdown__needed,
888				    zone_t *, zone, zone_key_t, key);
889			}
890		} else {
891			if (t->zsd_destroy != NULL &&
892			    (t->zsd_flags & ZSD_DESTROY_ALL) == 0) {
893				t->zsd_flags |= ZSD_DESTROY_NEEDED;
894				DTRACE_PROBE2(zsd__destroy__needed,
895				    zone_t *, zone, zone_key_t, key);
896			}
897		}
898	}
899	mutex_exit(&zone->zone_lock);
900
901	/* Now call the shutdown and destroy callback for this key */
902	zsd_apply_all_keys(zsd_apply_shutdown, zone);
903	zsd_apply_all_keys(zsd_apply_destroy, zone);
904
905}
906
907/*
908 * Called when the zone is going away; free ZSD-related memory, and
909 * destroy the zone_zsd list.
910 */
911static void
912zone_free_zsd(zone_t *zone)
913{
914	struct zsd_entry *t, *next;
915
916	/*
917	 * Free all the zsd_entry's we had on this zone.
918	 */
919	mutex_enter(&zone->zone_lock);
920	for (t = list_head(&zone->zone_zsd); t != NULL; t = next) {
921		next = list_next(&zone->zone_zsd, t);
922		list_remove(&zone->zone_zsd, t);
923		ASSERT(!(t->zsd_flags & ZSD_ALL_INPROGRESS));
924		kmem_free(t, sizeof (*t));
925	}
926	list_destroy(&zone->zone_zsd);
927	mutex_exit(&zone->zone_lock);
928
929}
930
931/*
932 * Apply a function to all zones for particular key value.
933 *
934 * The applyfn has to drop zonehash_lock if it does some work, and
935 * then reacquire it before it returns.
936 * When the lock is dropped we don't follow list_next even
937 * if it is possible to do so without any hazards. This is
938 * because we want the design to allow for the list of zones
939 * to change in any arbitrary way during the time the
940 * lock was dropped.
941 *
942 * It is safe to restart the loop at list_head since the applyfn
943 * changes the zsd_flags as it does work, so a subsequent
944 * pass through will have no effect in applyfn, hence the loop will terminate
945 * in at worst O(N^2).
946 */
947static void
948zsd_apply_all_zones(zsd_applyfn_t *applyfn, zone_key_t key)
949{
950	zone_t *zone;
951
952	mutex_enter(&zonehash_lock);
953	zone = list_head(&zone_active);
954	while (zone != NULL) {
955		if ((applyfn)(&zonehash_lock, B_FALSE, zone, key)) {
956			/* Lock dropped - restart at head */
957			zone = list_head(&zone_active);
958		} else {
959			zone = list_next(&zone_active, zone);
960		}
961	}
962	mutex_exit(&zonehash_lock);
963}
964
965/*
966 * Apply a function to all keys for a particular zone.
967 *
968 * The applyfn has to drop zonehash_lock if it does some work, and
969 * then reacquire it before it returns.
970 * When the lock is dropped we don't follow list_next even
971 * if it is possible to do so without any hazards. This is
972 * because we want the design to allow for the list of zsd callbacks
973 * to change in any arbitrary way during the time the
974 * lock was dropped.
975 *
976 * It is safe to restart the loop at list_head since the applyfn
977 * changes the zsd_flags as it does work, so a subsequent
978 * pass through will have no effect in applyfn, hence the loop will terminate
979 * in at worst O(N^2).
980 */
981static void
982zsd_apply_all_keys(zsd_applyfn_t *applyfn, zone_t *zone)
983{
984	struct zsd_entry *t;
985
986	mutex_enter(&zone->zone_lock);
987	t = list_head(&zone->zone_zsd);
988	while (t != NULL) {
989		if ((applyfn)(NULL, B_TRUE, zone, t->zsd_key)) {
990			/* Lock dropped - restart at head */
991			t = list_head(&zone->zone_zsd);
992		} else {
993			t = list_next(&zone->zone_zsd, t);
994		}
995	}
996	mutex_exit(&zone->zone_lock);
997}
998
999/*
1000 * Call the create function for the zone and key if CREATE_NEEDED
1001 * is set.
1002 * If some other thread gets here first and sets CREATE_INPROGRESS, then
1003 * we wait for that thread to complete so that we can ensure that
1004 * all the callbacks are done when we've looped over all zones/keys.
1005 *
1006 * When we call the create function, we drop the global held by the
1007 * caller, and return true to tell the caller it needs to re-evalute the
1008 * state.
1009 * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
1010 * remains held on exit.
1011 */
1012static boolean_t
1013zsd_apply_create(kmutex_t *lockp, boolean_t zone_lock_held,
1014    zone_t *zone, zone_key_t key)
1015{
1016	void *result;
1017	struct zsd_entry *t;
1018	boolean_t dropped;
1019
1020	if (lockp != NULL) {
1021		ASSERT(MUTEX_HELD(lockp));
1022	}
1023	if (zone_lock_held) {
1024		ASSERT(MUTEX_HELD(&zone->zone_lock));
1025	} else {
1026		mutex_enter(&zone->zone_lock);
1027	}
1028
1029	t = zsd_find(&zone->zone_zsd, key);
1030	if (t == NULL) {
1031		/*
1032		 * Somebody else got here first e.g the zone going
1033		 * away.
1034		 */
1035		if (!zone_lock_held)
1036			mutex_exit(&zone->zone_lock);
1037		return (B_FALSE);
1038	}
1039	dropped = B_FALSE;
1040	if (zsd_wait_for_inprogress(zone, t, lockp))
1041		dropped = B_TRUE;
1042
1043	if (t->zsd_flags & ZSD_CREATE_NEEDED) {
1044		t->zsd_flags &= ~ZSD_CREATE_NEEDED;
1045		t->zsd_flags |= ZSD_CREATE_INPROGRESS;
1046		DTRACE_PROBE2(zsd__create__inprogress,
1047		    zone_t *, zone, zone_key_t, key);
1048		mutex_exit(&zone->zone_lock);
1049		if (lockp != NULL)
1050			mutex_exit(lockp);
1051
1052		dropped = B_TRUE;
1053		ASSERT(t->zsd_create != NULL);
1054		DTRACE_PROBE2(zsd__create__start,
1055		    zone_t *, zone, zone_key_t, key);
1056
1057		result = (*t->zsd_create)(zone->zone_id);
1058
1059		DTRACE_PROBE2(zsd__create__end,
1060		    zone_t *, zone, voidn *, result);
1061
1062		ASSERT(result != NULL);
1063		if (lockp != NULL)
1064			mutex_enter(lockp);
1065		mutex_enter(&zone->zone_lock);
1066		t->zsd_data = result;
1067		t->zsd_flags &= ~ZSD_CREATE_INPROGRESS;
1068		t->zsd_flags |= ZSD_CREATE_COMPLETED;
1069		cv_broadcast(&t->zsd_cv);
1070		DTRACE_PROBE2(zsd__create__completed,
1071		    zone_t *, zone, zone_key_t, key);
1072	}
1073	if (!zone_lock_held)
1074		mutex_exit(&zone->zone_lock);
1075	return (dropped);
1076}
1077
1078/*
1079 * Call the shutdown function for the zone and key if SHUTDOWN_NEEDED
1080 * is set.
1081 * If some other thread gets here first and sets *_INPROGRESS, then
1082 * we wait for that thread to complete so that we can ensure that
1083 * all the callbacks are done when we've looped over all zones/keys.
1084 *
1085 * When we call the shutdown function, we drop the global held by the
1086 * caller, and return true to tell the caller it needs to re-evalute the
1087 * state.
1088 * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
1089 * remains held on exit.
1090 */
1091static boolean_t
1092zsd_apply_shutdown(kmutex_t *lockp, boolean_t zone_lock_held,
1093    zone_t *zone, zone_key_t key)
1094{
1095	struct zsd_entry *t;
1096	void *data;
1097	boolean_t dropped;
1098
1099	if (lockp != NULL) {
1100		ASSERT(MUTEX_HELD(lockp));
1101	}
1102	if (zone_lock_held) {
1103		ASSERT(MUTEX_HELD(&zone->zone_lock));
1104	} else {
1105		mutex_enter(&zone->zone_lock);
1106	}
1107
1108	t = zsd_find(&zone->zone_zsd, key);
1109	if (t == NULL) {
1110		/*
1111		 * Somebody else got here first e.g the zone going
1112		 * away.
1113		 */
1114		if (!zone_lock_held)
1115			mutex_exit(&zone->zone_lock);
1116		return (B_FALSE);
1117	}
1118	dropped = B_FALSE;
1119	if (zsd_wait_for_creator(zone, t, lockp))
1120		dropped = B_TRUE;
1121
1122	if (zsd_wait_for_inprogress(zone, t, lockp))
1123		dropped = B_TRUE;
1124
1125	if (t->zsd_flags & ZSD_SHUTDOWN_NEEDED) {
1126		t->zsd_flags &= ~ZSD_SHUTDOWN_NEEDED;
1127		t->zsd_flags |= ZSD_SHUTDOWN_INPROGRESS;
1128		DTRACE_PROBE2(zsd__shutdown__inprogress,
1129		    zone_t *, zone, zone_key_t, key);
1130		mutex_exit(&zone->zone_lock);
1131		if (lockp != NULL)
1132			mutex_exit(lockp);
1133		dropped = B_TRUE;
1134
1135		ASSERT(t->zsd_shutdown != NULL);
1136		data = t->zsd_data;
1137
1138		DTRACE_PROBE2(zsd__shutdown__start,
1139		    zone_t *, zone, zone_key_t, key);
1140
1141		(t->zsd_shutdown)(zone->zone_id, data);
1142		DTRACE_PROBE2(zsd__shutdown__end,
1143		    zone_t *, zone, zone_key_t, key);
1144
1145		if (lockp != NULL)
1146			mutex_enter(lockp);
1147		mutex_enter(&zone->zone_lock);
1148		t->zsd_flags &= ~ZSD_SHUTDOWN_INPROGRESS;
1149		t->zsd_flags |= ZSD_SHUTDOWN_COMPLETED;
1150		cv_broadcast(&t->zsd_cv);
1151		DTRACE_PROBE2(zsd__shutdown__completed,
1152		    zone_t *, zone, zone_key_t, key);
1153	}
1154	if (!zone_lock_held)
1155		mutex_exit(&zone->zone_lock);
1156	return (dropped);
1157}
1158
1159/*
1160 * Call the destroy function for the zone and key if DESTROY_NEEDED
1161 * is set.
1162 * If some other thread gets here first and sets *_INPROGRESS, then
1163 * we wait for that thread to complete so that we can ensure that
1164 * all the callbacks are done when we've looped over all zones/keys.
1165 *
1166 * When we call the destroy function, we drop the global held by the
1167 * caller, and return true to tell the caller it needs to re-evalute the
1168 * state.
1169 * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
1170 * remains held on exit.
1171 */
1172static boolean_t
1173zsd_apply_destroy(kmutex_t *lockp, boolean_t zone_lock_held,
1174    zone_t *zone, zone_key_t key)
1175{
1176	struct zsd_entry *t;
1177	void *data;
1178	boolean_t dropped;
1179
1180	if (lockp != NULL) {
1181		ASSERT(MUTEX_HELD(lockp));
1182	}
1183	if (zone_lock_held) {
1184		ASSERT(MUTEX_HELD(&zone->zone_lock));
1185	} else {
1186		mutex_enter(&zone->zone_lock);
1187	}
1188
1189	t = zsd_find(&zone->zone_zsd, key);
1190	if (t == NULL) {
1191		/*
1192		 * Somebody else got here first e.g the zone going
1193		 * away.
1194		 */
1195		if (!zone_lock_held)
1196			mutex_exit(&zone->zone_lock);
1197		return (B_FALSE);
1198	}
1199	dropped = B_FALSE;
1200	if (zsd_wait_for_creator(zone, t, lockp))
1201		dropped = B_TRUE;
1202
1203	if (zsd_wait_for_inprogress(zone, t, lockp))
1204		dropped = B_TRUE;
1205
1206	if (t->zsd_flags & ZSD_DESTROY_NEEDED) {
1207		t->zsd_flags &= ~ZSD_DESTROY_NEEDED;
1208		t->zsd_flags |= ZSD_DESTROY_INPROGRESS;
1209		DTRACE_PROBE2(zsd__destroy__inprogress,
1210		    zone_t *, zone, zone_key_t, key);
1211		mutex_exit(&zone->zone_lock);
1212		if (lockp != NULL)
1213			mutex_exit(lockp);
1214		dropped = B_TRUE;
1215
1216		ASSERT(t->zsd_destroy != NULL);
1217		data = t->zsd_data;
1218		DTRACE_PROBE2(zsd__destroy__start,
1219		    zone_t *, zone, zone_key_t, key);
1220
1221		(t->zsd_destroy)(zone->zone_id, data);
1222		DTRACE_PROBE2(zsd__destroy__end,
1223		    zone_t *, zone, zone_key_t, key);
1224
1225		if (lockp != NULL)
1226			mutex_enter(lockp);
1227		mutex_enter(&zone->zone_lock);
1228		t->zsd_data = NULL;
1229		t->zsd_flags &= ~ZSD_DESTROY_INPROGRESS;
1230		t->zsd_flags |= ZSD_DESTROY_COMPLETED;
1231		cv_broadcast(&t->zsd_cv);
1232		DTRACE_PROBE2(zsd__destroy__completed,
1233		    zone_t *, zone, zone_key_t, key);
1234	}
1235	if (!zone_lock_held)
1236		mutex_exit(&zone->zone_lock);
1237	return (dropped);
1238}
1239
1240/*
1241 * Wait for any CREATE_NEEDED flag to be cleared.
1242 * Returns true if lockp was temporarily dropped while waiting.
1243 */
1244static boolean_t
1245zsd_wait_for_creator(zone_t *zone, struct zsd_entry *t, kmutex_t *lockp)
1246{
1247	boolean_t dropped = B_FALSE;
1248
1249	while (t->zsd_flags & ZSD_CREATE_NEEDED) {
1250		DTRACE_PROBE2(zsd__wait__for__creator,
1251		    zone_t *, zone, struct zsd_entry *, t);
1252		if (lockp != NULL) {
1253			dropped = B_TRUE;
1254			mutex_exit(lockp);
1255		}
1256		cv_wait(&t->zsd_cv, &zone->zone_lock);
1257		if (lockp != NULL) {
1258			/* First drop zone_lock to preserve order */
1259			mutex_exit(&zone->zone_lock);
1260			mutex_enter(lockp);
1261			mutex_enter(&zone->zone_lock);
1262		}
1263	}
1264	return (dropped);
1265}
1266
1267/*
1268 * Wait for any INPROGRESS flag to be cleared.
1269 * Returns true if lockp was temporarily dropped while waiting.
1270 */
1271static boolean_t
1272zsd_wait_for_inprogress(zone_t *zone, struct zsd_entry *t, kmutex_t *lockp)
1273{
1274	boolean_t dropped = B_FALSE;
1275
1276	while (t->zsd_flags & ZSD_ALL_INPROGRESS) {
1277		DTRACE_PROBE2(zsd__wait__for__inprogress,
1278		    zone_t *, zone, struct zsd_entry *, t);
1279		if (lockp != NULL) {
1280			dropped = B_TRUE;
1281			mutex_exit(lockp);
1282		}
1283		cv_wait(&t->zsd_cv, &zone->zone_lock);
1284		if (lockp != NULL) {
1285			/* First drop zone_lock to preserve order */
1286			mutex_exit(&zone->zone_lock);
1287			mutex_enter(lockp);
1288			mutex_enter(&zone->zone_lock);
1289		}
1290	}
1291	return (dropped);
1292}
1293
1294/*
1295 * Frees memory associated with the zone dataset list.
1296 */
1297static void
1298zone_free_datasets(zone_t *zone)
1299{
1300	zone_dataset_t *t, *next;
1301
1302	for (t = list_head(&zone->zone_datasets); t != NULL; t = next) {
1303		next = list_next(&zone->zone_datasets, t);
1304		list_remove(&zone->zone_datasets, t);
1305		kmem_free(t->zd_dataset, strlen(t->zd_dataset) + 1);
1306		kmem_free(t, sizeof (*t));
1307	}
1308	list_destroy(&zone->zone_datasets);
1309}
1310
1311/*
1312 * zone.cpu-shares resource control support.
1313 */
1314/*ARGSUSED*/
1315static rctl_qty_t
1316zone_cpu_shares_usage(rctl_t *rctl, struct proc *p)
1317{
1318	ASSERT(MUTEX_HELD(&p->p_lock));
1319	return (p->p_zone->zone_shares);
1320}
1321
1322/*ARGSUSED*/
1323static int
1324zone_cpu_shares_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1325    rctl_qty_t nv)
1326{
1327	ASSERT(MUTEX_HELD(&p->p_lock));
1328	ASSERT(e->rcep_t == RCENTITY_ZONE);
1329	if (e->rcep_p.zone == NULL)
1330		return (0);
1331
1332	e->rcep_p.zone->zone_shares = nv;
1333	return (0);
1334}
1335
1336static rctl_ops_t zone_cpu_shares_ops = {
1337	rcop_no_action,
1338	zone_cpu_shares_usage,
1339	zone_cpu_shares_set,
1340	rcop_no_test
1341};
1342
1343/*
1344 * zone.cpu-cap resource control support.
1345 */
1346/*ARGSUSED*/
1347static rctl_qty_t
1348zone_cpu_cap_get(rctl_t *rctl, struct proc *p)
1349{
1350	ASSERT(MUTEX_HELD(&p->p_lock));
1351	return (cpucaps_zone_get(p->p_zone));
1352}
1353
1354/*ARGSUSED*/
1355static int
1356zone_cpu_cap_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1357    rctl_qty_t nv)
1358{
1359	zone_t *zone = e->rcep_p.zone;
1360
1361	ASSERT(MUTEX_HELD(&p->p_lock));
1362	ASSERT(e->rcep_t == RCENTITY_ZONE);
1363
1364	if (zone == NULL)
1365		return (0);
1366
1367	/*
1368	 * set cap to the new value.
1369	 */
1370	return (cpucaps_zone_set(zone, nv));
1371}
1372
1373static rctl_ops_t zone_cpu_cap_ops = {
1374	rcop_no_action,
1375	zone_cpu_cap_get,
1376	zone_cpu_cap_set,
1377	rcop_no_test
1378};
1379
1380/*ARGSUSED*/
1381static rctl_qty_t
1382zone_lwps_usage(rctl_t *r, proc_t *p)
1383{
1384	rctl_qty_t nlwps;
1385	zone_t *zone = p->p_zone;
1386
1387	ASSERT(MUTEX_HELD(&p->p_lock));
1388
1389	mutex_enter(&zone->zone_nlwps_lock);
1390	nlwps = zone->zone_nlwps;
1391	mutex_exit(&zone->zone_nlwps_lock);
1392
1393	return (nlwps);
1394}
1395
1396/*ARGSUSED*/
1397static int
1398zone_lwps_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rcntl,
1399    rctl_qty_t incr, uint_t flags)
1400{
1401	rctl_qty_t nlwps;
1402
1403	ASSERT(MUTEX_HELD(&p->p_lock));
1404	ASSERT(e->rcep_t == RCENTITY_ZONE);
1405	if (e->rcep_p.zone == NULL)
1406		return (0);
1407	ASSERT(MUTEX_HELD(&(e->rcep_p.zone->zone_nlwps_lock)));
1408	nlwps = e->rcep_p.zone->zone_nlwps;
1409
1410	if (nlwps + incr > rcntl->rcv_value)
1411		return (1);
1412
1413	return (0);
1414}
1415
1416/*ARGSUSED*/
1417static int
1418zone_lwps_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, rctl_qty_t nv)
1419{
1420	ASSERT(MUTEX_HELD(&p->p_lock));
1421	ASSERT(e->rcep_t == RCENTITY_ZONE);
1422	if (e->rcep_p.zone == NULL)
1423		return (0);
1424	e->rcep_p.zone->zone_nlwps_ctl = nv;
1425	return (0);
1426}
1427
1428static rctl_ops_t zone_lwps_ops = {
1429	rcop_no_action,
1430	zone_lwps_usage,
1431	zone_lwps_set,
1432	zone_lwps_test,
1433};
1434
1435/*ARGSUSED*/
1436static rctl_qty_t
1437zone_procs_usage(rctl_t *r, proc_t *p)
1438{
1439	rctl_qty_t nprocs;
1440	zone_t *zone = p->p_zone;
1441
1442	ASSERT(MUTEX_HELD(&p->p_lock));
1443
1444	mutex_enter(&zone->zone_nlwps_lock);
1445	nprocs = zone->zone_nprocs;
1446	mutex_exit(&zone->zone_nlwps_lock);
1447
1448	return (nprocs);
1449}
1450
1451/*ARGSUSED*/
1452static int
1453zone_procs_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rcntl,
1454    rctl_qty_t incr, uint_t flags)
1455{
1456	rctl_qty_t nprocs;
1457
1458	ASSERT(MUTEX_HELD(&p->p_lock));
1459	ASSERT(e->rcep_t == RCENTITY_ZONE);
1460	if (e->rcep_p.zone == NULL)
1461		return (0);
1462	ASSERT(MUTEX_HELD(&(e->rcep_p.zone->zone_nlwps_lock)));
1463	nprocs = e->rcep_p.zone->zone_nprocs;
1464
1465	if (nprocs + incr > rcntl->rcv_value)
1466		return (1);
1467
1468	return (0);
1469}
1470
1471/*ARGSUSED*/
1472static int
1473zone_procs_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, rctl_qty_t nv)
1474{
1475	ASSERT(MUTEX_HELD(&p->p_lock));
1476	ASSERT(e->rcep_t == RCENTITY_ZONE);
1477	if (e->rcep_p.zone == NULL)
1478		return (0);
1479	e->rcep_p.zone->zone_nprocs_ctl = nv;
1480	return (0);
1481}
1482
1483static rctl_ops_t zone_procs_ops = {
1484	rcop_no_action,
1485	zone_procs_usage,
1486	zone_procs_set,
1487	zone_procs_test,
1488};
1489
1490/*ARGSUSED*/
1491static rctl_qty_t
1492zone_shmmax_usage(rctl_t *rctl, struct proc *p)
1493{
1494	ASSERT(MUTEX_HELD(&p->p_lock));
1495	return (p->p_zone->zone_shmmax);
1496}
1497
1498/*ARGSUSED*/
1499static int
1500zone_shmmax_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1501    rctl_qty_t incr, uint_t flags)
1502{
1503	rctl_qty_t v;
1504	ASSERT(MUTEX_HELD(&p->p_lock));
1505	ASSERT(e->rcep_t == RCENTITY_ZONE);
1506	v = e->rcep_p.zone->zone_shmmax + incr;
1507	if (v > rval->rcv_value)
1508		return (1);
1509	return (0);
1510}
1511
1512static rctl_ops_t zone_shmmax_ops = {
1513	rcop_no_action,
1514	zone_shmmax_usage,
1515	rcop_no_set,
1516	zone_shmmax_test
1517};
1518
1519/*ARGSUSED*/
1520static rctl_qty_t
1521zone_shmmni_usage(rctl_t *rctl, struct proc *p)
1522{
1523	ASSERT(MUTEX_HELD(&p->p_lock));
1524	return (p->p_zone->zone_ipc.ipcq_shmmni);
1525}
1526
1527/*ARGSUSED*/
1528static int
1529zone_shmmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1530    rctl_qty_t incr, uint_t flags)
1531{
1532	rctl_qty_t v;
1533	ASSERT(MUTEX_HELD(&p->p_lock));
1534	ASSERT(e->rcep_t == RCENTITY_ZONE);
1535	v = e->rcep_p.zone->zone_ipc.ipcq_shmmni + incr;
1536	if (v > rval->rcv_value)
1537		return (1);
1538	return (0);
1539}
1540
1541static rctl_ops_t zone_shmmni_ops = {
1542	rcop_no_action,
1543	zone_shmmni_usage,
1544	rcop_no_set,
1545	zone_shmmni_test
1546};
1547
1548/*ARGSUSED*/
1549static rctl_qty_t
1550zone_semmni_usage(rctl_t *rctl, struct proc *p)
1551{
1552	ASSERT(MUTEX_HELD(&p->p_lock));
1553	return (p->p_zone->zone_ipc.ipcq_semmni);
1554}
1555
1556/*ARGSUSED*/
1557static int
1558zone_semmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1559    rctl_qty_t incr, uint_t flags)
1560{
1561	rctl_qty_t v;
1562	ASSERT(MUTEX_HELD(&p->p_lock));
1563	ASSERT(e->rcep_t == RCENTITY_ZONE);
1564	v = e->rcep_p.zone->zone_ipc.ipcq_semmni + incr;
1565	if (v > rval->rcv_value)
1566		return (1);
1567	return (0);
1568}
1569
1570static rctl_ops_t zone_semmni_ops = {
1571	rcop_no_action,
1572	zone_semmni_usage,
1573	rcop_no_set,
1574	zone_semmni_test
1575};
1576
1577/*ARGSUSED*/
1578static rctl_qty_t
1579zone_msgmni_usage(rctl_t *rctl, struct proc *p)
1580{
1581	ASSERT(MUTEX_HELD(&p->p_lock));
1582	return (p->p_zone->zone_ipc.ipcq_msgmni);
1583}
1584
1585/*ARGSUSED*/
1586static int
1587zone_msgmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1588    rctl_qty_t incr, uint_t flags)
1589{
1590	rctl_qty_t v;
1591	ASSERT(MUTEX_HELD(&p->p_lock));
1592	ASSERT(e->rcep_t == RCENTITY_ZONE);
1593	v = e->rcep_p.zone->zone_ipc.ipcq_msgmni + incr;
1594	if (v > rval->rcv_value)
1595		return (1);
1596	return (0);
1597}
1598
1599static rctl_ops_t zone_msgmni_ops = {
1600	rcop_no_action,
1601	zone_msgmni_usage,
1602	rcop_no_set,
1603	zone_msgmni_test
1604};
1605
1606/*ARGSUSED*/
1607static rctl_qty_t
1608zone_locked_mem_usage(rctl_t *rctl, struct proc *p)
1609{
1610	rctl_qty_t q;
1611	ASSERT(MUTEX_HELD(&p->p_lock));
1612	mutex_enter(&p->p_zone->zone_mem_lock);
1613	q = p->p_zone->zone_locked_mem;
1614	mutex_exit(&p->p_zone->zone_mem_lock);
1615	return (q);
1616}
1617
1618/*ARGSUSED*/
1619static int
1620zone_locked_mem_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
1621    rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
1622{
1623	rctl_qty_t q;
1624	zone_t *z;
1625
1626	z = e->rcep_p.zone;
1627	ASSERT(MUTEX_HELD(&p->p_lock));
1628	ASSERT(MUTEX_HELD(&z->zone_mem_lock));
1629	q = z->zone_locked_mem;
1630	if (q + incr > rcntl->rcv_value)
1631		return (1);
1632	return (0);
1633}
1634
1635/*ARGSUSED*/
1636static int
1637zone_locked_mem_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1638    rctl_qty_t nv)
1639{
1640	ASSERT(MUTEX_HELD(&p->p_lock));
1641	ASSERT(e->rcep_t == RCENTITY_ZONE);
1642	if (e->rcep_p.zone == NULL)
1643		return (0);
1644	e->rcep_p.zone->zone_locked_mem_ctl = nv;
1645	return (0);
1646}
1647
1648static rctl_ops_t zone_locked_mem_ops = {
1649	rcop_no_action,
1650	zone_locked_mem_usage,
1651	zone_locked_mem_set,
1652	zone_locked_mem_test
1653};
1654
1655/*ARGSUSED*/
1656static rctl_qty_t
1657zone_max_swap_usage(rctl_t *rctl, struct proc *p)
1658{
1659	rctl_qty_t q;
1660	zone_t *z = p->p_zone;
1661
1662	ASSERT(MUTEX_HELD(&p->p_lock));
1663	mutex_enter(&z->zone_mem_lock);
1664	q = z->zone_max_swap;
1665	mutex_exit(&z->zone_mem_lock);
1666	return (q);
1667}
1668
1669/*ARGSUSED*/
1670static int
1671zone_max_swap_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
1672    rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
1673{
1674	rctl_qty_t q;
1675	zone_t *z;
1676
1677	z = e->rcep_p.zone;
1678	ASSERT(MUTEX_HELD(&p->p_lock));
1679	ASSERT(MUTEX_HELD(&z->zone_mem_lock));
1680	q = z->zone_max_swap;
1681	if (q + incr > rcntl->rcv_value)
1682		return (1);
1683	return (0);
1684}
1685
1686/*ARGSUSED*/
1687static int
1688zone_max_swap_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1689    rctl_qty_t nv)
1690{
1691	ASSERT(MUTEX_HELD(&p->p_lock));
1692	ASSERT(e->rcep_t == RCENTITY_ZONE);
1693	if (e->rcep_p.zone == NULL)
1694		return (0);
1695	e->rcep_p.zone->zone_max_swap_ctl = nv;
1696	return (0);
1697}
1698
1699static rctl_ops_t zone_max_swap_ops = {
1700	rcop_no_action,
1701	zone_max_swap_usage,
1702	zone_max_swap_set,
1703	zone_max_swap_test
1704};
1705
1706/*ARGSUSED*/
1707static rctl_qty_t
1708zone_max_lofi_usage(rctl_t *rctl, struct proc *p)
1709{
1710	rctl_qty_t q;
1711	zone_t *z = p->p_zone;
1712
1713	ASSERT(MUTEX_HELD(&p->p_lock));
1714	mutex_enter(&z->zone_rctl_lock);
1715	q = z->zone_max_lofi;
1716	mutex_exit(&z->zone_rctl_lock);
1717	return (q);
1718}
1719
1720/*ARGSUSED*/
1721static int
1722zone_max_lofi_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
1723    rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
1724{
1725	rctl_qty_t q;
1726	zone_t *z;
1727
1728	z = e->rcep_p.zone;
1729	ASSERT(MUTEX_HELD(&p->p_lock));
1730	ASSERT(MUTEX_HELD(&z->zone_rctl_lock));
1731	q = z->zone_max_lofi;
1732	if (q + incr > rcntl->rcv_value)
1733		return (1);
1734	return (0);
1735}
1736
1737/*ARGSUSED*/
1738static int
1739zone_max_lofi_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1740    rctl_qty_t nv)
1741{
1742	ASSERT(MUTEX_HELD(&p->p_lock));
1743	ASSERT(e->rcep_t == RCENTITY_ZONE);
1744	if (e->rcep_p.zone == NULL)
1745		return (0);
1746	e->rcep_p.zone->zone_max_lofi_ctl = nv;
1747	return (0);
1748}
1749
1750static rctl_ops_t zone_max_lofi_ops = {
1751	rcop_no_action,
1752	zone_max_lofi_usage,
1753	zone_max_lofi_set,
1754	zone_max_lofi_test
1755};
1756
1757/*
1758 * Helper function to brand the zone with a unique ID.
1759 */
1760static void
1761zone_uniqid(zone_t *zone)
1762{
1763	static uint64_t uniqid = 0;
1764
1765	ASSERT(MUTEX_HELD(&zonehash_lock));
1766	zone->zone_uniqid = uniqid++;
1767}
1768
1769/*
1770 * Returns a held pointer to the "kcred" for the specified zone.
1771 */
1772struct cred *
1773zone_get_kcred(zoneid_t zoneid)
1774{
1775	zone_t *zone;
1776	cred_t *cr;
1777
1778	if ((zone = zone_find_by_id(zoneid)) == NULL)
1779		return (NULL);
1780	cr = zone->zone_kcred;
1781	crhold(cr);
1782	zone_rele(zone);
1783	return (cr);
1784}
1785
1786static int
1787zone_lockedmem_kstat_update(kstat_t *ksp, int rw)
1788{
1789	zone_t *zone = ksp->ks_private;
1790	zone_kstat_t *zk = ksp->ks_data;
1791
1792	if (rw == KSTAT_WRITE)
1793		return (EACCES);
1794
1795	zk->zk_usage.value.ui64 = zone->zone_locked_mem;
1796	zk->zk_value.value.ui64 = zone->zone_locked_mem_ctl;
1797	return (0);
1798}
1799
1800static int
1801zone_nprocs_kstat_update(kstat_t *ksp, int rw)
1802{
1803	zone_t *zone = ksp->ks_private;
1804	zone_kstat_t *zk = ksp->ks_data;
1805
1806	if (rw == KSTAT_WRITE)
1807		return (EACCES);
1808
1809	zk->zk_usage.value.ui64 = zone->zone_nprocs;
1810	zk->zk_value.value.ui64 = zone->zone_nprocs_ctl;
1811	return (0);
1812}
1813
1814static int
1815zone_swapresv_kstat_update(kstat_t *ksp, int rw)
1816{
1817	zone_t *zone = ksp->ks_private;
1818	zone_kstat_t *zk = ksp->ks_data;
1819
1820	if (rw == KSTAT_WRITE)
1821		return (EACCES);
1822
1823	zk->zk_usage.value.ui64 = zone->zone_max_swap;
1824	zk->zk_value.value.ui64 = zone->zone_max_swap_ctl;
1825	return (0);
1826}
1827
1828static kstat_t *
1829zone_kstat_create_common(zone_t *zone, char *name,
1830    int (*updatefunc) (kstat_t *, int))
1831{
1832	kstat_t *ksp;
1833	zone_kstat_t *zk;
1834
1835	ksp = rctl_kstat_create_zone(zone, name, KSTAT_TYPE_NAMED,
1836	    sizeof (zone_kstat_t) / sizeof (kstat_named_t),
1837	    KSTAT_FLAG_VIRTUAL);
1838
1839	if (ksp == NULL)
1840		return (NULL);
1841
1842	zk = ksp->ks_data = kmem_alloc(sizeof (zone_kstat_t), KM_SLEEP);
1843	ksp->ks_data_size += strlen(zone->zone_name) + 1;
1844	kstat_named_init(&zk->zk_zonename, "zonename", KSTAT_DATA_STRING);
1845	kstat_named_setstr(&zk->zk_zonename, zone->zone_name);
1846	kstat_named_init(&zk->zk_usage, "usage", KSTAT_DATA_UINT64);
1847	kstat_named_init(&zk->zk_value, "value", KSTAT_DATA_UINT64);
1848	ksp->ks_update = updatefunc;
1849	ksp->ks_private = zone;
1850	kstat_install(ksp);
1851	return (ksp);
1852}
1853
1854
1855static int
1856zone_mcap_kstat_update(kstat_t *ksp, int rw)
1857{
1858	zone_t *zone = ksp->ks_private;
1859	zone_mcap_kstat_t *zmp = ksp->ks_data;
1860
1861	if (rw == KSTAT_WRITE)
1862		return (EACCES);
1863
1864	zmp->zm_pgpgin.value.ui64 = zone->zone_pgpgin;
1865	zmp->zm_anonpgin.value.ui64 = zone->zone_anonpgin;
1866	zmp->zm_execpgin.value.ui64 = zone->zone_execpgin;
1867	zmp->zm_fspgin.value.ui64 = zone->zone_fspgin;
1868	zmp->zm_anon_alloc_fail.value.ui64 = zone->zone_anon_alloc_fail;
1869
1870	return (0);
1871}
1872
1873static kstat_t *
1874zone_mcap_kstat_create(zone_t *zone)
1875{
1876	kstat_t *ksp;
1877	zone_mcap_kstat_t *zmp;
1878
1879	if ((ksp = kstat_create_zone("memory_cap", zone->zone_id,
1880	    zone->zone_name, "zone_memory_cap", KSTAT_TYPE_NAMED,
1881	    sizeof (zone_mcap_kstat_t) / sizeof (kstat_named_t),
1882	    KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
1883		return (NULL);
1884
1885	if (zone->zone_id != GLOBAL_ZONEID)
1886		kstat_zone_add(ksp, GLOBAL_ZONEID);
1887
1888	zmp = ksp->ks_data = kmem_zalloc(sizeof (zone_mcap_kstat_t), KM_SLEEP);
1889	ksp->ks_data_size += strlen(zone->zone_name) + 1;
1890	ksp->ks_lock = &zone->zone_mcap_lock;
1891	zone->zone_mcap_stats = zmp;
1892
1893	/* The kstat "name" field is not large enough for a full zonename */
1894	kstat_named_init(&zmp->zm_zonename, "zonename", KSTAT_DATA_STRING);
1895	kstat_named_setstr(&zmp->zm_zonename, zone->zone_name);
1896	kstat_named_init(&zmp->zm_pgpgin, "pgpgin", KSTAT_DATA_UINT64);
1897	kstat_named_init(&zmp->zm_anonpgin, "anonpgin", KSTAT_DATA_UINT64);
1898	kstat_named_init(&zmp->zm_execpgin, "execpgin", KSTAT_DATA_UINT64);
1899	kstat_named_init(&zmp->zm_fspgin, "fspgin", KSTAT_DATA_UINT64);
1900	kstat_named_init(&zmp->zm_anon_alloc_fail, "anon_alloc_fail",
1901	    KSTAT_DATA_UINT64);
1902
1903	ksp->ks_update = zone_mcap_kstat_update;
1904	ksp->ks_private = zone;
1905
1906	kstat_install(ksp);
1907	return (ksp);
1908}
1909
1910static int
1911zone_misc_kstat_update(kstat_t *ksp, int rw)
1912{
1913	zone_t *zone = ksp->ks_private;
1914	zone_misc_kstat_t *zmp = ksp->ks_data;
1915	hrtime_t hrtime;
1916	uint64_t tmp;
1917
1918	if (rw == KSTAT_WRITE)
1919		return (EACCES);
1920
1921	tmp = cpu_uarray_sum(zone->zone_ustate, ZONE_USTATE_STIME);
1922	hrtime = UINT64_OVERFLOW_TO_INT64(tmp);
1923	scalehrtime(&hrtime);
1924	zmp->zm_stime.value.ui64 = hrtime;
1925
1926	tmp = cpu_uarray_sum(zone->zone_ustate, ZONE_USTATE_UTIME);
1927	hrtime = UINT64_OVERFLOW_TO_INT64(tmp);
1928	scalehrtime(&hrtime);
1929	zmp->zm_utime.value.ui64 = hrtime;
1930
1931	tmp = cpu_uarray_sum(zone->zone_ustate, ZONE_USTATE_WTIME);
1932	hrtime = UINT64_OVERFLOW_TO_INT64(tmp);
1933	scalehrtime(&hrtime);
1934	zmp->zm_wtime.value.ui64 = hrtime;
1935
1936	zmp->zm_avenrun1.value.ui32 = zone->zone_avenrun[0];
1937	zmp->zm_avenrun5.value.ui32 = zone->zone_avenrun[1];
1938	zmp->zm_avenrun15.value.ui32 = zone->zone_avenrun[2];
1939
1940	zmp->zm_ffcap.value.ui32 = zone->zone_ffcap;
1941	zmp->zm_ffnoproc.value.ui32 = zone->zone_ffnoproc;
1942	zmp->zm_ffnomem.value.ui32 = zone->zone_ffnomem;
1943	zmp->zm_ffmisc.value.ui32 = zone->zone_ffmisc;
1944
1945	zmp->zm_nested_intp.value.ui32 = zone->zone_nested_intp;
1946
1947	zmp->zm_init_pid.value.ui32 = zone->zone_proc_initpid;
1948	zmp->zm_boot_time.value.ui64 = (uint64_t)zone->zone_boot_time;
1949
1950	return (0);
1951}
1952
1953static kstat_t *
1954zone_misc_kstat_create(zone_t *zone)
1955{
1956	kstat_t *ksp;
1957	zone_misc_kstat_t *zmp;
1958
1959	if ((ksp = kstat_create_zone("zones", zone->zone_id,
1960	    zone->zone_name, "zone_misc", KSTAT_TYPE_NAMED,
1961	    sizeof (zone_misc_kstat_t) / sizeof (kstat_named_t),
1962	    KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
1963		return (NULL);
1964
1965	if (zone->zone_id != GLOBAL_ZONEID)
1966		kstat_zone_add(ksp, GLOBAL_ZONEID);
1967
1968	zmp = ksp->ks_data = kmem_zalloc(sizeof (zone_misc_kstat_t), KM_SLEEP);
1969	ksp->ks_data_size += strlen(zone->zone_name) + 1;
1970	ksp->ks_lock = &zone->zone_misc_lock;
1971	zone->zone_misc_stats = zmp;
1972
1973	/* The kstat "name" field is not large enough for a full zonename */
1974	kstat_named_init(&zmp->zm_zonename, "zonename", KSTAT_DATA_STRING);
1975	kstat_named_setstr(&zmp->zm_zonename, zone->zone_name);
1976	kstat_named_init(&zmp->zm_utime, "nsec_user", KSTAT_DATA_UINT64);
1977	kstat_named_init(&zmp->zm_stime, "nsec_sys", KSTAT_DATA_UINT64);
1978	kstat_named_init(&zmp->zm_wtime, "nsec_waitrq", KSTAT_DATA_UINT64);
1979	kstat_named_init(&zmp->zm_avenrun1, "avenrun_1min", KSTAT_DATA_UINT32);
1980	kstat_named_init(&zmp->zm_avenrun5, "avenrun_5min", KSTAT_DATA_UINT32);
1981	kstat_named_init(&zmp->zm_avenrun15, "avenrun_15min",
1982	    KSTAT_DATA_UINT32);
1983	kstat_named_init(&zmp->zm_ffcap, "forkfail_cap", KSTAT_DATA_UINT32);
1984	kstat_named_init(&zmp->zm_ffnoproc, "forkfail_noproc",
1985	    KSTAT_DATA_UINT32);
1986	kstat_named_init(&zmp->zm_ffnomem, "forkfail_nomem", KSTAT_DATA_UINT32);
1987	kstat_named_init(&zmp->zm_ffmisc, "forkfail_misc", KSTAT_DATA_UINT32);
1988	kstat_named_init(&zmp->zm_nested_intp, "nested_interp",
1989	    KSTAT_DATA_UINT32);
1990	kstat_named_init(&zmp->zm_init_pid, "init_pid", KSTAT_DATA_UINT32);
1991	kstat_named_init(&zmp->zm_boot_time, "boot_time", KSTAT_DATA_UINT64);
1992
1993	ksp->ks_update = zone_misc_kstat_update;
1994	ksp->ks_private = zone;
1995
1996	kstat_install(ksp);
1997	return (ksp);
1998}
1999
2000static void
2001zone_kstat_create(zone_t *zone)
2002{
2003	zone->zone_lockedmem_kstat = zone_kstat_create_common(zone,
2004	    "lockedmem", zone_lockedmem_kstat_update);
2005	zone->zone_swapresv_kstat = zone_kstat_create_common(zone,
2006	    "swapresv", zone_swapresv_kstat_update);
2007	zone->zone_nprocs_kstat = zone_kstat_create_common(zone,
2008	    "nprocs", zone_nprocs_kstat_update);
2009
2010	if ((zone->zone_mcap_ksp = zone_mcap_kstat_create(zone)) == NULL) {
2011		zone->zone_mcap_stats = kmem_zalloc(
2012		    sizeof (zone_mcap_kstat_t), KM_SLEEP);
2013	}
2014
2015	if ((zone->zone_misc_ksp = zone_misc_kstat_create(zone)) == NULL) {
2016		zone->zone_misc_stats = kmem_zalloc(
2017		    sizeof (zone_misc_kstat_t), KM_SLEEP);
2018	}
2019}
2020
2021static void
2022zone_kstat_delete_common(kstat_t **pkstat, size_t datasz)
2023{
2024	void *data;
2025
2026	if (*pkstat != NULL) {
2027		data = (*pkstat)->ks_data;
2028		kstat_delete(*pkstat);
2029		kmem_free(data, datasz);
2030		*pkstat = NULL;
2031	}
2032}
2033
2034static void
2035zone_kstat_delete(zone_t *zone)
2036{
2037	zone_kstat_delete_common(&zone->zone_lockedmem_kstat,
2038	    sizeof (zone_kstat_t));
2039	zone_kstat_delete_common(&zone->zone_swapresv_kstat,
2040	    sizeof (zone_kstat_t));
2041	zone_kstat_delete_common(&zone->zone_nprocs_kstat,
2042	    sizeof (zone_kstat_t));
2043	zone_kstat_delete_common(&zone->zone_mcap_ksp,
2044	    sizeof (zone_mcap_kstat_t));
2045	zone_kstat_delete_common(&zone->zone_misc_ksp,
2046	    sizeof (zone_misc_kstat_t));
2047}
2048
2049/*
2050 * Called very early on in boot to initialize the ZSD list so that
2051 * zone_key_create() can be called before zone_init().  It also initializes
2052 * portions of zone0 which may be used before zone_init() is called.  The
2053 * variable "global_zone" will be set when zone0 is fully initialized by
2054 * zone_init().
2055 */
2056void
2057zone_zsd_init(void)
2058{
2059	mutex_init(&zonehash_lock, NULL, MUTEX_DEFAULT, NULL);
2060	mutex_init(&zsd_key_lock, NULL, MUTEX_DEFAULT, NULL);
2061	list_create(&zsd_registered_keys, sizeof (struct zsd_entry),
2062	    offsetof(struct zsd_entry, zsd_linkage));
2063	list_create(&zone_active, sizeof (zone_t),
2064	    offsetof(zone_t, zone_linkage));
2065	list_create(&zone_deathrow, sizeof (zone_t),
2066	    offsetof(zone_t, zone_linkage));
2067
2068	mutex_init(&zone0.zone_lock, NULL, MUTEX_DEFAULT, NULL);
2069	mutex_init(&zone0.zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL);
2070	mutex_init(&zone0.zone_mem_lock, NULL, MUTEX_DEFAULT, NULL);
2071	zone0.zone_shares = 1;
2072	zone0.zone_nlwps = 0;
2073	zone0.zone_nlwps_ctl = INT_MAX;
2074	zone0.zone_nprocs = 0;
2075	zone0.zone_nprocs_ctl = INT_MAX;
2076	zone0.zone_locked_mem = 0;
2077	zone0.zone_locked_mem_ctl = UINT64_MAX;
2078	ASSERT(zone0.zone_max_swap == 0);
2079	zone0.zone_max_swap_ctl = UINT64_MAX;
2080	zone0.zone_max_lofi = 0;
2081	zone0.zone_max_lofi_ctl = UINT64_MAX;
2082	zone0.zone_shmmax = 0;
2083	zone0.zone_ipc.ipcq_shmmni = 0;
2084	zone0.zone_ipc.ipcq_semmni = 0;
2085	zone0.zone_ipc.ipcq_msgmni = 0;
2086	zone0.zone_name = GLOBAL_ZONENAME;
2087	zone0.zone_nodename = utsname.nodename;
2088	zone0.zone_domain = srpc_domain;
2089	zone0.zone_hostid = HW_INVALID_HOSTID;
2090	zone0.zone_fs_allowed = NULL;
2091	psecflags_default(&zone0.zone_secflags);
2092	zone0.zone_ref = 1;
2093	zone0.zone_id = GLOBAL_ZONEID;
2094	zone0.zone_status = ZONE_IS_RUNNING;
2095	zone0.zone_rootpath = "/";
2096	zone0.zone_rootpathlen = 2;
2097	zone0.zone_psetid = ZONE_PS_INVAL;
2098	zone0.zone_ncpus = 0;
2099	zone0.zone_ncpus_online = 0;
2100	zone0.zone_proc_initpid = 1;
2101	zone0.zone_initname = initname;
2102	zone0.zone_lockedmem_kstat = NULL;
2103	zone0.zone_swapresv_kstat = NULL;
2104	zone0.zone_nprocs_kstat = NULL;
2105
2106	list_create(&zone0.zone_ref_list, sizeof (zone_ref_t),
2107	    offsetof(zone_ref_t, zref_linkage));
2108	list_create(&zone0.zone_zsd, sizeof (struct zsd_entry),
2109	    offsetof(struct zsd_entry, zsd_linkage));
2110	list_insert_head(&zone_active, &zone0);
2111
2112	/*
2113	 * The root filesystem is not mounted yet, so zone_rootvp cannot be set
2114	 * to anything meaningful.  It is assigned to be 'rootdir' in
2115	 * vfs_mountroot().
2116	 */
2117	zone0.zone_rootvp = NULL;
2118	zone0.zone_vfslist = NULL;
2119	zone0.zone_bootargs = initargs;
2120	zone0.zone_privset = kmem_alloc(sizeof (priv_set_t), KM_SLEEP);
2121	/*
2122	 * The global zone has all privileges
2123	 */
2124	priv_fillset(zone0.zone_privset);
2125	/*
2126	 * Add p0 to the global zone
2127	 */
2128	zone0.zone_zsched = &p0;
2129	p0.p_zone = &zone0;
2130}
2131
2132/*
2133 * Compute a hash value based on the contents of the label and the DOI.  The
2134 * hash algorithm is somewhat arbitrary, but is based on the observation that
2135 * humans will likely pick labels that differ by amounts that work out to be
2136 * multiples of the number of hash chains, and thus stirring in some primes
2137 * should help.
2138 */
2139static uint_t
2140hash_bylabel(void *hdata, mod_hash_key_t key)
2141{
2142	const ts_label_t *lab = (ts_label_t *)key;
2143	const uint32_t *up, *ue;
2144	uint_t hash;
2145	int i;
2146
2147	_NOTE(ARGUNUSED(hdata));
2148
2149	hash = lab->tsl_doi + (lab->tsl_doi << 1);
2150	/* we depend on alignment of label, but not representation */
2151	up = (const uint32_t *)&lab->tsl_label;
2152	ue = up + sizeof (lab->tsl_label) / sizeof (*up);
2153	i = 1;
2154	while (up < ue) {
2155		/* using 2^n + 1, 1 <= n <= 16 as source of many primes */
2156		hash += *up + (*up << ((i % 16) + 1));
2157		up++;
2158		i++;
2159	}
2160	return (hash);
2161}
2162
2163/*
2164 * All that mod_hash cares about here is zero (equal) versus non-zero (not
2165 * equal).  This may need to be changed if less than / greater than is ever
2166 * needed.
2167 */
2168static int
2169hash_labelkey_cmp(mod_hash_key_t key1, mod_hash_key_t key2)
2170{
2171	ts_label_t *lab1 = (ts_label_t *)key1;
2172	ts_label_t *lab2 = (ts_label_t *)key2;
2173
2174	return (label_equal(lab1, lab2) ? 0 : 1);
2175}
2176
2177/*
2178 * Called by main() to initialize the zones framework.
2179 */
2180void
2181zone_init(void)
2182{
2183	rctl_dict_entry_t *rde;
2184	rctl_val_t *dval;
2185	rctl_set_t *set;
2186	rctl_alloc_gp_t *gp;
2187	rctl_entity_p_t e;
2188	int res;
2189
2190	ASSERT(curproc == &p0);
2191
2192	/*
2193	 * Create ID space for zone IDs.  ID 0 is reserved for the
2194	 * global zone.
2195	 */
2196	zoneid_space = id_space_create("zoneid_space", 1, MAX_ZONEID);
2197
2198	/*
2199	 * Initialize generic zone resource controls, if any.
2200	 */
2201	rc_zone_cpu_shares = rctl_register("zone.cpu-shares",
2202	    RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
2203	    RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
2204	    FSS_MAXSHARES, FSS_MAXSHARES, &zone_cpu_shares_ops);
2205
2206	rc_zone_cpu_cap = rctl_register("zone.cpu-cap",
2207	    RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_ALWAYS |
2208	    RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |RCTL_GLOBAL_SYSLOG_NEVER |
2209	    RCTL_GLOBAL_INFINITE,
2210	    MAXCAP, MAXCAP, &zone_cpu_cap_ops);
2211
2212	rc_zone_nlwps = rctl_register("zone.max-lwps", RCENTITY_ZONE,
2213	    RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
2214	    INT_MAX, INT_MAX, &zone_lwps_ops);
2215
2216	rc_zone_nprocs = rctl_register("zone.max-processes", RCENTITY_ZONE,
2217	    RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
2218	    INT_MAX, INT_MAX, &zone_procs_ops);
2219
2220	/*
2221	 * System V IPC resource controls
2222	 */
2223	rc_zone_msgmni = rctl_register("zone.max-msg-ids",
2224	    RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2225	    RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_msgmni_ops);
2226
2227	rc_zone_semmni = rctl_register("zone.max-sem-ids",
2228	    RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2229	    RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_semmni_ops);
2230
2231	rc_zone_shmmni = rctl_register("zone.max-shm-ids",
2232	    RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2233	    RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_shmmni_ops);
2234
2235	rc_zone_shmmax = rctl_register("zone.max-shm-memory",
2236	    RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2237	    RCTL_GLOBAL_BYTES, UINT64_MAX, UINT64_MAX, &zone_shmmax_ops);
2238
2239	/*
2240	 * Create a rctl_val with PRIVILEGED, NOACTION, value = 1.  Then attach
2241	 * this at the head of the rctl_dict_entry for ``zone.cpu-shares''.
2242	 */
2243	dval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
2244	bzero(dval, sizeof (rctl_val_t));
2245	dval->rcv_value = 1;
2246	dval->rcv_privilege = RCPRIV_PRIVILEGED;
2247	dval->rcv_flagaction = RCTL_LOCAL_NOACTION;
2248	dval->rcv_action_recip_pid = -1;
2249
2250	rde = rctl_dict_lookup("zone.cpu-shares");
2251	(void) rctl_val_list_insert(&rde->rcd_default_value, dval);
2252
2253	rc_zone_locked_mem = rctl_register("zone.max-locked-memory",
2254	    RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
2255	    RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2256	    &zone_locked_mem_ops);
2257
2258	rc_zone_max_swap = rctl_register("zone.max-swap",
2259	    RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
2260	    RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2261	    &zone_max_swap_ops);
2262
2263	rc_zone_max_lofi = rctl_register("zone.max-lofi",
2264	    RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |
2265	    RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2266	    &zone_max_lofi_ops);
2267
2268	/*
2269	 * Initialize the ``global zone''.
2270	 */
2271	set = rctl_set_create();
2272	gp = rctl_set_init_prealloc(RCENTITY_ZONE);
2273	mutex_enter(&p0.p_lock);
2274	e.rcep_p.zone = &zone0;
2275	e.rcep_t = RCENTITY_ZONE;
2276	zone0.zone_rctls = rctl_set_init(RCENTITY_ZONE, &p0, &e, set,
2277	    gp);
2278
2279	zone0.zone_nlwps = p0.p_lwpcnt;
2280	zone0.zone_nprocs = 1;
2281	zone0.zone_ntasks = 1;
2282	mutex_exit(&p0.p_lock);
2283	zone0.zone_restart_init = B_TRUE;
2284	zone0.zone_brand = &native_brand;
2285	rctl_prealloc_destroy(gp);
2286	/*
2287	 * pool_default hasn't been initialized yet, so we let pool_init()
2288	 * take care of making sure the global zone is in the default pool.
2289	 */
2290
2291	/*
2292	 * Initialize global zone kstats
2293	 */
2294	zone_kstat_create(&zone0);
2295
2296	/*
2297	 * Initialize zone label.
2298	 * mlp are initialized when tnzonecfg is loaded.
2299	 */
2300	zone0.zone_slabel = l_admin_low;
2301	rw_init(&zone0.zone_mlps.mlpl_rwlock, NULL, RW_DEFAULT, NULL);
2302	label_hold(l_admin_low);
2303
2304	/*
2305	 * Initialise the lock for the database structure used by mntfs.
2306	 */
2307	rw_init(&zone0.zone_mntfs_db_lock, NULL, RW_DEFAULT, NULL);
2308
2309	zone0.zone_ustate = cpu_uarray_zalloc(ZONE_USTATE_MAX, KM_SLEEP);
2310
2311	mutex_enter(&zonehash_lock);
2312	zone_uniqid(&zone0);
2313	ASSERT(zone0.zone_uniqid == GLOBAL_ZONEUNIQID);
2314
2315	zonehashbyid = mod_hash_create_idhash("zone_by_id", zone_hash_size,
2316	    mod_hash_null_valdtor);
2317	zonehashbyname = mod_hash_create_strhash("zone_by_name",
2318	    zone_hash_size, mod_hash_null_valdtor);
2319	/*
2320	 * maintain zonehashbylabel only for labeled systems
2321	 */
2322	if (is_system_labeled())
2323		zonehashbylabel = mod_hash_create_extended("zone_by_label",
2324		    zone_hash_size, mod_hash_null_keydtor,
2325		    mod_hash_null_valdtor, hash_bylabel, NULL,
2326		    hash_labelkey_cmp, KM_SLEEP);
2327	zonecount = 1;
2328
2329	(void) mod_hash_insert(zonehashbyid, (mod_hash_key_t)GLOBAL_ZONEID,
2330	    (mod_hash_val_t)&zone0);
2331	(void) mod_hash_insert(zonehashbyname, (mod_hash_key_t)zone0.zone_name,
2332	    (mod_hash_val_t)&zone0);
2333	if (is_system_labeled()) {
2334		zone0.zone_flags |= ZF_HASHED_LABEL;
2335		(void) mod_hash_insert(zonehashbylabel,
2336		    (mod_hash_key_t)zone0.zone_slabel, (mod_hash_val_t)&zone0);
2337	}
2338	mutex_exit(&zonehash_lock);
2339
2340	/*
2341	 * We avoid setting zone_kcred until now, since kcred is initialized
2342	 * sometime after zone_zsd_init() and before zone_init().
2343	 */
2344	zone0.zone_kcred = kcred;
2345	/*
2346	 * The global zone is fully initialized (except for zone_rootvp which
2347	 * will be set when the root filesystem is mounted).
2348	 */
2349	global_zone = &zone0;
2350
2351	/*
2352	 * Setup an event channel to send zone status change notifications on
2353	 */
2354	res = sysevent_evc_bind(ZONE_EVENT_CHANNEL, &zone_event_chan,
2355	    EVCH_CREAT);
2356
2357	if (res)
2358		panic("Sysevent_evc_bind failed during zone setup.\n");
2359
2360}
2361
2362static void
2363zone_free(zone_t *zone)
2364{
2365	ASSERT(zone != global_zone);
2366	ASSERT(zone->zone_ntasks == 0);
2367	ASSERT(zone->zone_nlwps == 0);
2368	ASSERT(zone->zone_nprocs == 0);
2369	ASSERT(zone->zone_cred_ref == 0);
2370	ASSERT(zone->zone_kcred == NULL);
2371	ASSERT(zone_status_get(zone) == ZONE_IS_DEAD ||
2372	    zone_status_get(zone) == ZONE_IS_UNINITIALIZED);
2373	ASSERT(list_is_empty(&zone->zone_ref_list));
2374
2375	/*
2376	 * Remove any zone caps.
2377	 */
2378	cpucaps_zone_remove(zone);
2379
2380	ASSERT(zone->zone_cpucap == NULL);
2381
2382	/* remove from deathrow list */
2383	if (zone_status_get(zone) == ZONE_IS_DEAD) {
2384		ASSERT(zone->zone_ref == 0);
2385		mutex_enter(&zone_deathrow_lock);
2386		list_remove(&zone_deathrow, zone);
2387		mutex_exit(&zone_deathrow_lock);
2388	}
2389
2390	list_destroy(&zone->zone_ref_list);
2391	zone_free_zsd(zone);
2392	zone_free_datasets(zone);
2393	list_destroy(&zone->zone_dl_list);
2394
2395	cpu_uarray_free(zone->zone_ustate);
2396
2397	if (zone->zone_rootvp != NULL)
2398		VN_RELE(zone->zone_rootvp);
2399	if (zone->zone_rootpath)
2400		kmem_free(zone->zone_rootpath, zone->zone_rootpathlen);
2401	if (zone->zone_name != NULL)
2402		kmem_free(zone->zone_name, ZONENAME_MAX);
2403	if (zone->zone_slabel != NULL)
2404		label_rele(zone->zone_slabel);
2405	if (zone->zone_nodename != NULL)
2406		kmem_free(zone->zone_nodename, _SYS_NMLN);
2407	if (zone->zone_domain != NULL)
2408		kmem_free(zone->zone_domain, _SYS_NMLN);
2409	if (zone->zone_privset != NULL)
2410		kmem_free(zone->zone_privset, sizeof (priv_set_t));
2411	if (zone->zone_rctls != NULL)
2412		rctl_set_free(zone->zone_rctls);
2413	if (zone->zone_bootargs != NULL)
2414		strfree(zone->zone_bootargs);
2415	if (zone->zone_initname != NULL)
2416		strfree(zone->zone_initname);
2417	if (zone->zone_fs_allowed != NULL)
2418		strfree(zone->zone_fs_allowed);
2419	if (zone->zone_pfexecd != NULL)
2420		klpd_freelist(&zone->zone_pfexecd);
2421	id_free(zoneid_space, zone->zone_id);
2422	mutex_destroy(&zone->zone_lock);
2423	cv_destroy(&zone->zone_cv);
2424	rw_destroy(&zone->zone_mlps.mlpl_rwlock);
2425	rw_destroy(&zone->zone_mntfs_db_lock);
2426	kmem_free(zone, sizeof (zone_t));
2427}
2428
2429/*
2430 * See block comment at the top of this file for information about zone
2431 * status values.
2432 */
2433/*
2434 * Convenience function for setting zone status.
2435 */
2436static void
2437zone_status_set(zone_t *zone, zone_status_t status)
2438{
2439
2440	nvlist_t *nvl = NULL;
2441	ASSERT(MUTEX_HELD(&zone_status_lock));
2442	ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE &&
2443	    status >= zone_status_get(zone));
2444
2445	if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) ||
2446	    nvlist_add_string(nvl, ZONE_CB_NAME, zone->zone_name) ||
2447	    nvlist_add_string(nvl, ZONE_CB_NEWSTATE,
2448	    zone_status_table[status]) ||
2449	    nvlist_add_string(nvl, ZONE_CB_OLDSTATE,
2450	    zone_status_table[zone->zone_status]) ||
2451	    nvlist_add_int32(nvl, ZONE_CB_ZONEID, zone->zone_id) ||
2452	    nvlist_add_uint64(nvl, ZONE_CB_TIMESTAMP, (uint64_t)gethrtime()) ||
2453	    sysevent_evc_publish(zone_event_chan, ZONE_EVENT_STATUS_CLASS,
2454	    ZONE_EVENT_STATUS_SUBCLASS, "sun.com", "kernel", nvl, EVCH_SLEEP)) {
2455#ifdef DEBUG
2456		(void) printf(
2457		    "Failed to allocate and send zone state change event.\n");
2458#endif
2459	}
2460	nvlist_free(nvl);
2461
2462	zone->zone_status = status;
2463
2464	cv_broadcast(&zone->zone_cv);
2465}
2466
2467/*
2468 * Public function to retrieve the zone status.  The zone status may
2469 * change after it is retrieved.
2470 */
2471zone_status_t
2472zone_status_get(zone_t *zone)
2473{
2474	return (zone->zone_status);
2475}
2476
2477static int
2478zone_set_bootargs(zone_t *zone, const char *zone_bootargs)
2479{
2480	char *buf = kmem_zalloc(BOOTARGS_MAX, KM_SLEEP);
2481	int err = 0;
2482
2483	ASSERT(zone != global_zone);
2484	if ((err = copyinstr(zone_bootargs, buf, BOOTARGS_MAX, NULL)) != 0)
2485		goto done;	/* EFAULT or ENAMETOOLONG */
2486
2487	if (zone->zone_bootargs != NULL)
2488		strfree(zone->zone_bootargs);
2489
2490	zone->zone_bootargs = strdup(buf);
2491
2492done:
2493	kmem_free(buf, BOOTARGS_MAX);
2494	return (err);
2495}
2496
2497static int
2498zone_set_brand(zone_t *zone, const char *brand)
2499{
2500	struct brand_attr *attrp;
2501	brand_t *bp;
2502
2503	attrp = kmem_alloc(sizeof (struct brand_attr), KM_SLEEP);
2504	if (copyin(brand, attrp, sizeof (struct brand_attr)) != 0) {
2505		kmem_free(attrp, sizeof (struct brand_attr));
2506		return (EFAULT);
2507	}
2508
2509	bp = brand_register_zone(attrp);
2510	kmem_free(attrp, sizeof (struct brand_attr));
2511	if (bp == NULL)
2512		return (EINVAL);
2513
2514	/*
2515	 * This is the only place where a zone can change it's brand.
2516	 * We already need to hold zone_status_lock to check the zone
2517	 * status, so we'll just use that lock to serialize zone
2518	 * branding requests as well.
2519	 */
2520	mutex_enter(&zone_status_lock);
2521
2522	/* Re-Branding is not allowed and the zone can't be booted yet */
2523	if ((ZONE_IS_BRANDED(zone)) ||
2524	    (zone_status_get(zone) >= ZONE_IS_BOOTING)) {
2525		mutex_exit(&zone_status_lock);
2526		brand_unregister_zone(bp);
2527		return (EINVAL);
2528	}
2529
2530	/* set up the brand specific data */
2531	zone->zone_brand = bp;
2532	ZBROP(zone)->b_init_brand_data(zone);
2533
2534	mutex_exit(&zone_status_lock);
2535	return (0);
2536}
2537
2538static int
2539zone_set_secflags(zone_t *zone, const psecflags_t *zone_secflags)
2540{
2541	int err = 0;
2542	psecflags_t psf;
2543
2544	ASSERT(zone != global_zone);
2545
2546	if ((err = copyin(zone_secflags, &psf, sizeof (psf))) != 0)
2547		return (err);
2548
2549	if (zone_status_get(zone) > ZONE_IS_READY)
2550		return (EINVAL);
2551
2552	if (!psecflags_validate(&psf))
2553		return (EINVAL);
2554
2555	(void) memcpy(&zone->zone_secflags, &psf, sizeof (psf));
2556
2557	/* Set security flags on the zone's zsched */
2558	(void) memcpy(&zone->zone_zsched->p_secflags, &zone->zone_secflags,
2559	    sizeof (zone->zone_zsched->p_secflags));
2560
2561	return (0);
2562}
2563
2564static int
2565zone_set_fs_allowed(zone_t *zone, const char *zone_fs_allowed)
2566{
2567	char *buf = kmem_zalloc(ZONE_FS_ALLOWED_MAX, KM_SLEEP);
2568	int err = 0;
2569
2570	ASSERT(zone != global_zone);
2571	if ((err = copyinstr(zone_fs_allowed, buf,
2572	    ZONE_FS_ALLOWED_MAX, NULL)) != 0)
2573		goto done;
2574
2575	if (zone->zone_fs_allowed != NULL)
2576		strfree(zone->zone_fs_allowed);
2577
2578	zone->zone_fs_allowed = strdup(buf);
2579
2580done:
2581	kmem_free(buf, ZONE_FS_ALLOWED_MAX);
2582	return (err);
2583}
2584
2585static int
2586zone_set_initname(zone_t *zone, const char *zone_initname)
2587{
2588	char initname[INITNAME_SZ];
2589	size_t len;
2590	int err = 0;
2591
2592	ASSERT(zone != global_zone);
2593	if ((err = copyinstr(zone_initname, initname, INITNAME_SZ, &len)) != 0)
2594		return (err);	/* EFAULT or ENAMETOOLONG */
2595
2596	if (zone->zone_initname != NULL)
2597		strfree(zone->zone_initname);
2598
2599	zone->zone_initname = kmem_alloc(strlen(initname) + 1, KM_SLEEP);
2600	(void) strcpy(zone->zone_initname, initname);
2601	return (0);
2602}
2603
2604static int
2605zone_set_phys_mcap(zone_t *zone, const uint64_t *zone_mcap)
2606{
2607	uint64_t mcap;
2608	int err = 0;
2609
2610	if ((err = copyin(zone_mcap, &mcap, sizeof (uint64_t))) == 0)
2611		zone->zone_phys_mcap = mcap;
2612
2613	return (err);
2614}
2615
2616static int
2617zone_set_sched_class(zone_t *zone, const char *new_class)
2618{
2619	char sched_class[PC_CLNMSZ];
2620	id_t classid;
2621	int err;
2622
2623	ASSERT(zone != global_zone);
2624	if ((err = copyinstr(new_class, sched_class, PC_CLNMSZ, NULL)) != 0)
2625		return (err);	/* EFAULT or ENAMETOOLONG */
2626
2627	if (getcid(sched_class, &classid) != 0 || CLASS_KERNEL(classid))
2628		return (set_errno(EINVAL));
2629	zone->zone_defaultcid = classid;
2630	ASSERT(zone->zone_defaultcid > 0 &&
2631	    zone->zone_defaultcid < loaded_classes);
2632
2633	return (0);
2634}
2635
2636/*
2637 * Block indefinitely waiting for (zone_status >= status)
2638 */
2639void
2640zone_status_wait(zone_t *zone, zone_status_t status)
2641{
2642	ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2643
2644	mutex_enter(&zone_status_lock);
2645	while (zone->zone_status < status) {
2646		cv_wait(&zone->zone_cv, &zone_status_lock);
2647	}
2648	mutex_exit(&zone_status_lock);
2649}
2650
2651/*
2652 * Private CPR-safe version of zone_status_wait().
2653 */
2654static void
2655zone_status_wait_cpr(zone_t *zone, zone_status_t status, char *str)
2656{
2657	callb_cpr_t cprinfo;
2658
2659	ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2660
2661	CALLB_CPR_INIT(&cprinfo, &zone_status_lock, callb_generic_cpr,
2662	    str);
2663	mutex_enter(&zone_status_lock);
2664	while (zone->zone_status < status) {
2665		CALLB_CPR_SAFE_BEGIN(&cprinfo);
2666		cv_wait(&zone->zone_cv, &zone_status_lock);
2667		CALLB_CPR_SAFE_END(&cprinfo, &zone_status_lock);
2668	}
2669	/*
2670	 * zone_status_lock is implicitly released by the following.
2671	 */
2672	CALLB_CPR_EXIT(&cprinfo);
2673}
2674
2675/*
2676 * Block until zone enters requested state or signal is received.  Return (0)
2677 * if signaled, non-zero otherwise.
2678 */
2679int
2680zone_status_wait_sig(zone_t *zone, zone_status_t status)
2681{
2682	ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2683
2684	mutex_enter(&zone_status_lock);
2685	while (zone->zone_status < status) {
2686		if (!cv_wait_sig(&zone->zone_cv, &zone_status_lock)) {
2687			mutex_exit(&zone_status_lock);
2688			return (0);
2689		}
2690	}
2691	mutex_exit(&zone_status_lock);
2692	return (1);
2693}
2694
2695/*
2696 * Block until the zone enters the requested state or the timeout expires,
2697 * whichever happens first.  Return (-1) if operation timed out, time remaining
2698 * otherwise.
2699 */
2700clock_t
2701zone_status_timedwait(zone_t *zone, clock_t tim, zone_status_t status)
2702{
2703	clock_t timeleft = 0;
2704
2705	ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2706
2707	mutex_enter(&zone_status_lock);
2708	while (zone->zone_status < status && timeleft != -1) {
2709		timeleft = cv_timedwait(&zone->zone_cv, &zone_status_lock, tim);
2710	}
2711	mutex_exit(&zone_status_lock);
2712	return (timeleft);
2713}
2714
2715/*
2716 * Block until the zone enters the requested state, the current process is
2717 * signaled,  or the timeout expires, whichever happens first.  Return (-1) if
2718 * operation timed out, 0 if signaled, time remaining otherwise.
2719 */
2720clock_t
2721zone_status_timedwait_sig(zone_t *zone, clock_t tim, zone_status_t status)
2722{
2723	clock_t timeleft = tim - ddi_get_lbolt();
2724
2725	ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2726
2727	mutex_enter(&zone_status_lock);
2728	while (zone->zone_status < status) {
2729		timeleft = cv_timedwait_sig(&zone->zone_cv, &zone_status_lock,
2730		    tim);
2731		if (timeleft <= 0)
2732			break;
2733	}
2734	mutex_exit(&zone_status_lock);
2735	return (timeleft);
2736}
2737
2738/*
2739 * Zones have two reference counts: one for references from credential
2740 * structures (zone_cred_ref), and one (zone_ref) for everything else.
2741 * This is so we can allow a zone to be rebooted while there are still
2742 * outstanding cred references, since certain drivers cache dblks (which
2743 * implicitly results in cached creds).  We wait for zone_ref to drop to
2744 * 0 (actually 1), but not zone_cred_ref.  The zone structure itself is
2745 * later freed when the zone_cred_ref drops to 0, though nothing other
2746 * than the zone id and privilege set should be accessed once the zone
2747 * is "dead".
2748 *
2749 * A debugging flag, zone_wait_for_cred, can be set to a non-zero value
2750 * to force halt/reboot to block waiting for the zone_cred_ref to drop
2751 * to 0.  This can be useful to flush out other sources of cached creds
2752 * that may be less innocuous than the driver case.
2753 *
2754 * Zones also provide a tracked reference counting mechanism in which zone
2755 * references are represented by "crumbs" (zone_ref structures).  Crumbs help
2756 * debuggers determine the sources of leaked zone references.  See
2757 * zone_hold_ref() and zone_rele_ref() below for more information.
2758 */
2759
2760int zone_wait_for_cred = 0;
2761
2762static void
2763zone_hold_locked(zone_t *z)
2764{
2765	ASSERT(MUTEX_HELD(&z->zone_lock));
2766	z->zone_ref++;
2767	ASSERT(z->zone_ref != 0);
2768}
2769
2770/*
2771 * Increment the specified zone's reference count.  The zone's zone_t structure
2772 * will not be freed as long as the zone's reference count is nonzero.
2773 * Decrement the zone's reference count via zone_rele().
2774 *
2775 * NOTE: This function should only be used to hold zones for short periods of
2776 * time.  Use zone_hold_ref() if the zone must be held for a long time.
2777 */
2778void
2779zone_hold(zone_t *z)
2780{
2781	mutex_enter(&z->zone_lock);
2782	zone_hold_locked(z);
2783	mutex_exit(&z->zone_lock);
2784}
2785
2786/*
2787 * If the non-cred ref count drops to 1 and either the cred ref count
2788 * is 0 or we aren't waiting for cred references, the zone is ready to
2789 * be destroyed.
2790 */
2791#define	ZONE_IS_UNREF(zone)	((zone)->zone_ref == 1 && \
2792	    (!zone_wait_for_cred || (zone)->zone_cred_ref == 0))
2793
2794/*
2795 * Common zone reference release function invoked by zone_rele() and
2796 * zone_rele_ref().  If subsys is ZONE_REF_NUM_SUBSYS, then the specified
2797 * zone's subsystem-specific reference counters are not affected by the
2798 * release.  If ref is not NULL, then the zone_ref_t to which it refers is
2799 * removed from the specified zone's reference list.  ref must be non-NULL iff
2800 * subsys is not ZONE_REF_NUM_SUBSYS.
2801 */
2802static void
2803zone_rele_common(zone_t *z, zone_ref_t *ref, zone_ref_subsys_t subsys)
2804{
2805	boolean_t wakeup;
2806
2807	mutex_enter(&z->zone_lock);
2808	ASSERT(z->zone_ref != 0);
2809	z->zone_ref--;
2810	if (subsys != ZONE_REF_NUM_SUBSYS) {
2811		ASSERT(z->zone_subsys_ref[subsys] != 0);
2812		z->zone_subsys_ref[subsys]--;
2813		list_remove(&z->zone_ref_list, ref);
2814	}
2815	if (z->zone_ref == 0 && z->zone_cred_ref == 0) {
2816		/* no more refs, free the structure */
2817		mutex_exit(&z->zone_lock);
2818		zone_free(z);
2819		return;
2820	}
2821	/* signal zone_destroy so the zone can finish halting */
2822	wakeup = (ZONE_IS_UNREF(z) && zone_status_get(z) >= ZONE_IS_DEAD);
2823	mutex_exit(&z->zone_lock);
2824
2825	if (wakeup) {
2826		/*
2827		 * Grabbing zonehash_lock here effectively synchronizes with
2828		 * zone_destroy() to avoid missed signals.
2829		 */
2830		mutex_enter(&zonehash_lock);
2831		cv_broadcast(&zone_destroy_cv);
2832		mutex_exit(&zonehash_lock);
2833	}
2834}
2835
2836/*
2837 * Decrement the specified zone's reference count.  The specified zone will
2838 * cease to exist after this function returns if the reference count drops to
2839 * zero.  This function should be paired with zone_hold().
2840 */
2841void
2842zone_rele(zone_t *z)
2843{
2844	zone_rele_common(z, NULL, ZONE_REF_NUM_SUBSYS);
2845}
2846
2847/*
2848 * Initialize a zone reference structure.  This function must be invoked for
2849 * a reference structure before the structure is passed to zone_hold_ref().
2850 */
2851void
2852zone_init_ref(zone_ref_t *ref)
2853{
2854	ref->zref_zone = NULL;
2855	list_link_init(&ref->zref_linkage);
2856}
2857
2858/*
2859 * Acquire a reference to zone z.  The caller must specify the
2860 * zone_ref_subsys_t constant associated with its subsystem.  The specified
2861 * zone_ref_t structure will represent a reference to the specified zone.  Use
2862 * zone_rele_ref() to release the reference.
2863 *
2864 * The referenced zone_t structure will not be freed as long as the zone_t's
2865 * zone_status field is not ZONE_IS_DEAD and the zone has outstanding
2866 * references.
2867 *
2868 * NOTE: The zone_ref_t structure must be initialized before it is used.
2869 * See zone_init_ref() above.
2870 */
2871void
2872zone_hold_ref(zone_t *z, zone_ref_t *ref, zone_ref_subsys_t subsys)
2873{
2874	ASSERT(subsys >= 0 && subsys < ZONE_REF_NUM_SUBSYS);
2875
2876	/*
2877	 * Prevent consumers from reusing a reference structure before
2878	 * releasing it.
2879	 */
2880	VERIFY(ref->zref_zone == NULL);
2881
2882	ref->zref_zone = z;
2883	mutex_enter(&z->zone_lock);
2884	zone_hold_locked(z);
2885	z->zone_subsys_ref[subsys]++;
2886	ASSERT(z->zone_subsys_ref[subsys] != 0);
2887	list_insert_head(&z->zone_ref_list, ref);
2888	mutex_exit(&z->zone_lock);
2889}
2890
2891/*
2892 * Release the zone reference represented by the specified zone_ref_t.
2893 * The reference is invalid after it's released; however, the zone_ref_t
2894 * structure can be reused without having to invoke zone_init_ref().
2895 * subsys should be the same value that was passed to zone_hold_ref()
2896 * when the reference was acquired.
2897 */
2898void
2899zone_rele_ref(zone_ref_t *ref, zone_ref_subsys_t subsys)
2900{
2901	zone_rele_common(ref->zref_zone, ref, subsys);
2902
2903	/*
2904	 * Set the zone_ref_t's zref_zone field to NULL to generate panics
2905	 * when consumers dereference the reference.  This helps us catch
2906	 * consumers who use released references.  Furthermore, this lets
2907	 * consumers reuse the zone_ref_t structure without having to
2908	 * invoke zone_init_ref().
2909	 */
2910	ref->zref_zone = NULL;
2911}
2912
2913void
2914zone_cred_hold(zone_t *z)
2915{
2916	mutex_enter(&z->zone_lock);
2917	z->zone_cred_ref++;
2918	ASSERT(z->zone_cred_ref != 0);
2919	mutex_exit(&z->zone_lock);
2920}
2921
2922void
2923zone_cred_rele(zone_t *z)
2924{
2925	boolean_t wakeup;
2926
2927	mutex_enter(&z->zone_lock);
2928	ASSERT(z->zone_cred_ref != 0);
2929	z->zone_cred_ref--;
2930	if (z->zone_ref == 0 && z->zone_cred_ref == 0) {
2931		/* no more refs, free the structure */
2932		mutex_exit(&z->zone_lock);
2933		zone_free(z);
2934		return;
2935	}
2936	/*
2937	 * If zone_destroy is waiting for the cred references to drain
2938	 * out, and they have, signal it.
2939	 */
2940	wakeup = (zone_wait_for_cred && ZONE_IS_UNREF(z) &&
2941	    zone_status_get(z) >= ZONE_IS_DEAD);
2942	mutex_exit(&z->zone_lock);
2943
2944	if (wakeup) {
2945		/*
2946		 * Grabbing zonehash_lock here effectively synchronizes with
2947		 * zone_destroy() to avoid missed signals.
2948		 */
2949		mutex_enter(&zonehash_lock);
2950		cv_broadcast(&zone_destroy_cv);
2951		mutex_exit(&zonehash_lock);
2952	}
2953}
2954
2955void
2956zone_task_hold(zone_t *z)
2957{
2958	mutex_enter(&z->zone_lock);
2959	z->zone_ntasks++;
2960	ASSERT(z->zone_ntasks != 0);
2961	mutex_exit(&z->zone_lock);
2962}
2963
2964void
2965zone_task_rele(zone_t *zone)
2966{
2967	uint_t refcnt;
2968
2969	mutex_enter(&zone->zone_lock);
2970	ASSERT(zone->zone_ntasks != 0);
2971	refcnt = --zone->zone_ntasks;
2972	if (refcnt > 1)	{	/* Common case */
2973		mutex_exit(&zone->zone_lock);
2974		return;
2975	}
2976	zone_hold_locked(zone);	/* so we can use the zone_t later */
2977	mutex_exit(&zone->zone_lock);
2978	if (refcnt == 1) {
2979		/*
2980		 * See if the zone is shutting down.
2981		 */
2982		mutex_enter(&zone_status_lock);
2983		if (zone_status_get(zone) != ZONE_IS_SHUTTING_DOWN) {
2984			goto out;
2985		}
2986
2987		/*
2988		 * Make sure the ntasks didn't change since we
2989		 * dropped zone_lock.
2990		 */
2991		mutex_enter(&zone->zone_lock);
2992		if (refcnt != zone->zone_ntasks) {
2993			mutex_exit(&zone->zone_lock);
2994			goto out;
2995		}
2996		mutex_exit(&zone->zone_lock);
2997
2998		/*
2999		 * No more user processes in the zone.  The zone is empty.
3000		 */
3001		zone_status_set(zone, ZONE_IS_EMPTY);
3002		goto out;
3003	}
3004
3005	ASSERT(refcnt == 0);
3006	/*
3007	 * zsched has exited; the zone is dead.
3008	 */
3009	zone->zone_zsched = NULL;		/* paranoia */
3010	mutex_enter(&zone_status_lock);
3011	zone_status_set(zone, ZONE_IS_DEAD);
3012out:
3013	mutex_exit(&zone_status_lock);
3014	zone_rele(zone);
3015}
3016
3017zoneid_t
3018getzoneid(void)
3019{
3020	return (curproc->p_zone->zone_id);
3021}
3022
3023/*
3024 * Internal versions of zone_find_by_*().  These don't zone_hold() or
3025 * check the validity of a zone's state.
3026 */
3027static zone_t *
3028zone_find_all_by_id(zoneid_t zoneid)
3029{
3030	mod_hash_val_t hv;
3031	zone_t *zone = NULL;
3032
3033	ASSERT(MUTEX_HELD(&zonehash_lock));
3034
3035	if (mod_hash_find(zonehashbyid,
3036	    (mod_hash_key_t)(uintptr_t)zoneid, &hv) == 0)
3037		zone = (zone_t *)hv;
3038	return (zone);
3039}
3040
3041static zone_t *
3042zone_find_all_by_label(const ts_label_t *label)
3043{
3044	mod_hash_val_t hv;
3045	zone_t *zone = NULL;
3046
3047	ASSERT(MUTEX_HELD(&zonehash_lock));
3048
3049	/*
3050	 * zonehashbylabel is not maintained for unlabeled systems
3051	 */
3052	if (!is_system_labeled())
3053		return (NULL);
3054	if (mod_hash_find(zonehashbylabel, (mod_hash_key_t)label, &hv) == 0)
3055		zone = (zone_t *)hv;
3056	return (zone);
3057}
3058
3059static zone_t *
3060zone_find_all_by_name(char *name)
3061{
3062	mod_hash_val_t hv;
3063	zone_t *zone = NULL;
3064
3065	ASSERT(MUTEX_HELD(&zonehash_lock));
3066
3067	if (mod_hash_find(zonehashbyname, (mod_hash_key_t)name, &hv) == 0)
3068		zone = (zone_t *)hv;
3069	return (zone);
3070}
3071
3072/*
3073 * Public interface for looking up a zone by zoneid.  Only returns the zone if
3074 * it is fully initialized, and has not yet begun the zone_destroy() sequence.
3075 * Caller must call zone_rele() once it is done with the zone.
3076 *
3077 * The zone may begin the zone_destroy() sequence immediately after this
3078 * function returns, but may be safely used until zone_rele() is called.
3079 */
3080zone_t *
3081zone_find_by_id(zoneid_t zoneid)
3082{
3083	zone_t *zone;
3084	zone_status_t status;
3085
3086	mutex_enter(&zonehash_lock);
3087	if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
3088		mutex_exit(&zonehash_lock);
3089		return (NULL);
3090	}
3091	status = zone_status_get(zone);
3092	if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
3093		/*
3094		 * For all practical purposes the zone doesn't exist.
3095		 */
3096		mutex_exit(&zonehash_lock);
3097		return (NULL);
3098	}
3099	zone_hold(zone);
3100	mutex_exit(&zonehash_lock);
3101	return (zone);
3102}
3103
3104/*
3105 * Similar to zone_find_by_id, but using zone label as the key.
3106 */
3107zone_t *
3108zone_find_by_label(const ts_label_t *label)
3109{
3110	zone_t *zone;
3111	zone_status_t status;
3112
3113	mutex_enter(&zonehash_lock);
3114	if ((zone = zone_find_all_by_label(label)) == NULL) {
3115		mutex_exit(&zonehash_lock);
3116		return (NULL);
3117	}
3118
3119	status = zone_status_get(zone);
3120	if (status > ZONE_IS_DOWN) {
3121		/*
3122		 * For all practical purposes the zone doesn't exist.
3123		 */
3124		mutex_exit(&zonehash_lock);
3125		return (NULL);
3126	}
3127	zone_hold(zone);
3128	mutex_exit(&zonehash_lock);
3129	return (zone);
3130}
3131
3132/*
3133 * Similar to zone_find_by_id, but using zone name as the key.
3134 */
3135zone_t *
3136zone_find_by_name(char *name)
3137{
3138	zone_t *zone;
3139	zone_status_t status;
3140
3141	mutex_enter(&zonehash_lock);
3142	if ((zone = zone_find_all_by_name(name)) == NULL) {
3143		mutex_exit(&zonehash_lock);
3144		return (NULL);
3145	}
3146	status = zone_status_get(zone);
3147	if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
3148		/*
3149		 * For all practical purposes the zone doesn't exist.
3150		 */
3151		mutex_exit(&zonehash_lock);
3152		return (NULL);
3153	}
3154	zone_hold(zone);
3155	mutex_exit(&zonehash_lock);
3156	return (zone);
3157}
3158
3159/*
3160 * Similar to zone_find_by_id(), using the path as a key.  For instance,
3161 * if there is a zone "foo" rooted at /foo/root, and the path argument
3162 * is "/foo/root/proc", it will return the held zone_t corresponding to
3163 * zone "foo".
3164 *
3165 * zone_find_by_path() always returns a non-NULL value, since at the
3166 * very least every path will be contained in the global zone.
3167 *
3168 * As with the other zone_find_by_*() functions, the caller is
3169 * responsible for zone_rele()ing the return value of this function.
3170 */
3171zone_t *
3172zone_find_by_path(const char *path)
3173{
3174	zone_t *zone;
3175	zone_t *zret = NULL;
3176	zone_status_t status;
3177
3178	if (path == NULL) {
3179		/*
3180		 * Call from rootconf().
3181		 */
3182		zone_hold(global_zone);
3183		return (global_zone);
3184	}
3185	ASSERT(*path == '/');
3186	mutex_enter(&zonehash_lock);
3187	for (zone = list_head(&zone_active); zone != NULL;
3188	    zone = list_next(&zone_active, zone)) {
3189		if (ZONE_PATH_VISIBLE(path, zone))
3190			zret = zone;
3191	}
3192	ASSERT(zret != NULL);
3193	status = zone_status_get(zret);
3194	if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
3195		/*
3196		 * Zone practically doesn't exist.
3197		 */
3198		zret = global_zone;
3199	}
3200	zone_hold(zret);
3201	mutex_exit(&zonehash_lock);
3202	return (zret);
3203}
3204
3205/*
3206 * Public interface for updating per-zone load averages.  Called once per
3207 * second.
3208 *
3209 * Based on loadavg_update(), genloadavg() and calcloadavg() from clock.c.
3210 */
3211void
3212zone_loadavg_update(void)
3213{
3214	zone_t *zp;
3215	zone_status_t status;
3216	struct loadavg_s *lavg;
3217	hrtime_t zone_total;
3218	uint64_t tmp;
3219	int i;
3220	hrtime_t hr_avg;
3221	int nrun;
3222	static int64_t f[3] = { 135, 27, 9 };
3223	int64_t q, r;
3224
3225	mutex_enter(&zonehash_lock);
3226	for (zp = list_head(&zone_active); zp != NULL;
3227	    zp = list_next(&zone_active, zp)) {
3228		mutex_enter(&zp->zone_lock);
3229
3230		/* Skip zones that are on the way down or not yet up */
3231		status = zone_status_get(zp);
3232		if (status < ZONE_IS_READY || status >= ZONE_IS_DOWN) {
3233			/* For all practical purposes the zone doesn't exist. */
3234			mutex_exit(&zp->zone_lock);
3235			continue;
3236		}
3237
3238		/*
3239		 * Update the 10 second moving average data in zone_loadavg.
3240		 */
3241		lavg = &zp->zone_loadavg;
3242
3243		tmp = cpu_uarray_sum_all(zp->zone_ustate);
3244		zone_total = UINT64_OVERFLOW_TO_INT64(tmp);
3245
3246		scalehrtime(&zone_total);
3247
3248		/* The zone_total should always be increasing. */
3249		lavg->lg_loads[lavg->lg_cur] = (zone_total > lavg->lg_total) ?
3250		    zone_total - lavg->lg_total : 0;
3251		lavg->lg_cur = (lavg->lg_cur + 1) % S_LOADAVG_SZ;
3252		/* lg_total holds the prev. 1 sec. total */
3253		lavg->lg_total = zone_total;
3254
3255		/*
3256		 * To simplify the calculation, we don't calculate the load avg.
3257		 * until the zone has been up for at least 10 seconds and our
3258		 * moving average is thus full.
3259		 */
3260		if ((lavg->lg_len + 1) < S_LOADAVG_SZ) {
3261			lavg->lg_len++;
3262			mutex_exit(&zp->zone_lock);
3263			continue;
3264		}
3265
3266		/* Now calculate the 1min, 5min, 15 min load avg. */
3267		hr_avg = 0;
3268		for (i = 0; i < S_LOADAVG_SZ; i++)
3269			hr_avg += lavg->lg_loads[i];
3270		hr_avg = hr_avg / S_LOADAVG_SZ;
3271		nrun = hr_avg / (NANOSEC / LGRP_LOADAVG_IN_THREAD_MAX);
3272
3273		/* Compute load avg. See comment in calcloadavg() */
3274		for (i = 0; i < 3; i++) {
3275			q = (zp->zone_hp_avenrun[i] >> 16) << 7;
3276			r = (zp->zone_hp_avenrun[i] & 0xffff) << 7;
3277			zp->zone_hp_avenrun[i] +=
3278			    ((nrun - q) * f[i] - ((r * f[i]) >> 16)) >> 4;
3279
3280			/* avenrun[] can only hold 31 bits of load avg. */
3281			if (zp->zone_hp_avenrun[i] <
3282			    ((uint64_t)1<<(31+16-FSHIFT)))
3283				zp->zone_avenrun[i] = (int32_t)
3284				    (zp->zone_hp_avenrun[i] >> (16 - FSHIFT));
3285			else
3286				zp->zone_avenrun[i] = 0x7fffffff;
3287		}
3288
3289		mutex_exit(&zp->zone_lock);
3290	}
3291	mutex_exit(&zonehash_lock);
3292}
3293
3294/*
3295 * Get the number of cpus visible to this zone.  The system-wide global
3296 * 'ncpus' is returned if pools are disabled, the caller is in the
3297 * global zone, or a NULL zone argument is passed in.
3298 */
3299int
3300zone_ncpus_get(zone_t *zone)
3301{
3302	int myncpus = zone == NULL ? 0 : zone->zone_ncpus;
3303
3304	return (myncpus != 0 ? myncpus : ncpus);
3305}
3306
3307/*
3308 * Get the number of online cpus visible to this zone.  The system-wide
3309 * global 'ncpus_online' is returned if pools are disabled, the caller
3310 * is in the global zone, or a NULL zone argument is passed in.
3311 */
3312int
3313zone_ncpus_online_get(zone_t *zone)
3314{
3315	int myncpus_online = zone == NULL ? 0 : zone->zone_ncpus_online;
3316
3317	return (myncpus_online != 0 ? myncpus_online : ncpus_online);
3318}
3319
3320/*
3321 * Return the pool to which the zone is currently bound.
3322 */
3323pool_t *
3324zone_pool_get(zone_t *zone)
3325{
3326	ASSERT(pool_lock_held());
3327
3328	return (zone->zone_pool);
3329}
3330
3331/*
3332 * Set the zone's pool pointer and update the zone's visibility to match
3333 * the resources in the new pool.
3334 */
3335void
3336zone_pool_set(zone_t *zone, pool_t *pool)
3337{
3338	ASSERT(pool_lock_held());
3339	ASSERT(MUTEX_HELD(&cpu_lock));
3340
3341	zone->zone_pool = pool;
3342	zone_pset_set(zone, pool->pool_pset->pset_id);
3343}
3344
3345/*
3346 * Return the cached value of the id of the processor set to which the
3347 * zone is currently bound.  The value will be ZONE_PS_INVAL if the pools
3348 * facility is disabled.
3349 */
3350psetid_t
3351zone_pset_get(zone_t *zone)
3352{
3353	ASSERT(MUTEX_HELD(&cpu_lock));
3354
3355	return (zone->zone_psetid);
3356}
3357
3358/*
3359 * Set the cached value of the id of the processor set to which the zone
3360 * is currently bound.  Also update the zone's visibility to match the
3361 * resources in the new processor set.
3362 */
3363void
3364zone_pset_set(zone_t *zone, psetid_t newpsetid)
3365{
3366	psetid_t oldpsetid;
3367
3368	ASSERT(MUTEX_HELD(&cpu_lock));
3369	oldpsetid = zone_pset_get(zone);
3370
3371	if (oldpsetid == newpsetid)
3372		return;
3373	/*
3374	 * Global zone sees all.
3375	 */
3376	if (zone != global_zone) {
3377		zone->zone_psetid = newpsetid;
3378		if (newpsetid != ZONE_PS_INVAL)
3379			pool_pset_visibility_add(newpsetid, zone);
3380		if (oldpsetid != ZONE_PS_INVAL)
3381			pool_pset_visibility_remove(oldpsetid, zone);
3382	}
3383	/*
3384	 * Disabling pools, so we should start using the global values
3385	 * for ncpus and ncpus_online.
3386	 */
3387	if (newpsetid == ZONE_PS_INVAL) {
3388		zone->zone_ncpus = 0;
3389		zone->zone_ncpus_online = 0;
3390	}
3391}
3392
3393/*
3394 * Walk the list of active zones and issue the provided callback for
3395 * each of them.
3396 *
3397 * Caller must not be holding any locks that may be acquired under
3398 * zonehash_lock.  See comment at the beginning of the file for a list of
3399 * common locks and their interactions with zones.
3400 */
3401int
3402zone_walk(int (*cb)(zone_t *, void *), void *data)
3403{
3404	zone_t *zone;
3405	int ret = 0;
3406	zone_status_t status;
3407
3408	mutex_enter(&zonehash_lock);
3409	for (zone = list_head(&zone_active); zone != NULL;
3410	    zone = list_next(&zone_active, zone)) {
3411		/*
3412		 * Skip zones that shouldn't be externally visible.
3413		 */
3414		status = zone_status_get(zone);
3415		if (status < ZONE_IS_READY || status > ZONE_IS_DOWN)
3416			continue;
3417		/*
3418		 * Bail immediately if any callback invocation returns a
3419		 * non-zero value.
3420		 */
3421		ret = (*cb)(zone, data);
3422		if (ret != 0)
3423			break;
3424	}
3425	mutex_exit(&zonehash_lock);
3426	return (ret);
3427}
3428
3429static int
3430zone_set_root(zone_t *zone, const char *upath)
3431{
3432	vnode_t *vp;
3433	int trycount;
3434	int error = 0;
3435	char *path;
3436	struct pathname upn, pn;
3437	size_t pathlen;
3438
3439	if ((error = pn_get((char *)upath, UIO_USERSPACE, &upn)) != 0)
3440		return (error);
3441
3442	pn_alloc(&pn);
3443
3444	/* prevent infinite loop */
3445	trycount = 10;
3446	for (;;) {
3447		if (--trycount <= 0) {
3448			error = ESTALE;
3449			goto out;
3450		}
3451
3452		if ((error = lookuppn(&upn, &pn, FOLLOW, NULLVPP, &vp)) == 0) {
3453			/*
3454			 * VOP_ACCESS() may cover 'vp' with a new
3455			 * filesystem, if 'vp' is an autoFS vnode.
3456			 * Get the new 'vp' if so.
3457			 */
3458			if ((error =
3459			    VOP_ACCESS(vp, VEXEC, 0, CRED(), NULL)) == 0 &&
3460			    (!vn_ismntpt(vp) ||
3461			    (error = traverse(&vp)) == 0)) {
3462				pathlen = pn.pn_pathlen + 2;
3463				path = kmem_alloc(pathlen, KM_SLEEP);
3464				(void) strncpy(path, pn.pn_path,
3465				    pn.pn_pathlen + 1);
3466				path[pathlen - 2] = '/';
3467				path[pathlen - 1] = '\0';
3468				pn_free(&pn);
3469				pn_free(&upn);
3470
3471				/* Success! */
3472				break;
3473			}
3474			VN_RELE(vp);
3475		}
3476		if (error != ESTALE)
3477			goto out;
3478	}
3479
3480	ASSERT(error == 0);
3481	zone->zone_rootvp = vp;		/* we hold a reference to vp */
3482	zone->zone_rootpath = path;
3483	zone->zone_rootpathlen = pathlen;
3484	if (pathlen > 5 && strcmp(path + pathlen - 5, "/lu/") == 0)
3485		zone->zone_flags |= ZF_IS_SCRATCH;
3486	return (0);
3487
3488out:
3489	pn_free(&pn);
3490	pn_free(&upn);
3491	return (error);
3492}
3493
3494#define	isalnum(c)	(((c) >= '0' && (c) <= '9') || \
3495			((c) >= 'a' && (c) <= 'z') || \
3496			((c) >= 'A' && (c) <= 'Z'))
3497
3498static int
3499zone_set_name(zone_t *zone, const char *uname)
3500{
3501	char *kname = kmem_zalloc(ZONENAME_MAX, KM_SLEEP);
3502	size_t len;
3503	int i, err;
3504
3505	if ((err = copyinstr(uname, kname, ZONENAME_MAX, &len)) != 0) {
3506		kmem_free(kname, ZONENAME_MAX);
3507		return (err);	/* EFAULT or ENAMETOOLONG */
3508	}
3509
3510	/* must be less than ZONENAME_MAX */
3511	if (len == ZONENAME_MAX && kname[ZONENAME_MAX - 1] != '\0') {
3512		kmem_free(kname, ZONENAME_MAX);
3513		return (EINVAL);
3514	}
3515
3516	/*
3517	 * Name must start with an alphanumeric and must contain only
3518	 * alphanumerics, '-', '_' and '.'.
3519	 */
3520	if (!isalnum(kname[0])) {
3521		kmem_free(kname, ZONENAME_MAX);
3522		return (EINVAL);
3523	}
3524	for (i = 1; i < len - 1; i++) {
3525		if (!isalnum(kname[i]) && kname[i] != '-' && kname[i] != '_' &&
3526		    kname[i] != '.') {
3527			kmem_free(kname, ZONENAME_MAX);
3528			return (EINVAL);
3529		}
3530	}
3531
3532	zone->zone_name = kname;
3533	return (0);
3534}
3535
3536/*
3537 * Gets the 32-bit hostid of the specified zone as an unsigned int.  If 'zonep'
3538 * is NULL or it points to a zone with no hostid emulation, then the machine's
3539 * hostid (i.e., the global zone's hostid) is returned.  This function returns
3540 * zero if neither the zone nor the host machine (global zone) have hostids.  It
3541 * returns HW_INVALID_HOSTID if the function attempts to return the machine's
3542 * hostid and the machine's hostid is invalid.
3543 */
3544uint32_t
3545zone_get_hostid(zone_t *zonep)
3546{
3547	unsigned long machine_hostid;
3548
3549	if (zonep == NULL || zonep->zone_hostid == HW_INVALID_HOSTID) {
3550		if (ddi_strtoul(hw_serial, NULL, 10, &machine_hostid) != 0)
3551			return (HW_INVALID_HOSTID);
3552		return ((uint32_t)machine_hostid);
3553	}
3554	return (zonep->zone_hostid);
3555}
3556
3557/*
3558 * Similar to thread_create(), but makes sure the thread is in the appropriate
3559 * zone's zsched process (curproc->p_zone->zone_zsched) before returning.
3560 */
3561/*ARGSUSED*/
3562kthread_t *
3563zthread_create(
3564    caddr_t stk,
3565    size_t stksize,
3566    void (*proc)(),
3567    void *arg,
3568    size_t len,
3569    pri_t pri)
3570{
3571	kthread_t *t;
3572	zone_t *zone = curproc->p_zone;
3573	proc_t *pp = zone->zone_zsched;
3574
3575	zone_hold(zone);	/* Reference to be dropped when thread exits */
3576
3577	/*
3578	 * No-one should be trying to create threads if the zone is shutting
3579	 * down and there aren't any kernel threads around.  See comment
3580	 * in zthread_exit().
3581	 */
3582	ASSERT(!(zone->zone_kthreads == NULL &&
3583	    zone_status_get(zone) >= ZONE_IS_EMPTY));
3584	/*
3585	 * Create a thread, but don't let it run until we've finished setting
3586	 * things up.
3587	 */
3588	t = thread_create(stk, stksize, proc, arg, len, pp, TS_STOPPED, pri);
3589	ASSERT(t->t_forw == NULL);
3590	mutex_enter(&zone_status_lock);
3591	if (zone->zone_kthreads == NULL) {
3592		t->t_forw = t->t_back = t;
3593	} else {
3594		kthread_t *tx = zone->zone_kthreads;
3595
3596		t->t_forw = tx;
3597		t->t_back = tx->t_back;
3598		tx->t_back->t_forw = t;
3599		tx->t_back = t;
3600	}
3601	zone->zone_kthreads = t;
3602	mutex_exit(&zone_status_lock);
3603
3604	mutex_enter(&pp->p_lock);
3605	t->t_proc_flag |= TP_ZTHREAD;
3606	project_rele(t->t_proj);
3607	t->t_proj = project_hold(pp->p_task->tk_proj);
3608
3609	/*
3610	 * Setup complete, let it run.
3611	 */
3612	thread_lock(t);
3613	t->t_schedflag |= TS_ALLSTART;
3614	setrun_locked(t);
3615	thread_unlock(t);
3616
3617	mutex_exit(&pp->p_lock);
3618
3619	return (t);
3620}
3621
3622/*
3623 * Similar to thread_exit().  Must be called by threads created via
3624 * zthread_exit().
3625 */
3626void
3627zthread_exit(void)
3628{
3629	kthread_t *t = curthread;
3630	proc_t *pp = curproc;
3631	zone_t *zone = pp->p_zone;
3632
3633	mutex_enter(&zone_status_lock);
3634
3635	/*
3636	 * Reparent to p0
3637	 */
3638	kpreempt_disable();
3639	mutex_enter(&pp->p_lock);
3640	t->t_proc_flag &= ~TP_ZTHREAD;
3641	t->t_procp = &p0;
3642	hat_thread_exit(t);
3643	mutex_exit(&pp->p_lock);
3644	kpreempt_enable();
3645
3646	if (t->t_back == t) {
3647		ASSERT(t->t_forw == t);
3648		/*
3649		 * If the zone is empty, once the thread count
3650		 * goes to zero no further kernel threads can be
3651		 * created.  This is because if the creator is a process
3652		 * in the zone, then it must have exited before the zone
3653		 * state could be set to ZONE_IS_EMPTY.
3654		 * Otherwise, if the creator is a kernel thread in the
3655		 * zone, the thread count is non-zero.
3656		 *
3657		 * This really means that non-zone kernel threads should
3658		 * not create zone kernel threads.
3659		 */
3660		zone->zone_kthreads = NULL;
3661		if (zone_status_get(zone) == ZONE_IS_EMPTY) {
3662			zone_status_set(zone, ZONE_IS_DOWN);
3663			/*
3664			 * Remove any CPU caps on this zone.
3665			 */
3666			cpucaps_zone_remove(zone);
3667		}
3668	} else {
3669		t->t_forw->t_back = t->t_back;
3670		t->t_back->t_forw = t->t_forw;
3671		if (zone->zone_kthreads == t)
3672			zone->zone_kthreads = t->t_forw;
3673	}
3674	mutex_exit(&zone_status_lock);
3675	zone_rele(zone);
3676	thread_exit();
3677	/* NOTREACHED */
3678}
3679
3680static void
3681zone_chdir(vnode_t *vp, vnode_t **vpp, proc_t *pp)
3682{
3683	vnode_t *oldvp;
3684
3685	/* we're going to hold a reference here to the directory */
3686	VN_HOLD(vp);
3687
3688	/* update abs cwd/root path see c2/audit.c */
3689	if (AU_AUDITING())
3690		audit_chdirec(vp, vpp);
3691
3692	mutex_enter(&pp->p_lock);
3693	oldvp = *vpp;
3694	*vpp = vp;
3695	mutex_exit(&pp->p_lock);
3696	if (oldvp != NULL)
3697		VN_RELE(oldvp);
3698}
3699
3700/*
3701 * Convert an rctl value represented by an nvlist_t into an rctl_val_t.
3702 */
3703static int
3704nvlist2rctlval(nvlist_t *nvl, rctl_val_t *rv)
3705{
3706	nvpair_t *nvp = NULL;
3707	boolean_t priv_set = B_FALSE;
3708	boolean_t limit_set = B_FALSE;
3709	boolean_t action_set = B_FALSE;
3710
3711	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
3712		const char *name;
3713		uint64_t ui64;
3714
3715		name = nvpair_name(nvp);
3716		if (nvpair_type(nvp) != DATA_TYPE_UINT64)
3717			return (EINVAL);
3718		(void) nvpair_value_uint64(nvp, &ui64);
3719		if (strcmp(name, "privilege") == 0) {
3720			/*
3721			 * Currently only privileged values are allowed, but
3722			 * this may change in the future.
3723			 */
3724			if (ui64 != RCPRIV_PRIVILEGED)
3725				return (EINVAL);
3726			rv->rcv_privilege = ui64;
3727			priv_set = B_TRUE;
3728		} else if (strcmp(name, "limit") == 0) {
3729			rv->rcv_value = ui64;
3730			limit_set = B_TRUE;
3731		} else if (strcmp(name, "action") == 0) {
3732			if (ui64 != RCTL_LOCAL_NOACTION &&
3733			    ui64 != RCTL_LOCAL_DENY)
3734				return (EINVAL);
3735			rv->rcv_flagaction = ui64;
3736			action_set = B_TRUE;
3737		} else {
3738			return (EINVAL);
3739		}
3740	}
3741
3742	if (!(priv_set && limit_set && action_set))
3743		return (EINVAL);
3744	rv->rcv_action_signal = 0;
3745	rv->rcv_action_recipient = NULL;
3746	rv->rcv_action_recip_pid = -1;
3747	rv->rcv_firing_time = 0;
3748
3749	return (0);
3750}
3751
3752/*
3753 * Non-global zone version of start_init.
3754 */
3755void
3756zone_start_init(void)
3757{
3758	proc_t *p = ttoproc(curthread);
3759	zone_t *z = p->p_zone;
3760
3761	ASSERT(!INGLOBALZONE(curproc));
3762
3763	/*
3764	 * For all purposes (ZONE_ATTR_INITPID and restart_init),
3765	 * storing just the pid of init is sufficient.
3766	 */
3767	z->zone_proc_initpid = p->p_pid;
3768
3769	/*
3770	 * We maintain zone_boot_err so that we can return the cause of the
3771	 * failure back to the caller of the zone_boot syscall.
3772	 */
3773	p->p_zone->zone_boot_err = start_init_common();
3774
3775	/*
3776	 * We will prevent booting zones from becoming running zones if the
3777	 * global zone is shutting down.
3778	 */
3779	mutex_enter(&zone_status_lock);
3780	if (z->zone_boot_err != 0 || zone_status_get(global_zone) >=
3781	    ZONE_IS_SHUTTING_DOWN) {
3782		/*
3783		 * Make sure we are still in the booting state-- we could have
3784		 * raced and already be shutting down, or even further along.
3785		 */
3786		if (zone_status_get(z) == ZONE_IS_BOOTING) {
3787			zone_status_set(z, ZONE_IS_SHUTTING_DOWN);
3788		}
3789		mutex_exit(&zone_status_lock);
3790		/* It's gone bad, dispose of the process */
3791		if (proc_exit(CLD_EXITED, z->zone_boot_err) != 0) {
3792			mutex_enter(&p->p_lock);
3793			ASSERT(p->p_flag & SEXITLWPS);
3794			lwp_exit();
3795		}
3796	} else {
3797		if (zone_status_get(z) == ZONE_IS_BOOTING)
3798			zone_status_set(z, ZONE_IS_RUNNING);
3799		mutex_exit(&zone_status_lock);
3800		/* cause the process to return to userland. */
3801		lwp_rtt();
3802	}
3803}
3804
3805struct zsched_arg {
3806	zone_t *zone;
3807	nvlist_t *nvlist;
3808};
3809
3810/*
3811 * Per-zone "sched" workalike.  The similarity to "sched" doesn't have
3812 * anything to do with scheduling, but rather with the fact that
3813 * per-zone kernel threads are parented to zsched, just like regular
3814 * kernel threads are parented to sched (p0).
3815 *
3816 * zsched is also responsible for launching init for the zone.
3817 */
3818static void
3819zsched(void *arg)
3820{
3821	struct zsched_arg *za = arg;
3822	proc_t *pp = curproc;
3823	proc_t *initp = proc_init;
3824	zone_t *zone = za->zone;
3825	cred_t *cr, *oldcred;
3826	rctl_set_t *set;
3827	rctl_alloc_gp_t *gp;
3828	contract_t *ct = NULL;
3829	task_t *tk, *oldtk;
3830	rctl_entity_p_t e;
3831	kproject_t *pj;
3832
3833	nvlist_t *nvl = za->nvlist;
3834	nvpair_t *nvp = NULL;
3835
3836	bcopy("zsched", PTOU(pp)->u_psargs, sizeof ("zsched"));
3837	bcopy("zsched", PTOU(pp)->u_comm, sizeof ("zsched"));
3838	PTOU(pp)->u_argc = 0;
3839	PTOU(pp)->u_argv = 0;
3840	PTOU(pp)->u_envp = 0;
3841	PTOU(pp)->u_commpagep = 0;
3842	closeall(P_FINFO(pp));
3843
3844	/*
3845	 * We are this zone's "zsched" process.  As the zone isn't generally
3846	 * visible yet we don't need to grab any locks before initializing its
3847	 * zone_proc pointer.
3848	 */
3849	zone_hold(zone);  /* this hold is released by zone_destroy() */
3850	zone->zone_zsched = pp;
3851	mutex_enter(&pp->p_lock);
3852	pp->p_zone = zone;
3853	mutex_exit(&pp->p_lock);
3854
3855	/*
3856	 * Disassociate process from its 'parent'; parent ourselves to init
3857	 * (pid 1) and change other values as needed.
3858	 */
3859	sess_create();
3860
3861	mutex_enter(&pidlock);
3862	proc_detach(pp);
3863	pp->p_ppid = 1;
3864	pp->p_flag |= SZONETOP;
3865	pp->p_ancpid = 1;
3866	pp->p_parent = initp;
3867	pp->p_psibling = NULL;
3868	if (initp->p_child)
3869		initp->p_child->p_psibling = pp;
3870	pp->p_sibling = initp->p_child;
3871	initp->p_child = pp;
3872
3873	/* Decrement what newproc() incremented. */
3874	upcount_dec(crgetruid(CRED()), GLOBAL_ZONEID);
3875	/*
3876	 * Our credentials are about to become kcred-like, so we don't care
3877	 * about the caller's ruid.
3878	 */
3879	upcount_inc(crgetruid(kcred), zone->zone_id);
3880	mutex_exit(&pidlock);
3881
3882	/*
3883	 * getting out of global zone, so decrement lwp and process counts
3884	 */
3885	pj = pp->p_task->tk_proj;
3886	mutex_enter(&global_zone->zone_nlwps_lock);
3887	pj->kpj_nlwps -= pp->p_lwpcnt;
3888	global_zone->zone_nlwps -= pp->p_lwpcnt;
3889	pj->kpj_nprocs--;
3890	global_zone->zone_nprocs--;
3891	mutex_exit(&global_zone->zone_nlwps_lock);
3892
3893	/*
3894	 * Decrement locked memory counts on old zone and project.
3895	 */
3896	mutex_enter(&global_zone->zone_mem_lock);
3897	global_zone->zone_locked_mem -= pp->p_locked_mem;
3898	pj->kpj_data.kpd_locked_mem -= pp->p_locked_mem;
3899	mutex_exit(&global_zone->zone_mem_lock);
3900
3901	/*
3902	 * Create and join a new task in project '0' of this zone.
3903	 *
3904	 * We don't need to call holdlwps() since we know we're the only lwp in
3905	 * this process.
3906	 *
3907	 * task_join() returns with p_lock held.
3908	 */
3909	tk = task_create(0, zone);
3910	mutex_enter(&cpu_lock);
3911	oldtk = task_join(tk, 0);
3912
3913	pj = pp->p_task->tk_proj;
3914
3915	mutex_enter(&zone->zone_mem_lock);
3916	zone->zone_locked_mem += pp->p_locked_mem;
3917	pj->kpj_data.kpd_locked_mem += pp->p_locked_mem;
3918	mutex_exit(&zone->zone_mem_lock);
3919
3920	/*
3921	 * add lwp and process counts to zsched's zone, and increment
3922	 * project's task and process count due to the task created in
3923	 * the above task_create.
3924	 */
3925	mutex_enter(&zone->zone_nlwps_lock);
3926	pj->kpj_nlwps += pp->p_lwpcnt;
3927	pj->kpj_ntasks += 1;
3928	zone->zone_nlwps += pp->p_lwpcnt;
3929	pj->kpj_nprocs++;
3930	zone->zone_nprocs++;
3931	mutex_exit(&zone->zone_nlwps_lock);
3932
3933	mutex_exit(&curproc->p_lock);
3934	mutex_exit(&cpu_lock);
3935	task_rele(oldtk);
3936
3937	/*
3938	 * The process was created by a process in the global zone, hence the
3939	 * credentials are wrong.  We might as well have kcred-ish credentials.
3940	 */
3941	cr = zone->zone_kcred;
3942	crhold(cr);
3943	mutex_enter(&pp->p_crlock);
3944	oldcred = pp->p_cred;
3945	pp->p_cred = cr;
3946	mutex_exit(&pp->p_crlock);
3947	crfree(oldcred);
3948
3949	/*
3950	 * Hold credentials again (for thread)
3951	 */
3952	crhold(cr);
3953
3954	/*
3955	 * p_lwpcnt can't change since this is a kernel process.
3956	 */
3957	crset(pp, cr);
3958
3959	/*
3960	 * Chroot
3961	 */
3962	zone_chdir(zone->zone_rootvp, &PTOU(pp)->u_cdir, pp);
3963	zone_chdir(zone->zone_rootvp, &PTOU(pp)->u_rdir, pp);
3964
3965	/*
3966	 * Initialize zone's rctl set.
3967	 */
3968	set = rctl_set_create();
3969	gp = rctl_set_init_prealloc(RCENTITY_ZONE);
3970	mutex_enter(&pp->p_lock);
3971	e.rcep_p.zone = zone;
3972	e.rcep_t = RCENTITY_ZONE;
3973	zone->zone_rctls = rctl_set_init(RCENTITY_ZONE, pp, &e, set, gp);
3974	mutex_exit(&pp->p_lock);
3975	rctl_prealloc_destroy(gp);
3976
3977	/*
3978	 * Apply the rctls passed in to zone_create().  This is basically a list
3979	 * assignment: all of the old values are removed and the new ones
3980	 * inserted.  That is, if an empty list is passed in, all values are
3981	 * removed.
3982	 */
3983	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
3984		rctl_dict_entry_t *rde;
3985		rctl_hndl_t hndl;
3986		char *name;
3987		nvlist_t **nvlarray;
3988		uint_t i, nelem;
3989		int error;	/* For ASSERT()s */
3990
3991		name = nvpair_name(nvp);
3992		hndl = rctl_hndl_lookup(name);
3993		ASSERT(hndl != -1);
3994		rde = rctl_dict_lookup_hndl(hndl);
3995		ASSERT(rde != NULL);
3996
3997		for (; /* ever */; ) {
3998			rctl_val_t oval;
3999
4000			mutex_enter(&pp->p_lock);
4001			error = rctl_local_get(hndl, NULL, &oval, pp);
4002			mutex_exit(&pp->p_lock);
4003			ASSERT(error == 0);	/* Can't fail for RCTL_FIRST */
4004			ASSERT(oval.rcv_privilege != RCPRIV_BASIC);
4005			if (oval.rcv_privilege == RCPRIV_SYSTEM)
4006				break;
4007			mutex_enter(&pp->p_lock);
4008			error = rctl_local_delete(hndl, &oval, pp);
4009			mutex_exit(&pp->p_lock);
4010			ASSERT(error == 0);
4011		}
4012		error = nvpair_value_nvlist_array(nvp, &nvlarray, &nelem);
4013		ASSERT(error == 0);
4014		for (i = 0; i < nelem; i++) {
4015			rctl_val_t *nvalp;
4016
4017			nvalp = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
4018			error = nvlist2rctlval(nvlarray[i], nvalp);
4019			ASSERT(error == 0);
4020			/*
4021			 * rctl_local_insert can fail if the value being
4022			 * inserted is a duplicate; this is OK.
4023			 */
4024			mutex_enter(&pp->p_lock);
4025			if (rctl_local_insert(hndl, nvalp, pp) != 0)
4026				kmem_cache_free(rctl_val_cache, nvalp);
4027			mutex_exit(&pp->p_lock);
4028		}
4029	}
4030
4031	/*
4032	 * Tell the world that we're done setting up.
4033	 *
4034	 * At this point we want to set the zone status to ZONE_IS_INITIALIZED
4035	 * and atomically set the zone's processor set visibility.  Once
4036	 * we drop pool_lock() this zone will automatically get updated
4037	 * to reflect any future changes to the pools configuration.
4038	 *
4039	 * Note that after we drop the locks below (zonehash_lock in
4040	 * particular) other operations such as a zone_getattr call can
4041	 * now proceed and observe the zone. That is the reason for doing a
4042	 * state transition to the INITIALIZED state.
4043	 */
4044	pool_lock();
4045	mutex_enter(&cpu_lock);
4046	mutex_enter(&zonehash_lock);
4047	zone_uniqid(zone);
4048	zone_zsd_configure(zone);
4049	if (pool_state == POOL_ENABLED)
4050		zone_pset_set(zone, pool_default->pool_pset->pset_id);
4051	mutex_enter(&zone_status_lock);
4052	ASSERT(zone_status_get(zone) == ZONE_IS_UNINITIALIZED);
4053	zone_status_set(zone, ZONE_IS_INITIALIZED);
4054	mutex_exit(&zone_status_lock);
4055	mutex_exit(&zonehash_lock);
4056	mutex_exit(&cpu_lock);
4057	pool_unlock();
4058
4059	/* Now call the create callback for this key */
4060	zsd_apply_all_keys(zsd_apply_create, zone);
4061
4062	/* The callbacks are complete. Mark ZONE_IS_READY */
4063	mutex_enter(&zone_status_lock);
4064	ASSERT(zone_status_get(zone) == ZONE_IS_INITIALIZED);
4065	zone_status_set(zone, ZONE_IS_READY);
4066	mutex_exit(&zone_status_lock);
4067
4068	/*
4069	 * Once we see the zone transition to the ZONE_IS_BOOTING state,
4070	 * we launch init, and set the state to running.
4071	 */
4072	zone_status_wait_cpr(zone, ZONE_IS_BOOTING, "zsched");
4073
4074	if (zone_status_get(zone) == ZONE_IS_BOOTING) {
4075		id_t cid;
4076
4077		/*
4078		 * Ok, this is a little complicated.  We need to grab the
4079		 * zone's pool's scheduling class ID; note that by now, we
4080		 * are already bound to a pool if we need to be (zoneadmd
4081		 * will have done that to us while we're in the READY
4082		 * state).  *But* the scheduling class for the zone's 'init'
4083		 * must be explicitly passed to newproc, which doesn't
4084		 * respect pool bindings.
4085		 *
4086		 * We hold the pool_lock across the call to newproc() to
4087		 * close the obvious race: the pool's scheduling class
4088		 * could change before we manage to create the LWP with
4089		 * classid 'cid'.
4090		 */
4091		pool_lock();
4092		if (zone->zone_defaultcid > 0)
4093			cid = zone->zone_defaultcid;
4094		else
4095			cid = pool_get_class(zone->zone_pool);
4096		if (cid == -1)
4097			cid = defaultcid;
4098
4099		/*
4100		 * If this fails, zone_boot will ultimately fail.  The
4101		 * state of the zone will be set to SHUTTING_DOWN-- userland
4102		 * will have to tear down the zone, and fail, or try again.
4103		 */
4104		if ((zone->zone_boot_err = newproc(zone_start_init, NULL, cid,
4105		    minclsyspri - 1, &ct, 0)) != 0) {
4106			mutex_enter(&zone_status_lock);
4107			zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
4108			mutex_exit(&zone_status_lock);
4109		} else {
4110			zone->zone_boot_time = gethrestime_sec();
4111		}
4112
4113		pool_unlock();
4114	}
4115
4116	/*
4117	 * Wait for zone_destroy() to be called.  This is what we spend
4118	 * most of our life doing.
4119	 */
4120	zone_status_wait_cpr(zone, ZONE_IS_DYING, "zsched");
4121
4122	if (ct)
4123		/*
4124		 * At this point the process contract should be empty.
4125		 * (Though if it isn't, it's not the end of the world.)
4126		 */
4127		VERIFY(contract_abandon(ct, curproc, B_TRUE) == 0);
4128
4129	/*
4130	 * Allow kcred to be freed when all referring processes
4131	 * (including this one) go away.  We can't just do this in
4132	 * zone_free because we need to wait for the zone_cred_ref to
4133	 * drop to 0 before calling zone_free, and the existence of
4134	 * zone_kcred will prevent that.  Thus, we call crfree here to
4135	 * balance the crdup in zone_create.  The crhold calls earlier
4136	 * in zsched will be dropped when the thread and process exit.
4137	 */
4138	crfree(zone->zone_kcred);
4139	zone->zone_kcred = NULL;
4140
4141	exit(CLD_EXITED, 0);
4142}
4143
4144/*
4145 * Helper function to determine if there are any submounts of the
4146 * provided path.  Used to make sure the zone doesn't "inherit" any
4147 * mounts from before it is created.
4148 */
4149static uint_t
4150zone_mount_count(const char *rootpath)
4151{
4152	vfs_t *vfsp;
4153	uint_t count = 0;
4154	size_t rootpathlen = strlen(rootpath);
4155
4156	/*
4157	 * Holding zonehash_lock prevents race conditions with
4158	 * vfs_list_add()/vfs_list_remove() since we serialize with
4159	 * zone_find_by_path().
4160	 */
4161	ASSERT(MUTEX_HELD(&zonehash_lock));
4162	/*
4163	 * The rootpath must end with a '/'
4164	 */
4165	ASSERT(rootpath[rootpathlen - 1] == '/');
4166
4167	/*
4168	 * This intentionally does not count the rootpath itself if that
4169	 * happens to be a mount point.
4170	 */
4171	vfs_list_read_lock();
4172	vfsp = rootvfs;
4173	do {
4174		if (strncmp(rootpath, refstr_value(vfsp->vfs_mntpt),
4175		    rootpathlen) == 0)
4176			count++;
4177		vfsp = vfsp->vfs_next;
4178	} while (vfsp != rootvfs);
4179	vfs_list_unlock();
4180	return (count);
4181}
4182
4183/*
4184 * Helper function to make sure that a zone created on 'rootpath'
4185 * wouldn't end up containing other zones' rootpaths.
4186 */
4187static boolean_t
4188zone_is_nested(const char *rootpath)
4189{
4190	zone_t *zone;
4191	size_t rootpathlen = strlen(rootpath);
4192	size_t len;
4193
4194	ASSERT(MUTEX_HELD(&zonehash_lock));
4195
4196	/*
4197	 * zone_set_root() appended '/' and '\0' at the end of rootpath
4198	 */
4199	if ((rootpathlen <= 3) && (rootpath[0] == '/') &&
4200	    (rootpath[1] == '/') && (rootpath[2] == '\0'))
4201		return (B_TRUE);
4202
4203	for (zone = list_head(&zone_active); zone != NULL;
4204	    zone = list_next(&zone_active, zone)) {
4205		if (zone == global_zone)
4206			continue;
4207		len = strlen(zone->zone_rootpath);
4208		if (strncmp(rootpath, zone->zone_rootpath,
4209		    MIN(rootpathlen, len)) == 0)
4210			return (B_TRUE);
4211	}
4212	return (B_FALSE);
4213}
4214
4215static int
4216zone_set_privset(zone_t *zone, const priv_set_t *zone_privs,
4217    size_t zone_privssz)
4218{
4219	priv_set_t *privs;
4220
4221	if (zone_privssz < sizeof (priv_set_t))
4222		return (ENOMEM);
4223
4224	privs = kmem_alloc(sizeof (priv_set_t), KM_SLEEP);
4225
4226	if (copyin(zone_privs, privs, sizeof (priv_set_t))) {
4227		kmem_free(privs, sizeof (priv_set_t));
4228		return (EFAULT);
4229	}
4230
4231	zone->zone_privset = privs;
4232	return (0);
4233}
4234
4235/*
4236 * We make creative use of nvlists to pass in rctls from userland.  The list is
4237 * a list of the following structures:
4238 *
4239 * (name = rctl_name, value = nvpair_list_array)
4240 *
4241 * Where each element of the nvpair_list_array is of the form:
4242 *
4243 * [(name = "privilege", value = RCPRIV_PRIVILEGED),
4244 *	(name = "limit", value = uint64_t),
4245 *	(name = "action", value = (RCTL_LOCAL_NOACTION || RCTL_LOCAL_DENY))]
4246 */
4247static int
4248parse_rctls(caddr_t ubuf, size_t buflen, nvlist_t **nvlp)
4249{
4250	nvpair_t *nvp = NULL;
4251	nvlist_t *nvl = NULL;
4252	char *kbuf;
4253	int error;
4254	rctl_val_t rv;
4255
4256	*nvlp = NULL;
4257
4258	if (buflen == 0)
4259		return (0);
4260
4261	if ((kbuf = kmem_alloc(buflen, KM_NOSLEEP)) == NULL)
4262		return (ENOMEM);
4263	if (copyin(ubuf, kbuf, buflen)) {
4264		error = EFAULT;
4265		goto out;
4266	}
4267	if (nvlist_unpack(kbuf, buflen, &nvl, KM_SLEEP) != 0) {
4268		/*
4269		 * nvl may have been allocated/free'd, but the value set to
4270		 * non-NULL, so we reset it here.
4271		 */
4272		nvl = NULL;
4273		error = EINVAL;
4274		goto out;
4275	}
4276	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
4277		rctl_dict_entry_t *rde;
4278		rctl_hndl_t hndl;
4279		nvlist_t **nvlarray;
4280		uint_t i, nelem;
4281		char *name;
4282
4283		error = EINVAL;
4284		name = nvpair_name(nvp);
4285		if (strncmp(nvpair_name(nvp), "zone.", sizeof ("zone.") - 1)
4286		    != 0 || nvpair_type(nvp) != DATA_TYPE_NVLIST_ARRAY) {
4287			goto out;
4288		}
4289		if ((hndl = rctl_hndl_lookup(name)) == -1) {
4290			goto out;
4291		}
4292		rde = rctl_dict_lookup_hndl(hndl);
4293		error = nvpair_value_nvlist_array(nvp, &nvlarray, &nelem);
4294		ASSERT(error == 0);
4295		for (i = 0; i < nelem; i++) {
4296			if (error = nvlist2rctlval(nvlarray[i], &rv))
4297				goto out;
4298		}
4299		if (rctl_invalid_value(rde, &rv)) {
4300			error = EINVAL;
4301			goto out;
4302		}
4303	}
4304	error = 0;
4305	*nvlp = nvl;
4306out:
4307	kmem_free(kbuf, buflen);
4308	if (error && nvl != NULL)
4309		nvlist_free(nvl);
4310	return (error);
4311}
4312
4313int
4314zone_create_error(int er_error, int er_ext, int *er_out)
4315{
4316	if (er_out != NULL) {
4317		if (copyout(&er_ext, er_out, sizeof (int))) {
4318			return (set_errno(EFAULT));
4319		}
4320	}
4321	return (set_errno(er_error));
4322}
4323
4324static int
4325zone_set_label(zone_t *zone, const bslabel_t *lab, uint32_t doi)
4326{
4327	ts_label_t *tsl;
4328	bslabel_t blab;
4329
4330	/* Get label from user */
4331	if (copyin(lab, &blab, sizeof (blab)) != 0)
4332		return (EFAULT);
4333	tsl = labelalloc(&blab, doi, KM_NOSLEEP);
4334	if (tsl == NULL)
4335		return (ENOMEM);
4336
4337	zone->zone_slabel = tsl;
4338	return (0);
4339}
4340
4341/*
4342 * Parses a comma-separated list of ZFS datasets into a per-zone dictionary.
4343 */
4344static int
4345parse_zfs(zone_t *zone, caddr_t ubuf, size_t buflen)
4346{
4347	char *kbuf;
4348	char *dataset, *next;
4349	zone_dataset_t *zd;
4350	size_t len;
4351
4352	if (ubuf == NULL || buflen == 0)
4353		return (0);
4354
4355	if ((kbuf = kmem_alloc(buflen, KM_NOSLEEP)) == NULL)
4356		return (ENOMEM);
4357
4358	if (copyin(ubuf, kbuf, buflen) != 0) {
4359		kmem_free(kbuf, buflen);
4360		return (EFAULT);
4361	}
4362
4363	dataset = next = kbuf;
4364	for (;;) {
4365		zd = kmem_alloc(sizeof (zone_dataset_t), KM_SLEEP);
4366
4367		next = strchr(dataset, ',');
4368
4369		if (next == NULL)
4370			len = strlen(dataset);
4371		else
4372			len = next - dataset;
4373
4374		zd->zd_dataset = kmem_alloc(len + 1, KM_SLEEP);
4375		bcopy(dataset, zd->zd_dataset, len);
4376		zd->zd_dataset[len] = '\0';
4377
4378		list_insert_head(&zone->zone_datasets, zd);
4379
4380		if (next == NULL)
4381			break;
4382
4383		dataset = next + 1;
4384	}
4385
4386	kmem_free(kbuf, buflen);
4387	return (0);
4388}
4389
4390/*
4391 * System call to create/initialize a new zone named 'zone_name', rooted
4392 * at 'zone_root', with a zone-wide privilege limit set of 'zone_privs',
4393 * and initialized with the zone-wide rctls described in 'rctlbuf', and
4394 * with labeling set by 'match', 'doi', and 'label'.
4395 *
4396 * If extended error is non-null, we may use it to return more detailed
4397 * error information.
4398 */
4399static zoneid_t
4400zone_create(const char *zone_name, const char *zone_root,
4401    const priv_set_t *zone_privs, size_t zone_privssz,
4402    caddr_t rctlbuf, size_t rctlbufsz,
4403    caddr_t zfsbuf, size_t zfsbufsz, int *extended_error,
4404    int match, uint32_t doi, const bslabel_t *label,
4405    int flags)
4406{
4407	struct zsched_arg zarg;
4408	nvlist_t *rctls = NULL;
4409	proc_t *pp = curproc;
4410	zone_t *zone, *ztmp;
4411	zoneid_t zoneid, start = GLOBAL_ZONEID;
4412	int error;
4413	int error2 = 0;
4414	char *str;
4415	cred_t *zkcr;
4416	boolean_t insert_label_hash;
4417
4418	if (secpolicy_zone_config(CRED()) != 0)
4419		return (set_errno(EPERM));
4420
4421	/* can't boot zone from within chroot environment */
4422	if (PTOU(pp)->u_rdir != NULL && PTOU(pp)->u_rdir != rootdir)
4423		return (zone_create_error(ENOTSUP, ZE_CHROOTED,
4424		    extended_error));
4425	/*
4426	 * As the first step of zone creation, we want to allocate a zoneid.
4427	 * This allocation is complicated by the fact that netstacks use the
4428	 * zoneid to determine their stackid, but netstacks themselves are
4429	 * freed asynchronously with respect to zone destruction.  This means
4430	 * that a netstack reference leak (or in principle, an extraordinarily
4431	 * long netstack reference hold) could result in a zoneid being
4432	 * allocated that in fact corresponds to a stackid from an active
4433	 * (referenced) netstack -- unleashing all sorts of havoc when that
4434	 * netstack is actually (re)used.  (In the abstract, we might wish a
4435	 * zoneid to not be deallocated until its last referencing netstack
4436	 * has been released, but netstacks lack a backpointer into their
4437	 * referencing zone -- and changing them to have such a pointer would
4438	 * be substantial, to put it euphemistically.)  To avoid this, we
4439	 * detect this condition on allocation: if we have allocated a zoneid
4440	 * that corresponds to a netstack that's still in use, we warn about
4441	 * it (as it is much more likely to be a reference leak than an actual
4442	 * netstack reference), free it, and allocate another.  That these
4443	 * identifers are allocated out of an ID space assures that we won't
4444	 * see the identifier we just allocated.
4445	 */
4446	for (;;) {
4447		zoneid = id_alloc(zoneid_space);
4448
4449		if (!netstack_inuse_by_stackid(zoneid_to_netstackid(zoneid)))
4450			break;
4451
4452		id_free(zoneid_space, zoneid);
4453
4454		if (start == GLOBAL_ZONEID) {
4455			start = zoneid;
4456		} else if (zoneid == start) {
4457			/*
4458			 * We have managed to iterate over the entire available
4459			 * zoneid space -- there are no identifiers available,
4460			 * presumably due to some number of leaked netstack
4461			 * references.  While it's in principle possible for us
4462			 * to continue to try, it seems wiser to give up at
4463			 * this point to warn and fail explicitly with a
4464			 * distinctive error.
4465			 */
4466			cmn_err(CE_WARN, "zone_create() failed: all available "
4467			    "zone IDs have netstacks still in use");
4468			return (set_errno(ENFILE));
4469		}
4470
4471		cmn_err(CE_WARN, "unable to reuse zone ID %d; "
4472		    "netstack still in use", zoneid);
4473	}
4474
4475	zone = kmem_zalloc(sizeof (zone_t), KM_SLEEP);
4476	zone->zone_id = zoneid;
4477	zone->zone_status = ZONE_IS_UNINITIALIZED;
4478	zone->zone_pool = pool_default;
4479	zone->zone_pool_mod = gethrtime();
4480	zone->zone_psetid = ZONE_PS_INVAL;
4481	zone->zone_ncpus = 0;
4482	zone->zone_ncpus_online = 0;
4483	zone->zone_restart_init = B_TRUE;
4484	zone->zone_brand = &native_brand;
4485	zone->zone_initname = NULL;
4486	mutex_init(&zone->zone_lock, NULL, MUTEX_DEFAULT, NULL);
4487	mutex_init(&zone->zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL);
4488	mutex_init(&zone->zone_mem_lock, NULL, MUTEX_DEFAULT, NULL);
4489	cv_init(&zone->zone_cv, NULL, CV_DEFAULT, NULL);
4490	list_create(&zone->zone_ref_list, sizeof (zone_ref_t),
4491	    offsetof(zone_ref_t, zref_linkage));
4492	list_create(&zone->zone_zsd, sizeof (struct zsd_entry),
4493	    offsetof(struct zsd_entry, zsd_linkage));
4494	list_create(&zone->zone_datasets, sizeof (zone_dataset_t),
4495	    offsetof(zone_dataset_t, zd_linkage));
4496	list_create(&zone->zone_dl_list, sizeof (zone_dl_t),
4497	    offsetof(zone_dl_t, zdl_linkage));
4498	rw_init(&zone->zone_mlps.mlpl_rwlock, NULL, RW_DEFAULT, NULL);
4499	rw_init(&zone->zone_mntfs_db_lock, NULL, RW_DEFAULT, NULL);
4500
4501	if (flags & ZCF_NET_EXCL) {
4502		zone->zone_flags |= ZF_NET_EXCL;
4503	}
4504
4505	if ((error = zone_set_name(zone, zone_name)) != 0) {
4506		zone_free(zone);
4507		return (zone_create_error(error, 0, extended_error));
4508	}
4509
4510	if ((error = zone_set_root(zone, zone_root)) != 0) {
4511		zone_free(zone);
4512		return (zone_create_error(error, 0, extended_error));
4513	}
4514	if ((error = zone_set_privset(zone, zone_privs, zone_privssz)) != 0) {
4515		zone_free(zone);
4516		return (zone_create_error(error, 0, extended_error));
4517	}
4518
4519	/* initialize node name to be the same as zone name */
4520	zone->zone_nodename = kmem_alloc(_SYS_NMLN, KM_SLEEP);
4521	(void) strncpy(zone->zone_nodename, zone->zone_name, _SYS_NMLN);
4522	zone->zone_nodename[_SYS_NMLN - 1] = '\0';
4523
4524	zone->zone_domain = kmem_alloc(_SYS_NMLN, KM_SLEEP);
4525	zone->zone_domain[0] = '\0';
4526	zone->zone_hostid = HW_INVALID_HOSTID;
4527	zone->zone_shares = 1;
4528	zone->zone_shmmax = 0;
4529	zone->zone_ipc.ipcq_shmmni = 0;
4530	zone->zone_ipc.ipcq_semmni = 0;
4531	zone->zone_ipc.ipcq_msgmni = 0;
4532	zone->zone_bootargs = NULL;
4533	zone->zone_fs_allowed = NULL;
4534
4535	psecflags_default(&zone->zone_secflags);
4536
4537	zone->zone_initname =
4538	    kmem_alloc(strlen(zone_default_initname) + 1, KM_SLEEP);
4539	(void) strcpy(zone->zone_initname, zone_default_initname);
4540	zone->zone_nlwps = 0;
4541	zone->zone_nlwps_ctl = INT_MAX;
4542	zone->zone_nprocs = 0;
4543	zone->zone_nprocs_ctl = INT_MAX;
4544	zone->zone_locked_mem = 0;
4545	zone->zone_locked_mem_ctl = UINT64_MAX;
4546	zone->zone_max_swap = 0;
4547	zone->zone_max_swap_ctl = UINT64_MAX;
4548	zone->zone_max_lofi = 0;
4549	zone->zone_max_lofi_ctl = UINT64_MAX;
4550	zone0.zone_lockedmem_kstat = NULL;
4551	zone0.zone_swapresv_kstat = NULL;
4552
4553	zone->zone_ustate = cpu_uarray_zalloc(ZONE_USTATE_MAX, KM_SLEEP);
4554
4555	/*
4556	 * Zsched initializes the rctls.
4557	 */
4558	zone->zone_rctls = NULL;
4559
4560	if ((error = parse_rctls(rctlbuf, rctlbufsz, &rctls)) != 0) {
4561		zone_free(zone);
4562		return (zone_create_error(error, 0, extended_error));
4563	}
4564
4565	if ((error = parse_zfs(zone, zfsbuf, zfsbufsz)) != 0) {
4566		zone_free(zone);
4567		return (set_errno(error));
4568	}
4569
4570	/*
4571	 * Read in the trusted system parameters:
4572	 * match flag and sensitivity label.
4573	 */
4574	zone->zone_match = match;
4575	if (is_system_labeled() && !(zone->zone_flags & ZF_IS_SCRATCH)) {
4576		/* Fail if requested to set doi to anything but system's doi */
4577		if (doi != 0 && doi != default_doi) {
4578			zone_free(zone);
4579			return (set_errno(EINVAL));
4580		}
4581		/* Always apply system's doi to the zone */
4582		error = zone_set_label(zone, label, default_doi);
4583		if (error != 0) {
4584			zone_free(zone);
4585			return (set_errno(error));
4586		}
4587		insert_label_hash = B_TRUE;
4588	} else {
4589		/* all zones get an admin_low label if system is not labeled */
4590		zone->zone_slabel = l_admin_low;
4591		label_hold(l_admin_low);
4592		insert_label_hash = B_FALSE;
4593	}
4594
4595	/*
4596	 * Stop all lwps since that's what normally happens as part of fork().
4597	 * This needs to happen before we grab any locks to avoid deadlock
4598	 * (another lwp in the process could be waiting for the held lock).
4599	 */
4600	if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK)) {
4601		zone_free(zone);
4602		nvlist_free(rctls);
4603		return (zone_create_error(error, 0, extended_error));
4604	}
4605
4606	if (block_mounts(zone) == 0) {
4607		mutex_enter(&pp->p_lock);
4608		if (curthread != pp->p_agenttp)
4609			continuelwps(pp);
4610		mutex_exit(&pp->p_lock);
4611		zone_free(zone);
4612		nvlist_free(rctls);
4613		return (zone_create_error(error, 0, extended_error));
4614	}
4615
4616	/*
4617	 * Set up credential for kernel access.  After this, any errors
4618	 * should go through the dance in errout rather than calling
4619	 * zone_free directly.
4620	 */
4621	zone->zone_kcred = crdup(kcred);
4622	crsetzone(zone->zone_kcred, zone);
4623	priv_intersect(zone->zone_privset, &CR_PPRIV(zone->zone_kcred));
4624	priv_intersect(zone->zone_privset, &CR_EPRIV(zone->zone_kcred));
4625	priv_intersect(zone->zone_privset, &CR_IPRIV(zone->zone_kcred));
4626	priv_intersect(zone->zone_privset, &CR_LPRIV(zone->zone_kcred));
4627
4628	mutex_enter(&zonehash_lock);
4629	/*
4630	 * Make sure zone doesn't already exist.
4631	 *
4632	 * If the system and zone are labeled,
4633	 * make sure no other zone exists that has the same label.
4634	 */
4635	if ((ztmp = zone_find_all_by_name(zone->zone_name)) != NULL ||
4636	    (insert_label_hash &&
4637	    (ztmp = zone_find_all_by_label(zone->zone_slabel)) != NULL)) {
4638		zone_status_t status;
4639
4640		status = zone_status_get(ztmp);
4641		if (status == ZONE_IS_READY || status == ZONE_IS_RUNNING)
4642			error = EEXIST;
4643		else
4644			error = EBUSY;
4645
4646		if (insert_label_hash)
4647			error2 = ZE_LABELINUSE;
4648
4649		goto errout;
4650	}
4651
4652	/*
4653	 * Don't allow zone creations which would cause one zone's rootpath to
4654	 * be accessible from that of another (non-global) zone.
4655	 */
4656	if (zone_is_nested(zone->zone_rootpath)) {
4657		error = EBUSY;
4658		goto errout;
4659	}
4660
4661	ASSERT(zonecount != 0);		/* check for leaks */
4662	if (zonecount + 1 > maxzones) {
4663		error = ENOMEM;
4664		goto errout;
4665	}
4666
4667	if (zone_mount_count(zone->zone_rootpath) != 0) {
4668		error = EBUSY;
4669		error2 = ZE_AREMOUNTS;
4670		goto errout;
4671	}
4672
4673	/*
4674	 * Zone is still incomplete, but we need to drop all locks while
4675	 * zsched() initializes this zone's kernel process.  We
4676	 * optimistically add the zone to the hashtable and associated
4677	 * lists so a parallel zone_create() doesn't try to create the
4678	 * same zone.
4679	 */
4680	zonecount++;
4681	(void) mod_hash_insert(zonehashbyid,
4682	    (mod_hash_key_t)(uintptr_t)zone->zone_id,
4683	    (mod_hash_val_t)(uintptr_t)zone);
4684	str = kmem_alloc(strlen(zone->zone_name) + 1, KM_SLEEP);
4685	(void) strcpy(str, zone->zone_name);
4686	(void) mod_hash_insert(zonehashbyname, (mod_hash_key_t)str,
4687	    (mod_hash_val_t)(uintptr_t)zone);
4688	if (insert_label_hash) {
4689		(void) mod_hash_insert(zonehashbylabel,
4690		    (mod_hash_key_t)zone->zone_slabel, (mod_hash_val_t)zone);
4691		zone->zone_flags |= ZF_HASHED_LABEL;
4692	}
4693
4694	/*
4695	 * Insert into active list.  At this point there are no 'hold's
4696	 * on the zone, but everyone else knows not to use it, so we can
4697	 * continue to use it.  zsched() will do a zone_hold() if the
4698	 * newproc() is successful.
4699	 */
4700	list_insert_tail(&zone_active, zone);
4701	mutex_exit(&zonehash_lock);
4702
4703	zarg.zone = zone;
4704	zarg.nvlist = rctls;
4705	/*
4706	 * The process, task, and project rctls are probably wrong;
4707	 * we need an interface to get the default values of all rctls,
4708	 * and initialize zsched appropriately.  I'm not sure that that
4709	 * makes much of a difference, though.
4710	 */
4711	error = newproc(zsched, (void *)&zarg, syscid, minclsyspri, NULL, 0);
4712	if (error != 0) {
4713		/*
4714		 * We need to undo all globally visible state.
4715		 */
4716		mutex_enter(&zonehash_lock);
4717		list_remove(&zone_active, zone);
4718		if (zone->zone_flags & ZF_HASHED_LABEL) {
4719			ASSERT(zone->zone_slabel != NULL);
4720			(void) mod_hash_destroy(zonehashbylabel,
4721			    (mod_hash_key_t)zone->zone_slabel);
4722		}
4723		(void) mod_hash_destroy(zonehashbyname,
4724		    (mod_hash_key_t)(uintptr_t)zone->zone_name);
4725		(void) mod_hash_destroy(zonehashbyid,
4726		    (mod_hash_key_t)(uintptr_t)zone->zone_id);
4727		ASSERT(zonecount > 1);
4728		zonecount--;
4729		goto errout;
4730	}
4731
4732	/*
4733	 * Zone creation can't fail from now on.
4734	 */
4735
4736	/*
4737	 * Create zone kstats
4738	 */
4739	zone_kstat_create(zone);
4740
4741	/*
4742	 * Let the other lwps continue.
4743	 */
4744	mutex_enter(&pp->p_lock);
4745	if (curthread != pp->p_agenttp)
4746		continuelwps(pp);
4747	mutex_exit(&pp->p_lock);
4748
4749	/*
4750	 * Wait for zsched to finish initializing the zone.
4751	 */
4752	zone_status_wait(zone, ZONE_IS_READY);
4753	/*
4754	 * The zone is fully visible, so we can let mounts progress.
4755	 */
4756	resume_mounts(zone);
4757	nvlist_free(rctls);
4758
4759	return (zoneid);
4760
4761errout:
4762	mutex_exit(&zonehash_lock);
4763	/*
4764	 * Let the other lwps continue.
4765	 */
4766	mutex_enter(&pp->p_lock);
4767	if (curthread != pp->p_agenttp)
4768		continuelwps(pp);
4769	mutex_exit(&pp->p_lock);
4770
4771	resume_mounts(zone);
4772	nvlist_free(rctls);
4773	/*
4774	 * There is currently one reference to the zone, a cred_ref from
4775	 * zone_kcred.  To free the zone, we call crfree, which will call
4776	 * zone_cred_rele, which will call zone_free.
4777	 */
4778	ASSERT(zone->zone_cred_ref == 1);
4779	ASSERT(zone->zone_kcred->cr_ref == 1);
4780	ASSERT(zone->zone_ref == 0);
4781	zkcr = zone->zone_kcred;
4782	zone->zone_kcred = NULL;
4783	crfree(zkcr);				/* triggers call to zone_free */
4784	return (zone_create_error(error, error2, extended_error));
4785}
4786
4787/*
4788 * Cause the zone to boot.  This is pretty simple, since we let zoneadmd do
4789 * the heavy lifting.  initname is the path to the program to launch
4790 * at the "top" of the zone; if this is NULL, we use the system default,
4791 * which is stored at zone_default_initname.
4792 */
4793static int
4794zone_boot(zoneid_t zoneid)
4795{
4796	int err;
4797	zone_t *zone;
4798
4799	if (secpolicy_zone_config(CRED()) != 0)
4800		return (set_errno(EPERM));
4801	if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
4802		return (set_errno(EINVAL));
4803
4804	mutex_enter(&zonehash_lock);
4805	/*
4806	 * Look for zone under hash lock to prevent races with calls to
4807	 * zone_shutdown, zone_destroy, etc.
4808	 */
4809	if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
4810		mutex_exit(&zonehash_lock);
4811		return (set_errno(EINVAL));
4812	}
4813
4814	mutex_enter(&zone_status_lock);
4815	if (zone_status_get(zone) != ZONE_IS_READY) {
4816		mutex_exit(&zone_status_lock);
4817		mutex_exit(&zonehash_lock);
4818		return (set_errno(EINVAL));
4819	}
4820	zone_status_set(zone, ZONE_IS_BOOTING);
4821	mutex_exit(&zone_status_lock);
4822
4823	zone_hold(zone);	/* so we can use the zone_t later */
4824	mutex_exit(&zonehash_lock);
4825
4826	if (zone_status_wait_sig(zone, ZONE_IS_RUNNING) == 0) {
4827		zone_rele(zone);
4828		return (set_errno(EINTR));
4829	}
4830
4831	/*
4832	 * Boot (starting init) might have failed, in which case the zone
4833	 * will go to the SHUTTING_DOWN state; an appropriate errno will
4834	 * be placed in zone->zone_boot_err, and so we return that.
4835	 */
4836	err = zone->zone_boot_err;
4837	zone_rele(zone);
4838	return (err ? set_errno(err) : 0);
4839}
4840
4841/*
4842 * Kills all user processes in the zone, waiting for them all to exit
4843 * before returning.
4844 */
4845static int
4846zone_empty(zone_t *zone)
4847{
4848	int waitstatus;
4849
4850	/*
4851	 * We need to drop zonehash_lock before killing all
4852	 * processes, otherwise we'll deadlock with zone_find_*
4853	 * which can be called from the exit path.
4854	 */
4855	ASSERT(MUTEX_NOT_HELD(&zonehash_lock));
4856	while ((waitstatus = zone_status_timedwait_sig(zone,
4857	    ddi_get_lbolt() + hz, ZONE_IS_EMPTY)) == -1) {
4858		killall(zone->zone_id);
4859	}
4860	/*
4861	 * return EINTR if we were signaled
4862	 */
4863	if (waitstatus == 0)
4864		return (EINTR);
4865	return (0);
4866}
4867
4868/*
4869 * This function implements the policy for zone visibility.
4870 *
4871 * In standard Solaris, a non-global zone can only see itself.
4872 *
4873 * In Trusted Extensions, a labeled zone can lookup any zone whose label
4874 * it dominates. For this test, the label of the global zone is treated as
4875 * admin_high so it is special-cased instead of being checked for dominance.
4876 *
4877 * Returns true if zone attributes are viewable, false otherwise.
4878 */
4879static boolean_t
4880zone_list_access(zone_t *zone)
4881{
4882
4883	if (curproc->p_zone == global_zone ||
4884	    curproc->p_zone == zone) {
4885		return (B_TRUE);
4886	} else if (is_system_labeled() && !(zone->zone_flags & ZF_IS_SCRATCH)) {
4887		bslabel_t *curproc_label;
4888		bslabel_t *zone_label;
4889
4890		curproc_label = label2bslabel(curproc->p_zone->zone_slabel);
4891		zone_label = label2bslabel(zone->zone_slabel);
4892
4893		if (zone->zone_id != GLOBAL_ZONEID &&
4894		    bldominates(curproc_label, zone_label)) {
4895			return (B_TRUE);
4896		} else {
4897			return (B_FALSE);
4898		}
4899	} else {
4900		return (B_FALSE);
4901	}
4902}
4903
4904/*
4905 * Systemcall to start the zone's halt sequence.  By the time this
4906 * function successfully returns, all user processes and kernel threads
4907 * executing in it will have exited, ZSD shutdown callbacks executed,
4908 * and the zone status set to ZONE_IS_DOWN.
4909 *
4910 * It is possible that the call will interrupt itself if the caller is the
4911 * parent of any process running in the zone, and doesn't have SIGCHLD blocked.
4912 */
4913static int
4914zone_shutdown(zoneid_t zoneid)
4915{
4916	int error;
4917	zone_t *zone;
4918	zone_status_t status;
4919
4920	if (secpolicy_zone_config(CRED()) != 0)
4921		return (set_errno(EPERM));
4922	if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
4923		return (set_errno(EINVAL));
4924
4925	mutex_enter(&zonehash_lock);
4926	/*
4927	 * Look for zone under hash lock to prevent races with other
4928	 * calls to zone_shutdown and zone_destroy.
4929	 */
4930	if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
4931		mutex_exit(&zonehash_lock);
4932		return (set_errno(EINVAL));
4933	}
4934
4935	/*
4936	 * We have to drop zonehash_lock before calling block_mounts.
4937	 * Hold the zone so we can continue to use the zone_t.
4938	 */
4939	zone_hold(zone);
4940	mutex_exit(&zonehash_lock);
4941
4942	/*
4943	 * Block mounts so that VFS_MOUNT() can get an accurate view of
4944	 * the zone's status with regards to ZONE_IS_SHUTTING down.
4945	 *
4946	 * e.g. NFS can fail the mount if it determines that the zone
4947	 * has already begun the shutdown sequence.
4948	 *
4949	 */
4950	if (block_mounts(zone) == 0) {
4951		zone_rele(zone);
4952		return (set_errno(EINTR));
4953	}
4954
4955	mutex_enter(&zonehash_lock);
4956	mutex_enter(&zone_status_lock);
4957	status = zone_status_get(zone);
4958	/*
4959	 * Fail if the zone isn't fully initialized yet.
4960	 */
4961	if (status < ZONE_IS_READY) {
4962		mutex_exit(&zone_status_lock);
4963		mutex_exit(&zonehash_lock);
4964		resume_mounts(zone);
4965		zone_rele(zone);
4966		return (set_errno(EINVAL));
4967	}
4968	/*
4969	 * If conditions required for zone_shutdown() to return have been met,
4970	 * return success.
4971	 */
4972	if (status >= ZONE_IS_DOWN) {
4973		mutex_exit(&zone_status_lock);
4974		mutex_exit(&zonehash_lock);
4975		resume_mounts(zone);
4976		zone_rele(zone);
4977		return (0);
4978	}
4979	/*
4980	 * If zone_shutdown() hasn't been called before, go through the motions.
4981	 * If it has, there's nothing to do but wait for the kernel threads to
4982	 * drain.
4983	 */
4984	if (status < ZONE_IS_EMPTY) {
4985		uint_t ntasks;
4986
4987		mutex_enter(&zone->zone_lock);
4988		if ((ntasks = zone->zone_ntasks) != 1) {
4989			/*
4990			 * There's still stuff running.
4991			 */
4992			zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
4993		}
4994		mutex_exit(&zone->zone_lock);
4995		if (ntasks == 1) {
4996			/*
4997			 * The only way to create another task is through
4998			 * zone_enter(), which will block until we drop
4999			 * zonehash_lock.  The zone is empty.
5000			 */
5001			if (zone->zone_kthreads == NULL) {
5002				/*
5003				 * Skip ahead to ZONE_IS_DOWN
5004				 */
5005				zone_status_set(zone, ZONE_IS_DOWN);
5006			} else {
5007				zone_status_set(zone, ZONE_IS_EMPTY);
5008			}
5009		}
5010	}
5011	mutex_exit(&zone_status_lock);
5012	mutex_exit(&zonehash_lock);
5013	resume_mounts(zone);
5014
5015	if (error = zone_empty(zone)) {
5016		zone_rele(zone);
5017		return (set_errno(error));
5018	}
5019	/*
5020	 * After the zone status goes to ZONE_IS_DOWN this zone will no
5021	 * longer be notified of changes to the pools configuration, so
5022	 * in order to not end up with a stale pool pointer, we point
5023	 * ourselves at the default pool and remove all resource
5024	 * visibility.  This is especially important as the zone_t may
5025	 * languish on the deathrow for a very long time waiting for
5026	 * cred's to drain out.
5027	 *
5028	 * This rebinding of the zone can happen multiple times
5029	 * (presumably due to interrupted or parallel systemcalls)
5030	 * without any adverse effects.
5031	 */
5032	if (pool_lock_intr() != 0) {
5033		zone_rele(zone);
5034		return (set_errno(EINTR));
5035	}
5036	if (pool_state == POOL_ENABLED) {
5037		mutex_enter(&cpu_lock);
5038		zone_pool_set(zone, pool_default);
5039		/*
5040		 * The zone no longer needs to be able to see any cpus.
5041		 */
5042		zone_pset_set(zone, ZONE_PS_INVAL);
5043		mutex_exit(&cpu_lock);
5044	}
5045	pool_unlock();
5046
5047	/*
5048	 * ZSD shutdown callbacks can be executed multiple times, hence
5049	 * it is safe to not be holding any locks across this call.
5050	 */
5051	zone_zsd_callbacks(zone, ZSD_SHUTDOWN);
5052
5053	mutex_enter(&zone_status_lock);
5054	if (zone->zone_kthreads == NULL && zone_status_get(zone) < ZONE_IS_DOWN)
5055		zone_status_set(zone, ZONE_IS_DOWN);
5056	mutex_exit(&zone_status_lock);
5057
5058	/*
5059	 * Wait for kernel threads to drain.
5060	 */
5061	if (!zone_status_wait_sig(zone, ZONE_IS_DOWN)) {
5062		zone_rele(zone);
5063		return (set_errno(EINTR));
5064	}
5065
5066	/*
5067	 * Zone can be become down/destroyable even if the above wait
5068	 * returns EINTR, so any code added here may never execute.
5069	 * (i.e. don't add code here)
5070	 */
5071
5072	zone_rele(zone);
5073	return (0);
5074}
5075
5076/*
5077 * Log the specified zone's reference counts.  The caller should not be
5078 * holding the zone's zone_lock.
5079 */
5080static void
5081zone_log_refcounts(zone_t *zone)
5082{
5083	char *buffer;
5084	char *buffer_position;
5085	uint32_t buffer_size;
5086	uint32_t index;
5087	uint_t ref;
5088	uint_t cred_ref;
5089
5090	/*
5091	 * Construct a string representing the subsystem-specific reference
5092	 * counts.  The counts are printed in ascending order by index into the
5093	 * zone_t::zone_subsys_ref array.  The list will be surrounded by
5094	 * square brackets [] and will only contain nonzero reference counts.
5095	 *
5096	 * The buffer will hold two square bracket characters plus ten digits,
5097	 * one colon, one space, one comma, and some characters for a
5098	 * subsystem name per subsystem-specific reference count.  (Unsigned 32-
5099	 * bit integers have at most ten decimal digits.)  The last
5100	 * reference count's comma is replaced by the closing square
5101	 * bracket and a NULL character to terminate the string.
5102	 *
5103	 * NOTE: We have to grab the zone's zone_lock to create a consistent
5104	 * snapshot of the zone's reference counters.
5105	 *
5106	 * First, figure out how much space the string buffer will need.
5107	 * The buffer's size is stored in buffer_size.
5108	 */
5109	buffer_size = 2;			/* for the square brackets */
5110	mutex_enter(&zone->zone_lock);
5111	zone->zone_flags |= ZF_REFCOUNTS_LOGGED;
5112	ref = zone->zone_ref;
5113	cred_ref = zone->zone_cred_ref;
5114	for (index = 0; index < ZONE_REF_NUM_SUBSYS; ++index)
5115		if (zone->zone_subsys_ref[index] != 0)
5116			buffer_size += strlen(zone_ref_subsys_names[index]) +
5117			    13;
5118	if (buffer_size == 2) {
5119		/*
5120		 * No subsystems had nonzero reference counts.  Don't bother
5121		 * with allocating a buffer; just log the general-purpose and
5122		 * credential reference counts.
5123		 */
5124		mutex_exit(&zone->zone_lock);
5125		(void) strlog(0, 0, 1, SL_CONSOLE | SL_NOTE,
5126		    "Zone '%s' (ID: %d) is shutting down, but %u zone "
5127		    "references and %u credential references are still extant",
5128		    zone->zone_name, zone->zone_id, ref, cred_ref);
5129		return;
5130	}
5131
5132	/*
5133	 * buffer_size contains the exact number of characters that the
5134	 * buffer will need.  Allocate the buffer and fill it with nonzero
5135	 * subsystem-specific reference counts.  Surround the results with
5136	 * square brackets afterwards.
5137	 */
5138	buffer = kmem_alloc(buffer_size, KM_SLEEP);
5139	buffer_position = &buffer[1];
5140	for (index = 0; index < ZONE_REF_NUM_SUBSYS; ++index) {
5141		/*
5142		 * NOTE: The DDI's version of sprintf() returns a pointer to
5143		 * the modified buffer rather than the number of bytes written
5144		 * (as in snprintf(3C)).  This is unfortunate and annoying.
5145		 * Therefore, we'll use snprintf() with INT_MAX to get the
5146		 * number of bytes written.  Using INT_MAX is safe because
5147		 * the buffer is perfectly sized for the data: we'll never
5148		 * overrun the buffer.
5149		 */
5150		if (zone->zone_subsys_ref[index] != 0)
5151			buffer_position += snprintf(buffer_position, INT_MAX,
5152			    "%s: %u,", zone_ref_subsys_names[index],
5153			    zone->zone_subsys_ref[index]);
5154	}
5155	mutex_exit(&zone->zone_lock);
5156	buffer[0] = '[';
5157	ASSERT((uintptr_t)(buffer_position - buffer) < buffer_size);
5158	ASSERT(buffer_position[0] == '\0' && buffer_position[-1] == ',');
5159	buffer_position[-1] = ']';
5160
5161	/*
5162	 * Log the reference counts and free the message buffer.
5163	 */
5164	(void) strlog(0, 0, 1, SL_CONSOLE | SL_NOTE,
5165	    "Zone '%s' (ID: %d) is shutting down, but %u zone references and "
5166	    "%u credential references are still extant %s", zone->zone_name,
5167	    zone->zone_id, ref, cred_ref, buffer);
5168	kmem_free(buffer, buffer_size);
5169}
5170
5171/*
5172 * Systemcall entry point to finalize the zone halt process.  The caller
5173 * must have already successfully called zone_shutdown().
5174 *
5175 * Upon successful completion, the zone will have been fully destroyed:
5176 * zsched will have exited, destructor callbacks executed, and the zone
5177 * removed from the list of active zones.
5178 */
5179static int
5180zone_destroy(zoneid_t zoneid)
5181{
5182	uint64_t uniqid;
5183	zone_t *zone;
5184	zone_status_t status;
5185	clock_t wait_time;
5186	boolean_t log_refcounts;
5187
5188	if (secpolicy_zone_config(CRED()) != 0)
5189		return (set_errno(EPERM));
5190	if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
5191		return (set_errno(EINVAL));
5192
5193	mutex_enter(&zonehash_lock);
5194	/*
5195	 * Look for zone under hash lock to prevent races with other
5196	 * calls to zone_destroy.
5197	 */
5198	if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
5199		mutex_exit(&zonehash_lock);
5200		return (set_errno(EINVAL));
5201	}
5202
5203	if (zone_mount_count(zone->zone_rootpath) != 0) {
5204		mutex_exit(&zonehash_lock);
5205		return (set_errno(EBUSY));
5206	}
5207	mutex_enter(&zone_status_lock);
5208	status = zone_status_get(zone);
5209	if (status < ZONE_IS_DOWN) {
5210		mutex_exit(&zone_status_lock);
5211		mutex_exit(&zonehash_lock);
5212		return (set_errno(EBUSY));
5213	} else if (status == ZONE_IS_DOWN) {
5214		zone_status_set(zone, ZONE_IS_DYING); /* Tell zsched to exit */
5215	}
5216	mutex_exit(&zone_status_lock);
5217	zone_hold(zone);
5218	mutex_exit(&zonehash_lock);
5219
5220	/*
5221	 * wait for zsched to exit
5222	 */
5223	zone_status_wait(zone, ZONE_IS_DEAD);
5224	zone_zsd_callbacks(zone, ZSD_DESTROY);
5225	zone->zone_netstack = NULL;
5226	uniqid = zone->zone_uniqid;
5227	zone_rele(zone);
5228	zone = NULL;	/* potentially free'd */
5229
5230	log_refcounts = B_FALSE;
5231	wait_time = SEC_TO_TICK(ZONE_DESTROY_TIMEOUT_SECS);
5232	mutex_enter(&zonehash_lock);
5233	for (; /* ever */; ) {
5234		boolean_t unref;
5235		boolean_t refs_have_been_logged;
5236
5237		if ((zone = zone_find_all_by_id(zoneid)) == NULL ||
5238		    zone->zone_uniqid != uniqid) {
5239			/*
5240			 * The zone has gone away.  Necessary conditions
5241			 * are met, so we return success.
5242			 */
5243			mutex_exit(&zonehash_lock);
5244			return (0);
5245		}
5246		mutex_enter(&zone->zone_lock);
5247		unref = ZONE_IS_UNREF(zone);
5248		refs_have_been_logged = (zone->zone_flags &
5249		    ZF_REFCOUNTS_LOGGED);
5250		mutex_exit(&zone->zone_lock);
5251		if (unref) {
5252			/*
5253			 * There is only one reference to the zone -- that
5254			 * added when the zone was added to the hashtables --
5255			 * and things will remain this way until we drop
5256			 * zonehash_lock... we can go ahead and cleanup the
5257			 * zone.
5258			 */
5259			break;
5260		}
5261
5262		/*
5263		 * Wait for zone_rele_common() or zone_cred_rele() to signal
5264		 * zone_destroy_cv.  zone_destroy_cv is signaled only when
5265		 * some zone's general-purpose reference count reaches one.
5266		 * If ZONE_DESTROY_TIMEOUT_SECS seconds elapse while waiting
5267		 * on zone_destroy_cv, then log the zone's reference counts and
5268		 * continue to wait for zone_rele() and zone_cred_rele().
5269		 */
5270		if (!refs_have_been_logged) {
5271			if (!log_refcounts) {
5272				/*
5273				 * This thread hasn't timed out waiting on
5274				 * zone_destroy_cv yet.  Wait wait_time clock
5275				 * ticks (initially ZONE_DESTROY_TIMEOUT_SECS
5276				 * seconds) for the zone's references to clear.
5277				 */
5278				ASSERT(wait_time > 0);
5279				wait_time = cv_reltimedwait_sig(
5280				    &zone_destroy_cv, &zonehash_lock, wait_time,
5281				    TR_SEC);
5282				if (wait_time > 0) {
5283					/*
5284					 * A thread in zone_rele() or
5285					 * zone_cred_rele() signaled
5286					 * zone_destroy_cv before this thread's
5287					 * wait timed out.  The zone might have
5288					 * only one reference left; find out!
5289					 */
5290					continue;
5291				} else if (wait_time == 0) {
5292					/* The thread's process was signaled. */
5293					mutex_exit(&zonehash_lock);
5294					return (set_errno(EINTR));
5295				}
5296
5297				/*
5298				 * The thread timed out while waiting on
5299				 * zone_destroy_cv.  Even though the thread
5300				 * timed out, it has to check whether another
5301				 * thread woke up from zone_destroy_cv and
5302				 * destroyed the zone.
5303				 *
5304				 * If the zone still exists and has more than
5305				 * one unreleased general-purpose reference,
5306				 * then log the zone's reference counts.
5307				 */
5308				log_refcounts = B_TRUE;
5309				continue;
5310			}
5311
5312			/*
5313			 * The thread already timed out on zone_destroy_cv while
5314			 * waiting for subsystems to release the zone's last
5315			 * general-purpose references.  Log the zone's reference
5316			 * counts and wait indefinitely on zone_destroy_cv.
5317			 */
5318			zone_log_refcounts(zone);
5319		}
5320		if (cv_wait_sig(&zone_destroy_cv, &zonehash_lock) == 0) {
5321			/* The thread's process was signaled. */
5322			mutex_exit(&zonehash_lock);
5323			return (set_errno(EINTR));
5324		}
5325	}
5326
5327	/*
5328	 * Remove CPU cap for this zone now since we're not going to
5329	 * fail below this point.
5330	 */
5331	cpucaps_zone_remove(zone);
5332
5333	/* Get rid of the zone's kstats */
5334	zone_kstat_delete(zone);
5335
5336	/* remove the pfexecd doors */
5337	if (zone->zone_pfexecd != NULL) {
5338		klpd_freelist(&zone->zone_pfexecd);
5339		zone->zone_pfexecd = NULL;
5340	}
5341
5342	/* free brand specific data */
5343	if (ZONE_IS_BRANDED(zone))
5344		ZBROP(zone)->b_free_brand_data(zone);
5345
5346	/* Say goodbye to brand framework. */
5347	brand_unregister_zone(zone->zone_brand);
5348
5349	/*
5350	 * It is now safe to let the zone be recreated; remove it from the
5351	 * lists.  The memory will not be freed until the last cred
5352	 * reference goes away.
5353	 */
5354	ASSERT(zonecount > 1);	/* must be > 1; can't destroy global zone */
5355	zonecount--;
5356	/* remove from active list and hash tables */
5357	list_remove(&zone_active, zone);
5358	(void) mod_hash_destroy(zonehashbyname,
5359	    (mod_hash_key_t)zone->zone_name);
5360	(void) mod_hash_destroy(zonehashbyid,
5361	    (mod_hash_key_t)(uintptr_t)zone->zone_id);
5362	if (zone->zone_flags & ZF_HASHED_LABEL)
5363		(void) mod_hash_destroy(zonehashbylabel,
5364		    (mod_hash_key_t)zone->zone_slabel);
5365	mutex_exit(&zonehash_lock);
5366
5367	/*
5368	 * Release the root vnode; we're not using it anymore.  Nor should any
5369	 * other thread that might access it exist.
5370	 */
5371	if (zone->zone_rootvp != NULL) {
5372		VN_RELE(zone->zone_rootvp);
5373		zone->zone_rootvp = NULL;
5374	}
5375
5376	/* add to deathrow list */
5377	mutex_enter(&zone_deathrow_lock);
5378	list_insert_tail(&zone_deathrow, zone);
5379	mutex_exit(&zone_deathrow_lock);
5380
5381	/*
5382	 * Drop last reference (which was added by zsched()), this will
5383	 * free the zone unless there are outstanding cred references.
5384	 */
5385	zone_rele(zone);
5386	return (0);
5387}
5388
5389/*
5390 * Systemcall entry point for zone_getattr(2).
5391 */
5392static ssize_t
5393zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
5394{
5395	size_t size;
5396	int error = 0, err;
5397	zone_t *zone;
5398	char *zonepath;
5399	char *outstr;
5400	zone_status_t zone_status;
5401	pid_t initpid;
5402	boolean_t global = (curzone == global_zone);
5403	boolean_t inzone = (curzone->zone_id == zoneid);
5404	ushort_t flags;
5405	zone_net_data_t *zbuf;
5406
5407	mutex_enter(&zonehash_lock);
5408	if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
5409		mutex_exit(&zonehash_lock);
5410		return (set_errno(EINVAL));
5411	}
5412	zone_status = zone_status_get(zone);
5413	if (zone_status < ZONE_IS_INITIALIZED) {
5414		mutex_exit(&zonehash_lock);
5415		return (set_errno(EINVAL));
5416	}
5417	zone_hold(zone);
5418	mutex_exit(&zonehash_lock);
5419
5420	/*
5421	 * If not in the global zone, don't show information about other zones,
5422	 * unless the system is labeled and the local zone's label dominates
5423	 * the other zone.
5424	 */
5425	if (!zone_list_access(zone)) {
5426		zone_rele(zone);
5427		return (set_errno(EINVAL));
5428	}
5429
5430	switch (attr) {
5431	case ZONE_ATTR_ROOT:
5432		if (global) {
5433			/*
5434			 * Copy the path to trim the trailing "/" (except for
5435			 * the global zone).
5436			 */
5437			if (zone != global_zone)
5438				size = zone->zone_rootpathlen - 1;
5439			else
5440				size = zone->zone_rootpathlen;
5441			zonepath = kmem_alloc(size, KM_SLEEP);
5442			bcopy(zone->zone_rootpath, zonepath, size);
5443			zonepath[size - 1] = '\0';
5444		} else {
5445			if (inzone || !is_system_labeled()) {
5446				/*
5447				 * Caller is not in the global zone.
5448				 * if the query is on the current zone
5449				 * or the system is not labeled,
5450				 * just return faked-up path for current zone.
5451				 */
5452				zonepath = "/";
5453				size = 2;
5454			} else {
5455				/*
5456				 * Return related path for current zone.
5457				 */
5458				int prefix_len = strlen(zone_prefix);
5459				int zname_len = strlen(zone->zone_name);
5460
5461				size = prefix_len + zname_len + 1;
5462				zonepath = kmem_alloc(size, KM_SLEEP);
5463				bcopy(zone_prefix, zonepath, prefix_len);
5464				bcopy(zone->zone_name, zonepath +
5465				    prefix_len, zname_len);
5466				zonepath[size - 1] = '\0';
5467			}
5468		}
5469		if (bufsize > size)
5470			bufsize = size;
5471		if (buf != NULL) {
5472			err = copyoutstr(zonepath, buf, bufsize, NULL);
5473			if (err != 0 && err != ENAMETOOLONG)
5474				error = EFAULT;
5475		}
5476		if (global || (is_system_labeled() && !inzone))
5477			kmem_free(zonepath, size);
5478		break;
5479
5480	case ZONE_ATTR_NAME:
5481		size = strlen(zone->zone_name) + 1;
5482		if (bufsize > size)
5483			bufsize = size;
5484		if (buf != NULL) {
5485			err = copyoutstr(zone->zone_name, buf, bufsize, NULL);
5486			if (err != 0 && err != ENAMETOOLONG)
5487				error = EFAULT;
5488		}
5489		break;
5490
5491	case ZONE_ATTR_STATUS:
5492		/*
5493		 * Since we're not holding zonehash_lock, the zone status
5494		 * may be anything; leave it up to userland to sort it out.
5495		 */
5496		size = sizeof (zone_status);
5497		if (bufsize > size)
5498			bufsize = size;
5499		zone_status = zone_status_get(zone);
5500		if (buf != NULL &&
5501		    copyout(&zone_status, buf, bufsize) != 0)
5502			error = EFAULT;
5503		break;
5504	case ZONE_ATTR_FLAGS:
5505		size = sizeof (zone->zone_flags);
5506		if (bufsize > size)
5507			bufsize = size;
5508		flags = zone->zone_flags;
5509		if (buf != NULL &&
5510		    copyout(&flags, buf, bufsize) != 0)
5511			error = EFAULT;
5512		break;
5513	case ZONE_ATTR_PRIVSET:
5514		size = sizeof (priv_set_t);
5515		if (bufsize > size)
5516			bufsize = size;
5517		if (buf != NULL &&
5518		    copyout(zone->zone_privset, buf, bufsize) != 0)
5519			error = EFAULT;
5520		break;
5521	case ZONE_ATTR_UNIQID:
5522		size = sizeof (zone->zone_uniqid);
5523		if (bufsize > size)
5524			bufsize = size;
5525		if (buf != NULL &&
5526		    copyout(&zone->zone_uniqid, buf, bufsize) != 0)
5527			error = EFAULT;
5528		break;
5529	case ZONE_ATTR_POOLID:
5530		{
5531			pool_t *pool;
5532			poolid_t poolid;
5533
5534			if (pool_lock_intr() != 0) {
5535				error = EINTR;
5536				break;
5537			}
5538			pool = zone_pool_get(zone);
5539			poolid = pool->pool_id;
5540			pool_unlock();
5541			size = sizeof (poolid);
5542			if (bufsize > size)
5543				bufsize = size;
5544			if (buf != NULL && copyout(&poolid, buf, size) != 0)
5545				error = EFAULT;
5546		}
5547		break;
5548	case ZONE_ATTR_SLBL:
5549		size = sizeof (bslabel_t);
5550		if (bufsize > size)
5551			bufsize = size;
5552		if (zone->zone_slabel == NULL)
5553			error = EINVAL;
5554		else if (buf != NULL &&
5555		    copyout(label2bslabel(zone->zone_slabel), buf,
5556		    bufsize) != 0)
5557			error = EFAULT;
5558		break;
5559	case ZONE_ATTR_INITPID:
5560		size = sizeof (initpid);
5561		if (bufsize > size)
5562			bufsize = size;
5563		initpid = zone->zone_proc_initpid;
5564		if (initpid == -1) {
5565			error = ESRCH;
5566			break;
5567		}
5568		if (buf != NULL &&
5569		    copyout(&initpid, buf, bufsize) != 0)
5570			error = EFAULT;
5571		break;
5572	case ZONE_ATTR_BRAND:
5573		size = strlen(zone->zone_brand->b_name) + 1;
5574
5575		if (bufsize > size)
5576			bufsize = size;
5577		if (buf != NULL) {
5578			err = copyoutstr(zone->zone_brand->b_name, buf,
5579			    bufsize, NULL);
5580			if (err != 0 && err != ENAMETOOLONG)
5581				error = EFAULT;
5582		}
5583		break;
5584	case ZONE_ATTR_INITNAME:
5585		size = strlen(zone->zone_initname) + 1;
5586		if (bufsize > size)
5587			bufsize = size;
5588		if (buf != NULL) {
5589			err = copyoutstr(zone->zone_initname, buf, bufsize,
5590			    NULL);
5591			if (err != 0 && err != ENAMETOOLONG)
5592				error = EFAULT;
5593		}
5594		break;
5595	case ZONE_ATTR_BOOTARGS:
5596		if (zone->zone_bootargs == NULL)
5597			outstr = "";
5598		else
5599			outstr = zone->zone_bootargs;
5600		size = strlen(outstr) + 1;
5601		if (bufsize > size)
5602			bufsize = size;
5603		if (buf != NULL) {
5604			err = copyoutstr(outstr, buf, bufsize, NULL);
5605			if (err != 0 && err != ENAMETOOLONG)
5606				error = EFAULT;
5607		}
5608		break;
5609	case ZONE_ATTR_PHYS_MCAP:
5610		size = sizeof (zone->zone_phys_mcap);
5611		if (bufsize > size)
5612			bufsize = size;
5613		if (buf != NULL &&
5614		    copyout(&zone->zone_phys_mcap, buf, bufsize) != 0)
5615			error = EFAULT;
5616		break;
5617	case ZONE_ATTR_SCHED_CLASS:
5618		mutex_enter(&class_lock);
5619
5620		if (zone->zone_defaultcid >= loaded_classes)
5621			outstr = "";
5622		else
5623			outstr = sclass[zone->zone_defaultcid].cl_name;
5624		size = strlen(outstr) + 1;
5625		if (bufsize > size)
5626			bufsize = size;
5627		if (buf != NULL) {
5628			err = copyoutstr(outstr, buf, bufsize, NULL);
5629			if (err != 0 && err != ENAMETOOLONG)
5630				error = EFAULT;
5631		}
5632
5633		mutex_exit(&class_lock);
5634		break;
5635	case ZONE_ATTR_HOSTID:
5636		if (zone->zone_hostid != HW_INVALID_HOSTID &&
5637		    bufsize == sizeof (zone->zone_hostid)) {
5638			size = sizeof (zone->zone_hostid);
5639			if (buf != NULL && copyout(&zone->zone_hostid, buf,
5640			    bufsize) != 0)
5641				error = EFAULT;
5642		} else {
5643			error = EINVAL;
5644		}
5645		break;
5646	case ZONE_ATTR_FS_ALLOWED:
5647		if (zone->zone_fs_allowed == NULL)
5648			outstr = "";
5649		else
5650			outstr = zone->zone_fs_allowed;
5651		size = strlen(outstr) + 1;
5652		if (bufsize > size)
5653			bufsize = size;
5654		if (buf != NULL) {
5655			err = copyoutstr(outstr, buf, bufsize, NULL);
5656			if (err != 0 && err != ENAMETOOLONG)
5657				error = EFAULT;
5658		}
5659		break;
5660	case ZONE_ATTR_SECFLAGS:
5661		size = sizeof (zone->zone_secflags);
5662		if (bufsize > size)
5663			bufsize = size;
5664		if ((err = copyout(&zone->zone_secflags, buf, bufsize)) != 0)
5665			error = EFAULT;
5666		break;
5667	case ZONE_ATTR_NETWORK:
5668		bufsize = MIN(bufsize, PIPE_BUF + sizeof (zone_net_data_t));
5669		size = bufsize;
5670		zbuf = kmem_alloc(bufsize, KM_SLEEP);
5671		if (copyin(buf, zbuf, bufsize) != 0) {
5672			error = EFAULT;
5673		} else {
5674			error = zone_get_network(zoneid, zbuf);
5675			if (error == 0 && copyout(zbuf, buf, bufsize) != 0)
5676				error = EFAULT;
5677		}
5678		kmem_free(zbuf, bufsize);
5679		break;
5680	default:
5681		if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) {
5682			size = bufsize;
5683			error = ZBROP(zone)->b_getattr(zone, attr, buf, &size);
5684		} else {
5685			error = EINVAL;
5686		}
5687	}
5688	zone_rele(zone);
5689
5690	if (error)
5691		return (set_errno(error));
5692	return ((ssize_t)size);
5693}
5694
5695/*
5696 * Systemcall entry point for zone_setattr(2).
5697 */
5698/*ARGSUSED*/
5699static int
5700zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
5701{
5702	zone_t *zone;
5703	zone_status_t zone_status;
5704	int err = -1;
5705	zone_net_data_t *zbuf;
5706
5707	if (secpolicy_zone_config(CRED()) != 0)
5708		return (set_errno(EPERM));
5709
5710	/*
5711	 * Only the ZONE_ATTR_PHYS_MCAP attribute can be set on the
5712	 * global zone.
5713	 */
5714	if (zoneid == GLOBAL_ZONEID && attr != ZONE_ATTR_PHYS_MCAP) {
5715		return (set_errno(EINVAL));
5716	}
5717
5718	mutex_enter(&zonehash_lock);
5719	if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
5720		mutex_exit(&zonehash_lock);
5721		return (set_errno(EINVAL));
5722	}
5723	zone_hold(zone);
5724	mutex_exit(&zonehash_lock);
5725
5726	/*
5727	 * At present most attributes can only be set on non-running,
5728	 * non-global zones.
5729	 */
5730	zone_status = zone_status_get(zone);
5731	if (attr != ZONE_ATTR_PHYS_MCAP && zone_status > ZONE_IS_READY) {
5732		err = EINVAL;
5733		goto done;
5734	}
5735
5736	switch (attr) {
5737	case ZONE_ATTR_INITNAME:
5738		err = zone_set_initname(zone, (const char *)buf);
5739		break;
5740	case ZONE_ATTR_INITNORESTART:
5741		zone->zone_restart_init = B_FALSE;
5742		err = 0;
5743		break;
5744	case ZONE_ATTR_BOOTARGS:
5745		err = zone_set_bootargs(zone, (const char *)buf);
5746		break;
5747	case ZONE_ATTR_BRAND:
5748		err = zone_set_brand(zone, (const char *)buf);
5749		break;
5750	case ZONE_ATTR_FS_ALLOWED:
5751		err = zone_set_fs_allowed(zone, (const char *)buf);
5752		break;
5753	case ZONE_ATTR_SECFLAGS:
5754		err = zone_set_secflags(zone, (psecflags_t *)buf);
5755		break;
5756	case ZONE_ATTR_PHYS_MCAP:
5757		err = zone_set_phys_mcap(zone, (const uint64_t *)buf);
5758		break;
5759	case ZONE_ATTR_SCHED_CLASS:
5760		err = zone_set_sched_class(zone, (const char *)buf);
5761		break;
5762	case ZONE_ATTR_HOSTID:
5763		if (bufsize == sizeof (zone->zone_hostid)) {
5764			if (copyin(buf, &zone->zone_hostid, bufsize) == 0)
5765				err = 0;
5766			else
5767				err = EFAULT;
5768		} else {
5769			err = EINVAL;
5770		}
5771		break;
5772	case ZONE_ATTR_NETWORK:
5773		if (bufsize > (PIPE_BUF + sizeof (zone_net_data_t))) {
5774			err = EINVAL;
5775			break;
5776		}
5777		zbuf = kmem_alloc(bufsize, KM_SLEEP);
5778		if (copyin(buf, zbuf, bufsize) != 0) {
5779			kmem_free(zbuf, bufsize);
5780			err = EFAULT;
5781			break;
5782		}
5783		err = zone_set_network(zoneid, zbuf);
5784		kmem_free(zbuf, bufsize);
5785		break;
5786	default:
5787		if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone))
5788			err = ZBROP(zone)->b_setattr(zone, attr, buf, bufsize);
5789		else
5790			err = EINVAL;
5791	}
5792
5793done:
5794	zone_rele(zone);
5795	ASSERT(err != -1);
5796	return (err != 0 ? set_errno(err) : 0);
5797}
5798
5799/*
5800 * Return zero if the process has at least one vnode mapped in to its
5801 * address space which shouldn't be allowed to change zones.
5802 *
5803 * Also return zero if the process has any shared mappings which reserve
5804 * swap.  This is because the counting for zone.max-swap does not allow swap
5805 * reservation to be shared between zones.  zone swap reservation is counted
5806 * on zone->zone_max_swap.
5807 */
5808static int
5809as_can_change_zones(void)
5810{
5811	proc_t *pp = curproc;
5812	struct seg *seg;
5813	struct as *as = pp->p_as;
5814	vnode_t *vp;
5815	int allow = 1;
5816
5817	ASSERT(pp->p_as != &kas);
5818	AS_LOCK_ENTER(as, RW_READER);
5819	for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
5820
5821		/*
5822		 * Cannot enter zone with shared anon memory which
5823		 * reserves swap.  See comment above.
5824		 */
5825		if (seg_can_change_zones(seg) == B_FALSE) {
5826			allow = 0;
5827			break;
5828		}
5829		/*
5830		 * if we can't get a backing vnode for this segment then skip
5831		 * it.
5832		 */
5833		vp = NULL;
5834		if (SEGOP_GETVP(seg, seg->s_base, &vp) != 0 || vp == NULL)
5835			continue;
5836		if (!vn_can_change_zones(vp)) { /* bail on first match */
5837			allow = 0;
5838			break;
5839		}
5840	}
5841	AS_LOCK_EXIT(as);
5842	return (allow);
5843}
5844
5845/*
5846 * Count swap reserved by curproc's address space
5847 */
5848static size_t
5849as_swresv(void)
5850{
5851	proc_t *pp = curproc;
5852	struct seg *seg;
5853	struct as *as = pp->p_as;
5854	size_t swap = 0;
5855
5856	ASSERT(pp->p_as != &kas);
5857	ASSERT(AS_WRITE_HELD(as));
5858	for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg))
5859		swap += seg_swresv(seg);
5860
5861	return (swap);
5862}
5863
5864/*
5865 * Systemcall entry point for zone_enter().
5866 *
5867 * The current process is injected into said zone.  In the process
5868 * it will change its project membership, privileges, rootdir/cwd,
5869 * zone-wide rctls, and pool association to match those of the zone.
5870 *
5871 * The first zone_enter() called while the zone is in the ZONE_IS_READY
5872 * state will transition it to ZONE_IS_RUNNING.  Processes may only
5873 * enter a zone that is "ready" or "running".
5874 */
5875static int
5876zone_enter(zoneid_t zoneid)
5877{
5878	zone_t *zone;
5879	vnode_t *vp;
5880	proc_t *pp = curproc;
5881	contract_t *ct;
5882	cont_process_t *ctp;
5883	task_t *tk, *oldtk;
5884	kproject_t *zone_proj0;
5885	cred_t *cr, *newcr;
5886	pool_t *oldpool, *newpool;
5887	sess_t *sp;
5888	uid_t uid;
5889	zone_status_t status;
5890	int err = 0;
5891	rctl_entity_p_t e;
5892	size_t swap;
5893	kthread_id_t t;
5894
5895	if (secpolicy_zone_config(CRED()) != 0)
5896		return (set_errno(EPERM));
5897	if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
5898		return (set_errno(EINVAL));
5899
5900	/*
5901	 * Stop all lwps so we don't need to hold a lock to look at
5902	 * curproc->p_zone.  This needs to happen before we grab any
5903	 * locks to avoid deadlock (another lwp in the process could
5904	 * be waiting for the held lock).
5905	 */
5906	if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK))
5907		return (set_errno(EINTR));
5908
5909	/*
5910	 * Make sure we're not changing zones with files open or mapped in
5911	 * to our address space which shouldn't be changing zones.
5912	 */
5913	if (!files_can_change_zones()) {
5914		err = EBADF;
5915		goto out;
5916	}
5917	if (!as_can_change_zones()) {
5918		err = EFAULT;
5919		goto out;
5920	}
5921
5922	mutex_enter(&zonehash_lock);
5923	if (pp->p_zone != global_zone) {
5924		mutex_exit(&zonehash_lock);
5925		err = EINVAL;
5926		goto out;
5927	}
5928
5929	zone = zone_find_all_by_id(zoneid);
5930	if (zone == NULL) {
5931		mutex_exit(&zonehash_lock);
5932		err = EINVAL;
5933		goto out;
5934	}
5935
5936	/*
5937	 * To prevent processes in a zone from holding contracts on
5938	 * extrazonal resources, and to avoid process contract
5939	 * memberships which span zones, contract holders and processes
5940	 * which aren't the sole members of their encapsulating process
5941	 * contracts are not allowed to zone_enter.
5942	 */
5943	ctp = pp->p_ct_process;
5944	ct = &ctp->conp_contract;
5945	mutex_enter(&ct->ct_lock);
5946	mutex_enter(&pp->p_lock);
5947	if ((avl_numnodes(&pp->p_ct_held) != 0) || (ctp->conp_nmembers != 1)) {
5948		mutex_exit(&pp->p_lock);
5949		mutex_exit(&ct->ct_lock);
5950		mutex_exit(&zonehash_lock);
5951		err = EINVAL;
5952		goto out;
5953	}
5954
5955	/*
5956	 * Moreover, we don't allow processes whose encapsulating
5957	 * process contracts have inherited extrazonal contracts.
5958	 * While it would be easier to eliminate all process contracts
5959	 * with inherited contracts, we need to be able to give a
5960	 * restarted init (or other zone-penetrating process) its
5961	 * predecessor's contracts.
5962	 */
5963	if (ctp->conp_ninherited != 0) {
5964		contract_t *next;
5965		for (next = list_head(&ctp->conp_inherited); next;
5966		    next = list_next(&ctp->conp_inherited, next)) {
5967			if (contract_getzuniqid(next) != zone->zone_uniqid) {
5968				mutex_exit(&pp->p_lock);
5969				mutex_exit(&ct->ct_lock);
5970				mutex_exit(&zonehash_lock);
5971				err = EINVAL;
5972				goto out;
5973			}
5974		}
5975	}
5976
5977	mutex_exit(&pp->p_lock);
5978	mutex_exit(&ct->ct_lock);
5979
5980	status = zone_status_get(zone);
5981	if (status < ZONE_IS_READY || status >= ZONE_IS_SHUTTING_DOWN) {
5982		/*
5983		 * Can't join
5984		 */
5985		mutex_exit(&zonehash_lock);
5986		err = EINVAL;
5987		goto out;
5988	}
5989
5990	/*
5991	 * Make sure new priv set is within the permitted set for caller
5992	 */
5993	if (!priv_issubset(zone->zone_privset, &CR_OPPRIV(CRED()))) {
5994		mutex_exit(&zonehash_lock);
5995		err = EPERM;
5996		goto out;
5997	}
5998	/*
5999	 * We want to momentarily drop zonehash_lock while we optimistically
6000	 * bind curproc to the pool it should be running in.  This is safe
6001	 * since the zone can't disappear (we have a hold on it).
6002	 */
6003	zone_hold(zone);
6004	mutex_exit(&zonehash_lock);
6005
6006	/*
6007	 * Grab pool_lock to keep the pools configuration from changing
6008	 * and to stop ourselves from getting rebound to another pool
6009	 * until we join the zone.
6010	 */
6011	if (pool_lock_intr() != 0) {
6012		zone_rele(zone);
6013		err = EINTR;
6014		goto out;
6015	}
6016	ASSERT(secpolicy_pool(CRED()) == 0);
6017	/*
6018	 * Bind ourselves to the pool currently associated with the zone.
6019	 */
6020	oldpool = curproc->p_pool;
6021	newpool = zone_pool_get(zone);
6022	if (pool_state == POOL_ENABLED && newpool != oldpool &&
6023	    (err = pool_do_bind(newpool, P_PID, P_MYID,
6024	    POOL_BIND_ALL)) != 0) {
6025		pool_unlock();
6026		zone_rele(zone);
6027		goto out;
6028	}
6029
6030	/*
6031	 * Grab cpu_lock now; we'll need it later when we call
6032	 * task_join().
6033	 */
6034	mutex_enter(&cpu_lock);
6035	mutex_enter(&zonehash_lock);
6036	/*
6037	 * Make sure the zone hasn't moved on since we dropped zonehash_lock.
6038	 */
6039	if (zone_status_get(zone) >= ZONE_IS_SHUTTING_DOWN) {
6040		/*
6041		 * Can't join anymore.
6042		 */
6043		mutex_exit(&zonehash_lock);
6044		mutex_exit(&cpu_lock);
6045		if (pool_state == POOL_ENABLED &&
6046		    newpool != oldpool)
6047			(void) pool_do_bind(oldpool, P_PID, P_MYID,
6048			    POOL_BIND_ALL);
6049		pool_unlock();
6050		zone_rele(zone);
6051		err = EINVAL;
6052		goto out;
6053	}
6054
6055	/*
6056	 * a_lock must be held while transfering locked memory and swap
6057	 * reservation from the global zone to the non global zone because
6058	 * asynchronous faults on the processes' address space can lock
6059	 * memory and reserve swap via MCL_FUTURE and MAP_NORESERVE
6060	 * segments respectively.
6061	 */
6062	AS_LOCK_ENTER(pp->p_as, RW_WRITER);
6063	swap = as_swresv();
6064	mutex_enter(&pp->p_lock);
6065	zone_proj0 = zone->zone_zsched->p_task->tk_proj;
6066	/* verify that we do not exceed and task or lwp limits */
6067	mutex_enter(&zone->zone_nlwps_lock);
6068	/* add new lwps to zone and zone's proj0 */
6069	zone_proj0->kpj_nlwps += pp->p_lwpcnt;
6070	zone->zone_nlwps += pp->p_lwpcnt;
6071	/* add 1 task to zone's proj0 */
6072	zone_proj0->kpj_ntasks += 1;
6073
6074	zone_proj0->kpj_nprocs++;
6075	zone->zone_nprocs++;
6076	mutex_exit(&zone->zone_nlwps_lock);
6077
6078	mutex_enter(&zone->zone_mem_lock);
6079	zone->zone_locked_mem += pp->p_locked_mem;
6080	zone_proj0->kpj_data.kpd_locked_mem += pp->p_locked_mem;
6081	zone->zone_max_swap += swap;
6082	mutex_exit(&zone->zone_mem_lock);
6083
6084	mutex_enter(&(zone_proj0->kpj_data.kpd_crypto_lock));
6085	zone_proj0->kpj_data.kpd_crypto_mem += pp->p_crypto_mem;
6086	mutex_exit(&(zone_proj0->kpj_data.kpd_crypto_lock));
6087
6088	/* remove lwps and process from proc's old zone and old project */
6089	mutex_enter(&pp->p_zone->zone_nlwps_lock);
6090	pp->p_zone->zone_nlwps -= pp->p_lwpcnt;
6091	pp->p_task->tk_proj->kpj_nlwps -= pp->p_lwpcnt;
6092	pp->p_task->tk_proj->kpj_nprocs--;
6093	pp->p_zone->zone_nprocs--;
6094	mutex_exit(&pp->p_zone->zone_nlwps_lock);
6095
6096	mutex_enter(&pp->p_zone->zone_mem_lock);
6097	pp->p_zone->zone_locked_mem -= pp->p_locked_mem;
6098	pp->p_task->tk_proj->kpj_data.kpd_locked_mem -= pp->p_locked_mem;
6099	pp->p_zone->zone_max_swap -= swap;
6100	mutex_exit(&pp->p_zone->zone_mem_lock);
6101
6102	mutex_enter(&(pp->p_task->tk_proj->kpj_data.kpd_crypto_lock));
6103	pp->p_task->tk_proj->kpj_data.kpd_crypto_mem -= pp->p_crypto_mem;
6104	mutex_exit(&(pp->p_task->tk_proj->kpj_data.kpd_crypto_lock));
6105
6106	pp->p_flag |= SZONETOP;
6107	pp->p_zone = zone;
6108	mutex_exit(&pp->p_lock);
6109	AS_LOCK_EXIT(pp->p_as);
6110
6111	/*
6112	 * Joining the zone cannot fail from now on.
6113	 *
6114	 * This means that a lot of the following code can be commonized and
6115	 * shared with zsched().
6116	 */
6117
6118	/*
6119	 * If the process contract fmri was inherited, we need to
6120	 * flag this so that any contract status will not leak
6121	 * extra zone information, svc_fmri in this case
6122	 */
6123	if (ctp->conp_svc_ctid != ct->ct_id) {
6124		mutex_enter(&ct->ct_lock);
6125		ctp->conp_svc_zone_enter = ct->ct_id;
6126		mutex_exit(&ct->ct_lock);
6127	}
6128
6129	/*
6130	 * Reset the encapsulating process contract's zone.
6131	 */
6132	ASSERT(ct->ct_mzuniqid == GLOBAL_ZONEUNIQID);
6133	contract_setzuniqid(ct, zone->zone_uniqid);
6134
6135	/*
6136	 * Create a new task and associate the process with the project keyed
6137	 * by (projid,zoneid).
6138	 *
6139	 * We might as well be in project 0; the global zone's projid doesn't
6140	 * make much sense in a zone anyhow.
6141	 *
6142	 * This also increments zone_ntasks, and returns with p_lock held.
6143	 */
6144	tk = task_create(0, zone);
6145	oldtk = task_join(tk, 0);
6146	mutex_exit(&cpu_lock);
6147
6148	/*
6149	 * call RCTLOP_SET functions on this proc
6150	 */
6151	e.rcep_p.zone = zone;
6152	e.rcep_t = RCENTITY_ZONE;
6153	(void) rctl_set_dup(NULL, NULL, pp, &e, zone->zone_rctls, NULL,
6154	    RCD_CALLBACK);
6155	mutex_exit(&pp->p_lock);
6156
6157	/*
6158	 * We don't need to hold any of zsched's locks here; not only do we know
6159	 * the process and zone aren't going away, we know its session isn't
6160	 * changing either.
6161	 *
6162	 * By joining zsched's session here, we mimic the behavior in the
6163	 * global zone of init's sid being the pid of sched.  We extend this
6164	 * to all zlogin-like zone_enter()'ing processes as well.
6165	 */
6166	mutex_enter(&pidlock);
6167	sp = zone->zone_zsched->p_sessp;
6168	sess_hold(zone->zone_zsched);
6169	mutex_enter(&pp->p_lock);
6170	pgexit(pp);
6171	sess_rele(pp->p_sessp, B_TRUE);
6172	pp->p_sessp = sp;
6173	pgjoin(pp, zone->zone_zsched->p_pidp);
6174
6175	/*
6176	 * If any threads are scheduled to be placed on zone wait queue they
6177	 * should abandon the idea since the wait queue is changing.
6178	 * We need to be holding pidlock & p_lock to do this.
6179	 */
6180	if ((t = pp->p_tlist) != NULL) {
6181		do {
6182			thread_lock(t);
6183			/*
6184			 * Kick this thread so that it doesn't sit
6185			 * on a wrong wait queue.
6186			 */
6187			if (ISWAITING(t))
6188				setrun_locked(t);
6189
6190			if (t->t_schedflag & TS_ANYWAITQ)
6191				t->t_schedflag &= ~ TS_ANYWAITQ;
6192
6193			thread_unlock(t);
6194		} while ((t = t->t_forw) != pp->p_tlist);
6195	}
6196
6197	/*
6198	 * If there is a default scheduling class for the zone and it is not
6199	 * the class we are currently in, change all of the threads in the
6200	 * process to the new class.  We need to be holding pidlock & p_lock
6201	 * when we call parmsset so this is a good place to do it.
6202	 */
6203	if (zone->zone_defaultcid > 0 &&
6204	    zone->zone_defaultcid != curthread->t_cid) {
6205		pcparms_t pcparms;
6206
6207		pcparms.pc_cid = zone->zone_defaultcid;
6208		pcparms.pc_clparms[0] = 0;
6209
6210		/*
6211		 * If setting the class fails, we still want to enter the zone.
6212		 */
6213		if ((t = pp->p_tlist) != NULL) {
6214			do {
6215				(void) parmsset(&pcparms, t);
6216			} while ((t = t->t_forw) != pp->p_tlist);
6217		}
6218	}
6219
6220	mutex_exit(&pp->p_lock);
6221	mutex_exit(&pidlock);
6222
6223	mutex_exit(&zonehash_lock);
6224	/*
6225	 * We're firmly in the zone; let pools progress.
6226	 */
6227	pool_unlock();
6228	task_rele(oldtk);
6229	/*
6230	 * We don't need to retain a hold on the zone since we already
6231	 * incremented zone_ntasks, so the zone isn't going anywhere.
6232	 */
6233	zone_rele(zone);
6234
6235	/*
6236	 * Chroot
6237	 */
6238	vp = zone->zone_rootvp;
6239	zone_chdir(vp, &PTOU(pp)->u_cdir, pp);
6240	zone_chdir(vp, &PTOU(pp)->u_rdir, pp);
6241
6242	/*
6243	 * Change process security flags.  Note that the _effective_ flags
6244	 * cannot change
6245	 */
6246	secflags_copy(&pp->p_secflags.psf_lower,
6247	    &zone->zone_secflags.psf_lower);
6248	secflags_copy(&pp->p_secflags.psf_upper,
6249	    &zone->zone_secflags.psf_upper);
6250	secflags_copy(&pp->p_secflags.psf_inherit,
6251	    &zone->zone_secflags.psf_inherit);
6252
6253	/*
6254	 * Change process credentials
6255	 */
6256	newcr = cralloc();
6257	mutex_enter(&pp->p_crlock);
6258	cr = pp->p_cred;
6259	crcopy_to(cr, newcr);
6260	crsetzone(newcr, zone);
6261	pp->p_cred = newcr;
6262
6263	/*
6264	 * Restrict all process privilege sets to zone limit
6265	 */
6266	priv_intersect(zone->zone_privset, &CR_PPRIV(newcr));
6267	priv_intersect(zone->zone_privset, &CR_EPRIV(newcr));
6268	priv_intersect(zone->zone_privset, &CR_IPRIV(newcr));
6269	priv_intersect(zone->zone_privset, &CR_LPRIV(newcr));
6270	mutex_exit(&pp->p_crlock);
6271	crset(pp, newcr);
6272
6273	/*
6274	 * Adjust upcount to reflect zone entry.
6275	 */
6276	uid = crgetruid(newcr);
6277	mutex_enter(&pidlock);
6278	upcount_dec(uid, GLOBAL_ZONEID);
6279	upcount_inc(uid, zoneid);
6280	mutex_exit(&pidlock);
6281
6282	/*
6283	 * Set up core file path and content.
6284	 */
6285	set_core_defaults();
6286
6287out:
6288	/*
6289	 * Let the other lwps continue.
6290	 */
6291	mutex_enter(&pp->p_lock);
6292	if (curthread != pp->p_agenttp)
6293		continuelwps(pp);
6294	mutex_exit(&pp->p_lock);
6295
6296	return (err != 0 ? set_errno(err) : 0);
6297}
6298
6299/*
6300 * Systemcall entry point for zone_list(2).
6301 *
6302 * Processes running in a (non-global) zone only see themselves.
6303 * On labeled systems, they see all zones whose label they dominate.
6304 */
6305static int
6306zone_list(zoneid_t *zoneidlist, uint_t *numzones)
6307{
6308	zoneid_t *zoneids;
6309	zone_t *zone, *myzone;
6310	uint_t user_nzones, real_nzones;
6311	uint_t domi_nzones;
6312	int error;
6313
6314	if (copyin(numzones, &user_nzones, sizeof (uint_t)) != 0)
6315		return (set_errno(EFAULT));
6316
6317	myzone = curproc->p_zone;
6318	ASSERT(zonecount > 0);
6319	if (myzone != global_zone) {
6320		bslabel_t *mybslab;
6321
6322		if (!is_system_labeled()) {
6323			/* just return current zone */
6324			real_nzones = domi_nzones = 1;
6325			zoneids = kmem_alloc(sizeof (zoneid_t), KM_SLEEP);
6326			zoneids[0] = myzone->zone_id;
6327		} else {
6328			/* return all zones that are dominated */
6329			mutex_enter(&zonehash_lock);
6330			real_nzones = zonecount;
6331			domi_nzones = 0;
6332			zoneids = kmem_alloc(real_nzones *
6333			    sizeof (zoneid_t), KM_SLEEP);
6334			mybslab = label2bslabel(myzone->zone_slabel);
6335			for (zone = list_head(&zone_active);
6336			    zone != NULL;
6337			    zone = list_next(&zone_active, zone)) {
6338				if (zone->zone_id == GLOBAL_ZONEID)
6339					continue;
6340				if (zone != myzone &&
6341				    (zone->zone_flags & ZF_IS_SCRATCH))
6342					continue;
6343				/*
6344				 * Note that a label always dominates
6345				 * itself, so myzone is always included
6346				 * in the list.
6347				 */
6348				if (bldominates(mybslab,
6349				    label2bslabel(zone->zone_slabel))) {
6350					zoneids[domi_nzones++] = zone->zone_id;
6351				}
6352			}
6353			mutex_exit(&zonehash_lock);
6354		}
6355	} else {
6356		mutex_enter(&zonehash_lock);
6357		real_nzones = zonecount;
6358		domi_nzones = 0;
6359		zoneids = kmem_alloc(real_nzones * sizeof (zoneid_t), KM_SLEEP);
6360		for (zone = list_head(&zone_active); zone != NULL;
6361		    zone = list_next(&zone_active, zone))
6362			zoneids[domi_nzones++] = zone->zone_id;
6363
6364		ASSERT(domi_nzones == real_nzones);
6365		mutex_exit(&zonehash_lock);
6366	}
6367
6368	/*
6369	 * If user has allocated space for fewer entries than we found, then
6370	 * return only up to their limit.  Either way, tell them exactly how
6371	 * many we found.
6372	 */
6373	if (domi_nzones < user_nzones)
6374		user_nzones = domi_nzones;
6375	error = 0;
6376	if (copyout(&domi_nzones, numzones, sizeof (uint_t)) != 0) {
6377		error = EFAULT;
6378	} else if (zoneidlist != NULL && user_nzones != 0) {
6379		if (copyout(zoneids, zoneidlist,
6380		    user_nzones * sizeof (zoneid_t)) != 0)
6381			error = EFAULT;
6382	}
6383
6384	kmem_free(zoneids, real_nzones * sizeof (zoneid_t));
6385
6386	if (error != 0)
6387		return (set_errno(error));
6388	else
6389		return (0);
6390}
6391
6392/*
6393 * Systemcall entry point for zone_lookup(2).
6394 *
6395 * Non-global zones are only able to see themselves and (on labeled systems)
6396 * the zones they dominate.
6397 */
6398static zoneid_t
6399zone_lookup(const char *zone_name)
6400{
6401	char *kname;
6402	zone_t *zone;
6403	zoneid_t zoneid;
6404	int err;
6405
6406	if (zone_name == NULL) {
6407		/* return caller's zone id */
6408		return (getzoneid());
6409	}
6410
6411	kname = kmem_zalloc(ZONENAME_MAX, KM_SLEEP);
6412	if ((err = copyinstr(zone_name, kname, ZONENAME_MAX, NULL)) != 0) {
6413		kmem_free(kname, ZONENAME_MAX);
6414		return (set_errno(err));
6415	}
6416
6417	mutex_enter(&zonehash_lock);
6418	zone = zone_find_all_by_name(kname);
6419	kmem_free(kname, ZONENAME_MAX);
6420	/*
6421	 * In a non-global zone, can only lookup global and own name.
6422	 * In Trusted Extensions zone label dominance rules apply.
6423	 */
6424	if (zone == NULL ||
6425	    zone_status_get(zone) < ZONE_IS_READY ||
6426	    !zone_list_access(zone)) {
6427		mutex_exit(&zonehash_lock);
6428		return (set_errno(EINVAL));
6429	} else {
6430		zoneid = zone->zone_id;
6431		mutex_exit(&zonehash_lock);
6432		return (zoneid);
6433	}
6434}
6435
6436static int
6437zone_version(int *version_arg)
6438{
6439	int version = ZONE_SYSCALL_API_VERSION;
6440
6441	if (copyout(&version, version_arg, sizeof (int)) != 0)
6442		return (set_errno(EFAULT));
6443	return (0);
6444}
6445
6446/* ARGSUSED */
6447long
6448zone(int cmd, void *arg1, void *arg2, void *arg3, void *arg4)
6449{
6450	zone_def zs;
6451	int err;
6452
6453	switch (cmd) {
6454	case ZONE_CREATE:
6455		if (get_udatamodel() == DATAMODEL_NATIVE) {
6456			if (copyin(arg1, &zs, sizeof (zone_def))) {
6457				return (set_errno(EFAULT));
6458			}
6459		} else {
6460#ifdef _SYSCALL32_IMPL
6461			zone_def32 zs32;
6462
6463			if (copyin(arg1, &zs32, sizeof (zone_def32))) {
6464				return (set_errno(EFAULT));
6465			}
6466			zs.zone_name =
6467			    (const char *)(unsigned long)zs32.zone_name;
6468			zs.zone_root =
6469			    (const char *)(unsigned long)zs32.zone_root;
6470			zs.zone_privs =
6471			    (const struct priv_set *)
6472			    (unsigned long)zs32.zone_privs;
6473			zs.zone_privssz = zs32.zone_privssz;
6474			zs.rctlbuf = (caddr_t)(unsigned long)zs32.rctlbuf;
6475			zs.rctlbufsz = zs32.rctlbufsz;
6476			zs.zfsbuf = (caddr_t)(unsigned long)zs32.zfsbuf;
6477			zs.zfsbufsz = zs32.zfsbufsz;
6478			zs.extended_error =
6479			    (int *)(unsigned long)zs32.extended_error;
6480			zs.match = zs32.match;
6481			zs.doi = zs32.doi;
6482			zs.label = (const bslabel_t *)(uintptr_t)zs32.label;
6483			zs.flags = zs32.flags;
6484#else
6485			panic("get_udatamodel() returned bogus result\n");
6486#endif
6487		}
6488
6489		return (zone_create(zs.zone_name, zs.zone_root,
6490		    zs.zone_privs, zs.zone_privssz,
6491		    (caddr_t)zs.rctlbuf, zs.rctlbufsz,
6492		    (caddr_t)zs.zfsbuf, zs.zfsbufsz,
6493		    zs.extended_error, zs.match, zs.doi,
6494		    zs.label, zs.flags));
6495	case ZONE_BOOT:
6496		return (zone_boot((zoneid_t)(uintptr_t)arg1));
6497	case ZONE_DESTROY:
6498		return (zone_destroy((zoneid_t)(uintptr_t)arg1));
6499	case ZONE_GETATTR:
6500		return (zone_getattr((zoneid_t)(uintptr_t)arg1,
6501		    (int)(uintptr_t)arg2, arg3, (size_t)arg4));
6502	case ZONE_SETATTR:
6503		return (zone_setattr((zoneid_t)(uintptr_t)arg1,