zone.c revision 0fbb751d81ab0a7c7ddfd8d4e447e075a9f7024f
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
24 */
25
26/*
27 * Zones
28 *
29 *   A zone is a named collection of processes, namespace constraints,
30 *   and other system resources which comprise a secure and manageable
31 *   application containment facility.
32 *
33 *   Zones (represented by the reference counted zone_t) are tracked in
34 *   the kernel in the zonehash.  Elsewhere in the kernel, Zone IDs
35 *   (zoneid_t) are used to track zone association.  Zone IDs are
36 *   dynamically generated when the zone is created; if a persistent
37 *   identifier is needed (core files, accounting logs, audit trail,
38 *   etc.), the zone name should be used.
39 *
40 *
41 *   Global Zone:
42 *
43 *   The global zone (zoneid 0) is automatically associated with all
44 *   system resources that have not been bound to a user-created zone.
45 *   This means that even systems where zones are not in active use
46 *   have a global zone, and all processes, mounts, etc. are
47 *   associated with that zone.  The global zone is generally
48 *   unconstrained in terms of privileges and access, though the usual
49 *   credential and privilege based restrictions apply.
50 *
51 *
52 *   Zone States:
53 *
54 *   The states in which a zone may be in and the transitions are as
55 *   follows:
56 *
57 *   ZONE_IS_UNINITIALIZED: primordial state for a zone. The partially
58 *   initialized zone is added to the list of active zones on the system but
59 *   isn't accessible.
60 *
61 *   ZONE_IS_INITIALIZED: Initialization complete except the ZSD callbacks are
62 *   not yet completed. Not possible to enter the zone, but attributes can
63 *   be retrieved.
64 *
65 *   ZONE_IS_READY: zsched (the kernel dummy process for a zone) is
66 *   ready.  The zone is made visible after the ZSD constructor callbacks are
67 *   executed.  A zone remains in this state until it transitions into
68 *   the ZONE_IS_BOOTING state as a result of a call to zone_boot().
69 *
70 *   ZONE_IS_BOOTING: in this shortlived-state, zsched attempts to start
71 *   init.  Should that fail, the zone proceeds to the ZONE_IS_SHUTTING_DOWN
72 *   state.
73 *
74 *   ZONE_IS_RUNNING: The zone is open for business: zsched has
75 *   successfully started init.   A zone remains in this state until
76 *   zone_shutdown() is called.
77 *
78 *   ZONE_IS_SHUTTING_DOWN: zone_shutdown() has been called, the system is
79 *   killing all processes running in the zone. The zone remains
80 *   in this state until there are no more user processes running in the zone.
81 *   zone_create(), zone_enter(), and zone_destroy() on this zone will fail.
82 *   Since zone_shutdown() is restartable, it may be called successfully
83 *   multiple times for the same zone_t.  Setting of the zone's state to
84 *   ZONE_IS_SHUTTING_DOWN is synchronized with mounts, so VOP_MOUNT() may check
85 *   the zone's status without worrying about it being a moving target.
86 *
87 *   ZONE_IS_EMPTY: zone_shutdown() has been called, and there
88 *   are no more user processes in the zone.  The zone remains in this
89 *   state until there are no more kernel threads associated with the
90 *   zone.  zone_create(), zone_enter(), and zone_destroy() on this zone will
91 *   fail.
92 *
93 *   ZONE_IS_DOWN: All kernel threads doing work on behalf of the zone
94 *   have exited.  zone_shutdown() returns.  Henceforth it is not possible to
95 *   join the zone or create kernel threads therein.
96 *
97 *   ZONE_IS_DYING: zone_destroy() has been called on the zone; zone
98 *   remains in this state until zsched exits.  Calls to zone_find_by_*()
99 *   return NULL from now on.
100 *
101 *   ZONE_IS_DEAD: zsched has exited (zone_ntasks == 0).  There are no
102 *   processes or threads doing work on behalf of the zone.  The zone is
103 *   removed from the list of active zones.  zone_destroy() returns, and
104 *   the zone can be recreated.
105 *
106 *   ZONE_IS_FREE (internal state): zone_ref goes to 0, ZSD destructor
107 *   callbacks are executed, and all memory associated with the zone is
108 *   freed.
109 *
110 *   Threads can wait for the zone to enter a requested state by using
111 *   zone_status_wait() or zone_status_timedwait() with the desired
112 *   state passed in as an argument.  Zone state transitions are
113 *   uni-directional; it is not possible to move back to an earlier state.
114 *
115 *
116 *   Zone-Specific Data:
117 *
118 *   Subsystems needing to maintain zone-specific data can store that
119 *   data using the ZSD mechanism.  This provides a zone-specific data
120 *   store, similar to thread-specific data (see pthread_getspecific(3C)
121 *   or the TSD code in uts/common/disp/thread.c.  Also, ZSD can be used
122 *   to register callbacks to be invoked when a zone is created, shut
123 *   down, or destroyed.  This can be used to initialize zone-specific
124 *   data for new zones and to clean up when zones go away.
125 *
126 *
127 *   Data Structures:
128 *
129 *   The per-zone structure (zone_t) is reference counted, and freed
130 *   when all references are released.  zone_hold and zone_rele can be
131 *   used to adjust the reference count.  In addition, reference counts
132 *   associated with the cred_t structure are tracked separately using
133 *   zone_cred_hold and zone_cred_rele.
134 *
135 *   Pointers to active zone_t's are stored in two hash tables; one
136 *   for searching by id, the other for searching by name.  Lookups
137 *   can be performed on either basis, using zone_find_by_id and
138 *   zone_find_by_name.  Both return zone_t pointers with the zone
139 *   held, so zone_rele should be called when the pointer is no longer
140 *   needed.  Zones can also be searched by path; zone_find_by_path
141 *   returns the zone with which a path name is associated (global
142 *   zone if the path is not within some other zone's file system
143 *   hierarchy).  This currently requires iterating through each zone,
144 *   so it is slower than an id or name search via a hash table.
145 *
146 *
147 *   Locking:
148 *
149 *   zonehash_lock: This is a top-level global lock used to protect the
150 *       zone hash tables and lists.  Zones cannot be created or destroyed
151 *       while this lock is held.
152 *   zone_status_lock: This is a global lock protecting zone state.
153 *       Zones cannot change state while this lock is held.  It also
154 *       protects the list of kernel threads associated with a zone.
155 *   zone_lock: This is a per-zone lock used to protect several fields of
156 *       the zone_t (see <sys/zone.h> for details).  In addition, holding
157 *       this lock means that the zone cannot go away.
158 *   zone_nlwps_lock: This is a per-zone lock used to protect the fields
159 *	 related to the zone.max-lwps rctl.
160 *   zone_mem_lock: This is a per-zone lock used to protect the fields
161 *	 related to the zone.max-locked-memory and zone.max-swap rctls.
162 *   zone_rctl_lock: This is a per-zone lock used to protect other rctls,
163 *       currently just max_lofi
164 *   zsd_key_lock: This is a global lock protecting the key state for ZSD.
165 *   zone_deathrow_lock: This is a global lock protecting the "deathrow"
166 *       list (a list of zones in the ZONE_IS_DEAD state).
167 *
168 *   Ordering requirements:
169 *       pool_lock --> cpu_lock --> zonehash_lock --> zone_status_lock -->
170 *       	zone_lock --> zsd_key_lock --> pidlock --> p_lock
171 *
172 *   When taking zone_mem_lock or zone_nlwps_lock, the lock ordering is:
173 *	zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_mem_lock
174 *	zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_mem_lock
175 *
176 *   Blocking memory allocations are permitted while holding any of the
177 *   zone locks.
178 *
179 *
180 *   System Call Interface:
181 *
182 *   The zone subsystem can be managed and queried from user level with
183 *   the following system calls (all subcodes of the primary "zone"
184 *   system call):
185 *   - zone_create: creates a zone with selected attributes (name,
186 *     root path, privileges, resource controls, ZFS datasets)
187 *   - zone_enter: allows the current process to enter a zone
188 *   - zone_getattr: reports attributes of a zone
189 *   - zone_setattr: set attributes of a zone
190 *   - zone_boot: set 'init' running for the zone
191 *   - zone_list: lists all zones active in the system
192 *   - zone_lookup: looks up zone id based on name
193 *   - zone_shutdown: initiates shutdown process (see states above)
194 *   - zone_destroy: completes shutdown process (see states above)
195 *
196 */
197
198#include <sys/priv_impl.h>
199#include <sys/cred.h>
200#include <c2/audit.h>
201#include <sys/debug.h>
202#include <sys/file.h>
203#include <sys/kmem.h>
204#include <sys/kstat.h>
205#include <sys/mutex.h>
206#include <sys/note.h>
207#include <sys/pathname.h>
208#include <sys/proc.h>
209#include <sys/project.h>
210#include <sys/sysevent.h>
211#include <sys/task.h>
212#include <sys/systm.h>
213#include <sys/types.h>
214#include <sys/utsname.h>
215#include <sys/vnode.h>
216#include <sys/vfs.h>
217#include <sys/systeminfo.h>
218#include <sys/policy.h>
219#include <sys/cred_impl.h>
220#include <sys/contract_impl.h>
221#include <sys/contract/process_impl.h>
222#include <sys/class.h>
223#include <sys/pool.h>
224#include <sys/pool_pset.h>
225#include <sys/pset.h>
226#include <sys/sysmacros.h>
227#include <sys/callb.h>
228#include <sys/vmparam.h>
229#include <sys/corectl.h>
230#include <sys/ipc_impl.h>
231#include <sys/klpd.h>
232
233#include <sys/door.h>
234#include <sys/cpuvar.h>
235#include <sys/sdt.h>
236
237#include <sys/uadmin.h>
238#include <sys/session.h>
239#include <sys/cmn_err.h>
240#include <sys/modhash.h>
241#include <sys/sunddi.h>
242#include <sys/nvpair.h>
243#include <sys/rctl.h>
244#include <sys/fss.h>
245#include <sys/brand.h>
246#include <sys/zone.h>
247#include <net/if.h>
248#include <sys/cpucaps.h>
249#include <vm/seg.h>
250#include <sys/mac.h>
251
252/* List of data link IDs which are accessible from the zone */
253typedef struct zone_dl {
254	datalink_id_t	zdl_id;
255	list_node_t	zdl_linkage;
256} zone_dl_t;
257
258/*
259 * cv used to signal that all references to the zone have been released.  This
260 * needs to be global since there may be multiple waiters, and the first to
261 * wake up will free the zone_t, hence we cannot use zone->zone_cv.
262 */
263static kcondvar_t zone_destroy_cv;
264/*
265 * Lock used to serialize access to zone_cv.  This could have been per-zone,
266 * but then we'd need another lock for zone_destroy_cv, and why bother?
267 */
268static kmutex_t zone_status_lock;
269
270/*
271 * ZSD-related global variables.
272 */
273static kmutex_t zsd_key_lock;	/* protects the following two */
274/*
275 * The next caller of zone_key_create() will be assigned a key of ++zsd_keyval.
276 */
277static zone_key_t zsd_keyval = 0;
278/*
279 * Global list of registered keys.  We use this when a new zone is created.
280 */
281static list_t zsd_registered_keys;
282
283int zone_hash_size = 256;
284static mod_hash_t *zonehashbyname, *zonehashbyid, *zonehashbylabel;
285static kmutex_t zonehash_lock;
286static uint_t zonecount;
287static id_space_t *zoneid_space;
288
289/*
290 * The global zone (aka zone0) is the all-seeing, all-knowing zone in which the
291 * kernel proper runs, and which manages all other zones.
292 *
293 * Although not declared as static, the variable "zone0" should not be used
294 * except for by code that needs to reference the global zone early on in boot,
295 * before it is fully initialized.  All other consumers should use
296 * 'global_zone'.
297 */
298zone_t zone0;
299zone_t *global_zone = NULL;	/* Set when the global zone is initialized */
300
301/*
302 * List of active zones, protected by zonehash_lock.
303 */
304static list_t zone_active;
305
306/*
307 * List of destroyed zones that still have outstanding cred references.
308 * Used for debugging.  Uses a separate lock to avoid lock ordering
309 * problems in zone_free.
310 */
311static list_t zone_deathrow;
312static kmutex_t zone_deathrow_lock;
313
314/* number of zones is limited by virtual interface limit in IP */
315uint_t maxzones = 8192;
316
317/* Event channel to sent zone state change notifications */
318evchan_t *zone_event_chan;
319
320/*
321 * This table holds the mapping from kernel zone states to
322 * states visible in the state notification API.
323 * The idea is that we only expose "obvious" states and
324 * do not expose states which are just implementation details.
325 */
326const char  *zone_status_table[] = {
327	ZONE_EVENT_UNINITIALIZED,	/* uninitialized */
328	ZONE_EVENT_INITIALIZED,		/* initialized */
329	ZONE_EVENT_READY,		/* ready */
330	ZONE_EVENT_READY,		/* booting */
331	ZONE_EVENT_RUNNING,		/* running */
332	ZONE_EVENT_SHUTTING_DOWN,	/* shutting_down */
333	ZONE_EVENT_SHUTTING_DOWN,	/* empty */
334	ZONE_EVENT_SHUTTING_DOWN,	/* down */
335	ZONE_EVENT_SHUTTING_DOWN,	/* dying */
336	ZONE_EVENT_UNINITIALIZED,	/* dead */
337};
338
339/*
340 * This isn't static so lint doesn't complain.
341 */
342rctl_hndl_t rc_zone_cpu_shares;
343rctl_hndl_t rc_zone_locked_mem;
344rctl_hndl_t rc_zone_max_swap;
345rctl_hndl_t rc_zone_max_lofi;
346rctl_hndl_t rc_zone_cpu_cap;
347rctl_hndl_t rc_zone_nlwps;
348rctl_hndl_t rc_zone_shmmax;
349rctl_hndl_t rc_zone_shmmni;
350rctl_hndl_t rc_zone_semmni;
351rctl_hndl_t rc_zone_msgmni;
352/*
353 * Synchronization primitives used to synchronize between mounts and zone
354 * creation/destruction.
355 */
356static int mounts_in_progress;
357static kcondvar_t mount_cv;
358static kmutex_t mount_lock;
359
360const char * const zone_default_initname = "/sbin/init";
361static char * const zone_prefix = "/zone/";
362static int zone_shutdown(zoneid_t zoneid);
363static int zone_add_datalink(zoneid_t, datalink_id_t);
364static int zone_remove_datalink(zoneid_t, datalink_id_t);
365static int zone_list_datalink(zoneid_t, int *, datalink_id_t *);
366
367typedef boolean_t zsd_applyfn_t(kmutex_t *, boolean_t, zone_t *, zone_key_t);
368
369static void zsd_apply_all_zones(zsd_applyfn_t *, zone_key_t);
370static void zsd_apply_all_keys(zsd_applyfn_t *, zone_t *);
371static boolean_t zsd_apply_create(kmutex_t *, boolean_t, zone_t *, zone_key_t);
372static boolean_t zsd_apply_shutdown(kmutex_t *, boolean_t, zone_t *,
373    zone_key_t);
374static boolean_t zsd_apply_destroy(kmutex_t *, boolean_t, zone_t *, zone_key_t);
375static boolean_t zsd_wait_for_creator(zone_t *, struct zsd_entry *,
376    kmutex_t *);
377static boolean_t zsd_wait_for_inprogress(zone_t *, struct zsd_entry *,
378    kmutex_t *);
379
380/*
381 * Bump this number when you alter the zone syscall interfaces; this is
382 * because we need to have support for previous API versions in libc
383 * to support patching; libc calls into the kernel to determine this number.
384 *
385 * Version 1 of the API is the version originally shipped with Solaris 10
386 * Version 2 alters the zone_create system call in order to support more
387 *     arguments by moving the args into a structure; and to do better
388 *     error reporting when zone_create() fails.
389 * Version 3 alters the zone_create system call in order to support the
390 *     import of ZFS datasets to zones.
391 * Version 4 alters the zone_create system call in order to support
392 *     Trusted Extensions.
393 * Version 5 alters the zone_boot system call, and converts its old
394 *     bootargs parameter to be set by the zone_setattr API instead.
395 * Version 6 adds the flag argument to zone_create.
396 */
397static const int ZONE_SYSCALL_API_VERSION = 6;
398
399/*
400 * Certain filesystems (such as NFS and autofs) need to know which zone
401 * the mount is being placed in.  Because of this, we need to be able to
402 * ensure that a zone isn't in the process of being created such that
403 * nfs_mount() thinks it is in the global zone, while by the time it
404 * gets added the list of mounted zones, it ends up on zoneA's mount
405 * list.
406 *
407 * The following functions: block_mounts()/resume_mounts() and
408 * mount_in_progress()/mount_completed() are used by zones and the VFS
409 * layer (respectively) to synchronize zone creation and new mounts.
410 *
411 * The semantics are like a reader-reader lock such that there may
412 * either be multiple mounts (or zone creations, if that weren't
413 * serialized by zonehash_lock) in progress at the same time, but not
414 * both.
415 *
416 * We use cv's so the user can ctrl-C out of the operation if it's
417 * taking too long.
418 *
419 * The semantics are such that there is unfair bias towards the
420 * "current" operation.  This means that zone creations may starve if
421 * there is a rapid succession of new mounts coming in to the system, or
422 * there is a remote possibility that zones will be created at such a
423 * rate that new mounts will not be able to proceed.
424 */
425/*
426 * Prevent new mounts from progressing to the point of calling
427 * VFS_MOUNT().  If there are already mounts in this "region", wait for
428 * them to complete.
429 */
430static int
431block_mounts(void)
432{
433	int retval = 0;
434
435	/*
436	 * Since it may block for a long time, block_mounts() shouldn't be
437	 * called with zonehash_lock held.
438	 */
439	ASSERT(MUTEX_NOT_HELD(&zonehash_lock));
440	mutex_enter(&mount_lock);
441	while (mounts_in_progress > 0) {
442		if (cv_wait_sig(&mount_cv, &mount_lock) == 0)
443			goto signaled;
444	}
445	/*
446	 * A negative value of mounts_in_progress indicates that mounts
447	 * have been blocked by (-mounts_in_progress) different callers.
448	 */
449	mounts_in_progress--;
450	retval = 1;
451signaled:
452	mutex_exit(&mount_lock);
453	return (retval);
454}
455
456/*
457 * The VFS layer may progress with new mounts as far as we're concerned.
458 * Allow them to progress if we were the last obstacle.
459 */
460static void
461resume_mounts(void)
462{
463	mutex_enter(&mount_lock);
464	if (++mounts_in_progress == 0)
465		cv_broadcast(&mount_cv);
466	mutex_exit(&mount_lock);
467}
468
469/*
470 * The VFS layer is busy with a mount; zones should wait until all
471 * mounts are completed to progress.
472 */
473void
474mount_in_progress(void)
475{
476	mutex_enter(&mount_lock);
477	while (mounts_in_progress < 0)
478		cv_wait(&mount_cv, &mount_lock);
479	mounts_in_progress++;
480	mutex_exit(&mount_lock);
481}
482
483/*
484 * VFS is done with one mount; wake up any waiting block_mounts()
485 * callers if this is the last mount.
486 */
487void
488mount_completed(void)
489{
490	mutex_enter(&mount_lock);
491	if (--mounts_in_progress == 0)
492		cv_broadcast(&mount_cv);
493	mutex_exit(&mount_lock);
494}
495
496/*
497 * ZSD routines.
498 *
499 * Zone Specific Data (ZSD) is modeled after Thread Specific Data as
500 * defined by the pthread_key_create() and related interfaces.
501 *
502 * Kernel subsystems may register one or more data items and/or
503 * callbacks to be executed when a zone is created, shutdown, or
504 * destroyed.
505 *
506 * Unlike the thread counterpart, destructor callbacks will be executed
507 * even if the data pointer is NULL and/or there are no constructor
508 * callbacks, so it is the responsibility of such callbacks to check for
509 * NULL data values if necessary.
510 *
511 * The locking strategy and overall picture is as follows:
512 *
513 * When someone calls zone_key_create(), a template ZSD entry is added to the
514 * global list "zsd_registered_keys", protected by zsd_key_lock.  While
515 * holding that lock all the existing zones are marked as
516 * ZSD_CREATE_NEEDED and a copy of the ZSD entry added to the per-zone
517 * zone_zsd list (protected by zone_lock). The global list is updated first
518 * (under zone_key_lock) to make sure that newly created zones use the
519 * most recent list of keys. Then under zonehash_lock we walk the zones
520 * and mark them.  Similar locking is used in zone_key_delete().
521 *
522 * The actual create, shutdown, and destroy callbacks are done without
523 * holding any lock. And zsd_flags are used to ensure that the operations
524 * completed so that when zone_key_create (and zone_create) is done, as well as
525 * zone_key_delete (and zone_destroy) is done, all the necessary callbacks
526 * are completed.
527 *
528 * When new zones are created constructor callbacks for all registered ZSD
529 * entries will be called. That also uses the above two phases of marking
530 * what needs to be done, and then running the callbacks without holding
531 * any locks.
532 *
533 * The framework does not provide any locking around zone_getspecific() and
534 * zone_setspecific() apart from that needed for internal consistency, so
535 * callers interested in atomic "test-and-set" semantics will need to provide
536 * their own locking.
537 */
538
539/*
540 * Helper function to find the zsd_entry associated with the key in the
541 * given list.
542 */
543static struct zsd_entry *
544zsd_find(list_t *l, zone_key_t key)
545{
546	struct zsd_entry *zsd;
547
548	for (zsd = list_head(l); zsd != NULL; zsd = list_next(l, zsd)) {
549		if (zsd->zsd_key == key) {
550			return (zsd);
551		}
552	}
553	return (NULL);
554}
555
556/*
557 * Helper function to find the zsd_entry associated with the key in the
558 * given list. Move it to the front of the list.
559 */
560static struct zsd_entry *
561zsd_find_mru(list_t *l, zone_key_t key)
562{
563	struct zsd_entry *zsd;
564
565	for (zsd = list_head(l); zsd != NULL; zsd = list_next(l, zsd)) {
566		if (zsd->zsd_key == key) {
567			/*
568			 * Move to head of list to keep list in MRU order.
569			 */
570			if (zsd != list_head(l)) {
571				list_remove(l, zsd);
572				list_insert_head(l, zsd);
573			}
574			return (zsd);
575		}
576	}
577	return (NULL);
578}
579
580void
581zone_key_create(zone_key_t *keyp, void *(*create)(zoneid_t),
582    void (*shutdown)(zoneid_t, void *), void (*destroy)(zoneid_t, void *))
583{
584	struct zsd_entry *zsdp;
585	struct zsd_entry *t;
586	struct zone *zone;
587	zone_key_t  key;
588
589	zsdp = kmem_zalloc(sizeof (*zsdp), KM_SLEEP);
590	zsdp->zsd_data = NULL;
591	zsdp->zsd_create = create;
592	zsdp->zsd_shutdown = shutdown;
593	zsdp->zsd_destroy = destroy;
594
595	/*
596	 * Insert in global list of callbacks. Makes future zone creations
597	 * see it.
598	 */
599	mutex_enter(&zsd_key_lock);
600	key = zsdp->zsd_key = ++zsd_keyval;
601	ASSERT(zsd_keyval != 0);
602	list_insert_tail(&zsd_registered_keys, zsdp);
603	mutex_exit(&zsd_key_lock);
604
605	/*
606	 * Insert for all existing zones and mark them as needing
607	 * a create callback.
608	 */
609	mutex_enter(&zonehash_lock);	/* stop the world */
610	for (zone = list_head(&zone_active); zone != NULL;
611	    zone = list_next(&zone_active, zone)) {
612		zone_status_t status;
613
614		mutex_enter(&zone->zone_lock);
615
616		/* Skip zones that are on the way down or not yet up */
617		status = zone_status_get(zone);
618		if (status >= ZONE_IS_DOWN ||
619		    status == ZONE_IS_UNINITIALIZED) {
620			mutex_exit(&zone->zone_lock);
621			continue;
622		}
623
624		t = zsd_find_mru(&zone->zone_zsd, key);
625		if (t != NULL) {
626			/*
627			 * A zsd_configure already inserted it after
628			 * we dropped zsd_key_lock above.
629			 */
630			mutex_exit(&zone->zone_lock);
631			continue;
632		}
633		t = kmem_zalloc(sizeof (*t), KM_SLEEP);
634		t->zsd_key = key;
635		t->zsd_create = create;
636		t->zsd_shutdown = shutdown;
637		t->zsd_destroy = destroy;
638		if (create != NULL) {
639			t->zsd_flags = ZSD_CREATE_NEEDED;
640			DTRACE_PROBE2(zsd__create__needed,
641			    zone_t *, zone, zone_key_t, key);
642		}
643		list_insert_tail(&zone->zone_zsd, t);
644		mutex_exit(&zone->zone_lock);
645	}
646	mutex_exit(&zonehash_lock);
647
648	if (create != NULL) {
649		/* Now call the create callback for this key */
650		zsd_apply_all_zones(zsd_apply_create, key);
651	}
652	/*
653	 * It is safe for consumers to use the key now, make it
654	 * globally visible. Specifically zone_getspecific() will
655	 * always successfully return the zone specific data associated
656	 * with the key.
657	 */
658	*keyp = key;
659
660}
661
662/*
663 * Function called when a module is being unloaded, or otherwise wishes
664 * to unregister its ZSD key and callbacks.
665 *
666 * Remove from the global list and determine the functions that need to
667 * be called under a global lock. Then call the functions without
668 * holding any locks. Finally free up the zone_zsd entries. (The apply
669 * functions need to access the zone_zsd entries to find zsd_data etc.)
670 */
671int
672zone_key_delete(zone_key_t key)
673{
674	struct zsd_entry *zsdp = NULL;
675	zone_t *zone;
676
677	mutex_enter(&zsd_key_lock);
678	zsdp = zsd_find_mru(&zsd_registered_keys, key);
679	if (zsdp == NULL) {
680		mutex_exit(&zsd_key_lock);
681		return (-1);
682	}
683	list_remove(&zsd_registered_keys, zsdp);
684	mutex_exit(&zsd_key_lock);
685
686	mutex_enter(&zonehash_lock);
687	for (zone = list_head(&zone_active); zone != NULL;
688	    zone = list_next(&zone_active, zone)) {
689		struct zsd_entry *del;
690
691		mutex_enter(&zone->zone_lock);
692		del = zsd_find_mru(&zone->zone_zsd, key);
693		if (del == NULL) {
694			/*
695			 * Somebody else got here first e.g the zone going
696			 * away.
697			 */
698			mutex_exit(&zone->zone_lock);
699			continue;
700		}
701		ASSERT(del->zsd_shutdown == zsdp->zsd_shutdown);
702		ASSERT(del->zsd_destroy == zsdp->zsd_destroy);
703		if (del->zsd_shutdown != NULL &&
704		    (del->zsd_flags & ZSD_SHUTDOWN_ALL) == 0) {
705			del->zsd_flags |= ZSD_SHUTDOWN_NEEDED;
706			DTRACE_PROBE2(zsd__shutdown__needed,
707			    zone_t *, zone, zone_key_t, key);
708		}
709		if (del->zsd_destroy != NULL &&
710		    (del->zsd_flags & ZSD_DESTROY_ALL) == 0) {
711			del->zsd_flags |= ZSD_DESTROY_NEEDED;
712			DTRACE_PROBE2(zsd__destroy__needed,
713			    zone_t *, zone, zone_key_t, key);
714		}
715		mutex_exit(&zone->zone_lock);
716	}
717	mutex_exit(&zonehash_lock);
718	kmem_free(zsdp, sizeof (*zsdp));
719
720	/* Now call the shutdown and destroy callback for this key */
721	zsd_apply_all_zones(zsd_apply_shutdown, key);
722	zsd_apply_all_zones(zsd_apply_destroy, key);
723
724	/* Now we can free up the zsdp structures in each zone */
725	mutex_enter(&zonehash_lock);
726	for (zone = list_head(&zone_active); zone != NULL;
727	    zone = list_next(&zone_active, zone)) {
728		struct zsd_entry *del;
729
730		mutex_enter(&zone->zone_lock);
731		del = zsd_find(&zone->zone_zsd, key);
732		if (del != NULL) {
733			list_remove(&zone->zone_zsd, del);
734			ASSERT(!(del->zsd_flags & ZSD_ALL_INPROGRESS));
735			kmem_free(del, sizeof (*del));
736		}
737		mutex_exit(&zone->zone_lock);
738	}
739	mutex_exit(&zonehash_lock);
740
741	return (0);
742}
743
744/*
745 * ZSD counterpart of pthread_setspecific().
746 *
747 * Since all zsd callbacks, including those with no create function,
748 * have an entry in zone_zsd, if the key is registered it is part of
749 * the zone_zsd list.
750 * Return an error if the key wasn't registerd.
751 */
752int
753zone_setspecific(zone_key_t key, zone_t *zone, const void *data)
754{
755	struct zsd_entry *t;
756
757	mutex_enter(&zone->zone_lock);
758	t = zsd_find_mru(&zone->zone_zsd, key);
759	if (t != NULL) {
760		/*
761		 * Replace old value with new
762		 */
763		t->zsd_data = (void *)data;
764		mutex_exit(&zone->zone_lock);
765		return (0);
766	}
767	mutex_exit(&zone->zone_lock);
768	return (-1);
769}
770
771/*
772 * ZSD counterpart of pthread_getspecific().
773 */
774void *
775zone_getspecific(zone_key_t key, zone_t *zone)
776{
777	struct zsd_entry *t;
778	void *data;
779
780	mutex_enter(&zone->zone_lock);
781	t = zsd_find_mru(&zone->zone_zsd, key);
782	data = (t == NULL ? NULL : t->zsd_data);
783	mutex_exit(&zone->zone_lock);
784	return (data);
785}
786
787/*
788 * Function used to initialize a zone's list of ZSD callbacks and data
789 * when the zone is being created.  The callbacks are initialized from
790 * the template list (zsd_registered_keys). The constructor callback is
791 * executed later (once the zone exists and with locks dropped).
792 */
793static void
794zone_zsd_configure(zone_t *zone)
795{
796	struct zsd_entry *zsdp;
797	struct zsd_entry *t;
798
799	ASSERT(MUTEX_HELD(&zonehash_lock));
800	ASSERT(list_head(&zone->zone_zsd) == NULL);
801	mutex_enter(&zone->zone_lock);
802	mutex_enter(&zsd_key_lock);
803	for (zsdp = list_head(&zsd_registered_keys); zsdp != NULL;
804	    zsdp = list_next(&zsd_registered_keys, zsdp)) {
805		/*
806		 * Since this zone is ZONE_IS_UNCONFIGURED, zone_key_create
807		 * should not have added anything to it.
808		 */
809		ASSERT(zsd_find(&zone->zone_zsd, zsdp->zsd_key) == NULL);
810
811		t = kmem_zalloc(sizeof (*t), KM_SLEEP);
812		t->zsd_key = zsdp->zsd_key;
813		t->zsd_create = zsdp->zsd_create;
814		t->zsd_shutdown = zsdp->zsd_shutdown;
815		t->zsd_destroy = zsdp->zsd_destroy;
816		if (zsdp->zsd_create != NULL) {
817			t->zsd_flags = ZSD_CREATE_NEEDED;
818			DTRACE_PROBE2(zsd__create__needed,
819			    zone_t *, zone, zone_key_t, zsdp->zsd_key);
820		}
821		list_insert_tail(&zone->zone_zsd, t);
822	}
823	mutex_exit(&zsd_key_lock);
824	mutex_exit(&zone->zone_lock);
825}
826
827enum zsd_callback_type { ZSD_CREATE, ZSD_SHUTDOWN, ZSD_DESTROY };
828
829/*
830 * Helper function to execute shutdown or destructor callbacks.
831 */
832static void
833zone_zsd_callbacks(zone_t *zone, enum zsd_callback_type ct)
834{
835	struct zsd_entry *t;
836
837	ASSERT(ct == ZSD_SHUTDOWN || ct == ZSD_DESTROY);
838	ASSERT(ct != ZSD_SHUTDOWN || zone_status_get(zone) >= ZONE_IS_EMPTY);
839	ASSERT(ct != ZSD_DESTROY || zone_status_get(zone) >= ZONE_IS_DOWN);
840
841	/*
842	 * Run the callback solely based on what is registered for the zone
843	 * in zone_zsd. The global list can change independently of this
844	 * as keys are registered and unregistered and we don't register new
845	 * callbacks for a zone that is in the process of going away.
846	 */
847	mutex_enter(&zone->zone_lock);
848	for (t = list_head(&zone->zone_zsd); t != NULL;
849	    t = list_next(&zone->zone_zsd, t)) {
850		zone_key_t key = t->zsd_key;
851
852		/* Skip if no callbacks registered */
853
854		if (ct == ZSD_SHUTDOWN) {
855			if (t->zsd_shutdown != NULL &&
856			    (t->zsd_flags & ZSD_SHUTDOWN_ALL) == 0) {
857				t->zsd_flags |= ZSD_SHUTDOWN_NEEDED;
858				DTRACE_PROBE2(zsd__shutdown__needed,
859				    zone_t *, zone, zone_key_t, key);
860			}
861		} else {
862			if (t->zsd_destroy != NULL &&
863			    (t->zsd_flags & ZSD_DESTROY_ALL) == 0) {
864				t->zsd_flags |= ZSD_DESTROY_NEEDED;
865				DTRACE_PROBE2(zsd__destroy__needed,
866				    zone_t *, zone, zone_key_t, key);
867			}
868		}
869	}
870	mutex_exit(&zone->zone_lock);
871
872	/* Now call the shutdown and destroy callback for this key */
873	zsd_apply_all_keys(zsd_apply_shutdown, zone);
874	zsd_apply_all_keys(zsd_apply_destroy, zone);
875
876}
877
878/*
879 * Called when the zone is going away; free ZSD-related memory, and
880 * destroy the zone_zsd list.
881 */
882static void
883zone_free_zsd(zone_t *zone)
884{
885	struct zsd_entry *t, *next;
886
887	/*
888	 * Free all the zsd_entry's we had on this zone.
889	 */
890	mutex_enter(&zone->zone_lock);
891	for (t = list_head(&zone->zone_zsd); t != NULL; t = next) {
892		next = list_next(&zone->zone_zsd, t);
893		list_remove(&zone->zone_zsd, t);
894		ASSERT(!(t->zsd_flags & ZSD_ALL_INPROGRESS));
895		kmem_free(t, sizeof (*t));
896	}
897	list_destroy(&zone->zone_zsd);
898	mutex_exit(&zone->zone_lock);
899
900}
901
902/*
903 * Apply a function to all zones for particular key value.
904 *
905 * The applyfn has to drop zonehash_lock if it does some work, and
906 * then reacquire it before it returns.
907 * When the lock is dropped we don't follow list_next even
908 * if it is possible to do so without any hazards. This is
909 * because we want the design to allow for the list of zones
910 * to change in any arbitrary way during the time the
911 * lock was dropped.
912 *
913 * It is safe to restart the loop at list_head since the applyfn
914 * changes the zsd_flags as it does work, so a subsequent
915 * pass through will have no effect in applyfn, hence the loop will terminate
916 * in at worst O(N^2).
917 */
918static void
919zsd_apply_all_zones(zsd_applyfn_t *applyfn, zone_key_t key)
920{
921	zone_t *zone;
922
923	mutex_enter(&zonehash_lock);
924	zone = list_head(&zone_active);
925	while (zone != NULL) {
926		if ((applyfn)(&zonehash_lock, B_FALSE, zone, key)) {
927			/* Lock dropped - restart at head */
928			zone = list_head(&zone_active);
929		} else {
930			zone = list_next(&zone_active, zone);
931		}
932	}
933	mutex_exit(&zonehash_lock);
934}
935
936/*
937 * Apply a function to all keys for a particular zone.
938 *
939 * The applyfn has to drop zonehash_lock if it does some work, and
940 * then reacquire it before it returns.
941 * When the lock is dropped we don't follow list_next even
942 * if it is possible to do so without any hazards. This is
943 * because we want the design to allow for the list of zsd callbacks
944 * to change in any arbitrary way during the time the
945 * lock was dropped.
946 *
947 * It is safe to restart the loop at list_head since the applyfn
948 * changes the zsd_flags as it does work, so a subsequent
949 * pass through will have no effect in applyfn, hence the loop will terminate
950 * in at worst O(N^2).
951 */
952static void
953zsd_apply_all_keys(zsd_applyfn_t *applyfn, zone_t *zone)
954{
955	struct zsd_entry *t;
956
957	mutex_enter(&zone->zone_lock);
958	t = list_head(&zone->zone_zsd);
959	while (t != NULL) {
960		if ((applyfn)(NULL, B_TRUE, zone, t->zsd_key)) {
961			/* Lock dropped - restart at head */
962			t = list_head(&zone->zone_zsd);
963		} else {
964			t = list_next(&zone->zone_zsd, t);
965		}
966	}
967	mutex_exit(&zone->zone_lock);
968}
969
970/*
971 * Call the create function for the zone and key if CREATE_NEEDED
972 * is set.
973 * If some other thread gets here first and sets CREATE_INPROGRESS, then
974 * we wait for that thread to complete so that we can ensure that
975 * all the callbacks are done when we've looped over all zones/keys.
976 *
977 * When we call the create function, we drop the global held by the
978 * caller, and return true to tell the caller it needs to re-evalute the
979 * state.
980 * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
981 * remains held on exit.
982 */
983static boolean_t
984zsd_apply_create(kmutex_t *lockp, boolean_t zone_lock_held,
985    zone_t *zone, zone_key_t key)
986{
987	void *result;
988	struct zsd_entry *t;
989	boolean_t dropped;
990
991	if (lockp != NULL) {
992		ASSERT(MUTEX_HELD(lockp));
993	}
994	if (zone_lock_held) {
995		ASSERT(MUTEX_HELD(&zone->zone_lock));
996	} else {
997		mutex_enter(&zone->zone_lock);
998	}
999
1000	t = zsd_find(&zone->zone_zsd, key);
1001	if (t == NULL) {
1002		/*
1003		 * Somebody else got here first e.g the zone going
1004		 * away.
1005		 */
1006		if (!zone_lock_held)
1007			mutex_exit(&zone->zone_lock);
1008		return (B_FALSE);
1009	}
1010	dropped = B_FALSE;
1011	if (zsd_wait_for_inprogress(zone, t, lockp))
1012		dropped = B_TRUE;
1013
1014	if (t->zsd_flags & ZSD_CREATE_NEEDED) {
1015		t->zsd_flags &= ~ZSD_CREATE_NEEDED;
1016		t->zsd_flags |= ZSD_CREATE_INPROGRESS;
1017		DTRACE_PROBE2(zsd__create__inprogress,
1018		    zone_t *, zone, zone_key_t, key);
1019		mutex_exit(&zone->zone_lock);
1020		if (lockp != NULL)
1021			mutex_exit(lockp);
1022
1023		dropped = B_TRUE;
1024		ASSERT(t->zsd_create != NULL);
1025		DTRACE_PROBE2(zsd__create__start,
1026		    zone_t *, zone, zone_key_t, key);
1027
1028		result = (*t->zsd_create)(zone->zone_id);
1029
1030		DTRACE_PROBE2(zsd__create__end,
1031		    zone_t *, zone, voidn *, result);
1032
1033		ASSERT(result != NULL);
1034		if (lockp != NULL)
1035			mutex_enter(lockp);
1036		mutex_enter(&zone->zone_lock);
1037		t->zsd_data = result;
1038		t->zsd_flags &= ~ZSD_CREATE_INPROGRESS;
1039		t->zsd_flags |= ZSD_CREATE_COMPLETED;
1040		cv_broadcast(&t->zsd_cv);
1041		DTRACE_PROBE2(zsd__create__completed,
1042		    zone_t *, zone, zone_key_t, key);
1043	}
1044	if (!zone_lock_held)
1045		mutex_exit(&zone->zone_lock);
1046	return (dropped);
1047}
1048
1049/*
1050 * Call the shutdown function for the zone and key if SHUTDOWN_NEEDED
1051 * is set.
1052 * If some other thread gets here first and sets *_INPROGRESS, then
1053 * we wait for that thread to complete so that we can ensure that
1054 * all the callbacks are done when we've looped over all zones/keys.
1055 *
1056 * When we call the shutdown function, we drop the global held by the
1057 * caller, and return true to tell the caller it needs to re-evalute the
1058 * state.
1059 * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
1060 * remains held on exit.
1061 */
1062static boolean_t
1063zsd_apply_shutdown(kmutex_t *lockp, boolean_t zone_lock_held,
1064    zone_t *zone, zone_key_t key)
1065{
1066	struct zsd_entry *t;
1067	void *data;
1068	boolean_t dropped;
1069
1070	if (lockp != NULL) {
1071		ASSERT(MUTEX_HELD(lockp));
1072	}
1073	if (zone_lock_held) {
1074		ASSERT(MUTEX_HELD(&zone->zone_lock));
1075	} else {
1076		mutex_enter(&zone->zone_lock);
1077	}
1078
1079	t = zsd_find(&zone->zone_zsd, key);
1080	if (t == NULL) {
1081		/*
1082		 * Somebody else got here first e.g the zone going
1083		 * away.
1084		 */
1085		if (!zone_lock_held)
1086			mutex_exit(&zone->zone_lock);
1087		return (B_FALSE);
1088	}
1089	dropped = B_FALSE;
1090	if (zsd_wait_for_creator(zone, t, lockp))
1091		dropped = B_TRUE;
1092
1093	if (zsd_wait_for_inprogress(zone, t, lockp))
1094		dropped = B_TRUE;
1095
1096	if (t->zsd_flags & ZSD_SHUTDOWN_NEEDED) {
1097		t->zsd_flags &= ~ZSD_SHUTDOWN_NEEDED;
1098		t->zsd_flags |= ZSD_SHUTDOWN_INPROGRESS;
1099		DTRACE_PROBE2(zsd__shutdown__inprogress,
1100		    zone_t *, zone, zone_key_t, key);
1101		mutex_exit(&zone->zone_lock);
1102		if (lockp != NULL)
1103			mutex_exit(lockp);
1104		dropped = B_TRUE;
1105
1106		ASSERT(t->zsd_shutdown != NULL);
1107		data = t->zsd_data;
1108
1109		DTRACE_PROBE2(zsd__shutdown__start,
1110		    zone_t *, zone, zone_key_t, key);
1111
1112		(t->zsd_shutdown)(zone->zone_id, data);
1113		DTRACE_PROBE2(zsd__shutdown__end,
1114		    zone_t *, zone, zone_key_t, key);
1115
1116		if (lockp != NULL)
1117			mutex_enter(lockp);
1118		mutex_enter(&zone->zone_lock);
1119		t->zsd_flags &= ~ZSD_SHUTDOWN_INPROGRESS;
1120		t->zsd_flags |= ZSD_SHUTDOWN_COMPLETED;
1121		cv_broadcast(&t->zsd_cv);
1122		DTRACE_PROBE2(zsd__shutdown__completed,
1123		    zone_t *, zone, zone_key_t, key);
1124	}
1125	if (!zone_lock_held)
1126		mutex_exit(&zone->zone_lock);
1127	return (dropped);
1128}
1129
1130/*
1131 * Call the destroy function for the zone and key if DESTROY_NEEDED
1132 * is set.
1133 * If some other thread gets here first and sets *_INPROGRESS, then
1134 * we wait for that thread to complete so that we can ensure that
1135 * all the callbacks are done when we've looped over all zones/keys.
1136 *
1137 * When we call the destroy function, we drop the global held by the
1138 * caller, and return true to tell the caller it needs to re-evalute the
1139 * state.
1140 * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
1141 * remains held on exit.
1142 */
1143static boolean_t
1144zsd_apply_destroy(kmutex_t *lockp, boolean_t zone_lock_held,
1145    zone_t *zone, zone_key_t key)
1146{
1147	struct zsd_entry *t;
1148	void *data;
1149	boolean_t dropped;
1150
1151	if (lockp != NULL) {
1152		ASSERT(MUTEX_HELD(lockp));
1153	}
1154	if (zone_lock_held) {
1155		ASSERT(MUTEX_HELD(&zone->zone_lock));
1156	} else {
1157		mutex_enter(&zone->zone_lock);
1158	}
1159
1160	t = zsd_find(&zone->zone_zsd, key);
1161	if (t == NULL) {
1162		/*
1163		 * Somebody else got here first e.g the zone going
1164		 * away.
1165		 */
1166		if (!zone_lock_held)
1167			mutex_exit(&zone->zone_lock);
1168		return (B_FALSE);
1169	}
1170	dropped = B_FALSE;
1171	if (zsd_wait_for_creator(zone, t, lockp))
1172		dropped = B_TRUE;
1173
1174	if (zsd_wait_for_inprogress(zone, t, lockp))
1175		dropped = B_TRUE;
1176
1177	if (t->zsd_flags & ZSD_DESTROY_NEEDED) {
1178		t->zsd_flags &= ~ZSD_DESTROY_NEEDED;
1179		t->zsd_flags |= ZSD_DESTROY_INPROGRESS;
1180		DTRACE_PROBE2(zsd__destroy__inprogress,
1181		    zone_t *, zone, zone_key_t, key);
1182		mutex_exit(&zone->zone_lock);
1183		if (lockp != NULL)
1184			mutex_exit(lockp);
1185		dropped = B_TRUE;
1186
1187		ASSERT(t->zsd_destroy != NULL);
1188		data = t->zsd_data;
1189		DTRACE_PROBE2(zsd__destroy__start,
1190		    zone_t *, zone, zone_key_t, key);
1191
1192		(t->zsd_destroy)(zone->zone_id, data);
1193		DTRACE_PROBE2(zsd__destroy__end,
1194		    zone_t *, zone, zone_key_t, key);
1195
1196		if (lockp != NULL)
1197			mutex_enter(lockp);
1198		mutex_enter(&zone->zone_lock);
1199		t->zsd_data = NULL;
1200		t->zsd_flags &= ~ZSD_DESTROY_INPROGRESS;
1201		t->zsd_flags |= ZSD_DESTROY_COMPLETED;
1202		cv_broadcast(&t->zsd_cv);
1203		DTRACE_PROBE2(zsd__destroy__completed,
1204		    zone_t *, zone, zone_key_t, key);
1205	}
1206	if (!zone_lock_held)
1207		mutex_exit(&zone->zone_lock);
1208	return (dropped);
1209}
1210
1211/*
1212 * Wait for any CREATE_NEEDED flag to be cleared.
1213 * Returns true if lockp was temporarily dropped while waiting.
1214 */
1215static boolean_t
1216zsd_wait_for_creator(zone_t *zone, struct zsd_entry *t, kmutex_t *lockp)
1217{
1218	boolean_t dropped = B_FALSE;
1219
1220	while (t->zsd_flags & ZSD_CREATE_NEEDED) {
1221		DTRACE_PROBE2(zsd__wait__for__creator,
1222		    zone_t *, zone, struct zsd_entry *, t);
1223		if (lockp != NULL) {
1224			dropped = B_TRUE;
1225			mutex_exit(lockp);
1226		}
1227		cv_wait(&t->zsd_cv, &zone->zone_lock);
1228		if (lockp != NULL) {
1229			/* First drop zone_lock to preserve order */
1230			mutex_exit(&zone->zone_lock);
1231			mutex_enter(lockp);
1232			mutex_enter(&zone->zone_lock);
1233		}
1234	}
1235	return (dropped);
1236}
1237
1238/*
1239 * Wait for any INPROGRESS flag to be cleared.
1240 * Returns true if lockp was temporarily dropped while waiting.
1241 */
1242static boolean_t
1243zsd_wait_for_inprogress(zone_t *zone, struct zsd_entry *t, kmutex_t *lockp)
1244{
1245	boolean_t dropped = B_FALSE;
1246
1247	while (t->zsd_flags & ZSD_ALL_INPROGRESS) {
1248		DTRACE_PROBE2(zsd__wait__for__inprogress,
1249		    zone_t *, zone, struct zsd_entry *, t);
1250		if (lockp != NULL) {
1251			dropped = B_TRUE;
1252			mutex_exit(lockp);
1253		}
1254		cv_wait(&t->zsd_cv, &zone->zone_lock);
1255		if (lockp != NULL) {
1256			/* First drop zone_lock to preserve order */
1257			mutex_exit(&zone->zone_lock);
1258			mutex_enter(lockp);
1259			mutex_enter(&zone->zone_lock);
1260		}
1261	}
1262	return (dropped);
1263}
1264
1265/*
1266 * Frees memory associated with the zone dataset list.
1267 */
1268static void
1269zone_free_datasets(zone_t *zone)
1270{
1271	zone_dataset_t *t, *next;
1272
1273	for (t = list_head(&zone->zone_datasets); t != NULL; t = next) {
1274		next = list_next(&zone->zone_datasets, t);
1275		list_remove(&zone->zone_datasets, t);
1276		kmem_free(t->zd_dataset, strlen(t->zd_dataset) + 1);
1277		kmem_free(t, sizeof (*t));
1278	}
1279	list_destroy(&zone->zone_datasets);
1280}
1281
1282/*
1283 * zone.cpu-shares resource control support.
1284 */
1285/*ARGSUSED*/
1286static rctl_qty_t
1287zone_cpu_shares_usage(rctl_t *rctl, struct proc *p)
1288{
1289	ASSERT(MUTEX_HELD(&p->p_lock));
1290	return (p->p_zone->zone_shares);
1291}
1292
1293/*ARGSUSED*/
1294static int
1295zone_cpu_shares_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1296    rctl_qty_t nv)
1297{
1298	ASSERT(MUTEX_HELD(&p->p_lock));
1299	ASSERT(e->rcep_t == RCENTITY_ZONE);
1300	if (e->rcep_p.zone == NULL)
1301		return (0);
1302
1303	e->rcep_p.zone->zone_shares = nv;
1304	return (0);
1305}
1306
1307static rctl_ops_t zone_cpu_shares_ops = {
1308	rcop_no_action,
1309	zone_cpu_shares_usage,
1310	zone_cpu_shares_set,
1311	rcop_no_test
1312};
1313
1314/*
1315 * zone.cpu-cap resource control support.
1316 */
1317/*ARGSUSED*/
1318static rctl_qty_t
1319zone_cpu_cap_get(rctl_t *rctl, struct proc *p)
1320{
1321	ASSERT(MUTEX_HELD(&p->p_lock));
1322	return (cpucaps_zone_get(p->p_zone));
1323}
1324
1325/*ARGSUSED*/
1326static int
1327zone_cpu_cap_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1328    rctl_qty_t nv)
1329{
1330	zone_t *zone = e->rcep_p.zone;
1331
1332	ASSERT(MUTEX_HELD(&p->p_lock));
1333	ASSERT(e->rcep_t == RCENTITY_ZONE);
1334
1335	if (zone == NULL)
1336		return (0);
1337
1338	/*
1339	 * set cap to the new value.
1340	 */
1341	return (cpucaps_zone_set(zone, nv));
1342}
1343
1344static rctl_ops_t zone_cpu_cap_ops = {
1345	rcop_no_action,
1346	zone_cpu_cap_get,
1347	zone_cpu_cap_set,
1348	rcop_no_test
1349};
1350
1351/*ARGSUSED*/
1352static rctl_qty_t
1353zone_lwps_usage(rctl_t *r, proc_t *p)
1354{
1355	rctl_qty_t nlwps;
1356	zone_t *zone = p->p_zone;
1357
1358	ASSERT(MUTEX_HELD(&p->p_lock));
1359
1360	mutex_enter(&zone->zone_nlwps_lock);
1361	nlwps = zone->zone_nlwps;
1362	mutex_exit(&zone->zone_nlwps_lock);
1363
1364	return (nlwps);
1365}
1366
1367/*ARGSUSED*/
1368static int
1369zone_lwps_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rcntl,
1370    rctl_qty_t incr, uint_t flags)
1371{
1372	rctl_qty_t nlwps;
1373
1374	ASSERT(MUTEX_HELD(&p->p_lock));
1375	ASSERT(e->rcep_t == RCENTITY_ZONE);
1376	if (e->rcep_p.zone == NULL)
1377		return (0);
1378	ASSERT(MUTEX_HELD(&(e->rcep_p.zone->zone_nlwps_lock)));
1379	nlwps = e->rcep_p.zone->zone_nlwps;
1380
1381	if (nlwps + incr > rcntl->rcv_value)
1382		return (1);
1383
1384	return (0);
1385}
1386
1387/*ARGSUSED*/
1388static int
1389zone_lwps_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, rctl_qty_t nv)
1390{
1391	ASSERT(MUTEX_HELD(&p->p_lock));
1392	ASSERT(e->rcep_t == RCENTITY_ZONE);
1393	if (e->rcep_p.zone == NULL)
1394		return (0);
1395	e->rcep_p.zone->zone_nlwps_ctl = nv;
1396	return (0);
1397}
1398
1399static rctl_ops_t zone_lwps_ops = {
1400	rcop_no_action,
1401	zone_lwps_usage,
1402	zone_lwps_set,
1403	zone_lwps_test,
1404};
1405
1406/*ARGSUSED*/
1407static int
1408zone_shmmax_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1409    rctl_qty_t incr, uint_t flags)
1410{
1411	rctl_qty_t v;
1412	ASSERT(MUTEX_HELD(&p->p_lock));
1413	ASSERT(e->rcep_t == RCENTITY_ZONE);
1414	v = e->rcep_p.zone->zone_shmmax + incr;
1415	if (v > rval->rcv_value)
1416		return (1);
1417	return (0);
1418}
1419
1420static rctl_ops_t zone_shmmax_ops = {
1421	rcop_no_action,
1422	rcop_no_usage,
1423	rcop_no_set,
1424	zone_shmmax_test
1425};
1426
1427/*ARGSUSED*/
1428static int
1429zone_shmmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1430    rctl_qty_t incr, uint_t flags)
1431{
1432	rctl_qty_t v;
1433	ASSERT(MUTEX_HELD(&p->p_lock));
1434	ASSERT(e->rcep_t == RCENTITY_ZONE);
1435	v = e->rcep_p.zone->zone_ipc.ipcq_shmmni + incr;
1436	if (v > rval->rcv_value)
1437		return (1);
1438	return (0);
1439}
1440
1441static rctl_ops_t zone_shmmni_ops = {
1442	rcop_no_action,
1443	rcop_no_usage,
1444	rcop_no_set,
1445	zone_shmmni_test
1446};
1447
1448/*ARGSUSED*/
1449static int
1450zone_semmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1451    rctl_qty_t incr, uint_t flags)
1452{
1453	rctl_qty_t v;
1454	ASSERT(MUTEX_HELD(&p->p_lock));
1455	ASSERT(e->rcep_t == RCENTITY_ZONE);
1456	v = e->rcep_p.zone->zone_ipc.ipcq_semmni + incr;
1457	if (v > rval->rcv_value)
1458		return (1);
1459	return (0);
1460}
1461
1462static rctl_ops_t zone_semmni_ops = {
1463	rcop_no_action,
1464	rcop_no_usage,
1465	rcop_no_set,
1466	zone_semmni_test
1467};
1468
1469/*ARGSUSED*/
1470static int
1471zone_msgmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1472    rctl_qty_t incr, uint_t flags)
1473{
1474	rctl_qty_t v;
1475	ASSERT(MUTEX_HELD(&p->p_lock));
1476	ASSERT(e->rcep_t == RCENTITY_ZONE);
1477	v = e->rcep_p.zone->zone_ipc.ipcq_msgmni + incr;
1478	if (v > rval->rcv_value)
1479		return (1);
1480	return (0);
1481}
1482
1483static rctl_ops_t zone_msgmni_ops = {
1484	rcop_no_action,
1485	rcop_no_usage,
1486	rcop_no_set,
1487	zone_msgmni_test
1488};
1489
1490/*ARGSUSED*/
1491static rctl_qty_t
1492zone_locked_mem_usage(rctl_t *rctl, struct proc *p)
1493{
1494	rctl_qty_t q;
1495	ASSERT(MUTEX_HELD(&p->p_lock));
1496	mutex_enter(&p->p_zone->zone_mem_lock);
1497	q = p->p_zone->zone_locked_mem;
1498	mutex_exit(&p->p_zone->zone_mem_lock);
1499	return (q);
1500}
1501
1502/*ARGSUSED*/
1503static int
1504zone_locked_mem_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
1505    rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
1506{
1507	rctl_qty_t q;
1508	zone_t *z;
1509
1510	z = e->rcep_p.zone;
1511	ASSERT(MUTEX_HELD(&p->p_lock));
1512	ASSERT(MUTEX_HELD(&z->zone_mem_lock));
1513	q = z->zone_locked_mem;
1514	if (q + incr > rcntl->rcv_value)
1515		return (1);
1516	return (0);
1517}
1518
1519/*ARGSUSED*/
1520static int
1521zone_locked_mem_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1522    rctl_qty_t nv)
1523{
1524	ASSERT(MUTEX_HELD(&p->p_lock));
1525	ASSERT(e->rcep_t == RCENTITY_ZONE);
1526	if (e->rcep_p.zone == NULL)
1527		return (0);
1528	e->rcep_p.zone->zone_locked_mem_ctl = nv;
1529	return (0);
1530}
1531
1532static rctl_ops_t zone_locked_mem_ops = {
1533	rcop_no_action,
1534	zone_locked_mem_usage,
1535	zone_locked_mem_set,
1536	zone_locked_mem_test
1537};
1538
1539/*ARGSUSED*/
1540static rctl_qty_t
1541zone_max_swap_usage(rctl_t *rctl, struct proc *p)
1542{
1543	rctl_qty_t q;
1544	zone_t *z = p->p_zone;
1545
1546	ASSERT(MUTEX_HELD(&p->p_lock));
1547	mutex_enter(&z->zone_mem_lock);
1548	q = z->zone_max_swap;
1549	mutex_exit(&z->zone_mem_lock);
1550	return (q);
1551}
1552
1553/*ARGSUSED*/
1554static int
1555zone_max_swap_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
1556    rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
1557{
1558	rctl_qty_t q;
1559	zone_t *z;
1560
1561	z = e->rcep_p.zone;
1562	ASSERT(MUTEX_HELD(&p->p_lock));
1563	ASSERT(MUTEX_HELD(&z->zone_mem_lock));
1564	q = z->zone_max_swap;
1565	if (q + incr > rcntl->rcv_value)
1566		return (1);
1567	return (0);
1568}
1569
1570/*ARGSUSED*/
1571static int
1572zone_max_swap_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1573    rctl_qty_t nv)
1574{
1575	ASSERT(MUTEX_HELD(&p->p_lock));
1576	ASSERT(e->rcep_t == RCENTITY_ZONE);
1577	if (e->rcep_p.zone == NULL)
1578		return (0);
1579	e->rcep_p.zone->zone_max_swap_ctl = nv;
1580	return (0);
1581}
1582
1583static rctl_ops_t zone_max_swap_ops = {
1584	rcop_no_action,
1585	zone_max_swap_usage,
1586	zone_max_swap_set,
1587	zone_max_swap_test
1588};
1589
1590/*ARGSUSED*/
1591static rctl_qty_t
1592zone_max_lofi_usage(rctl_t *rctl, struct proc *p)
1593{
1594	rctl_qty_t q;
1595	zone_t *z = p->p_zone;
1596
1597	ASSERT(MUTEX_HELD(&p->p_lock));
1598	mutex_enter(&z->zone_rctl_lock);
1599	q = z->zone_max_lofi;
1600	mutex_exit(&z->zone_rctl_lock);
1601	return (q);
1602}
1603
1604/*ARGSUSED*/
1605static int
1606zone_max_lofi_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
1607    rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
1608{
1609	rctl_qty_t q;
1610	zone_t *z;
1611
1612	z = e->rcep_p.zone;
1613	ASSERT(MUTEX_HELD(&p->p_lock));
1614	ASSERT(MUTEX_HELD(&z->zone_rctl_lock));
1615	q = z->zone_max_lofi;
1616	if (q + incr > rcntl->rcv_value)
1617		return (1);
1618	return (0);
1619}
1620
1621/*ARGSUSED*/
1622static int
1623zone_max_lofi_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1624    rctl_qty_t nv)
1625{
1626	ASSERT(MUTEX_HELD(&p->p_lock));
1627	ASSERT(e->rcep_t == RCENTITY_ZONE);
1628	if (e->rcep_p.zone == NULL)
1629		return (0);
1630	e->rcep_p.zone->zone_max_lofi_ctl = nv;
1631	return (0);
1632}
1633
1634static rctl_ops_t zone_max_lofi_ops = {
1635	rcop_no_action,
1636	zone_max_lofi_usage,
1637	zone_max_lofi_set,
1638	zone_max_lofi_test
1639};
1640
1641/*
1642 * Helper function to brand the zone with a unique ID.
1643 */
1644static void
1645zone_uniqid(zone_t *zone)
1646{
1647	static uint64_t uniqid = 0;
1648
1649	ASSERT(MUTEX_HELD(&zonehash_lock));
1650	zone->zone_uniqid = uniqid++;
1651}
1652
1653/*
1654 * Returns a held pointer to the "kcred" for the specified zone.
1655 */
1656struct cred *
1657zone_get_kcred(zoneid_t zoneid)
1658{
1659	zone_t *zone;
1660	cred_t *cr;
1661
1662	if ((zone = zone_find_by_id(zoneid)) == NULL)
1663		return (NULL);
1664	cr = zone->zone_kcred;
1665	crhold(cr);
1666	zone_rele(zone);
1667	return (cr);
1668}
1669
1670static int
1671zone_lockedmem_kstat_update(kstat_t *ksp, int rw)
1672{
1673	zone_t *zone = ksp->ks_private;
1674	zone_kstat_t *zk = ksp->ks_data;
1675
1676	if (rw == KSTAT_WRITE)
1677		return (EACCES);
1678
1679	zk->zk_usage.value.ui64 = zone->zone_locked_mem;
1680	zk->zk_value.value.ui64 = zone->zone_locked_mem_ctl;
1681	return (0);
1682}
1683
1684static int
1685zone_swapresv_kstat_update(kstat_t *ksp, int rw)
1686{
1687	zone_t *zone = ksp->ks_private;
1688	zone_kstat_t *zk = ksp->ks_data;
1689
1690	if (rw == KSTAT_WRITE)
1691		return (EACCES);
1692
1693	zk->zk_usage.value.ui64 = zone->zone_max_swap;
1694	zk->zk_value.value.ui64 = zone->zone_max_swap_ctl;
1695	return (0);
1696}
1697
1698static void
1699zone_kstat_create(zone_t *zone)
1700{
1701	kstat_t *ksp;
1702	zone_kstat_t *zk;
1703
1704	ksp = rctl_kstat_create_zone(zone, "lockedmem", KSTAT_TYPE_NAMED,
1705	    sizeof (zone_kstat_t) / sizeof (kstat_named_t),
1706	    KSTAT_FLAG_VIRTUAL);
1707
1708	if (ksp == NULL)
1709		return;
1710
1711	zk = ksp->ks_data = kmem_alloc(sizeof (zone_kstat_t), KM_SLEEP);
1712	ksp->ks_data_size += strlen(zone->zone_name) + 1;
1713	kstat_named_init(&zk->zk_zonename, "zonename", KSTAT_DATA_STRING);
1714	kstat_named_setstr(&zk->zk_zonename, zone->zone_name);
1715	kstat_named_init(&zk->zk_usage, "usage", KSTAT_DATA_UINT64);
1716	kstat_named_init(&zk->zk_value, "value", KSTAT_DATA_UINT64);
1717	ksp->ks_update = zone_lockedmem_kstat_update;
1718	ksp->ks_private = zone;
1719	kstat_install(ksp);
1720
1721	zone->zone_lockedmem_kstat = ksp;
1722
1723	ksp = rctl_kstat_create_zone(zone, "swapresv", KSTAT_TYPE_NAMED,
1724	    sizeof (zone_kstat_t) / sizeof (kstat_named_t),
1725	    KSTAT_FLAG_VIRTUAL);
1726
1727	if (ksp == NULL)
1728		return;
1729
1730	zk = ksp->ks_data = kmem_alloc(sizeof (zone_kstat_t), KM_SLEEP);
1731	ksp->ks_data_size += strlen(zone->zone_name) + 1;
1732	kstat_named_init(&zk->zk_zonename, "zonename", KSTAT_DATA_STRING);
1733	kstat_named_setstr(&zk->zk_zonename, zone->zone_name);
1734	kstat_named_init(&zk->zk_usage, "usage", KSTAT_DATA_UINT64);
1735	kstat_named_init(&zk->zk_value, "value", KSTAT_DATA_UINT64);
1736	ksp->ks_update = zone_swapresv_kstat_update;
1737	ksp->ks_private = zone;
1738	kstat_install(ksp);
1739
1740	zone->zone_swapresv_kstat = ksp;
1741}
1742
1743static void
1744zone_kstat_delete(zone_t *zone)
1745{
1746	void *data;
1747
1748	if (zone->zone_lockedmem_kstat != NULL) {
1749		data = zone->zone_lockedmem_kstat->ks_data;
1750		kstat_delete(zone->zone_lockedmem_kstat);
1751		kmem_free(data, sizeof (zone_kstat_t));
1752	}
1753	if (zone->zone_swapresv_kstat != NULL) {
1754		data = zone->zone_swapresv_kstat->ks_data;
1755		kstat_delete(zone->zone_swapresv_kstat);
1756		kmem_free(data, sizeof (zone_kstat_t));
1757	}
1758}
1759
1760/*
1761 * Called very early on in boot to initialize the ZSD list so that
1762 * zone_key_create() can be called before zone_init().  It also initializes
1763 * portions of zone0 which may be used before zone_init() is called.  The
1764 * variable "global_zone" will be set when zone0 is fully initialized by
1765 * zone_init().
1766 */
1767void
1768zone_zsd_init(void)
1769{
1770	mutex_init(&zonehash_lock, NULL, MUTEX_DEFAULT, NULL);
1771	mutex_init(&zsd_key_lock, NULL, MUTEX_DEFAULT, NULL);
1772	list_create(&zsd_registered_keys, sizeof (struct zsd_entry),
1773	    offsetof(struct zsd_entry, zsd_linkage));
1774	list_create(&zone_active, sizeof (zone_t),
1775	    offsetof(zone_t, zone_linkage));
1776	list_create(&zone_deathrow, sizeof (zone_t),
1777	    offsetof(zone_t, zone_linkage));
1778
1779	mutex_init(&zone0.zone_lock, NULL, MUTEX_DEFAULT, NULL);
1780	mutex_init(&zone0.zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL);
1781	mutex_init(&zone0.zone_mem_lock, NULL, MUTEX_DEFAULT, NULL);
1782	zone0.zone_shares = 1;
1783	zone0.zone_nlwps = 0;
1784	zone0.zone_nlwps_ctl = INT_MAX;
1785	zone0.zone_locked_mem = 0;
1786	zone0.zone_locked_mem_ctl = UINT64_MAX;
1787	ASSERT(zone0.zone_max_swap == 0);
1788	zone0.zone_max_swap_ctl = UINT64_MAX;
1789	zone0.zone_max_lofi = 0;
1790	zone0.zone_max_lofi_ctl = UINT64_MAX;
1791	zone0.zone_shmmax = 0;
1792	zone0.zone_ipc.ipcq_shmmni = 0;
1793	zone0.zone_ipc.ipcq_semmni = 0;
1794	zone0.zone_ipc.ipcq_msgmni = 0;
1795	zone0.zone_name = GLOBAL_ZONENAME;
1796	zone0.zone_nodename = utsname.nodename;
1797	zone0.zone_domain = srpc_domain;
1798	zone0.zone_hostid = HW_INVALID_HOSTID;
1799	zone0.zone_fs_allowed = NULL;
1800	zone0.zone_ref = 1;
1801	zone0.zone_id = GLOBAL_ZONEID;
1802	zone0.zone_status = ZONE_IS_RUNNING;
1803	zone0.zone_rootpath = "/";
1804	zone0.zone_rootpathlen = 2;
1805	zone0.zone_psetid = ZONE_PS_INVAL;
1806	zone0.zone_ncpus = 0;
1807	zone0.zone_ncpus_online = 0;
1808	zone0.zone_proc_initpid = 1;
1809	zone0.zone_initname = initname;
1810	zone0.zone_lockedmem_kstat = NULL;
1811	zone0.zone_swapresv_kstat = NULL;
1812	list_create(&zone0.zone_zsd, sizeof (struct zsd_entry),
1813	    offsetof(struct zsd_entry, zsd_linkage));
1814	list_insert_head(&zone_active, &zone0);
1815
1816	/*
1817	 * The root filesystem is not mounted yet, so zone_rootvp cannot be set
1818	 * to anything meaningful.  It is assigned to be 'rootdir' in
1819	 * vfs_mountroot().
1820	 */
1821	zone0.zone_rootvp = NULL;
1822	zone0.zone_vfslist = NULL;
1823	zone0.zone_bootargs = initargs;
1824	zone0.zone_privset = kmem_alloc(sizeof (priv_set_t), KM_SLEEP);
1825	/*
1826	 * The global zone has all privileges
1827	 */
1828	priv_fillset(zone0.zone_privset);
1829	/*
1830	 * Add p0 to the global zone
1831	 */
1832	zone0.zone_zsched = &p0;
1833	p0.p_zone = &zone0;
1834}
1835
1836/*
1837 * Compute a hash value based on the contents of the label and the DOI.  The
1838 * hash algorithm is somewhat arbitrary, but is based on the observation that
1839 * humans will likely pick labels that differ by amounts that work out to be
1840 * multiples of the number of hash chains, and thus stirring in some primes
1841 * should help.
1842 */
1843static uint_t
1844hash_bylabel(void *hdata, mod_hash_key_t key)
1845{
1846	const ts_label_t *lab = (ts_label_t *)key;
1847	const uint32_t *up, *ue;
1848	uint_t hash;
1849	int i;
1850
1851	_NOTE(ARGUNUSED(hdata));
1852
1853	hash = lab->tsl_doi + (lab->tsl_doi << 1);
1854	/* we depend on alignment of label, but not representation */
1855	up = (const uint32_t *)&lab->tsl_label;
1856	ue = up + sizeof (lab->tsl_label) / sizeof (*up);
1857	i = 1;
1858	while (up < ue) {
1859		/* using 2^n + 1, 1 <= n <= 16 as source of many primes */
1860		hash += *up + (*up << ((i % 16) + 1));
1861		up++;
1862		i++;
1863	}
1864	return (hash);
1865}
1866
1867/*
1868 * All that mod_hash cares about here is zero (equal) versus non-zero (not
1869 * equal).  This may need to be changed if less than / greater than is ever
1870 * needed.
1871 */
1872static int
1873hash_labelkey_cmp(mod_hash_key_t key1, mod_hash_key_t key2)
1874{
1875	ts_label_t *lab1 = (ts_label_t *)key1;
1876	ts_label_t *lab2 = (ts_label_t *)key2;
1877
1878	return (label_equal(lab1, lab2) ? 0 : 1);
1879}
1880
1881/*
1882 * Called by main() to initialize the zones framework.
1883 */
1884void
1885zone_init(void)
1886{
1887	rctl_dict_entry_t *rde;
1888	rctl_val_t *dval;
1889	rctl_set_t *set;
1890	rctl_alloc_gp_t *gp;
1891	rctl_entity_p_t e;
1892	int res;
1893
1894	ASSERT(curproc == &p0);
1895
1896	/*
1897	 * Create ID space for zone IDs.  ID 0 is reserved for the
1898	 * global zone.
1899	 */
1900	zoneid_space = id_space_create("zoneid_space", 1, MAX_ZONEID);
1901
1902	/*
1903	 * Initialize generic zone resource controls, if any.
1904	 */
1905	rc_zone_cpu_shares = rctl_register("zone.cpu-shares",
1906	    RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
1907	    RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
1908	    FSS_MAXSHARES, FSS_MAXSHARES, &zone_cpu_shares_ops);
1909
1910	rc_zone_cpu_cap = rctl_register("zone.cpu-cap",
1911	    RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_ALWAYS |
1912	    RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |RCTL_GLOBAL_SYSLOG_NEVER |
1913	    RCTL_GLOBAL_INFINITE,
1914	    MAXCAP, MAXCAP, &zone_cpu_cap_ops);
1915
1916	rc_zone_nlwps = rctl_register("zone.max-lwps", RCENTITY_ZONE,
1917	    RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
1918	    INT_MAX, INT_MAX, &zone_lwps_ops);
1919	/*
1920	 * System V IPC resource controls
1921	 */
1922	rc_zone_msgmni = rctl_register("zone.max-msg-ids",
1923	    RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
1924	    RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_msgmni_ops);
1925
1926	rc_zone_semmni = rctl_register("zone.max-sem-ids",
1927	    RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
1928	    RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_semmni_ops);
1929
1930	rc_zone_shmmni = rctl_register("zone.max-shm-ids",
1931	    RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
1932	    RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_shmmni_ops);
1933
1934	rc_zone_shmmax = rctl_register("zone.max-shm-memory",
1935	    RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
1936	    RCTL_GLOBAL_BYTES, UINT64_MAX, UINT64_MAX, &zone_shmmax_ops);
1937
1938	/*
1939	 * Create a rctl_val with PRIVILEGED, NOACTION, value = 1.  Then attach
1940	 * this at the head of the rctl_dict_entry for ``zone.cpu-shares''.
1941	 */
1942	dval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
1943	bzero(dval, sizeof (rctl_val_t));
1944	dval->rcv_value = 1;
1945	dval->rcv_privilege = RCPRIV_PRIVILEGED;
1946	dval->rcv_flagaction = RCTL_LOCAL_NOACTION;
1947	dval->rcv_action_recip_pid = -1;
1948
1949	rde = rctl_dict_lookup("zone.cpu-shares");
1950	(void) rctl_val_list_insert(&rde->rcd_default_value, dval);
1951
1952	rc_zone_locked_mem = rctl_register("zone.max-locked-memory",
1953	    RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
1954	    RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
1955	    &zone_locked_mem_ops);
1956
1957	rc_zone_max_swap = rctl_register("zone.max-swap",
1958	    RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
1959	    RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
1960	    &zone_max_swap_ops);
1961
1962	rc_zone_max_lofi = rctl_register("zone.max-lofi",
1963	    RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |
1964	    RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
1965	    &zone_max_lofi_ops);
1966
1967	/*
1968	 * Initialize the ``global zone''.
1969	 */
1970	set = rctl_set_create();
1971	gp = rctl_set_init_prealloc(RCENTITY_ZONE);
1972	mutex_enter(&p0.p_lock);
1973	e.rcep_p.zone = &zone0;
1974	e.rcep_t = RCENTITY_ZONE;
1975	zone0.zone_rctls = rctl_set_init(RCENTITY_ZONE, &p0, &e, set,
1976	    gp);
1977
1978	zone0.zone_nlwps = p0.p_lwpcnt;
1979	zone0.zone_ntasks = 1;
1980	mutex_exit(&p0.p_lock);
1981	zone0.zone_restart_init = B_TRUE;
1982	zone0.zone_brand = &native_brand;
1983	rctl_prealloc_destroy(gp);
1984	/*
1985	 * pool_default hasn't been initialized yet, so we let pool_init()
1986	 * take care of making sure the global zone is in the default pool.
1987	 */
1988
1989	/*
1990	 * Initialize global zone kstats
1991	 */
1992	zone_kstat_create(&zone0);
1993
1994	/*
1995	 * Initialize zone label.
1996	 * mlp are initialized when tnzonecfg is loaded.
1997	 */
1998	zone0.zone_slabel = l_admin_low;
1999	rw_init(&zone0.zone_mlps.mlpl_rwlock, NULL, RW_DEFAULT, NULL);
2000	label_hold(l_admin_low);
2001
2002	/*
2003	 * Initialise the lock for the database structure used by mntfs.
2004	 */
2005	rw_init(&zone0.zone_mntfs_db_lock, NULL, RW_DEFAULT, NULL);
2006
2007	mutex_enter(&zonehash_lock);
2008	zone_uniqid(&zone0);
2009	ASSERT(zone0.zone_uniqid == GLOBAL_ZONEUNIQID);
2010
2011	zonehashbyid = mod_hash_create_idhash("zone_by_id", zone_hash_size,
2012	    mod_hash_null_valdtor);
2013	zonehashbyname = mod_hash_create_strhash("zone_by_name",
2014	    zone_hash_size, mod_hash_null_valdtor);
2015	/*
2016	 * maintain zonehashbylabel only for labeled systems
2017	 */
2018	if (is_system_labeled())
2019		zonehashbylabel = mod_hash_create_extended("zone_by_label",
2020		    zone_hash_size, mod_hash_null_keydtor,
2021		    mod_hash_null_valdtor, hash_bylabel, NULL,
2022		    hash_labelkey_cmp, KM_SLEEP);
2023	zonecount = 1;
2024
2025	(void) mod_hash_insert(zonehashbyid, (mod_hash_key_t)GLOBAL_ZONEID,
2026	    (mod_hash_val_t)&zone0);
2027	(void) mod_hash_insert(zonehashbyname, (mod_hash_key_t)zone0.zone_name,
2028	    (mod_hash_val_t)&zone0);
2029	if (is_system_labeled()) {
2030		zone0.zone_flags |= ZF_HASHED_LABEL;
2031		(void) mod_hash_insert(zonehashbylabel,
2032		    (mod_hash_key_t)zone0.zone_slabel, (mod_hash_val_t)&zone0);
2033	}
2034	mutex_exit(&zonehash_lock);
2035
2036	/*
2037	 * We avoid setting zone_kcred until now, since kcred is initialized
2038	 * sometime after zone_zsd_init() and before zone_init().
2039	 */
2040	zone0.zone_kcred = kcred;
2041	/*
2042	 * The global zone is fully initialized (except for zone_rootvp which
2043	 * will be set when the root filesystem is mounted).
2044	 */
2045	global_zone = &zone0;
2046
2047	/*
2048	 * Setup an event channel to send zone status change notifications on
2049	 */
2050	res = sysevent_evc_bind(ZONE_EVENT_CHANNEL, &zone_event_chan,
2051	    EVCH_CREAT);
2052
2053	if (res)
2054		panic("Sysevent_evc_bind failed during zone setup.\n");
2055
2056}
2057
2058static void
2059zone_free(zone_t *zone)
2060{
2061	ASSERT(zone != global_zone);
2062	ASSERT(zone->zone_ntasks == 0);
2063	ASSERT(zone->zone_nlwps == 0);
2064	ASSERT(zone->zone_cred_ref == 0);
2065	ASSERT(zone->zone_kcred == NULL);
2066	ASSERT(zone_status_get(zone) == ZONE_IS_DEAD ||
2067	    zone_status_get(zone) == ZONE_IS_UNINITIALIZED);
2068
2069	/*
2070	 * Remove any zone caps.
2071	 */
2072	cpucaps_zone_remove(zone);
2073
2074	ASSERT(zone->zone_cpucap == NULL);
2075
2076	/* remove from deathrow list */
2077	if (zone_status_get(zone) == ZONE_IS_DEAD) {
2078		ASSERT(zone->zone_ref == 0);
2079		mutex_enter(&zone_deathrow_lock);
2080		list_remove(&zone_deathrow, zone);
2081		mutex_exit(&zone_deathrow_lock);
2082	}
2083
2084	zone_free_zsd(zone);
2085	zone_free_datasets(zone);
2086	list_destroy(&zone->zone_dl_list);
2087
2088	if (zone->zone_rootvp != NULL)
2089		VN_RELE(zone->zone_rootvp);
2090	if (zone->zone_rootpath)
2091		kmem_free(zone->zone_rootpath, zone->zone_rootpathlen);
2092	if (zone->zone_name != NULL)
2093		kmem_free(zone->zone_name, ZONENAME_MAX);
2094	if (zone->zone_slabel != NULL)
2095		label_rele(zone->zone_slabel);
2096	if (zone->zone_nodename != NULL)
2097		kmem_free(zone->zone_nodename, _SYS_NMLN);
2098	if (zone->zone_domain != NULL)
2099		kmem_free(zone->zone_domain, _SYS_NMLN);
2100	if (zone->zone_privset != NULL)
2101		kmem_free(zone->zone_privset, sizeof (priv_set_t));
2102	if (zone->zone_rctls != NULL)
2103		rctl_set_free(zone->zone_rctls);
2104	if (zone->zone_bootargs != NULL)
2105		strfree(zone->zone_bootargs);
2106	if (zone->zone_initname != NULL)
2107		strfree(zone->zone_initname);
2108	if (zone->zone_fs_allowed != NULL)
2109		strfree(zone->zone_fs_allowed);
2110	if (zone->zone_pfexecd != NULL)
2111		klpd_freelist(&zone->zone_pfexecd);
2112	id_free(zoneid_space, zone->zone_id);
2113	mutex_destroy(&zone->zone_lock);
2114	cv_destroy(&zone->zone_cv);
2115	rw_destroy(&zone->zone_mlps.mlpl_rwlock);
2116	rw_destroy(&zone->zone_mntfs_db_lock);
2117	kmem_free(zone, sizeof (zone_t));
2118}
2119
2120/*
2121 * See block comment at the top of this file for information about zone
2122 * status values.
2123 */
2124/*
2125 * Convenience function for setting zone status.
2126 */
2127static void
2128zone_status_set(zone_t *zone, zone_status_t status)
2129{
2130
2131	nvlist_t *nvl = NULL;
2132	ASSERT(MUTEX_HELD(&zone_status_lock));
2133	ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE &&
2134	    status >= zone_status_get(zone));
2135
2136	if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) ||
2137	    nvlist_add_string(nvl, ZONE_CB_NAME, zone->zone_name) ||
2138	    nvlist_add_string(nvl, ZONE_CB_NEWSTATE,
2139	    zone_status_table[status]) ||
2140	    nvlist_add_string(nvl, ZONE_CB_OLDSTATE,
2141	    zone_status_table[zone->zone_status]) ||
2142	    nvlist_add_int32(nvl, ZONE_CB_ZONEID, zone->zone_id) ||
2143	    nvlist_add_uint64(nvl, ZONE_CB_TIMESTAMP, (uint64_t)gethrtime()) ||
2144	    sysevent_evc_publish(zone_event_chan, ZONE_EVENT_STATUS_CLASS,
2145	    ZONE_EVENT_STATUS_SUBCLASS, "sun.com", "kernel", nvl, EVCH_SLEEP)) {
2146#ifdef DEBUG
2147		(void) printf(
2148		    "Failed to allocate and send zone state change event.\n");
2149#endif
2150	}
2151	nvlist_free(nvl);
2152
2153	zone->zone_status = status;
2154
2155	cv_broadcast(&zone->zone_cv);
2156}
2157
2158/*
2159 * Public function to retrieve the zone status.  The zone status may
2160 * change after it is retrieved.
2161 */
2162zone_status_t
2163zone_status_get(zone_t *zone)
2164{
2165	return (zone->zone_status);
2166}
2167
2168static int
2169zone_set_bootargs(zone_t *zone, const char *zone_bootargs)
2170{
2171	char *buf = kmem_zalloc(BOOTARGS_MAX, KM_SLEEP);
2172	int err = 0;
2173
2174	ASSERT(zone != global_zone);
2175	if ((err = copyinstr(zone_bootargs, buf, BOOTARGS_MAX, NULL)) != 0)
2176		goto done;	/* EFAULT or ENAMETOOLONG */
2177
2178	if (zone->zone_bootargs != NULL)
2179		strfree(zone->zone_bootargs);
2180
2181	zone->zone_bootargs = strdup(buf);
2182
2183done:
2184	kmem_free(buf, BOOTARGS_MAX);
2185	return (err);
2186}
2187
2188static int
2189zone_set_brand(zone_t *zone, const char *brand)
2190{
2191	struct brand_attr *attrp;
2192	brand_t *bp;
2193
2194	attrp = kmem_alloc(sizeof (struct brand_attr), KM_SLEEP);
2195	if (copyin(brand, attrp, sizeof (struct brand_attr)) != 0) {
2196		kmem_free(attrp, sizeof (struct brand_attr));
2197		return (EFAULT);
2198	}
2199
2200	bp = brand_register_zone(attrp);
2201	kmem_free(attrp, sizeof (struct brand_attr));
2202	if (bp == NULL)
2203		return (EINVAL);
2204
2205	/*
2206	 * This is the only place where a zone can change it's brand.
2207	 * We already need to hold zone_status_lock to check the zone
2208	 * status, so we'll just use that lock to serialize zone
2209	 * branding requests as well.
2210	 */
2211	mutex_enter(&zone_status_lock);
2212
2213	/* Re-Branding is not allowed and the zone can't be booted yet */
2214	if ((ZONE_IS_BRANDED(zone)) ||
2215	    (zone_status_get(zone) >= ZONE_IS_BOOTING)) {
2216		mutex_exit(&zone_status_lock);
2217		brand_unregister_zone(bp);
2218		return (EINVAL);
2219	}
2220
2221	/* set up the brand specific data */
2222	zone->zone_brand = bp;
2223	ZBROP(zone)->b_init_brand_data(zone);
2224
2225	mutex_exit(&zone_status_lock);
2226	return (0);
2227}
2228
2229static int
2230zone_set_fs_allowed(zone_t *zone, const char *zone_fs_allowed)
2231{
2232	char *buf = kmem_zalloc(ZONE_FS_ALLOWED_MAX, KM_SLEEP);
2233	int err = 0;
2234
2235	ASSERT(zone != global_zone);
2236	if ((err = copyinstr(zone_fs_allowed, buf,
2237	    ZONE_FS_ALLOWED_MAX, NULL)) != 0)
2238		goto done;
2239
2240	if (zone->zone_fs_allowed != NULL)
2241		strfree(zone->zone_fs_allowed);
2242
2243	zone->zone_fs_allowed = strdup(buf);
2244
2245done:
2246	kmem_free(buf, ZONE_FS_ALLOWED_MAX);
2247	return (err);
2248}
2249
2250static int
2251zone_set_initname(zone_t *zone, const char *zone_initname)
2252{
2253	char initname[INITNAME_SZ];
2254	size_t len;
2255	int err = 0;
2256
2257	ASSERT(zone != global_zone);
2258	if ((err = copyinstr(zone_initname, initname, INITNAME_SZ, &len)) != 0)
2259		return (err);	/* EFAULT or ENAMETOOLONG */
2260
2261	if (zone->zone_initname != NULL)
2262		strfree(zone->zone_initname);
2263
2264	zone->zone_initname = kmem_alloc(strlen(initname) + 1, KM_SLEEP);
2265	(void) strcpy(zone->zone_initname, initname);
2266	return (0);
2267}
2268
2269static int
2270zone_set_phys_mcap(zone_t *zone, const uint64_t *zone_mcap)
2271{
2272	uint64_t mcap;
2273	int err = 0;
2274
2275	if ((err = copyin(zone_mcap, &mcap, sizeof (uint64_t))) == 0)
2276		zone->zone_phys_mcap = mcap;
2277
2278	return (err);
2279}
2280
2281static int
2282zone_set_sched_class(zone_t *zone, const char *new_class)
2283{
2284	char sched_class[PC_CLNMSZ];
2285	id_t classid;
2286	int err;
2287
2288	ASSERT(zone != global_zone);
2289	if ((err = copyinstr(new_class, sched_class, PC_CLNMSZ, NULL)) != 0)
2290		return (err);	/* EFAULT or ENAMETOOLONG */
2291
2292	if (getcid(sched_class, &classid) != 0 || CLASS_KERNEL(classid))
2293		return (set_errno(EINVAL));
2294	zone->zone_defaultcid = classid;
2295	ASSERT(zone->zone_defaultcid > 0 &&
2296	    zone->zone_defaultcid < loaded_classes);
2297
2298	return (0);
2299}
2300
2301/*
2302 * Block indefinitely waiting for (zone_status >= status)
2303 */
2304void
2305zone_status_wait(zone_t *zone, zone_status_t status)
2306{
2307	ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2308
2309	mutex_enter(&zone_status_lock);
2310	while (zone->zone_status < status) {
2311		cv_wait(&zone->zone_cv, &zone_status_lock);
2312	}
2313	mutex_exit(&zone_status_lock);
2314}
2315
2316/*
2317 * Private CPR-safe version of zone_status_wait().
2318 */
2319static void
2320zone_status_wait_cpr(zone_t *zone, zone_status_t status, char *str)
2321{
2322	callb_cpr_t cprinfo;
2323
2324	ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2325
2326	CALLB_CPR_INIT(&cprinfo, &zone_status_lock, callb_generic_cpr,
2327	    str);
2328	mutex_enter(&zone_status_lock);
2329	while (zone->zone_status < status) {
2330		CALLB_CPR_SAFE_BEGIN(&cprinfo);
2331		cv_wait(&zone->zone_cv, &zone_status_lock);
2332		CALLB_CPR_SAFE_END(&cprinfo, &zone_status_lock);
2333	}
2334	/*
2335	 * zone_status_lock is implicitly released by the following.
2336	 */
2337	CALLB_CPR_EXIT(&cprinfo);
2338}
2339
2340/*
2341 * Block until zone enters requested state or signal is received.  Return (0)
2342 * if signaled, non-zero otherwise.
2343 */
2344int
2345zone_status_wait_sig(zone_t *zone, zone_status_t status)
2346{
2347	ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2348
2349	mutex_enter(&zone_status_lock);
2350	while (zone->zone_status < status) {
2351		if (!cv_wait_sig(&zone->zone_cv, &zone_status_lock)) {
2352			mutex_exit(&zone_status_lock);
2353			return (0);
2354		}
2355	}
2356	mutex_exit(&zone_status_lock);
2357	return (1);
2358}
2359
2360/*
2361 * Block until the zone enters the requested state or the timeout expires,
2362 * whichever happens first.  Return (-1) if operation timed out, time remaining
2363 * otherwise.
2364 */
2365clock_t
2366zone_status_timedwait(zone_t *zone, clock_t tim, zone_status_t status)
2367{
2368	clock_t timeleft = 0;
2369
2370	ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2371
2372	mutex_enter(&zone_status_lock);
2373	while (zone->zone_status < status && timeleft != -1) {
2374		timeleft = cv_timedwait(&zone->zone_cv, &zone_status_lock, tim);
2375	}
2376	mutex_exit(&zone_status_lock);
2377	return (timeleft);
2378}
2379
2380/*
2381 * Block until the zone enters the requested state, the current process is
2382 * signaled,  or the timeout expires, whichever happens first.  Return (-1) if
2383 * operation timed out, 0 if signaled, time remaining otherwise.
2384 */
2385clock_t
2386zone_status_timedwait_sig(zone_t *zone, clock_t tim, zone_status_t status)
2387{
2388	clock_t timeleft = tim - ddi_get_lbolt();
2389
2390	ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2391
2392	mutex_enter(&zone_status_lock);
2393	while (zone->zone_status < status) {
2394		timeleft = cv_timedwait_sig(&zone->zone_cv, &zone_status_lock,
2395		    tim);
2396		if (timeleft <= 0)
2397			break;
2398	}
2399	mutex_exit(&zone_status_lock);
2400	return (timeleft);
2401}
2402
2403/*
2404 * Zones have two reference counts: one for references from credential
2405 * structures (zone_cred_ref), and one (zone_ref) for everything else.
2406 * This is so we can allow a zone to be rebooted while there are still
2407 * outstanding cred references, since certain drivers cache dblks (which
2408 * implicitly results in cached creds).  We wait for zone_ref to drop to
2409 * 0 (actually 1), but not zone_cred_ref.  The zone structure itself is
2410 * later freed when the zone_cred_ref drops to 0, though nothing other
2411 * than the zone id and privilege set should be accessed once the zone
2412 * is "dead".
2413 *
2414 * A debugging flag, zone_wait_for_cred, can be set to a non-zero value
2415 * to force halt/reboot to block waiting for the zone_cred_ref to drop
2416 * to 0.  This can be useful to flush out other sources of cached creds
2417 * that may be less innocuous than the driver case.
2418 */
2419
2420int zone_wait_for_cred = 0;
2421
2422static void
2423zone_hold_locked(zone_t *z)
2424{
2425	ASSERT(MUTEX_HELD(&z->zone_lock));
2426	z->zone_ref++;
2427	ASSERT(z->zone_ref != 0);
2428}
2429
2430void
2431zone_hold(zone_t *z)
2432{
2433	mutex_enter(&z->zone_lock);
2434	zone_hold_locked(z);
2435	mutex_exit(&z->zone_lock);
2436}
2437
2438/*
2439 * If the non-cred ref count drops to 1 and either the cred ref count
2440 * is 0 or we aren't waiting for cred references, the zone is ready to
2441 * be destroyed.
2442 */
2443#define	ZONE_IS_UNREF(zone)	((zone)->zone_ref == 1 && \
2444	    (!zone_wait_for_cred || (zone)->zone_cred_ref == 0))
2445
2446void
2447zone_rele(zone_t *z)
2448{
2449	boolean_t wakeup;
2450
2451	mutex_enter(&z->zone_lock);
2452	ASSERT(z->zone_ref != 0);
2453	z->zone_ref--;
2454	if (z->zone_ref == 0 && z->zone_cred_ref == 0) {
2455		/* no more refs, free the structure */
2456		mutex_exit(&z->zone_lock);
2457		zone_free(z);
2458		return;
2459	}
2460	/* signal zone_destroy so the zone can finish halting */
2461	wakeup = (ZONE_IS_UNREF(z) && zone_status_get(z) >= ZONE_IS_DEAD);
2462	mutex_exit(&z->zone_lock);
2463
2464	if (wakeup) {
2465		/*
2466		 * Grabbing zonehash_lock here effectively synchronizes with
2467		 * zone_destroy() to avoid missed signals.
2468		 */
2469		mutex_enter(&zonehash_lock);
2470		cv_broadcast(&zone_destroy_cv);
2471		mutex_exit(&zonehash_lock);
2472	}
2473}
2474
2475void
2476zone_cred_hold(zone_t *z)
2477{
2478	mutex_enter(&z->zone_lock);
2479	z->zone_cred_ref++;
2480	ASSERT(z->zone_cred_ref != 0);
2481	mutex_exit(&z->zone_lock);
2482}
2483
2484void
2485zone_cred_rele(zone_t *z)
2486{
2487	boolean_t wakeup;
2488
2489	mutex_enter(&z->zone_lock);
2490	ASSERT(z->zone_cred_ref != 0);
2491	z->zone_cred_ref--;
2492	if (z->zone_ref == 0 && z->zone_cred_ref == 0) {
2493		/* no more refs, free the structure */
2494		mutex_exit(&z->zone_lock);
2495		zone_free(z);
2496		return;
2497	}
2498	/*
2499	 * If zone_destroy is waiting for the cred references to drain
2500	 * out, and they have, signal it.
2501	 */
2502	wakeup = (zone_wait_for_cred && ZONE_IS_UNREF(z) &&
2503	    zone_status_get(z) >= ZONE_IS_DEAD);
2504	mutex_exit(&z->zone_lock);
2505
2506	if (wakeup) {
2507		/*
2508		 * Grabbing zonehash_lock here effectively synchronizes with
2509		 * zone_destroy() to avoid missed signals.
2510		 */
2511		mutex_enter(&zonehash_lock);
2512		cv_broadcast(&zone_destroy_cv);
2513		mutex_exit(&zonehash_lock);
2514	}
2515}
2516
2517void
2518zone_task_hold(zone_t *z)
2519{
2520	mutex_enter(&z->zone_lock);
2521	z->zone_ntasks++;
2522	ASSERT(z->zone_ntasks != 0);
2523	mutex_exit(&z->zone_lock);
2524}
2525
2526void
2527zone_task_rele(zone_t *zone)
2528{
2529	uint_t refcnt;
2530
2531	mutex_enter(&zone->zone_lock);
2532	ASSERT(zone->zone_ntasks != 0);
2533	refcnt = --zone->zone_ntasks;
2534	if (refcnt > 1)	{	/* Common case */
2535		mutex_exit(&zone->zone_lock);
2536		return;
2537	}
2538	zone_hold_locked(zone);	/* so we can use the zone_t later */
2539	mutex_exit(&zone->zone_lock);
2540	if (refcnt == 1) {
2541		/*
2542		 * See if the zone is shutting down.
2543		 */
2544		mutex_enter(&zone_status_lock);
2545		if (zone_status_get(zone) != ZONE_IS_SHUTTING_DOWN) {
2546			goto out;
2547		}
2548
2549		/*
2550		 * Make sure the ntasks didn't change since we
2551		 * dropped zone_lock.
2552		 */
2553		mutex_enter(&zone->zone_lock);
2554		if (refcnt != zone->zone_ntasks) {
2555			mutex_exit(&zone->zone_lock);
2556			goto out;
2557		}
2558		mutex_exit(&zone->zone_lock);
2559
2560		/*
2561		 * No more user processes in the zone.  The zone is empty.
2562		 */
2563		zone_status_set(zone, ZONE_IS_EMPTY);
2564		goto out;
2565	}
2566
2567	ASSERT(refcnt == 0);
2568	/*
2569	 * zsched has exited; the zone is dead.
2570	 */
2571	zone->zone_zsched = NULL;		/* paranoia */
2572	mutex_enter(&zone_status_lock);
2573	zone_status_set(zone, ZONE_IS_DEAD);
2574out:
2575	mutex_exit(&zone_status_lock);
2576	zone_rele(zone);
2577}
2578
2579zoneid_t
2580getzoneid(void)
2581{
2582	return (curproc->p_zone->zone_id);
2583}
2584
2585/*
2586 * Internal versions of zone_find_by_*().  These don't zone_hold() or
2587 * check the validity of a zone's state.
2588 */
2589static zone_t *
2590zone_find_all_by_id(zoneid_t zoneid)
2591{
2592	mod_hash_val_t hv;
2593	zone_t *zone = NULL;
2594
2595	ASSERT(MUTEX_HELD(&zonehash_lock));
2596
2597	if (mod_hash_find(zonehashbyid,
2598	    (mod_hash_key_t)(uintptr_t)zoneid, &hv) == 0)
2599		zone = (zone_t *)hv;
2600	return (zone);
2601}
2602
2603static zone_t *
2604zone_find_all_by_label(const ts_label_t *label)
2605{
2606	mod_hash_val_t hv;
2607	zone_t *zone = NULL;
2608
2609	ASSERT(MUTEX_HELD(&zonehash_lock));
2610
2611	/*
2612	 * zonehashbylabel is not maintained for unlabeled systems
2613	 */
2614	if (!is_system_labeled())
2615		return (NULL);
2616	if (mod_hash_find(zonehashbylabel, (mod_hash_key_t)label, &hv) == 0)
2617		zone = (zone_t *)hv;
2618	return (zone);
2619}
2620
2621static zone_t *
2622zone_find_all_by_name(char *name)
2623{
2624	mod_hash_val_t hv;
2625	zone_t *zone = NULL;
2626
2627	ASSERT(MUTEX_HELD(&zonehash_lock));
2628
2629	if (mod_hash_find(zonehashbyname, (mod_hash_key_t)name, &hv) == 0)
2630		zone = (zone_t *)hv;
2631	return (zone);
2632}
2633
2634/*
2635 * Public interface for looking up a zone by zoneid.  Only returns the zone if
2636 * it is fully initialized, and has not yet begun the zone_destroy() sequence.
2637 * Caller must call zone_rele() once it is done with the zone.
2638 *
2639 * The zone may begin the zone_destroy() sequence immediately after this
2640 * function returns, but may be safely used until zone_rele() is called.
2641 */
2642zone_t *
2643zone_find_by_id(zoneid_t zoneid)
2644{
2645	zone_t *zone;
2646	zone_status_t status;
2647
2648	mutex_enter(&zonehash_lock);
2649	if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
2650		mutex_exit(&zonehash_lock);
2651		return (NULL);
2652	}
2653	status = zone_status_get(zone);
2654	if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
2655		/*
2656		 * For all practical purposes the zone doesn't exist.
2657		 */
2658		mutex_exit(&zonehash_lock);
2659		return (NULL);
2660	}
2661	zone_hold(zone);
2662	mutex_exit(&zonehash_lock);
2663	return (zone);
2664}
2665
2666/*
2667 * Similar to zone_find_by_id, but using zone label as the key.
2668 */
2669zone_t *
2670zone_find_by_label(const ts_label_t *label)
2671{
2672	zone_t *zone;
2673	zone_status_t status;
2674
2675	mutex_enter(&zonehash_lock);
2676	if ((zone = zone_find_all_by_label(label)) == NULL) {
2677		mutex_exit(&zonehash_lock);
2678		return (NULL);
2679	}
2680
2681	status = zone_status_get(zone);
2682	if (status > ZONE_IS_DOWN) {
2683		/*
2684		 * For all practical purposes the zone doesn't exist.
2685		 */
2686		mutex_exit(&zonehash_lock);
2687		return (NULL);
2688	}
2689	zone_hold(zone);
2690	mutex_exit(&zonehash_lock);
2691	return (zone);
2692}
2693
2694/*
2695 * Similar to zone_find_by_id, but using zone name as the key.
2696 */
2697zone_t *
2698zone_find_by_name(char *name)
2699{
2700	zone_t *zone;
2701	zone_status_t status;
2702
2703	mutex_enter(&zonehash_lock);
2704	if ((zone = zone_find_all_by_name(name)) == NULL) {
2705		mutex_exit(&zonehash_lock);
2706		return (NULL);
2707	}
2708	status = zone_status_get(zone);
2709	if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
2710		/*
2711		 * For all practical purposes the zone doesn't exist.
2712		 */
2713		mutex_exit(&zonehash_lock);
2714		return (NULL);
2715	}
2716	zone_hold(zone);
2717	mutex_exit(&zonehash_lock);
2718	return (zone);
2719}
2720
2721/*
2722 * Similar to zone_find_by_id(), using the path as a key.  For instance,
2723 * if there is a zone "foo" rooted at /foo/root, and the path argument
2724 * is "/foo/root/proc", it will return the held zone_t corresponding to
2725 * zone "foo".
2726 *
2727 * zone_find_by_path() always returns a non-NULL value, since at the
2728 * very least every path will be contained in the global zone.
2729 *
2730 * As with the other zone_find_by_*() functions, the caller is
2731 * responsible for zone_rele()ing the return value of this function.
2732 */
2733zone_t *
2734zone_find_by_path(const char *path)
2735{
2736	zone_t *zone;
2737	zone_t *zret = NULL;
2738	zone_status_t status;
2739
2740	if (path == NULL) {
2741		/*
2742		 * Call from rootconf().
2743		 */
2744		zone_hold(global_zone);
2745		return (global_zone);
2746	}
2747	ASSERT(*path == '/');
2748	mutex_enter(&zonehash_lock);
2749	for (zone = list_head(&zone_active); zone != NULL;
2750	    zone = list_next(&zone_active, zone)) {
2751		if (ZONE_PATH_VISIBLE(path, zone))
2752			zret = zone;
2753	}
2754	ASSERT(zret != NULL);
2755	status = zone_status_get(zret);
2756	if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
2757		/*
2758		 * Zone practically doesn't exist.
2759		 */
2760		zret = global_zone;
2761	}
2762	zone_hold(zret);
2763	mutex_exit(&zonehash_lock);
2764	return (zret);
2765}
2766
2767/*
2768 * Get the number of cpus visible to this zone.  The system-wide global
2769 * 'ncpus' is returned if pools are disabled, the caller is in the
2770 * global zone, or a NULL zone argument is passed in.
2771 */
2772int
2773zone_ncpus_get(zone_t *zone)
2774{
2775	int myncpus = zone == NULL ? 0 : zone->zone_ncpus;
2776
2777	return (myncpus != 0 ? myncpus : ncpus);
2778}
2779
2780/*
2781 * Get the number of online cpus visible to this zone.  The system-wide
2782 * global 'ncpus_online' is returned if pools are disabled, the caller
2783 * is in the global zone, or a NULL zone argument is passed in.
2784 */
2785int
2786zone_ncpus_online_get(zone_t *zone)
2787{
2788	int myncpus_online = zone == NULL ? 0 : zone->zone_ncpus_online;
2789
2790	return (myncpus_online != 0 ? myncpus_online : ncpus_online);
2791}
2792
2793/*
2794 * Return the pool to which the zone is currently bound.
2795 */
2796pool_t *
2797zone_pool_get(zone_t *zone)
2798{
2799	ASSERT(pool_lock_held());
2800
2801	return (zone->zone_pool);
2802}
2803
2804/*
2805 * Set the zone's pool pointer and update the zone's visibility to match
2806 * the resources in the new pool.
2807 */
2808void
2809zone_pool_set(zone_t *zone, pool_t *pool)
2810{
2811	ASSERT(pool_lock_held());
2812	ASSERT(MUTEX_HELD(&cpu_lock));
2813
2814	zone->zone_pool = pool;
2815	zone_pset_set(zone, pool->pool_pset->pset_id);
2816}
2817
2818/*
2819 * Return the cached value of the id of the processor set to which the
2820 * zone is currently bound.  The value will be ZONE_PS_INVAL if the pools
2821 * facility is disabled.
2822 */
2823psetid_t
2824zone_pset_get(zone_t *zone)
2825{
2826	ASSERT(MUTEX_HELD(&cpu_lock));
2827
2828	return (zone->zone_psetid);
2829}
2830
2831/*
2832 * Set the cached value of the id of the processor set to which the zone
2833 * is currently bound.  Also update the zone's visibility to match the
2834 * resources in the new processor set.
2835 */
2836void
2837zone_pset_set(zone_t *zone, psetid_t newpsetid)
2838{
2839	psetid_t oldpsetid;
2840
2841	ASSERT(MUTEX_HELD(&cpu_lock));
2842	oldpsetid = zone_pset_get(zone);
2843
2844	if (oldpsetid == newpsetid)
2845		return;
2846	/*
2847	 * Global zone sees all.
2848	 */
2849	if (zone != global_zone) {
2850		zone->zone_psetid = newpsetid;
2851		if (newpsetid != ZONE_PS_INVAL)
2852			pool_pset_visibility_add(newpsetid, zone);
2853		if (oldpsetid != ZONE_PS_INVAL)
2854			pool_pset_visibility_remove(oldpsetid, zone);
2855	}
2856	/*
2857	 * Disabling pools, so we should start using the global values
2858	 * for ncpus and ncpus_online.
2859	 */
2860	if (newpsetid == ZONE_PS_INVAL) {
2861		zone->zone_ncpus = 0;
2862		zone->zone_ncpus_online = 0;
2863	}
2864}
2865
2866/*
2867 * Walk the list of active zones and issue the provided callback for
2868 * each of them.
2869 *
2870 * Caller must not be holding any locks that may be acquired under
2871 * zonehash_lock.  See comment at the beginning of the file for a list of
2872 * common locks and their interactions with zones.
2873 */
2874int
2875zone_walk(int (*cb)(zone_t *, void *), void *data)
2876{
2877	zone_t *zone;
2878	int ret = 0;
2879	zone_status_t status;
2880
2881	mutex_enter(&zonehash_lock);
2882	for (zone = list_head(&zone_active); zone != NULL;
2883	    zone = list_next(&zone_active, zone)) {
2884		/*
2885		 * Skip zones that shouldn't be externally visible.
2886		 */
2887		status = zone_status_get(zone);
2888		if (status < ZONE_IS_READY || status > ZONE_IS_DOWN)
2889			continue;
2890		/*
2891		 * Bail immediately if any callback invocation returns a
2892		 * non-zero value.
2893		 */
2894		ret = (*cb)(zone, data);
2895		if (ret != 0)
2896			break;
2897	}
2898	mutex_exit(&zonehash_lock);
2899	return (ret);
2900}
2901
2902static int
2903zone_set_root(zone_t *zone, const char *upath)
2904{
2905	vnode_t *vp;
2906	int trycount;
2907	int error = 0;
2908	char *path;
2909	struct pathname upn, pn;
2910	size_t pathlen;
2911
2912	if ((error = pn_get((char *)upath, UIO_USERSPACE, &upn)) != 0)
2913		return (error);
2914
2915	pn_alloc(&pn);
2916
2917	/* prevent infinite loop */
2918	trycount = 10;
2919	for (;;) {
2920		if (--trycount <= 0) {
2921			error = ESTALE;
2922			goto out;
2923		}
2924
2925		if ((error = lookuppn(&upn, &pn, FOLLOW, NULLVPP, &vp)) == 0) {
2926			/*
2927			 * VOP_ACCESS() may cover 'vp' with a new
2928			 * filesystem, if 'vp' is an autoFS vnode.
2929			 * Get the new 'vp' if so.
2930			 */
2931			if ((error =
2932			    VOP_ACCESS(vp, VEXEC, 0, CRED(), NULL)) == 0 &&
2933			    (!vn_ismntpt(vp) ||
2934			    (error = traverse(&vp)) == 0)) {
2935				pathlen = pn.pn_pathlen + 2;
2936				path = kmem_alloc(pathlen, KM_SLEEP);
2937				(void) strncpy(path, pn.pn_path,
2938				    pn.pn_pathlen + 1);
2939				path[pathlen - 2] = '/';
2940				path[pathlen - 1] = '\0';
2941				pn_free(&pn);
2942				pn_free(&upn);
2943
2944				/* Success! */
2945				break;
2946			}
2947			VN_RELE(vp);
2948		}
2949		if (error != ESTALE)
2950			goto out;
2951	}
2952
2953	ASSERT(error == 0);
2954	zone->zone_rootvp = vp;		/* we hold a reference to vp */
2955	zone->zone_rootpath = path;
2956	zone->zone_rootpathlen = pathlen;
2957	if (pathlen > 5 && strcmp(path + pathlen - 5, "/lu/") == 0)
2958		zone->zone_flags |= ZF_IS_SCRATCH;
2959	return (0);
2960
2961out:
2962	pn_free(&pn);
2963	pn_free(&upn);
2964	return (error);
2965}
2966
2967#define	isalnum(c)	(((c) >= '0' && (c) <= '9') || \
2968			((c) >= 'a' && (c) <= 'z') || \
2969			((c) >= 'A' && (c) <= 'Z'))
2970
2971static int
2972zone_set_name(zone_t *zone, const char *uname)
2973{
2974	char *kname = kmem_zalloc(ZONENAME_MAX, KM_SLEEP);
2975	size_t len;
2976	int i, err;
2977
2978	if ((err = copyinstr(uname, kname, ZONENAME_MAX, &len)) != 0) {
2979		kmem_free(kname, ZONENAME_MAX);
2980		return (err);	/* EFAULT or ENAMETOOLONG */
2981	}
2982
2983	/* must be less than ZONENAME_MAX */
2984	if (len == ZONENAME_MAX && kname[ZONENAME_MAX - 1] != '\0') {
2985		kmem_free(kname, ZONENAME_MAX);
2986		return (EINVAL);
2987	}
2988
2989	/*
2990	 * Name must start with an alphanumeric and must contain only
2991	 * alphanumerics, '-', '_' and '.'.
2992	 */
2993	if (!isalnum(kname[0])) {
2994		kmem_free(kname, ZONENAME_MAX);
2995		return (EINVAL);
2996	}
2997	for (i = 1; i < len - 1; i++) {
2998		if (!isalnum(kname[i]) && kname[i] != '-' && kname[i] != '_' &&
2999		    kname[i] != '.') {
3000			kmem_free(kname, ZONENAME_MAX);
3001			return (EINVAL);
3002		}
3003	}
3004
3005	zone->zone_name = kname;
3006	return (0);
3007}
3008
3009/*
3010 * Gets the 32-bit hostid of the specified zone as an unsigned int.  If 'zonep'
3011 * is NULL or it points to a zone with no hostid emulation, then the machine's
3012 * hostid (i.e., the global zone's hostid) is returned.  This function returns
3013 * zero if neither the zone nor the host machine (global zone) have hostids.  It
3014 * returns HW_INVALID_HOSTID if the function attempts to return the machine's
3015 * hostid and the machine's hostid is invalid.
3016 */
3017uint32_t
3018zone_get_hostid(zone_t *zonep)
3019{
3020	unsigned long machine_hostid;
3021
3022	if (zonep == NULL || zonep->zone_hostid == HW_INVALID_HOSTID) {
3023		if (ddi_strtoul(hw_serial, NULL, 10, &machine_hostid) != 0)
3024			return (HW_INVALID_HOSTID);
3025		return ((uint32_t)machine_hostid);
3026	}
3027	return (zonep->zone_hostid);
3028}
3029
3030/*
3031 * Similar to thread_create(), but makes sure the thread is in the appropriate
3032 * zone's zsched process (curproc->p_zone->zone_zsched) before returning.
3033 */
3034/*ARGSUSED*/
3035kthread_t *
3036zthread_create(
3037    caddr_t stk,
3038    size_t stksize,
3039    void (*proc)(),
3040    void *arg,
3041    size_t len,
3042    pri_t pri)
3043{
3044	kthread_t *t;
3045	zone_t *zone = curproc->p_zone;
3046	proc_t *pp = zone->zone_zsched;
3047
3048	zone_hold(zone);	/* Reference to be dropped when thread exits */
3049
3050	/*
3051	 * No-one should be trying to create threads if the zone is shutting
3052	 * down and there aren't any kernel threads around.  See comment
3053	 * in zthread_exit().
3054	 */
3055	ASSERT(!(zone->zone_kthreads == NULL &&
3056	    zone_status_get(zone) >= ZONE_IS_EMPTY));
3057	/*
3058	 * Create a thread, but don't let it run until we've finished setting
3059	 * things up.
3060	 */
3061	t = thread_create(stk, stksize, proc, arg, len, pp, TS_STOPPED, pri);
3062	ASSERT(t->t_forw == NULL);
3063	mutex_enter(&zone_status_lock);
3064	if (zone->zone_kthreads == NULL) {
3065		t->t_forw = t->t_back = t;
3066	} else {
3067		kthread_t *tx = zone->zone_kthreads;
3068
3069		t->t_forw = tx;
3070		t->t_back = tx->t_back;
3071		tx->t_back->t_forw = t;
3072		tx->t_back = t;
3073	}
3074	zone->zone_kthreads = t;
3075	mutex_exit(&zone_status_lock);
3076
3077	mutex_enter(&pp->p_lock);
3078	t->t_proc_flag |= TP_ZTHREAD;
3079	project_rele(t->t_proj);
3080	t->t_proj = project_hold(pp->p_task->tk_proj);
3081
3082	/*
3083	 * Setup complete, let it run.
3084	 */
3085	thread_lock(t);
3086	t->t_schedflag |= TS_ALLSTART;
3087	setrun_locked(t);
3088	thread_unlock(t);
3089
3090	mutex_exit(&pp->p_lock);
3091
3092	return (t);
3093}
3094
3095/*
3096 * Similar to thread_exit().  Must be called by threads created via
3097 * zthread_exit().
3098 */
3099void
3100zthread_exit(void)
3101{
3102	kthread_t *t = curthread;
3103	proc_t *pp = curproc;
3104	zone_t *zone = pp->p_zone;
3105
3106	mutex_enter(&zone_status_lock);
3107
3108	/*
3109	 * Reparent to p0
3110	 */
3111	kpreempt_disable();
3112	mutex_enter(&pp->p_lock);
3113	t->t_proc_flag &= ~TP_ZTHREAD;
3114	t->t_procp = &p0;
3115	hat_thread_exit(t);
3116	mutex_exit(&pp->p_lock);
3117	kpreempt_enable();
3118
3119	if (t->t_back == t) {
3120		ASSERT(t->t_forw == t);
3121		/*
3122		 * If the zone is empty, once the thread count
3123		 * goes to zero no further kernel threads can be
3124		 * created.  This is because if the creator is a process
3125		 * in the zone, then it must have exited before the zone
3126		 * state could be set to ZONE_IS_EMPTY.
3127		 * Otherwise, if the creator is a kernel thread in the
3128		 * zone, the thread count is non-zero.
3129		 *
3130		 * This really means that non-zone kernel threads should
3131		 * not create zone kernel threads.
3132		 */
3133		zone->zone_kthreads = NULL;
3134		if (zone_status_get(zone) == ZONE_IS_EMPTY) {
3135			zone_status_set(zone, ZONE_IS_DOWN);
3136			/*
3137			 * Remove any CPU caps on this zone.
3138			 */
3139			cpucaps_zone_remove(zone);
3140		}
3141	} else {
3142		t->t_forw->t_back = t->t_back;
3143		t->t_back->t_forw = t->t_forw;
3144		if (zone->zone_kthreads == t)
3145			zone->zone_kthreads = t->t_forw;
3146	}
3147	mutex_exit(&zone_status_lock);
3148	zone_rele(zone);
3149	thread_exit();
3150	/* NOTREACHED */
3151}
3152
3153static void
3154zone_chdir(vnode_t *vp, vnode_t **vpp, proc_t *pp)
3155{
3156	vnode_t *oldvp;
3157
3158	/* we're going to hold a reference here to the directory */
3159	VN_HOLD(vp);
3160
3161	/* update abs cwd/root path see c2/audit.c */
3162	if (AU_AUDITING())
3163		audit_chdirec(vp, vpp);
3164
3165	mutex_enter(&pp->p_lock);
3166	oldvp = *vpp;
3167	*vpp = vp;
3168	mutex_exit(&pp->p_lock);
3169	if (oldvp != NULL)
3170		VN_RELE(oldvp);
3171}
3172
3173/*
3174 * Convert an rctl value represented by an nvlist_t into an rctl_val_t.
3175 */
3176static int
3177nvlist2rctlval(nvlist_t *nvl, rctl_val_t *rv)
3178{
3179	nvpair_t *nvp = NULL;
3180	boolean_t priv_set = B_FALSE;
3181	boolean_t limit_set = B_FALSE;
3182	boolean_t action_set = B_FALSE;
3183
3184	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
3185		const char *name;
3186		uint64_t ui64;
3187
3188		name = nvpair_name(nvp);
3189		if (nvpair_type(nvp) != DATA_TYPE_UINT64)
3190			return (EINVAL);
3191		(void) nvpair_value_uint64(nvp, &ui64);
3192		if (strcmp(name, "privilege") == 0) {
3193			/*
3194			 * Currently only privileged values are allowed, but
3195			 * this may change in the future.
3196			 */
3197			if (ui64 != RCPRIV_PRIVILEGED)
3198				return (EINVAL);
3199			rv->rcv_privilege = ui64;
3200			priv_set = B_TRUE;
3201		} else if (strcmp(name, "limit") == 0) {
3202			rv->rcv_value = ui64;
3203			limit_set = B_TRUE;
3204		} else if (strcmp(name, "action") == 0) {
3205			if (ui64 != RCTL_LOCAL_NOACTION &&
3206			    ui64 != RCTL_LOCAL_DENY)
3207				return (EINVAL);
3208			rv->rcv_flagaction = ui64;
3209			action_set = B_TRUE;
3210		} else {
3211			return (EINVAL);
3212		}
3213	}
3214
3215	if (!(priv_set && limit_set && action_set))
3216		return (EINVAL);
3217	rv->rcv_action_signal = 0;
3218	rv->rcv_action_recipient = NULL;
3219	rv->rcv_action_recip_pid = -1;
3220	rv->rcv_firing_time = 0;
3221
3222	return (0);
3223}
3224
3225/*
3226 * Non-global zone version of start_init.
3227 */
3228void
3229zone_start_init(void)
3230{
3231	proc_t *p = ttoproc(curthread);
3232	zone_t *z = p->p_zone;
3233
3234	ASSERT(!INGLOBALZONE(curproc));
3235
3236	/*
3237	 * For all purposes (ZONE_ATTR_INITPID and restart_init),
3238	 * storing just the pid of init is sufficient.
3239	 */
3240	z->zone_proc_initpid = p->p_pid;
3241
3242	/*
3243	 * We maintain zone_boot_err so that we can return the cause of the
3244	 * failure back to the caller of the zone_boot syscall.
3245	 */
3246	p->p_zone->zone_boot_err = start_init_common();
3247
3248	/*
3249	 * We will prevent booting zones from becoming running zones if the
3250	 * global zone is shutting down.
3251	 */
3252	mutex_enter(&zone_status_lock);
3253	if (z->zone_boot_err != 0 || zone_status_get(global_zone) >=
3254	    ZONE_IS_SHUTTING_DOWN) {
3255		/*
3256		 * Make sure we are still in the booting state-- we could have
3257		 * raced and already be shutting down, or even further along.
3258		 */
3259		if (zone_status_get(z) == ZONE_IS_BOOTING) {
3260			zone_status_set(z, ZONE_IS_SHUTTING_DOWN);
3261		}
3262		mutex_exit(&zone_status_lock);
3263		/* It's gone bad, dispose of the process */
3264		if (proc_exit(CLD_EXITED, z->zone_boot_err) != 0) {
3265			mutex_enter(&p->p_lock);
3266			ASSERT(p->p_flag & SEXITLWPS);
3267			lwp_exit();
3268		}
3269	} else {
3270		if (zone_status_get(z) == ZONE_IS_BOOTING)
3271			zone_status_set(z, ZONE_IS_RUNNING);
3272		mutex_exit(&zone_status_lock);
3273		/* cause the process to return to userland. */
3274		lwp_rtt();
3275	}
3276}
3277
3278struct zsched_arg {
3279	zone_t *zone;
3280	nvlist_t *nvlist;
3281};
3282
3283/*
3284 * Per-zone "sched" workalike.  The similarity to "sched" doesn't have
3285 * anything to do with scheduling, but rather with the fact that
3286 * per-zone kernel threads are parented to zsched, just like regular
3287 * kernel threads are parented to sched (p0).
3288 *
3289 * zsched is also responsible for launching init for the zone.
3290 */
3291static void
3292zsched(void *arg)
3293{
3294	struct zsched_arg *za = arg;
3295	proc_t *pp = curproc;
3296	proc_t *initp = proc_init;
3297	zone_t *zone = za->zone;
3298	cred_t *cr, *oldcred;
3299	rctl_set_t *set;
3300	rctl_alloc_gp_t *gp;
3301	contract_t *ct = NULL;
3302	task_t *tk, *oldtk;
3303	rctl_entity_p_t e;
3304	kproject_t *pj;
3305
3306	nvlist_t *nvl = za->nvlist;
3307	nvpair_t *nvp = NULL;
3308
3309	bcopy("zsched", PTOU(pp)->u_psargs, sizeof ("zsched"));
3310	bcopy("zsched", PTOU(pp)->u_comm, sizeof ("zsched"));
3311	PTOU(pp)->u_argc = 0;
3312	PTOU(pp)->u_argv = NULL;
3313	PTOU(pp)->u_envp = NULL;
3314	closeall(P_FINFO(pp));
3315
3316	/*
3317	 * We are this zone's "zsched" process.  As the zone isn't generally
3318	 * visible yet we don't need to grab any locks before initializing its
3319	 * zone_proc pointer.
3320	 */
3321	zone_hold(zone);  /* this hold is released by zone_destroy() */
3322	zone->zone_zsched = pp;
3323	mutex_enter(&pp->p_lock);
3324	pp->p_zone = zone;
3325	mutex_exit(&pp->p_lock);
3326
3327	/*
3328	 * Disassociate process from its 'parent'; parent ourselves to init
3329	 * (pid 1) and change other values as needed.
3330	 */
3331	sess_create();
3332
3333	mutex_enter(&pidlock);
3334	proc_detach(pp);
3335	pp->p_ppid = 1;
3336	pp->p_flag |= SZONETOP;
3337	pp->p_ancpid = 1;
3338	pp->p_parent = initp;
3339	pp->p_psibling = NULL;
3340	if (initp->p_child)
3341		initp->p_child->p_psibling = pp;
3342	pp->p_sibling = initp->p_child;
3343	initp->p_child = pp;
3344
3345	/* Decrement what newproc() incremented. */
3346	upcount_dec(crgetruid(CRED()), GLOBAL_ZONEID);
3347	/*
3348	 * Our credentials are about to become kcred-like, so we don't care
3349	 * about the caller's ruid.
3350	 */
3351	upcount_inc(crgetruid(kcred), zone->zone_id);
3352	mutex_exit(&pidlock);
3353
3354	/*
3355	 * getting out of global zone, so decrement lwp counts
3356	 */
3357	pj = pp->p_task->tk_proj;
3358	mutex_enter(&global_zone->zone_nlwps_lock);
3359	pj->kpj_nlwps -= pp->p_lwpcnt;
3360	global_zone->zone_nlwps -= pp->p_lwpcnt;
3361	mutex_exit(&global_zone->zone_nlwps_lock);
3362
3363	/*
3364	 * Decrement locked memory counts on old zone and project.
3365	 */
3366	mutex_enter(&global_zone->zone_mem_lock);
3367	global_zone->zone_locked_mem -= pp->p_locked_mem;
3368	pj->kpj_data.kpd_locked_mem -= pp->p_locked_mem;
3369	mutex_exit(&global_zone->zone_mem_lock);
3370
3371	/*
3372	 * Create and join a new task in project '0' of this zone.
3373	 *
3374	 * We don't need to call holdlwps() since we know we're the only lwp in
3375	 * this process.
3376	 *
3377	 * task_join() returns with p_lock held.
3378	 */
3379	tk = task_create(0, zone);
3380	mutex_enter(&cpu_lock);
3381	oldtk = task_join(tk, 0);
3382
3383	pj = pp->p_task->tk_proj;
3384
3385	mutex_enter(&zone->zone_mem_lock);
3386	zone->zone_locked_mem += pp->p_locked_mem;
3387	pj->kpj_data.kpd_locked_mem += pp->p_locked_mem;
3388	mutex_exit(&zone->zone_mem_lock);
3389
3390	/*
3391	 * add lwp counts to zsched's zone, and increment project's task count
3392	 * due to the task created in the above tasksys_settaskid
3393	 */
3394
3395	mutex_enter(&zone->zone_nlwps_lock);
3396	pj->kpj_nlwps += pp->p_lwpcnt;
3397	pj->kpj_ntasks += 1;
3398	zone->zone_nlwps += pp->p_lwpcnt;
3399	mutex_exit(&zone->zone_nlwps_lock);
3400
3401	mutex_exit(&curproc->p_lock);
3402	mutex_exit(&cpu_lock);
3403	task_rele(oldtk);
3404
3405	/*
3406	 * The process was created by a process in the global zone, hence the
3407	 * credentials are wrong.  We might as well have kcred-ish credentials.
3408	 */
3409	cr = zone->zone_kcred;
3410	crhold(cr);
3411	mutex_enter(&pp->p_crlock);
3412	oldcred = pp->p_cred;
3413	pp->p_cred = cr;
3414	mutex_exit(&pp->p_crlock);
3415	crfree(oldcred);
3416
3417	/*
3418	 * Hold credentials again (for thread)
3419	 */
3420	crhold(cr);
3421
3422	/*
3423	 * p_lwpcnt can't change since this is a kernel process.
3424	 */
3425	crset(pp, cr);
3426
3427	/*
3428	 * Chroot
3429	 */
3430	zone_chdir(zone->zone_rootvp, &PTOU(pp)->u_cdir, pp);
3431	zone_chdir(zone->zone_rootvp, &PTOU(pp)->u_rdir, pp);
3432
3433	/*
3434	 * Initialize zone's rctl set.
3435	 */
3436	set = rctl_set_create();
3437	gp = rctl_set_init_prealloc(RCENTITY_ZONE);
3438	mutex_enter(&pp->p_lock);
3439	e.rcep_p.zone = zone;
3440	e.rcep_t = RCENTITY_ZONE;
3441	zone->zone_rctls = rctl_set_init(RCENTITY_ZONE, pp, &e, set, gp);
3442	mutex_exit(&pp->p_lock);
3443	rctl_prealloc_destroy(gp);
3444
3445	/*
3446	 * Apply the rctls passed in to zone_create().  This is basically a list
3447	 * assignment: all of the old values are removed and the new ones
3448	 * inserted.  That is, if an empty list is passed in, all values are
3449	 * removed.
3450	 */
3451	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
3452		rctl_dict_entry_t *rde;
3453		rctl_hndl_t hndl;
3454		char *name;
3455		nvlist_t **nvlarray;
3456		uint_t i, nelem;
3457		int error;	/* For ASSERT()s */
3458
3459		name = nvpair_name(nvp);
3460		hndl = rctl_hndl_lookup(name);
3461		ASSERT(hndl != -1);
3462		rde = rctl_dict_lookup_hndl(hndl);
3463		ASSERT(rde != NULL);
3464
3465		for (; /* ever */; ) {
3466			rctl_val_t oval;
3467
3468			mutex_enter(&pp->p_lock);
3469			error = rctl_local_get(hndl, NULL, &oval, pp);
3470			mutex_exit(&pp->p_lock);
3471			ASSERT(error == 0);	/* Can't fail for RCTL_FIRST */
3472			ASSERT(oval.rcv_privilege != RCPRIV_BASIC);
3473			if (oval.rcv_privilege == RCPRIV_SYSTEM)
3474				break;
3475			mutex_enter(&pp->p_lock);
3476			error = rctl_local_delete(hndl, &oval, pp);
3477			mutex_exit(&pp->p_lock);
3478			ASSERT(error == 0);
3479		}
3480		error = nvpair_value_nvlist_array(nvp, &nvlarray, &nelem);
3481		ASSERT(error == 0);
3482		for (i = 0; i < nelem; i++) {
3483			rctl_val_t *nvalp;
3484
3485			nvalp = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
3486			error = nvlist2rctlval(nvlarray[i], nvalp);
3487			ASSERT(error == 0);
3488			/*
3489			 * rctl_local_insert can fail if the value being
3490			 * inserted is a duplicate; this is OK.
3491			 */
3492			mutex_enter(&pp->p_lock);
3493			if (rctl_local_insert(hndl, nvalp, pp) != 0)
3494				kmem_cache_free(rctl_val_cache, nvalp);
3495			mutex_exit(&pp->p_lock);
3496		}
3497	}
3498	/*
3499	 * Tell the world that we're done setting up.
3500	 *
3501	 * At this point we want to set the zone status to ZONE_IS_INITIALIZED
3502	 * and atomically set the zone's processor set visibility.  Once
3503	 * we drop pool_lock() this zone will automatically get updated
3504	 * to reflect any future changes to the pools configuration.
3505	 *
3506	 * Note that after we drop the locks below (zonehash_lock in
3507	 * particular) other operations such as a zone_getattr call can
3508	 * now proceed and observe the zone. That is the reason for doing a
3509	 * state transition to the INITIALIZED state.
3510	 */
3511	pool_lock();
3512	mutex_enter(&cpu_lock);
3513	mutex_enter(&zonehash_lock);
3514	zone_uniqid(zone);
3515	zone_zsd_configure(zone);
3516	if (pool_state == POOL_ENABLED)
3517		zone_pset_set(zone, pool_default->pool_pset->pset_id);
3518	mutex_enter(&zone_status_lock);
3519	ASSERT(zone_status_get(zone) == ZONE_IS_UNINITIALIZED);
3520	zone_status_set(zone, ZONE_IS_INITIALIZED);
3521	mutex_exit(&zone_status_lock);
3522	mutex_exit(&zonehash_lock);
3523	mutex_exit(&cpu_lock);
3524	pool_unlock();
3525
3526	/* Now call the create callback for this key */
3527	zsd_apply_all_keys(zsd_apply_create, zone);
3528
3529	/* The callbacks are complete. Mark ZONE_IS_READY */
3530	mutex_enter(&zone_status_lock);
3531	ASSERT(zone_status_get(zone) == ZONE_IS_INITIALIZED);
3532	zone_status_set(zone, ZONE_IS_READY);
3533	mutex_exit(&zone_status_lock);
3534
3535	/*
3536	 * Once we see the zone transition to the ZONE_IS_BOOTING state,
3537	 * we launch init, and set the state to running.
3538	 */
3539	zone_status_wait_cpr(zone, ZONE_IS_BOOTING, "zsched");
3540
3541	if (zone_status_get(zone) == ZONE_IS_BOOTING) {
3542		id_t cid;
3543
3544		/*
3545		 * Ok, this is a little complicated.  We need to grab the
3546		 * zone's pool's scheduling class ID; note that by now, we
3547		 * are already bound to a pool if we need to be (zoneadmd
3548		 * will have done that to us while we're in the READY
3549		 * state).  *But* the scheduling class for the zone's 'init'
3550		 * must be explicitly passed to newproc, which doesn't
3551		 * respect pool bindings.
3552		 *
3553		 * We hold the pool_lock across the call to newproc() to
3554		 * close the obvious race: the pool's scheduling class
3555		 * could change before we manage to create the LWP with
3556		 * classid 'cid'.
3557		 */
3558		pool_lock();
3559		if (zone->zone_defaultcid > 0)
3560			cid = zone->zone_defaultcid;
3561		else
3562			cid = pool_get_class(zone->zone_pool);
3563		if (cid == -1)
3564			cid = defaultcid;
3565
3566		/*
3567		 * If this fails, zone_boot will ultimately fail.  The
3568		 * state of the zone will be set to SHUTTING_DOWN-- userland
3569		 * will have to tear down the zone, and fail, or try again.
3570		 */
3571		if ((zone->zone_boot_err = newproc(zone_start_init, NULL, cid,
3572		    minclsyspri - 1, &ct, 0)) != 0) {
3573			mutex_enter(&zone_status_lock);
3574			zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
3575			mutex_exit(&zone_status_lock);
3576		}
3577		pool_unlock();
3578	}
3579
3580	/*
3581	 * Wait for zone_destroy() to be called.  This is what we spend
3582	 * most of our life doing.
3583	 */
3584	zone_status_wait_cpr(zone, ZONE_IS_DYING, "zsched");
3585
3586	if (ct)
3587		/*
3588		 * At this point the process contract should be empty.
3589		 * (Though if it isn't, it's not the end of the world.)
3590		 */
3591		VERIFY(contract_abandon(ct, curproc, B_TRUE) == 0);
3592
3593	/*
3594	 * Allow kcred to be freed when all referring processes
3595	 * (including this one) go away.  We can't just do this in
3596	 * zone_free because we need to wait for the zone_cred_ref to
3597	 * drop to 0 before calling zone_free, and the existence of
3598	 * zone_kcred will prevent that.  Thus, we call crfree here to
3599	 * balance the crdup in zone_create.  The crhold calls earlier
3600	 * in zsched will be dropped when the thread and process exit.
3601	 */
3602	crfree(zone->zone_kcred);
3603	zone->zone_kcred = NULL;
3604
3605	exit(CLD_EXITED, 0);
3606}
3607
3608/*
3609 * Helper function to determine if there are any submounts of the
3610 * provided path.  Used to make sure the zone doesn't "inherit" any
3611 * mounts from before it is created.
3612 */
3613static uint_t
3614zone_mount_count(const char *rootpath)
3615{
3616	vfs_t *vfsp;
3617	uint_t count = 0;
3618	size_t rootpathlen = strlen(rootpath);
3619
3620	/*
3621	 * Holding zonehash_lock prevents race conditions with
3622	 * vfs_list_add()/vfs_list_remove() since we serialize with
3623	 * zone_find_by_path().
3624	 */
3625	ASSERT(MUTEX_HELD(&zonehash_lock));
3626	/*
3627	 * The rootpath must end with a '/'
3628	 */
3629	ASSERT(rootpath[rootpathlen - 1] == '/');
3630
3631	/*
3632	 * This intentionally does not count the rootpath itself if that
3633	 * happens to be a mount point.
3634	 */
3635	vfs_list_read_lock();
3636	vfsp = rootvfs;
3637	do {
3638		if (strncmp(rootpath, refstr_value(vfsp->vfs_mntpt),
3639		    rootpathlen) == 0)
3640			count++;
3641		vfsp = vfsp->vfs_next;
3642	} while (vfsp != rootvfs);
3643	vfs_list_unlock();
3644	return (count);
3645}
3646
3647/*
3648 * Helper function to make sure that a zone created on 'rootpath'
3649 * wouldn't end up containing other zones' rootpaths.
3650 */
3651static boolean_t
3652zone_is_nested(const char *rootpath)
3653{
3654	zone_t *zone;
3655	size_t rootpathlen = strlen(rootpath);
3656	size_t len;
3657
3658	ASSERT(MUTEX_HELD(&zonehash_lock));
3659
3660	/*
3661	 * zone_set_root() appended '/' and '\0' at the end of rootpath
3662	 */
3663	if ((rootpathlen <= 3) && (rootpath[0] == '/') &&
3664	    (rootpath[1] == '/') && (rootpath[2] == '\0'))
3665		return (B_TRUE);
3666
3667	for (zone = list_head(&zone_active); zone != NULL;
3668	    zone = list_next(&zone_active, zone)) {
3669		if (zone == global_zone)
3670			continue;
3671		len = strlen(zone->zone_rootpath);
3672		if (strncmp(rootpath, zone->zone_rootpath,
3673		    MIN(rootpathlen, len)) == 0)
3674			return (B_TRUE);
3675	}
3676	return (B_FALSE);
3677}
3678
3679static int
3680zone_set_privset(zone_t *zone, const priv_set_t *zone_privs,
3681    size_t zone_privssz)
3682{
3683	priv_set_t *privs = kmem_alloc(sizeof (priv_set_t), KM_SLEEP);
3684
3685	if (zone_privssz < sizeof (priv_set_t))
3686		return (set_errno(ENOMEM));
3687
3688	if (copyin(zone_privs, privs, sizeof (priv_set_t))) {
3689		kmem_free(privs, sizeof (priv_set_t));
3690		return (EFAULT);
3691	}
3692
3693	zone->zone_privset = privs;
3694	return (0);
3695}
3696
3697/*
3698 * We make creative use of nvlists to pass in rctls from userland.  The list is
3699 * a list of the following structures:
3700 *
3701 * (name = rctl_name, value = nvpair_list_array)
3702 *
3703 * Where each element of the nvpair_list_array is of the form:
3704 *
3705 * [(name = "privilege", value = RCPRIV_PRIVILEGED),
3706 * 	(name = "limit", value = uint64_t),
3707 * 	(name = "action", value = (RCTL_LOCAL_NOACTION || RCTL_LOCAL_DENY))]
3708 */
3709static int
3710parse_rctls(caddr_t ubuf, size_t buflen, nvlist_t **nvlp)
3711{
3712	nvpair_t *nvp = NULL;
3713	nvlist_t *nvl = NULL;
3714	char *kbuf;
3715	int error;
3716	rctl_val_t rv;
3717
3718	*nvlp = NULL;
3719
3720	if (buflen == 0)
3721		return (0);
3722
3723	if ((kbuf = kmem_alloc(buflen, KM_NOSLEEP)) == NULL)
3724		return (ENOMEM);
3725	if (copyin(ubuf, kbuf, buflen)) {
3726		error = EFAULT;
3727		goto out;
3728	}
3729	if (nvlist_unpack(kbuf, buflen, &nvl, KM_SLEEP) != 0) {
3730		/*
3731		 * nvl may have been allocated/free'd, but the value set to
3732		 * non-NULL, so we reset it here.
3733		 */
3734		nvl = NULL;
3735		error = EINVAL;
3736		goto out;
3737	}
3738	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
3739		rctl_dict_entry_t *rde;
3740		rctl_hndl_t hndl;
3741		nvlist_t **nvlarray;
3742		uint_t i, nelem;
3743		char *name;
3744
3745		error = EINVAL;
3746		name = nvpair_name(nvp);
3747		if (strncmp(nvpair_name(nvp), "zone.", sizeof ("zone.") - 1)
3748		    != 0 || nvpair_type(nvp) != DATA_TYPE_NVLIST_ARRAY) {
3749			goto out;
3750		}
3751		if ((hndl = rctl_hndl_lookup(name)) == -1) {
3752			goto out;
3753		}
3754		rde = rctl_dict_lookup_hndl(hndl);
3755		error = nvpair_value_nvlist_array(nvp, &nvlarray, &nelem);
3756		ASSERT(error == 0);
3757		for (i = 0; i < nelem; i++) {
3758			if (error = nvlist2rctlval(nvlarray[i], &rv))
3759				goto out;
3760		}
3761		if (rctl_invalid_value(rde, &rv)) {
3762			error = EINVAL;
3763			goto out;
3764		}
3765	}
3766	error = 0;
3767	*nvlp = nvl;
3768out:
3769	kmem_free(kbuf, buflen);
3770	if (error && nvl != NULL)
3771		nvlist_free(nvl);
3772	return (error);
3773}
3774
3775int
3776zone_create_error(int er_error, int er_ext, int *er_out) {
3777	if (er_out != NULL) {
3778		if (copyout(&er_ext, er_out, sizeof (int))) {
3779			return (set_errno(EFAULT));
3780		}
3781	}
3782	return (set_errno(er_error));
3783}
3784
3785static int
3786zone_set_label(zone_t *zone, const bslabel_t *lab, uint32_t doi)
3787{
3788	ts_label_t *tsl;
3789	bslabel_t blab;
3790
3791	/* Get label from user */
3792	if (copyin(lab, &blab, sizeof (blab)) != 0)
3793		return (EFAULT);
3794	tsl = labelalloc(&blab, doi, KM_NOSLEEP);
3795	if (tsl == NULL)
3796		return (ENOMEM);
3797
3798	zone->zone_slabel = tsl;
3799	return (0);
3800}
3801
3802/*
3803 * Parses a comma-separated list of ZFS datasets into a per-zone dictionary.
3804 */
3805static int
3806parse_zfs(zone_t *zone, caddr_t ubuf, size_t buflen)
3807{
3808	char *kbuf;
3809	char *dataset, *next;
3810	zone_dataset_t *zd;
3811	size_t len;
3812
3813	if (ubuf == NULL || buflen == 0)
3814		return (0);
3815
3816	if ((kbuf = kmem_alloc(buflen, KM_NOSLEEP)) == NULL)
3817		return (ENOMEM);
3818
3819	if (copyin(ubuf, kbuf, buflen) != 0) {
3820		kmem_free(kbuf, buflen);
3821		return (EFAULT);
3822	}
3823
3824	dataset = next = kbuf;
3825	for (;;) {
3826		zd = kmem_alloc(sizeof (zone_dataset_t), KM_SLEEP);
3827
3828		next = strchr(dataset, ',');
3829
3830		if (next == NULL)
3831			len = strlen(dataset);
3832		else
3833			len = next - dataset;
3834
3835		zd->zd_dataset = kmem_alloc(len + 1, KM_SLEEP);
3836		bcopy(dataset, zd->zd_dataset, len);
3837		zd->zd_dataset[len] = '\0';
3838
3839		list_insert_head(&zone->zone_datasets, zd);
3840
3841		if (next == NULL)
3842			break;
3843
3844		dataset = next + 1;
3845	}
3846
3847	kmem_free(kbuf, buflen);
3848	return (0);
3849}
3850
3851/*
3852 * System call to create/initialize a new zone named 'zone_name', rooted
3853 * at 'zone_root', with a zone-wide privilege limit set of 'zone_privs',
3854 * and initialized with the zone-wide rctls described in 'rctlbuf', and
3855 * with labeling set by 'match', 'doi', and 'label'.
3856 *
3857 * If extended error is non-null, we may use it to return more detailed
3858 * error information.
3859 */
3860static zoneid_t
3861zone_create(const char *zone_name, const char *zone_root,
3862    const priv_set_t *zone_privs, size_t zone_privssz,
3863    caddr_t rctlbuf, size_t rctlbufsz,
3864    caddr_t zfsbuf, size_t zfsbufsz, int *extended_error,
3865    int match, uint32_t doi, const bslabel_t *label,
3866    int flags)
3867{
3868	struct zsched_arg zarg;
3869	nvlist_t *rctls = NULL;
3870	proc_t *pp = curproc;
3871	zone_t *zone, *ztmp;
3872	zoneid_t zoneid;
3873	int error;
3874	int error2 = 0;
3875	char *str;
3876	cred_t *zkcr;
3877	boolean_t insert_label_hash;
3878
3879	if (secpolicy_zone_config(CRED()) != 0)
3880		return (set_errno(EPERM));
3881
3882	/* can't boot zone from within chroot environment */
3883	if (PTOU(pp)->u_rdir != NULL && PTOU(pp)->u_rdir != rootdir)
3884		return (zone_create_error(ENOTSUP, ZE_CHROOTED,
3885		    extended_error));
3886
3887	zone = kmem_zalloc(sizeof (zone_t), KM_SLEEP);
3888	zoneid = zone->zone_id = id_alloc(zoneid_space);
3889	zone->zone_status = ZONE_IS_UNINITIALIZED;
3890	zone->zone_pool = pool_default;
3891	zone->zone_pool_mod = gethrtime();
3892	zone->zone_psetid = ZONE_PS_INVAL;
3893	zone->zone_ncpus = 0;
3894	zone->zone_ncpus_online = 0;
3895	zone->zone_restart_init = B_TRUE;
3896	zone->zone_brand = &native_brand;
3897	zone->zone_initname = NULL;
3898	mutex_init(&zone->zone_lock, NULL, MUTEX_DEFAULT, NULL);
3899	mutex_init(&zone->zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL);
3900	mutex_init(&zone->zone_mem_lock, NULL, MUTEX_DEFAULT, NULL);
3901	cv_init(&zone->zone_cv, NULL, CV_DEFAULT, NULL);
3902	list_create(&zone->zone_zsd, sizeof (struct zsd_entry),
3903	    offsetof(struct zsd_entry, zsd_linkage));
3904	list_create(&zone->zone_datasets, sizeof (zone_dataset_t),
3905	    offsetof(zone_dataset_t, zd_linkage));
3906	list_create(&zone->zone_dl_list, sizeof (zone_dl_t),
3907	    offsetof(zone_dl_t, zdl_linkage));
3908	rw_init(&zone->zone_mlps.mlpl_rwlock, NULL, RW_DEFAULT, NULL);
3909	rw_init(&zone->zone_mntfs_db_lock, NULL, RW_DEFAULT, NULL);
3910
3911	if (flags & ZCF_NET_EXCL) {
3912		zone->zone_flags |= ZF_NET_EXCL;
3913	}
3914
3915	if ((error = zone_set_name(zone, zone_name)) != 0) {
3916		zone_free(zone);
3917		return (zone_create_error(error, 0, extended_error));
3918	}
3919
3920	if ((error = zone_set_root(zone, zone_root)) != 0) {
3921		zone_free(zone);
3922		return (zone_create_error(error, 0, extended_error));
3923	}
3924	if ((error = zone_set_privset(zone, zone_privs, zone_privssz)) != 0) {
3925		zone_free(zone);
3926		return (zone_create_error(error, 0, extended_error));
3927	}
3928
3929	/* initialize node name to be the same as zone name */
3930	zone->zone_nodename = kmem_alloc(_SYS_NMLN, KM_SLEEP);
3931	(void) strncpy(zone->zone_nodename, zone->zone_name, _SYS_NMLN);
3932	zone->zone_nodename[_SYS_NMLN - 1] = '\0';
3933
3934	zone->zone_domain = kmem_alloc(_SYS_NMLN, KM_SLEEP);
3935	zone->zone_domain[0] = '\0';
3936	zone->zone_hostid = HW_INVALID_HOSTID;
3937	zone->zone_shares = 1;
3938	zone->zone_shmmax = 0;
3939	zone->zone_ipc.ipcq_shmmni = 0;
3940	zone->zone_ipc.ipcq_semmni = 0;
3941	zone->zone_ipc.ipcq_msgmni = 0;
3942	zone->zone_bootargs = NULL;
3943	zone->zone_fs_allowed = NULL;
3944	zone->zone_initname =
3945	    kmem_alloc(strlen(zone_default_initname) + 1, KM_SLEEP);
3946	(void) strcpy(zone->zone_initname, zone_default_initname);
3947	zone->zone_nlwps = 0;
3948	zone->zone_nlwps_ctl = INT_MAX;
3949	zone->zone_locked_mem = 0;
3950	zone->zone_locked_mem_ctl = UINT64_MAX;
3951	zone->zone_max_swap = 0;
3952	zone->zone_max_swap_ctl = UINT64_MAX;
3953	zone->zone_max_lofi = 0;
3954	zone->zone_max_lofi_ctl = UINT64_MAX;
3955	zone0.zone_lockedmem_kstat = NULL;
3956	zone0.zone_swapresv_kstat = NULL;
3957
3958	/*
3959	 * Zsched initializes the rctls.
3960	 */
3961	zone->zone_rctls = NULL;
3962
3963	if ((error = parse_rctls(rctlbuf, rctlbufsz, &rctls)) != 0) {
3964		zone_free(zone);
3965		return (zone_create_error(error, 0, extended_error));
3966	}
3967
3968	if ((error = parse_zfs(zone, zfsbuf, zfsbufsz)) != 0) {
3969		zone_free(zone);
3970		return (set_errno(error));
3971	}
3972
3973	/*
3974	 * Read in the trusted system parameters:
3975	 * match flag and sensitivity label.
3976	 */
3977	zone->zone_match = match;
3978	if (is_system_labeled() && !(zone->zone_flags & ZF_IS_SCRATCH)) {
3979		/* Fail if requested to set doi to anything but system's doi */
3980		if (doi != 0 && doi != default_doi) {
3981			zone_free(zone);
3982			return (set_errno(EINVAL));
3983		}
3984		/* Always apply system's doi to the zone */
3985		error = zone_set_label(zone, label, default_doi);
3986		if (error != 0) {
3987			zone_free(zone);
3988			return (set_errno(error));
3989		}
3990		insert_label_hash = B_TRUE;
3991	} else {
3992		/* all zones get an admin_low label if system is not labeled */
3993		zone->zone_slabel = l_admin_low;
3994		label_hold(l_admin_low);
3995		insert_label_hash = B_FALSE;
3996	}
3997
3998	/*
3999	 * Stop all lwps since that's what normally happens as part of fork().
4000	 * This needs to happen before we grab any locks to avoid deadlock
4001	 * (another lwp in the process could be waiting for the held lock).
4002	 */
4003	if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK)) {
4004		zone_free(zone);
4005		if (rctls)
4006			nvlist_free(rctls);
4007		return (zone_create_error(error, 0, extended_error));
4008	}
4009
4010	if (block_mounts() == 0) {
4011		mutex_enter(&pp->p_lock);
4012		if (curthread != pp->p_agenttp)
4013			continuelwps(pp);
4014		mutex_exit(&pp->p_lock);
4015		zone_free(zone);
4016		if (rctls)
4017			nvlist_free(rctls);
4018		return (zone_create_error(error, 0, extended_error));
4019	}
4020
4021	/*
4022	 * Set up credential for kernel access.  After this, any errors
4023	 * should go through the dance in errout rather than calling
4024	 * zone_free directly.
4025	 */
4026	zone->zone_kcred = crdup(kcred);
4027	crsetzone(zone->zone_kcred, zone);
4028	priv_intersect(zone->zone_privset, &CR_PPRIV(zone->zone_kcred));
4029	priv_intersect(zone->zone_privset, &CR_EPRIV(zone->zone_kcred));
4030	priv_intersect(zone->zone_privset, &CR_IPRIV(zone->zone_kcred));
4031	priv_intersect(zone->zone_privset, &CR_LPRIV(zone->zone_kcred));
4032
4033	mutex_enter(&zonehash_lock);
4034	/*
4035	 * Make sure zone doesn't already exist.
4036	 *
4037	 * If the system and zone are labeled,
4038	 * make sure no other zone exists that has the same label.
4039	 */
4040	if ((ztmp = zone_find_all_by_name(zone->zone_name)) != NULL ||
4041	    (insert_label_hash &&
4042	    (ztmp = zone_find_all_by_label(zone->zone_slabel)) != NULL)) {
4043		zone_status_t status;
4044
4045		status = zone_status_get(ztmp);
4046		if (status == ZONE_IS_READY || status == ZONE_IS_RUNNING)
4047			error = EEXIST;
4048		else
4049			error = EBUSY;
4050
4051		if (insert_label_hash)
4052			error2 = ZE_LABELINUSE;
4053
4054		goto errout;
4055	}
4056
4057	/*
4058	 * Don't allow zone creations which would cause one zone's rootpath to
4059	 * be accessible from that of another (non-global) zone.
4060	 */
4061	if (zone_is_nested(zone->zone_rootpath)) {
4062		error = EBUSY;
4063		goto errout;
4064	}
4065
4066	ASSERT(zonecount != 0);		/* check for leaks */
4067	if (zonecount + 1 > maxzones) {
4068		error = ENOMEM;
4069		goto errout;
4070	}
4071
4072	if (zone_mount_count(zone->zone_rootpath) != 0) {
4073		error = EBUSY;
4074		error2 = ZE_AREMOUNTS;
4075		goto errout;
4076	}
4077
4078	/*
4079	 * Zone is still incomplete, but we need to drop all locks while
4080	 * zsched() initializes this zone's kernel process.  We
4081	 * optimistically add the zone to the hashtable and associated
4082	 * lists so a parallel zone_create() doesn't try to create the
4083	 * same zone.
4084	 */
4085	zonecount++;
4086	(void) mod_hash_insert(zonehashbyid,
4087	    (mod_hash_key_t)(uintptr_t)zone->zone_id,
4088	    (mod_hash_val_t)(uintptr_t)zone);
4089	str = kmem_alloc(strlen(zone->zone_name) + 1, KM_SLEEP);
4090	(void) strcpy(str, zone->zone_name);
4091	(void) mod_hash_insert(zonehashbyname, (mod_hash_key_t)str,
4092	    (mod_hash_val_t)(uintptr_t)zone);
4093	if (insert_label_hash) {
4094		(void) mod_hash_insert(zonehashbylabel,
4095		    (mod_hash_key_t)zone->zone_slabel, (mod_hash_val_t)zone);
4096		zone->zone_flags |= ZF_HASHED_LABEL;
4097	}
4098
4099	/*
4100	 * Insert into active list.  At this point there are no 'hold's
4101	 * on the zone, but everyone else knows not to use it, so we can
4102	 * continue to use it.  zsched() will do a zone_hold() if the
4103	 * newproc() is successful.
4104	 */
4105	list_insert_tail(&zone_active, zone);
4106	mutex_exit(&zonehash_lock);
4107
4108	zarg.zone = zone;
4109	zarg.nvlist = rctls;
4110	/*
4111	 * The process, task, and project rctls are probably wrong;
4112	 * we need an interface to get the default values of all rctls,
4113	 * and initialize zsched appropriately.  I'm not sure that that
4114	 * makes much of a difference, though.
4115	 */
4116	error = newproc(zsched, (void *)&zarg, syscid, minclsyspri, NULL, 0);
4117	if (error != 0) {
4118		/*
4119		 * We need to undo all globally visible state.
4120		 */
4121		mutex_enter(&zonehash_lock);
4122		list_remove(&zone_active, zone);
4123		if (zone->zone_flags & ZF_HASHED_LABEL) {
4124			ASSERT(zone->zone_slabel != NULL);
4125			(void) mod_hash_destroy(zonehashbylabel,
4126			    (mod_hash_key_t)zone->zone_slabel);
4127		}
4128		(void) mod_hash_destroy(zonehashbyname,
4129		    (mod_hash_key_t)(uintptr_t)zone->zone_name);
4130		(void) mod_hash_destroy(zonehashbyid,
4131		    (mod_hash_key_t)(uintptr_t)zone->zone_id);
4132		ASSERT(zonecount > 1);
4133		zonecount--;
4134		goto errout;
4135	}
4136
4137	/*
4138	 * Zone creation can't fail from now on.
4139	 */
4140
4141	/*
4142	 * Create zone kstats
4143	 */
4144	zone_kstat_create(zone);
4145
4146	/*
4147	 * Let the other lwps continue.
4148	 */
4149	mutex_enter(&pp->p_lock);
4150	if (curthread != pp->p_agenttp)
4151		continuelwps(pp);
4152	mutex_exit(&pp->p_lock);
4153
4154	/*
4155	 * Wait for zsched to finish initializing the zone.
4156	 */
4157	zone_status_wait(zone, ZONE_IS_READY);
4158	/*
4159	 * The zone is fully visible, so we can let mounts progress.
4160	 */
4161	resume_mounts();
4162	if (rctls)
4163		nvlist_free(rctls);
4164
4165	return (zoneid);
4166
4167errout:
4168	mutex_exit(&zonehash_lock);
4169	/*
4170	 * Let the other lwps continue.
4171	 */
4172	mutex_enter(&pp->p_lock);
4173	if (curthread != pp->p_agenttp)
4174		continuelwps(pp);
4175	mutex_exit(&pp->p_lock);
4176
4177	resume_mounts();
4178	if (rctls)
4179		nvlist_free(rctls);
4180	/*
4181	 * There is currently one reference to the zone, a cred_ref from
4182	 * zone_kcred.  To free the zone, we call crfree, which will call
4183	 * zone_cred_rele, which will call zone_free.
4184	 */
4185	ASSERT(zone->zone_cred_ref == 1);	/* for zone_kcred */
4186	ASSERT(zone->zone_kcred->cr_ref == 1);
4187	ASSERT(zone->zone_ref == 0);
4188	zkcr = zone->zone_kcred;
4189	zone->zone_kcred = NULL;
4190	crfree(zkcr);				/* triggers call to zone_free */
4191	return (zone_create_error(error, error2, extended_error));
4192}
4193
4194/*
4195 * Cause the zone to boot.  This is pretty simple, since we let zoneadmd do
4196 * the heavy lifting.  initname is the path to the program to launch
4197 * at the "top" of the zone; if this is NULL, we use the system default,
4198 * which is stored at zone_default_initname.
4199 */
4200static int
4201zone_boot(zoneid_t zoneid)
4202{
4203	int err;
4204	zone_t *zone;
4205
4206	if (secpolicy_zone_config(CRED()) != 0)
4207		return (set_errno(EPERM));
4208	if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
4209		return (set_errno(EINVAL));
4210
4211	mutex_enter(&zonehash_lock);
4212	/*
4213	 * Look for zone under hash lock to prevent races with calls to
4214	 * zone_shutdown, zone_destroy, etc.
4215	 */
4216	if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
4217		mutex_exit(&zonehash_lock);
4218		return (set_errno(EINVAL));
4219	}
4220
4221	mutex_enter(&zone_status_lock);
4222	if (zone_status_get(zone) != ZONE_IS_READY) {
4223		mutex_exit(&zone_status_lock);
4224		mutex_exit(&zonehash_lock);
4225		return (set_errno(EINVAL));
4226	}
4227	zone_status_set(zone, ZONE_IS_BOOTING);
4228	mutex_exit(&zone_status_lock);
4229
4230	zone_hold(zone);	/* so we can use the zone_t later */
4231	mutex_exit(&zonehash_lock);
4232
4233	if (zone_status_wait_sig(zone, ZONE_IS_RUNNING) == 0) {
4234		zone_rele(zone);
4235		return (set_errno(EINTR));
4236	}
4237
4238	/*
4239	 * Boot (starting init) might have failed, in which case the zone
4240	 * will go to the SHUTTING_DOWN state; an appropriate errno will
4241	 * be placed in zone->zone_boot_err, and so we return that.
4242	 */
4243	err = zone->zone_boot_err;
4244	zone_rele(zone);
4245	return (err ? set_errno(err) : 0);
4246}
4247
4248/*
4249 * Kills all user processes in the zone, waiting for them all to exit
4250 * before returning.
4251 */
4252static int
4253zone_empty(zone_t *zone)
4254{
4255	int waitstatus;
4256
4257	/*
4258	 * We need to drop zonehash_lock before killing all
4259	 * processes, otherwise we'll deadlock with zone_find_*
4260	 * which can be called from the exit path.
4261	 */
4262	ASSERT(MUTEX_NOT_HELD(&zonehash_lock));
4263	while ((waitstatus = zone_status_timedwait_sig(zone,
4264	    ddi_get_lbolt() + hz, ZONE_IS_EMPTY)) == -1) {
4265		killall(zone->zone_id);
4266	}
4267	/*
4268	 * return EINTR if we were signaled
4269	 */
4270	if (waitstatus == 0)
4271		return (EINTR);
4272	return (0);
4273}
4274
4275/*
4276 * This function implements the policy for zone visibility.
4277 *
4278 * In standard Solaris, a non-global zone can only see itself.
4279 *
4280 * In Trusted Extensions, a labeled zone can lookup any zone whose label
4281 * it dominates. For this test, the label of the global zone is treated as
4282 * admin_high so it is special-cased instead of being checked for dominance.
4283 *
4284 * Returns true if zone attributes are viewable, false otherwise.
4285 */
4286static boolean_t
4287zone_list_access(zone_t *zone)
4288{
4289
4290	if (curproc->p_zone == global_zone ||
4291	    curproc->p_zone == zone) {
4292		return (B_TRUE);
4293	} else if (is_system_labeled() && !(zone->zone_flags & ZF_IS_SCRATCH)) {
4294		bslabel_t *curproc_label;
4295		bslabel_t *zone_label;
4296
4297		curproc_label = label2bslabel(curproc->p_zone->zone_slabel);
4298		zone_label = label2bslabel(zone->zone_slabel);
4299
4300		if (zone->zone_id != GLOBAL_ZONEID &&
4301		    bldominates(curproc_label, zone_label)) {
4302			return (B_TRUE);
4303		} else {
4304			return (B_FALSE);
4305		}
4306	} else {
4307		return (B_FALSE);
4308	}
4309}
4310
4311/*
4312 * Systemcall to start the zone's halt sequence.  By the time this
4313 * function successfully returns, all user processes and kernel threads
4314 * executing in it will have exited, ZSD shutdown callbacks executed,
4315 * and the zone status set to ZONE_IS_DOWN.
4316 *
4317 * It is possible that the call will interrupt itself if the caller is the
4318 * parent of any process running in the zone, and doesn't have SIGCHLD blocked.
4319 */
4320static int
4321zone_shutdown(zoneid_t zoneid)
4322{
4323	int error;
4324	zone_t *zone;
4325	zone_status_t status;
4326
4327	if (secpolicy_zone_config(CRED()) != 0)
4328		return (set_errno(EPERM));
4329	if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
4330		return (set_errno(EINVAL));
4331
4332	/*
4333	 * Block mounts so that VFS_MOUNT() can get an accurate view of
4334	 * the zone's status with regards to ZONE_IS_SHUTTING down.
4335	 *
4336	 * e.g. NFS can fail the mount if it determines that the zone
4337	 * has already begun the shutdown sequence.
4338	 */
4339	if (block_mounts() == 0)
4340		return (set_errno(EINTR));
4341	mutex_enter(&zonehash_lock);
4342	/*
4343	 * Look for zone under hash lock to prevent races with other
4344	 * calls to zone_shutdown and zone_destroy.
4345	 */
4346	if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
4347		mutex_exit(&zonehash_lock);
4348		resume_mounts();
4349		return (set_errno(EINVAL));
4350	}
4351	mutex_enter(&zone_status_lock);
4352	status = zone_status_get(zone);
4353	/*
4354	 * Fail if the zone isn't fully initialized yet.
4355	 */
4356	if (status < ZONE_IS_READY) {
4357		mutex_exit(&zone_status_lock);
4358		mutex_exit(&zonehash_lock);
4359		resume_mounts();
4360		return (set_errno(EINVAL));
4361	}
4362	/*
4363	 * If conditions required for zone_shutdown() to return have been met,
4364	 * return success.
4365	 */
4366	if (status >= ZONE_IS_DOWN) {
4367		mutex_exit(&zone_status_lock);
4368		mutex_exit(&zonehash_lock);
4369		resume_mounts();
4370		return (0);
4371	}
4372	/*
4373	 * If zone_shutdown() hasn't been called before, go through the motions.
4374	 * If it has, there's nothing to do but wait for the kernel threads to
4375	 * drain.
4376	 */
4377	if (status < ZONE_IS_EMPTY) {
4378		uint_t ntasks;
4379
4380		mutex_enter(&zone->zone_lock);
4381		if ((ntasks = zone->zone_ntasks) != 1) {
4382			/*
4383			 * There's still stuff running.
4384			 */
4385			zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
4386		}
4387		mutex_exit(&zone->zone_lock);
4388		if (ntasks == 1) {
4389			/*
4390			 * The only way to create another task is through
4391			 * zone_enter(), which will block until we drop
4392			 * zonehash_lock.  The zone is empty.
4393			 */
4394			if (zone->zone_kthreads == NULL) {
4395				/*
4396				 * Skip ahead to ZONE_IS_DOWN
4397				 */
4398				zone_status_set(zone, ZONE_IS_DOWN);
4399			} else {
4400				zone_status_set(zone, ZONE_IS_EMPTY);
4401			}
4402		}
4403	}
4404	zone_hold(zone);	/* so we can use the zone_t later */
4405	mutex_exit(&zone_status_lock);
4406	mutex_exit(&zonehash_lock);
4407	resume_mounts();
4408
4409	if (error = zone_empty(zone)) {
4410		zone_rele(zone);
4411		return (set_errno(error));
4412	}
4413	/*
4414	 * After the zone status goes to ZONE_IS_DOWN this zone will no
4415	 * longer be notified of changes to the pools configuration, so
4416	 * in order to not end up with a stale pool pointer, we point
4417	 * ourselves at the default pool and remove all resource
4418	 * visibility.  This is especially important as the zone_t may
4419	 * languish on the deathrow for a very long time waiting for
4420	 * cred's to drain out.
4421	 *
4422	 * This rebinding of the zone can happen multiple times
4423	 * (presumably due to interrupted or parallel systemcalls)
4424	 * without any adverse effects.
4425	 */
4426	if (pool_lock_intr() != 0) {
4427		zone_rele(zone);
4428		return (set_errno(EINTR));
4429	}
4430	if (pool_state == POOL_ENABLED) {
4431		mutex_enter(&cpu_lock);
4432		zone_pool_set(zone, pool_default);
4433		/*
4434		 * The zone no longer needs to be able to see any cpus.
4435		 */
4436		zone_pset_set(zone, ZONE_PS_INVAL);
4437		mutex_exit(&cpu_lock);
4438	}
4439	pool_unlock();
4440
4441	/*
4442	 * ZSD shutdown callbacks can be executed multiple times, hence
4443	 * it is safe to not be holding any locks across this call.
4444	 */
4445	zone_zsd_callbacks(zone, ZSD_SHUTDOWN);
4446
4447	mutex_enter(&zone_status_lock);
4448	if (zone->zone_kthreads == NULL && zone_status_get(zone) < ZONE_IS_DOWN)
4449		zone_status_set(zone, ZONE_IS_DOWN);
4450	mutex_exit(&zone_status_lock);
4451
4452	/*
4453	 * Wait for kernel threads to drain.
4454	 */
4455	if (!zone_status_wait_sig(zone, ZONE_IS_DOWN)) {
4456		zone_rele(zone);
4457		return (set_errno(EINTR));
4458	}
4459
4460	/*
4461	 * Zone can be become down/destroyable even if the above wait
4462	 * returns EINTR, so any code added here may never execute.
4463	 * (i.e. don't add code here)
4464	 */
4465
4466	zone_rele(zone);
4467	return (0);
4468}
4469
4470/*
4471 * Systemcall entry point to finalize the zone halt process.  The caller
4472 * must have already successfully called zone_shutdown().
4473 *
4474 * Upon successful completion, the zone will have been fully destroyed:
4475 * zsched will have exited, destructor callbacks executed, and the zone
4476 * removed from the list of active zones.
4477 */
4478static int
4479zone_destroy(zoneid_t zoneid)
4480{
4481	uint64_t uniqid;
4482	zone_t *zone;
4483	zone_status_t status;
4484
4485	if (secpolicy_zone_config(CRED()) != 0)
4486		return (set_errno(EPERM));
4487	if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
4488		return (set_errno(EINVAL));
4489
4490	mutex_enter(&zonehash_lock);
4491	/*
4492	 * Look for zone under hash lock to prevent races with other
4493	 * calls to zone_destroy.
4494	 */
4495	if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
4496		mutex_exit(&zonehash_lock);
4497		return (set_errno(EINVAL));
4498	}
4499
4500	if (zone_mount_count(zone->zone_rootpath) != 0) {
4501		mutex_exit(&zonehash_lock);
4502		return (set_errno(EBUSY));
4503	}
4504	mutex_enter(&zone_status_lock);
4505	status = zone_status_get(zone);
4506	if (status < ZONE_IS_DOWN) {
4507		mutex_exit(&zone_status_lock);
4508		mutex_exit(&zonehash_lock);
4509		return (set_errno(EBUSY));
4510	} else if (status == ZONE_IS_DOWN) {
4511		zone_status_set(zone, ZONE_IS_DYING); /* Tell zsched to exit */
4512	}
4513	mutex_exit(&zone_status_lock);
4514	zone_hold(zone);
4515	mutex_exit(&zonehash_lock);
4516
4517	/*
4518	 * wait for zsched to exit
4519	 */
4520	zone_status_wait(zone, ZONE_IS_DEAD);
4521	zone_zsd_callbacks(zone, ZSD_DESTROY);
4522	zone->zone_netstack = NULL;
4523	uniqid = zone->zone_uniqid;
4524	zone_rele(zone);
4525	zone = NULL;	/* potentially free'd */
4526
4527	mutex_enter(&zonehash_lock);
4528	for (; /* ever */; ) {
4529		boolean_t unref;
4530
4531		if ((zone = zone_find_all_by_id(zoneid)) == NULL ||
4532		    zone->zone_uniqid != uniqid) {
4533			/*
4534			 * The zone has gone away.  Necessary conditions
4535			 * are met, so we return success.
4536			 */
4537			mutex_exit(&zonehash_lock);
4538			return (0);
4539		}
4540		mutex_enter(&zone->zone_lock);
4541		unref = ZONE_IS_UNREF(zone);
4542		mutex_exit(&zone->zone_lock);
4543		if (unref) {
4544			/*
4545			 * There is only one reference to the zone -- that
4546			 * added when the zone was added to the hashtables --
4547			 * and things will remain this way until we drop
4548			 * zonehash_lock... we can go ahead and cleanup the
4549			 * zone.
4550			 */
4551			break;
4552		}
4553
4554		if (cv_wait_sig(&zone_destroy_cv, &zonehash_lock) == 0) {
4555			/* Signaled */
4556			mutex_exit(&zonehash_lock);
4557			return (set_errno(EINTR));
4558		}
4559
4560	}
4561
4562	/*
4563	 * Remove CPU cap for this zone now since we're not going to
4564	 * fail below this point.
4565	 */
4566	cpucaps_zone_remove(zone);
4567
4568	/* Get rid of the zone's kstats */
4569	zone_kstat_delete(zone);
4570
4571	/* remove the pfexecd doors */
4572	if (zone->zone_pfexecd != NULL) {
4573		klpd_freelist(&zone->zone_pfexecd);
4574		zone->zone_pfexecd = NULL;
4575	}
4576
4577	/* free brand specific data */
4578	if (ZONE_IS_BRANDED(zone))
4579		ZBROP(zone)->b_free_brand_data(zone);
4580
4581	/* Say goodbye to brand framework. */
4582	brand_unregister_zone(zone->zone_brand);
4583
4584	/*
4585	 * It is now safe to let the zone be recreated; remove it from the
4586	 * lists.  The memory will not be freed until the last cred
4587	 * reference goes away.
4588	 */
4589	ASSERT(zonecount > 1);	/* must be > 1; can't destroy global zone */
4590	zonecount--;
4591	/* remove from active list and hash tables */
4592	list_remove(&zone_active, zone);
4593	(void) mod_hash_destroy(zonehashbyname,
4594	    (mod_hash_key_t)zone->zone_name);
4595	(void) mod_hash_destroy(zonehashbyid,
4596	    (mod_hash_key_t)(uintptr_t)zone->zone_id);
4597	if (zone->zone_flags & ZF_HASHED_LABEL)
4598		(void) mod_hash_destroy(zonehashbylabel,
4599		    (mod_hash_key_t)zone->zone_slabel);
4600	mutex_exit(&zonehash_lock);
4601
4602	/*
4603	 * Release the root vnode; we're not using it anymore.  Nor should any
4604	 * other thread that might access it exist.
4605	 */
4606	if (zone->zone_rootvp != NULL) {
4607		VN_RELE(zone->zone_rootvp);
4608		zone->zone_rootvp = NULL;
4609	}
4610
4611	/* add to deathrow list */
4612	mutex_enter(&zone_deathrow_lock);
4613	list_insert_tail(&zone_deathrow, zone);
4614	mutex_exit(&zone_deathrow_lock);
4615
4616	/*
4617	 * Drop last reference (which was added by zsched()), this will
4618	 * free the zone unless there are outstanding cred references.
4619	 */
4620	zone_rele(zone);
4621	return (0);
4622}
4623
4624/*
4625 * Systemcall entry point for zone_getattr(2).
4626 */
4627static ssize_t
4628zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
4629{
4630	size_t size;
4631	int error = 0, err;
4632	zone_t *zone;
4633	char *zonepath;
4634	char *outstr;
4635	zone_status_t zone_status;
4636	pid_t initpid;
4637	boolean_t global = (curzone == global_zone);
4638	boolean_t inzone = (curzone->zone_id == zoneid);
4639	ushort_t flags;
4640
4641	mutex_enter(&zonehash_lock);
4642	if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
4643		mutex_exit(&zonehash_lock);
4644		return (set_errno(EINVAL));
4645	}
4646	zone_status = zone_status_get(zone);
4647	if (zone_status < ZONE_IS_INITIALIZED) {
4648		mutex_exit(&zonehash_lock);
4649		return (set_errno(EINVAL));
4650	}
4651	zone_hold(zone);
4652	mutex_exit(&zonehash_lock);
4653
4654	/*
4655	 * If not in the global zone, don't show information about other zones,
4656	 * unless the system is labeled and the local zone's label dominates
4657	 * the other zone.
4658	 */
4659	if (!zone_list_access(zone)) {
4660		zone_rele(zone);
4661		return (set_errno(EINVAL));
4662	}
4663
4664	switch (attr) {
4665	case ZONE_ATTR_ROOT:
4666		if (global) {
4667			/*
4668			 * Copy the path to trim the trailing "/" (except for
4669			 * the global zone).
4670			 */
4671			if (zone != global_zone)
4672				size = zone->zone_rootpathlen - 1;
4673			else
4674				size = zone->zone_rootpathlen;
4675			zonepath = kmem_alloc(size, KM_SLEEP);
4676			bcopy(zone->zone_rootpath, zonepath, size);
4677			zonepath[size - 1] = '\0';
4678		} else {
4679			if (inzone || !is_system_labeled()) {
4680				/*
4681				 * Caller is not in the global zone.
4682				 * if the query is on the current zone
4683				 * or the system is not labeled,
4684				 * just return faked-up path for current zone.
4685				 */
4686				zonepath = "/";
4687				size = 2;
4688			} else {
4689				/*
4690				 * Return related path for current zone.
4691				 */
4692				int prefix_len = strlen(zone_prefix);
4693				int zname_len = strlen(zone->zone_name);
4694
4695				size = prefix_len + zname_len + 1;
4696				zonepath = kmem_alloc(size, KM_SLEEP);
4697				bcopy(zone_prefix, zonepath, prefix_len);
4698				bcopy(zone->zone_name, zonepath +
4699				    prefix_len, zname_len);
4700				zonepath[size - 1] = '\0';
4701			}
4702		}
4703		if (bufsize > size)
4704			bufsize = size;
4705		if (buf != NULL) {
4706			err = copyoutstr(zonepath, buf, bufsize, NULL);
4707			if (err != 0 && err != ENAMETOOLONG)
4708				error = EFAULT;
4709		}
4710		if (global || (is_system_labeled() && !inzone))
4711			kmem_free(zonepath, size);
4712		break;
4713
4714	case ZONE_ATTR_NAME:
4715		size = strlen(zone->zone_name) + 1;
4716		if (bufsize > size)
4717			bufsize = size;
4718		if (buf != NULL) {
4719			err = copyoutstr(zone->zone_name, buf, bufsize, NULL);
4720			if (err != 0 && err != ENAMETOOLONG)
4721				error = EFAULT;
4722		}
4723		break;
4724
4725	case ZONE_ATTR_STATUS:
4726		/*
4727		 * Since we're not holding zonehash_lock, the zone status
4728		 * may be anything; leave it up to userland to sort it out.
4729		 */
4730		size = sizeof (zone_status);
4731		if (bufsize > size)
4732			bufsize = size;
4733		zone_status = zone_status_get(zone);
4734		if (buf != NULL &&
4735		    copyout(&zone_status, buf, bufsize) != 0)
4736			error = EFAULT;
4737		break;
4738	case ZONE_ATTR_FLAGS:
4739		size = sizeof (zone->zone_flags);
4740		if (bufsize > size)
4741			bufsize = size;
4742		flags = zone->zone_flags;
4743		if (buf != NULL &&
4744		    copyout(&flags, buf, bufsize) != 0)
4745			error = EFAULT;
4746		break;
4747	case ZONE_ATTR_PRIVSET:
4748		size = sizeof (priv_set_t);
4749		if (bufsize > size)
4750			bufsize = size;
4751		if (buf != NULL &&
4752		    copyout(zone->zone_privset, buf, bufsize) != 0)
4753			error = EFAULT;
4754		break;
4755	case ZONE_ATTR_UNIQID:
4756		size = sizeof (zone->zone_uniqid);
4757		if (bufsize > size)
4758			bufsize = size;
4759		if (buf != NULL &&
4760		    copyout(&zone->zone_uniqid, buf, bufsize) != 0)
4761			error = EFAULT;
4762		break;
4763	case ZONE_ATTR_POOLID:
4764		{
4765			pool_t *pool;
4766			poolid_t poolid;
4767
4768			if (pool_lock_intr() != 0) {
4769				error = EINTR;
4770				break;
4771			}
4772			pool = zone_pool_get(zone);
4773			poolid = pool->pool_id;
4774			pool_unlock();
4775			size = sizeof (poolid);
4776			if (bufsize > size)
4777				bufsize = size;
4778			if (buf != NULL && copyout(&poolid, buf, size) != 0)
4779				error = EFAULT;
4780		}
4781		break;
4782	case ZONE_ATTR_SLBL:
4783		size = sizeof (bslabel_t);
4784		if (bufsize > size)
4785			bufsize = size;
4786		if (zone->zone_slabel == NULL)
4787			error = EINVAL;
4788		else if (buf != NULL &&
4789		    copyout(label2bslabel(zone->zone_slabel), buf,
4790		    bufsize) != 0)
4791			error = EFAULT;
4792		break;
4793	case ZONE_ATTR_INITPID:
4794		size = sizeof (initpid);
4795		if (bufsize > size)
4796			bufsize = size;
4797		initpid = zone->zone_proc_initpid;
4798		if (initpid == -1) {
4799			error = ESRCH;
4800			break;
4801		}
4802		if (buf != NULL &&
4803		    copyout(&initpid, buf, bufsize) != 0)
4804			error = EFAULT;
4805		break;
4806	case ZONE_ATTR_BRAND:
4807		size = strlen(zone->zone_brand->b_name) + 1;
4808
4809		if (bufsize > size)
4810			bufsize = size;
4811		if (buf != NULL) {
4812			err = copyoutstr(zone->zone_brand->b_name, buf,
4813			    bufsize, NULL);
4814			if (err != 0 && err != ENAMETOOLONG)
4815				error = EFAULT;
4816		}
4817		break;
4818	case ZONE_ATTR_INITNAME:
4819		size = strlen(zone->zone_initname) + 1;
4820		if (bufsize > size)
4821			bufsize = size;
4822		if (buf != NULL) {
4823			err = copyoutstr(zone->zone_initname, buf, bufsize,
4824			    NULL);
4825			if (err != 0 && err != ENAMETOOLONG)
4826				error = EFAULT;
4827		}
4828		break;
4829	case ZONE_ATTR_BOOTARGS:
4830		if (zone->zone_bootargs == NULL)
4831			outstr = "";
4832		else
4833			outstr = zone->zone_bootargs;
4834		size = strlen(outstr) + 1;
4835		if (bufsize > size)
4836			bufsize = size;
4837		if (buf != NULL) {
4838			err = copyoutstr(outstr, buf, bufsize, NULL);
4839			if (err != 0 && err != ENAMETOOLONG)
4840				error = EFAULT;
4841		}
4842		break;
4843	case ZONE_ATTR_PHYS_MCAP:
4844		size = sizeof (zone->zone_phys_mcap);
4845		if (bufsize > size)
4846			bufsize = size;
4847		if (buf != NULL &&
4848		    copyout(&zone->zone_phys_mcap, buf, bufsize) != 0)
4849			error = EFAULT;
4850		break;
4851	case ZONE_ATTR_SCHED_CLASS:
4852		mutex_enter(&class_lock);
4853
4854		if (zone->zone_defaultcid >= loaded_classes)
4855			outstr = "";
4856		else
4857			outstr = sclass[zone->zone_defaultcid].cl_name;
4858		size = strlen(outstr) + 1;
4859		if (bufsize > size)
4860			bufsize = size;
4861		if (buf != NULL) {
4862			err = copyoutstr(outstr, buf, bufsize, NULL);
4863			if (err != 0 && err != ENAMETOOLONG)
4864				error = EFAULT;
4865		}
4866
4867		mutex_exit(&class_lock);
4868		break;
4869	case ZONE_ATTR_HOSTID:
4870		if (zone->zone_hostid != HW_INVALID_HOSTID &&
4871		    bufsize == sizeof (zone->zone_hostid)) {
4872			size = sizeof (zone->zone_hostid);
4873			if (buf != NULL && copyout(&zone->zone_hostid, buf,
4874			    bufsize) != 0)
4875				error = EFAULT;
4876		} else {
4877			error = EINVAL;
4878		}
4879		break;
4880	case ZONE_ATTR_FS_ALLOWED:
4881		if (zone->zone_fs_allowed == NULL)
4882			outstr = "";
4883		else
4884			outstr = zone->zone_fs_allowed;
4885		size = strlen(outstr) + 1;
4886		if (bufsize > size)
4887			bufsize = size;
4888		if (buf != NULL) {
4889			err = copyoutstr(outstr, buf, bufsize, NULL);
4890			if (err != 0 && err != ENAMETOOLONG)
4891				error = EFAULT;
4892		}
4893		break;
4894	default:
4895		if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) {
4896			size = bufsize;
4897			error = ZBROP(zone)->b_getattr(zone, attr, buf, &size);
4898		} else {
4899			error = EINVAL;
4900		}
4901	}
4902	zone_rele(zone);
4903
4904	if (error)
4905		return (set_errno(error));
4906	return ((ssize_t)size);
4907}
4908
4909/*
4910 * Systemcall entry point for zone_setattr(2).
4911 */
4912/*ARGSUSED*/
4913static int
4914zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
4915{
4916	zone_t *zone;
4917	zone_status_t zone_status;
4918	int err;
4919
4920	if (secpolicy_zone_config(CRED()) != 0)
4921		return (set_errno(EPERM));
4922
4923	/*
4924	 * Only the ZONE_ATTR_PHYS_MCAP attribute can be set on the
4925	 * global zone.
4926	 */
4927	if (zoneid == GLOBAL_ZONEID && attr != ZONE_ATTR_PHYS_MCAP) {
4928		return (set_errno(EINVAL));
4929	}
4930
4931	mutex_enter(&zonehash_lock);
4932	if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
4933		mutex_exit(&zonehash_lock);
4934		return (set_errno(EINVAL));
4935	}
4936	zone_hold(zone);
4937	mutex_exit(&zonehash_lock);
4938
4939	/*
4940	 * At present most attributes can only be set on non-running,
4941	 * non-global zones.
4942	 */
4943	zone_status = zone_status_get(zone);
4944	if (attr != ZONE_ATTR_PHYS_MCAP && zone_status > ZONE_IS_READY)
4945		goto done;
4946
4947	switch (attr) {
4948	case ZONE_ATTR_INITNAME:
4949		err = zone_set_initname(zone, (const char *)buf);
4950		break;
4951	case ZONE_ATTR_BOOTARGS:
4952		err = zone_set_bootargs(zone, (const char *)buf);
4953		break;
4954	case ZONE_ATTR_BRAND:
4955		err = zone_set_brand(zone, (const char *)buf);
4956		break;
4957	case ZONE_ATTR_FS_ALLOWED:
4958		err = zone_set_fs_allowed(zone, (const char *)buf);
4959		break;
4960	case ZONE_ATTR_PHYS_MCAP:
4961		err = zone_set_phys_mcap(zone, (const uint64_t *)buf);
4962		break;
4963	case ZONE_ATTR_SCHED_CLASS:
4964		err = zone_set_sched_class(zone, (const char *)buf);
4965		break;
4966	case ZONE_ATTR_HOSTID:
4967		if (bufsize == sizeof (zone->zone_hostid)) {
4968			if (copyin(buf, &zone->zone_hostid, bufsize) == 0)
4969				err = 0;
4970			else
4971				err = EFAULT;
4972		} else {
4973			err = EINVAL;
4974		}
4975		break;
4976	default:
4977		if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone))
4978			err = ZBROP(zone)->b_setattr(zone, attr, buf, bufsize);
4979		else
4980			err = EINVAL;
4981	}
4982
4983done:
4984	zone_rele(zone);
4985	return (err != 0 ? set_errno(err) : 0);
4986}
4987
4988/*
4989 * Return zero if the process has at least one vnode mapped in to its
4990 * address space which shouldn't be allowed to change zones.
4991 *
4992 * Also return zero if the process has any shared mappings which reserve
4993 * swap.  This is because the counting for zone.max-swap does not allow swap
4994 * reservation to be shared between zones.  zone swap reservation is counted
4995 * on zone->zone_max_swap.
4996 */
4997static int
4998as_can_change_zones(void)
4999{
5000	proc_t *pp = curproc;
5001	struct seg *seg;
5002	struct as *as = pp->p_as;
5003	vnode_t *vp;
5004	int allow = 1;
5005
5006	ASSERT(pp->p_as != &kas);
5007	AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
5008	for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
5009
5010		/*
5011		 * Cannot enter zone with shared anon memory which
5012		 * reserves swap.  See comment above.
5013		 */
5014		if (seg_can_change_zones(seg) == B_FALSE) {
5015			allow = 0;
5016			break;
5017		}
5018		/*
5019		 * if we can't get a backing vnode for this segment then skip
5020		 * it.
5021		 */
5022		vp = NULL;
5023		if (SEGOP_GETVP(seg, seg->s_base, &vp) != 0 || vp == NULL)
5024			continue;
5025		if (!vn_can_change_zones(vp)) { /* bail on first match */
5026			allow = 0;
5027			break;
5028		}
5029	}
5030	AS_LOCK_EXIT(as, &as->a_lock);
5031	return (allow);
5032}
5033
5034/*
5035 * Count swap reserved by curproc's address space
5036 */
5037static size_t
5038as_swresv(void)
5039{
5040	proc_t *pp = curproc;
5041	struct seg *seg;
5042	struct as *as = pp->p_as;
5043	size_t swap = 0;
5044
5045	ASSERT(pp->p_as != &kas);
5046	ASSERT(AS_WRITE_HELD(as, &as->a_lock));
5047	for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg))
5048		swap += seg_swresv(seg);
5049
5050	return (swap);
5051}
5052
5053/*
5054 * Systemcall entry point for zone_enter().
5055 *
5056 * The current process is injected into said zone.  In the process
5057 * it will change its project membership, privileges, rootdir/cwd,
5058 * zone-wide rctls, and pool association to match those of the zone.
5059 *
5060 * The first zone_enter() called while the zone is in the ZONE_IS_READY
5061 * state will transition it to ZONE_IS_RUNNING.  Processes may only
5062 * enter a zone that is "ready" or "running".
5063 */
5064static int
5065zone_enter(zoneid_t zoneid)
5066{
5067	zone_t *zone;
5068	vnode_t *vp;
5069	proc_t *pp = curproc;
5070	contract_t *ct;
5071	cont_process_t *ctp;
5072	task_t *tk, *oldtk;
5073	kproject_t *zone_proj0;
5074	cred_t *cr, *newcr;
5075	pool_t *oldpool, *newpool;
5076	sess_t *sp;
5077	uid_t uid;
5078	zone_status_t status;
5079	int err = 0;
5080	rctl_entity_p_t e;
5081	size_t swap;
5082	kthread_id_t t;
5083
5084	if (secpolicy_zone_config(CRED()) != 0)
5085		return (set_errno(EPERM));
5086	if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
5087		return (set_errno(EINVAL));
5088
5089	/*
5090	 * Stop all lwps so we don't need to hold a lock to look at
5091	 * curproc->p_zone.  This needs to happen before we grab any
5092	 * locks to avoid deadlock (another lwp in the process could
5093	 * be waiting for the held lock).
5094	 */
5095	if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK))
5096		return (set_errno(EINTR));
5097
5098	/*
5099	 * Make sure we're not changing zones with files open or mapped in
5100	 * to our address space which shouldn't be changing zones.
5101	 */
5102	if (!files_can_change_zones()) {
5103		err = EBADF;
5104		goto out;
5105	}
5106	if (!as_can_change_zones()) {
5107		err = EFAULT;
5108		goto out;
5109	}
5110
5111	mutex_enter(&zonehash_lock);
5112	if (pp->p_zone != global_zone) {
5113		mutex_exit(&zonehash_lock);
5114		err = EINVAL;
5115		goto out;
5116	}
5117
5118	zone = zone_find_all_by_id(zoneid);
5119	if (zone == NULL) {
5120		mutex_exit(&zonehash_lock);
5121		err = EINVAL;
5122		goto out;
5123	}
5124
5125	/*
5126	 * To prevent processes in a zone from holding contracts on
5127	 * extrazonal resources, and to avoid process contract
5128	 * memberships which span zones, contract holders and processes
5129	 * which aren't the sole members of their encapsulating process
5130	 * contracts are not allowed to zone_enter.
5131	 */
5132	ctp = pp->p_ct_process;
5133	ct = &ctp->conp_contract;
5134	mutex_enter(&ct->ct_lock);
5135	mutex_enter(&pp->p_lock);
5136	if ((avl_numnodes(&pp->p_ct_held) != 0) || (ctp->conp_nmembers != 1)) {
5137		mutex_exit(&pp->p_lock);
5138		mutex_exit(&ct->ct_lock);
5139		mutex_exit(&zonehash_lock);
5140		err = EINVAL;
5141		goto out;
5142	}
5143
5144	/*
5145	 * Moreover, we don't allow processes whose encapsulating
5146	 * process contracts have inherited extrazonal contracts.
5147	 * While it would be easier to eliminate all process contracts
5148	 * with inherited contracts, we need to be able to give a
5149	 * restarted init (or other zone-penetrating process) its
5150	 * predecessor's contracts.
5151	 */
5152	if (ctp->conp_ninherited != 0) {
5153		contract_t *next;
5154		for (next = list_head(&ctp->conp_inherited); next;
5155		    next = list_next(&ctp->conp_inherited, next)) {
5156			if (contract_getzuniqid(next) != zone->zone_uniqid) {
5157				mutex_exit(&pp->p_lock);
5158				mutex_exit(&ct->ct_lock);
5159				mutex_exit(&zonehash_lock);
5160				err = EINVAL;
5161				goto out;
5162			}
5163		}
5164	}
5165
5166	mutex_exit(&pp->p_lock);
5167	mutex_exit(&ct->ct_lock);
5168
5169	status = zone_status_get(zone);
5170	if (status < ZONE_IS_READY || status >= ZONE_IS_SHUTTING_DOWN) {
5171		/*
5172		 * Can't join
5173		 */
5174		mutex_exit(&zonehash_lock);
5175		err = EINVAL;
5176		goto out;
5177	}
5178
5179	/*
5180	 * Make sure new priv set is within the permitted set for caller
5181	 */
5182	if (!priv_issubset(zone->zone_privset, &CR_OPPRIV(CRED()))) {
5183		mutex_exit(&zonehash_lock);
5184		err = EPERM;
5185		goto out;
5186	}
5187	/*
5188	 * We want to momentarily drop zonehash_lock while we optimistically
5189	 * bind curproc to the pool it should be running in.  This is safe
5190	 * since the zone can't disappear (we have a hold on it).
5191	 */
5192	zone_hold(zone);
5193	mutex_exit(&zonehash_lock);
5194
5195	/*
5196	 * Grab pool_lock to keep the pools configuration from changing
5197	 * and to stop ourselves from getting rebound to another pool
5198	 * until we join the zone.
5199	 */
5200	if (pool_lock_intr() != 0) {
5201		zone_rele(zone);
5202		err = EINTR;
5203		goto out;
5204	}
5205	ASSERT(secpolicy_pool(CRED()) == 0);
5206	/*
5207	 * Bind ourselves to the pool currently associated with the zone.
5208	 */
5209	oldpool = curproc->p_pool;
5210	newpool = zone_pool_get(zone);
5211	if (pool_state == POOL_ENABLED && newpool != oldpool &&
5212	    (err = pool_do_bind(newpool, P_PID, P_MYID,
5213	    POOL_BIND_ALL)) != 0) {
5214		pool_unlock();
5215		zone_rele(zone);
5216		goto out;
5217	}
5218
5219	/*
5220	 * Grab cpu_lock now; we'll need it later when we call
5221	 * task_join().
5222	 */
5223	mutex_enter(&cpu_lock);
5224	mutex_enter(&zonehash_lock);
5225	/*
5226	 * Make sure the zone hasn't moved on since we dropped zonehash_lock.
5227	 */
5228	if (zone_status_get(zone) >= ZONE_IS_SHUTTING_DOWN) {
5229		/*
5230		 * Can't join anymore.
5231		 */
5232		mutex_exit(&zonehash_lock);
5233		mutex_exit(&cpu_lock);
5234		if (pool_state == POOL_ENABLED &&
5235		    newpool != oldpool)
5236			(void) pool_do_bind(oldpool, P_PID, P_MYID,
5237			    POOL_BIND_ALL);
5238		pool_unlock();
5239		zone_rele(zone);
5240		err = EINVAL;
5241		goto out;
5242	}
5243
5244	/*
5245	 * a_lock must be held while transfering locked memory and swap
5246	 * reservation from the global zone to the non global zone because
5247	 * asynchronous faults on the processes' address space can lock
5248	 * memory and reserve swap via MCL_FUTURE and MAP_NORESERVE
5249	 * segments respectively.
5250	 */
5251	AS_LOCK_ENTER(pp->as, &pp->p_as->a_lock, RW_WRITER);
5252	swap = as_swresv();
5253	mutex_enter(&pp->p_lock);
5254	zone_proj0 = zone->zone_zsched->p_task->tk_proj;
5255	/* verify that we do not exceed and task or lwp limits */
5256	mutex_enter(&zone->zone_nlwps_lock);
5257	/* add new lwps to zone and zone's proj0 */
5258	zone_proj0->kpj_nlwps += pp->p_lwpcnt;
5259	zone->zone_nlwps += pp->p_lwpcnt;
5260	/* add 1 task to zone's proj0 */
5261	zone_proj0->kpj_ntasks += 1;
5262	mutex_exit(&zone->zone_nlwps_lock);
5263
5264	mutex_enter(&zone->zone_mem_lock);
5265	zone->zone_locked_mem += pp->p_locked_mem;
5266	zone_proj0->kpj_data.kpd_locked_mem += pp->p_locked_mem;
5267	zone->zone_max_swap += swap;
5268	mutex_exit(&zone->zone_mem_lock);
5269
5270	mutex_enter(&(zone_proj0->kpj_data.kpd_crypto_lock));
5271	zone_proj0->kpj_data.kpd_crypto_mem += pp->p_crypto_mem;
5272	mutex_exit(&(zone_proj0->kpj_data.kpd_crypto_lock));
5273
5274	/* remove lwps from proc's old zone and old project */
5275	mutex_enter(&pp->p_zone->zone_nlwps_lock);
5276	pp->p_zone->zone_nlwps -= pp->p_lwpcnt;
5277	pp->p_task->tk_proj->kpj_nlwps -= pp->p_lwpcnt;
5278	mutex_exit(&pp->p_zone->zone_nlwps_lock);
5279
5280	mutex_enter(&pp->p_zone->zone_mem_lock);
5281	pp->p_zone->zone_locked_mem -= pp->p_locked_mem;
5282	pp->p_task->tk_proj->kpj_data.kpd_locked_mem -= pp->p_locked_mem;
5283	pp->p_zone->zone_max_swap -= swap;
5284	mutex_exit(&pp->p_zone->zone_mem_lock);
5285
5286	mutex_enter(&(pp->p_task->tk_proj->kpj_data.kpd_crypto_lock));
5287	pp->p_task->tk_proj->kpj_data.kpd_crypto_mem -= pp->p_crypto_mem;
5288	mutex_exit(&(pp->p_task->tk_proj->kpj_data.kpd_crypto_lock));
5289
5290	pp->p_flag |= SZONETOP;
5291	pp->p_zone = zone;
5292	mutex_exit(&pp->p_lock);
5293	AS_LOCK_EXIT(pp->p_as, &pp->p_as->a_lock);
5294
5295	/*
5296	 * Joining the zone cannot fail from now on.
5297	 *
5298	 * This means that a lot of the following code can be commonized and
5299	 * shared with zsched().
5300	 */
5301
5302	/*
5303	 * If the process contract fmri was inherited, we need to
5304	 * flag this so that any contract status will not leak
5305	 * extra zone information, svc_fmri in this case
5306	 */
5307	if (ctp->conp_svc_ctid != ct->ct_id) {
5308		mutex_enter(&ct->ct_lock);
5309		ctp->conp_svc_zone_enter = ct->ct_id;
5310		mutex_exit(&ct->ct_lock);
5311	}
5312
5313	/*
5314	 * Reset the encapsulating process contract's zone.
5315	 */
5316	ASSERT(ct->ct_mzuniqid == GLOBAL_ZONEUNIQID);
5317	contract_setzuniqid(ct, zone->zone_uniqid);
5318
5319	/*
5320	 * Create a new task and associate the process with the project keyed
5321	 * by (projid,zoneid).
5322	 *
5323	 * We might as well be in project 0; the global zone's projid doesn't
5324	 * make much sense in a zone anyhow.
5325	 *
5326	 * This also increments zone_ntasks, and returns with p_lock held.
5327	 */
5328	tk = task_create(0, zone);
5329	oldtk = task_join(tk, 0);
5330	mutex_exit(&cpu_lock);
5331
5332	/*
5333	 * call RCTLOP_SET functions on this proc
5334	 */
5335	e.rcep_p.zone = zone;
5336	e.rcep_t = RCENTITY_ZONE;
5337	(void) rctl_set_dup(NULL, NULL, pp, &e, zone->zone_rctls, NULL,
5338	    RCD_CALLBACK);
5339	mutex_exit(&pp->p_lock);
5340
5341	/*
5342	 * We don't need to hold any of zsched's locks here; not only do we know
5343	 * the process and zone aren't going away, we know its session isn't
5344	 * changing either.
5345	 *
5346	 * By joining zsched's session here, we mimic the behavior in the
5347	 * global zone of init's sid being the pid of sched.  We extend this
5348	 * to all zlogin-like zone_enter()'ing processes as well.
5349	 */
5350	mutex_enter(&pidlock);
5351	sp = zone->zone_zsched->p_sessp;
5352	sess_hold(zone->zone_zsched);
5353	mutex_enter(&pp->p_lock);
5354	pgexit(pp);
5355	sess_rele(pp->p_sessp, B_TRUE);
5356	pp->p_sessp = sp;
5357	pgjoin(pp, zone->zone_zsched->p_pidp);
5358
5359	/*
5360	 * If any threads are scheduled to be placed on zone wait queue they
5361	 * should abandon the idea since the wait queue is changing.
5362	 * We need to be holding pidlock & p_lock to do this.
5363	 */
5364	if ((t = pp->p_tlist) != NULL) {
5365		do {
5366			thread_lock(t);
5367			/*
5368			 * Kick this thread so that he doesn't sit
5369			 * on a wrong wait queue.
5370			 */
5371			if (ISWAITING(t))
5372				setrun_locked(t);
5373
5374			if (t->t_schedflag & TS_ANYWAITQ)
5375				t->t_schedflag &= ~ TS_ANYWAITQ;
5376
5377			thread_unlock(t);
5378		} while ((t = t->t_forw) != pp->p_tlist);
5379	}
5380
5381	/*
5382	 * If there is a default scheduling class for the zone and it is not
5383	 * the class we are currently in, change all of the threads in the
5384	 * process to the new class.  We need to be holding pidlock & p_lock
5385	 * when we call parmsset so this is a good place to do it.
5386	 */
5387	if (zone->zone_defaultcid > 0 &&
5388	    zone->zone_defaultcid != curthread->t_cid) {
5389		pcparms_t pcparms;
5390
5391		pcparms.pc_cid = zone->zone_defaultcid;
5392		pcparms.pc_clparms[0] = 0;
5393
5394		/*
5395		 * If setting the class fails, we still want to enter the zone.
5396		 */
5397		if ((t = pp->p_tlist) != NULL) {
5398			do {
5399				(void) parmsset(&pcparms, t);
5400			} while ((t = t->t_forw) != pp->p_tlist);
5401		}
5402	}
5403
5404	mutex_exit(&pp->p_lock);
5405	mutex_exit(&pidlock);
5406
5407	mutex_exit(&zonehash_lock);
5408	/*
5409	 * We're firmly in the zone; let pools progress.
5410	 */
5411	pool_unlock();
5412	task_rele(oldtk);
5413	/*
5414	 * We don't need to retain a hold on the zone since we already
5415	 * incremented zone_ntasks, so the zone isn't going anywhere.
5416	 */
5417	zone_rele(zone);
5418
5419	/*
5420	 * Chroot
5421	 */
5422	vp = zone->zone_rootvp;
5423	zone_chdir(vp, &PTOU(pp)->u_cdir, pp);
5424	zone_chdir(vp, &PTOU(pp)->u_rdir, pp);
5425
5426	/*
5427	 * Change process credentials
5428	 */
5429	newcr = cralloc();
5430	mutex_enter(&pp->p_crlock);
5431	cr = pp->p_cred;
5432	crcopy_to(cr, newcr);
5433	crsetzone(newcr, zone);
5434	pp->p_cred = newcr;
5435
5436	/*
5437	 * Restrict all process privilege sets to zone limit
5438	 */
5439	priv_intersect(zone->zone_privset, &CR_PPRIV(newcr));
5440	priv_intersect(zone->zone_privset, &CR_EPRIV(newcr));
5441	priv_intersect(zone->zone_privset, &CR_IPRIV(newcr));
5442	priv_intersect(zone->zone_privset, &CR_LPRIV(newcr));
5443	mutex_exit(&pp->p_crlock);
5444	crset(pp, newcr);
5445
5446	/*
5447	 * Adjust upcount to reflect zone entry.
5448	 */
5449	uid = crgetruid(newcr);
5450	mutex_enter(&pidlock);
5451	upcount_dec(uid, GLOBAL_ZONEID);
5452	upcount_inc(uid, zoneid);
5453	mutex_exit(&pidlock);
5454
5455	/*
5456	 * Set up core file path and content.
5457	 */
5458	set_core_defaults();
5459
5460out:
5461	/*
5462	 * Let the other lwps continue.
5463	 */
5464	mutex_enter(&pp->p_lock);
5465	if (curthread != pp->p_agenttp)
5466		continuelwps(pp);
5467	mutex_exit(&pp->p_lock);
5468
5469	return (err != 0 ? set_errno(err) : 0);
5470}
5471
5472/*
5473 * Systemcall entry point for zone_list(2).
5474 *
5475 * Processes running in a (non-global) zone only see themselves.
5476 * On labeled systems, they see all zones whose label they dominate.
5477 */
5478static int
5479zone_list(zoneid_t *zoneidlist, uint_t *numzones)
5480{
5481	zoneid_t *zoneids;
5482	zone_t *zone, *myzone;
5483	uint_t user_nzones, real_nzones;
5484	uint_t domi_nzones;
5485	int error;
5486
5487	if (copyin(numzones, &user_nzones, sizeof (uint_t)) != 0)
5488		return (set_errno(EFAULT));
5489
5490	myzone = curproc->p_zone;
5491	if (myzone != global_zone) {
5492		bslabel_t *mybslab;
5493
5494		if (!is_system_labeled()) {
5495			/* just return current zone */
5496			real_nzones = domi_nzones = 1;
5497			zoneids = kmem_alloc(sizeof (zoneid_t), KM_SLEEP);
5498			zoneids[0] = myzone->zone_id;
5499		} else {
5500			/* return all zones that are dominated */
5501			mutex_enter(&zonehash_lock);
5502			real_nzones = zonecount;
5503			domi_nzones = 0;
5504			if (real_nzones > 0) {
5505				zoneids = kmem_alloc(real_nzones *
5506				    sizeof (zoneid_t), KM_SLEEP);
5507				mybslab = label2bslabel(myzone->zone_slabel);
5508				for (zone = list_head(&zone_active);
5509				    zone != NULL;
5510				    zone = list_next(&zone_active, zone)) {
5511					if (zone->zone_id == GLOBAL_ZONEID)
5512						continue;
5513					if (zone != myzone &&
5514					    (zone->zone_flags & ZF_IS_SCRATCH))
5515						continue;
5516					/*
5517					 * Note that a label always dominates
5518					 * itself, so myzone is always included
5519					 * in the list.
5520					 */
5521					if (bldominates(mybslab,
5522					    label2bslabel(zone->zone_slabel))) {
5523						zoneids[domi_nzones++] =
5524						    zone->zone_id;
5525					}
5526				}
5527			}
5528			mutex_exit(&zonehash_lock);
5529		}
5530	} else {
5531		mutex_enter(&zonehash_lock);
5532		real_nzones = zonecount;
5533		domi_nzones = 0;
5534		if (real_nzones > 0) {
5535			zoneids = kmem_alloc(real_nzones * sizeof (zoneid_t),
5536			    KM_SLEEP);
5537			for (zone = list_head(&zone_active); zone != NULL;
5538			    zone = list_next(&zone_active, zone))
5539				zoneids[domi_nzones++] = zone->zone_id;
5540			ASSERT(domi_nzones == real_nzones);
5541		}
5542		mutex_exit(&zonehash_lock);
5543	}
5544
5545	/*
5546	 * If user has allocated space for fewer entries than we found, then
5547	 * return only up to his limit.  Either way, tell him exactly how many
5548	 * we found.
5549	 */
5550	if (domi_nzones < user_nzones)
5551		user_nzones = domi_nzones;
5552	error = 0;
5553	if (copyout(&domi_nzones, numzones, sizeof (uint_t)) != 0) {
5554		error = EFAULT;
5555	} else if (zoneidlist != NULL && user_nzones != 0) {
5556		if (copyout(zoneids, zoneidlist,
5557		    user_nzones * sizeof (zoneid_t)) != 0)
5558			error = EFAULT;
5559	}
5560
5561	if (real_nzones > 0)
5562		kmem_free(zoneids, real_nzones * sizeof (zoneid_t));
5563
5564	if (error != 0)
5565		return (set_errno(error));
5566	else
5567		return (0);
5568}
5569
5570/*
5571 * Systemcall entry point for zone_lookup(2).
5572 *
5573 * Non-global zones are only able to see themselves and (on labeled systems)
5574 * the zones they dominate.
5575 */
5576static zoneid_t
5577zone_lookup(const char *zone_name)
5578{
5579	char *kname;
5580	zone_t *zone;
5581	zoneid_t zoneid;
5582	int err;
5583
5584	if (zone_name == NULL) {
5585		/* return caller's zone id */
5586		return (getzoneid());
5587	}
5588
5589	kname = kmem_zalloc(ZONENAME_MAX, KM_SLEEP);
5590	if ((err = copyinstr(zone_name, kname, ZONENAME_MAX, NULL)) != 0) {
5591		kmem_free(kname, ZONENAME_MAX);
5592		return (set_errno(err));
5593	}
5594
5595	mutex_enter(&zonehash_lock);
5596	zone = zone_find_all_by_name(kname);
5597	kmem_free(kname, ZONENAME_MAX);
5598	/*
5599	 * In a non-global zone, can only lookup global and own name.
5600	 * In Trusted Extensions zone label dominance rules apply.
5601	 */
5602	if (zone == NULL ||
5603	    zone_status_get(zone) < ZONE_IS_READY ||
5604	    !zone_list_access(zone)) {
5605		mutex_exit(&zonehash_lock);
5606		return (set_errno(EINVAL));
5607	} else {
5608		zoneid = zone->zone_id;
5609		mutex_exit(&zonehash_lock);
5610		return (zoneid);
5611	}
5612}
5613
5614static int
5615zone_version(int *version_arg)
5616{
5617	int version = ZONE_SYSCALL_API_VERSION;
5618
5619	if (copyout(&version, version_arg, sizeof (int)) != 0)
5620		return (set_errno(EFAULT));
5621	return (0);
5622}
5623
5624/* ARGSUSED */
5625long
5626zone(int cmd, void *arg1, void *arg2, void *arg3, void *arg4)
5627{
5628	zone_def zs;
5629	int err;
5630
5631	switch (cmd) {
5632	case ZONE_CREATE:
5633		if (get_udatamodel() == DATAMODEL_NATIVE) {
5634			if (copyin(arg1, &zs, sizeof (zone_def))) {
5635				return (set_errno(EFAULT));
5636			}
5637		} else {
5638#ifdef _SYSCALL32_IMPL
5639			zone_def32 zs32;
5640
5641			if (copyin(arg1, &zs32, sizeof (zone_def32))) {
5642				return (set_errno(EFAULT));
5643			}
5644			zs.zone_name =
5645			    (const char *)(unsigned long)zs32.zone_name;
5646			zs.zone_root =
5647			    (const char *)(unsigned long)zs32.zone_root;
5648			zs.zone_privs =
5649			    (const struct priv_set *)
5650			    (unsigned long)zs32.zone_privs;
5651			zs.zone_privssz = zs32.zone_privssz;
5652			zs.rctlbuf = (caddr_t)(unsigned long)zs32.rctlbuf;
5653			zs.rctlbufsz = zs32.rctlbufsz;
5654			zs.zfsbuf = (caddr_t)(unsigned long)zs32.zfsbuf;
5655			zs.zfsbufsz = zs32.zfsbufsz;
5656			zs.extended_error =
5657			    (int *)(unsigned long)zs32.extended_error;
5658			zs.match = zs32.match;
5659			zs.doi = zs32.doi;
5660			zs.label = (const bslabel_t *)(uintptr_t)zs32.label;
5661			zs.flags = zs32.flags;
5662#else
5663			panic("get_udatamodel() returned bogus result\n");
5664#endif
5665		}
5666
5667		return (zone_create(zs.zone_name, zs.zone_root,
5668		    zs.zone_privs, zs.zone_privssz,
5669		    (caddr_t)zs.rctlbuf, zs.rctlbufsz,
5670		    (caddr_t)zs.zfsbuf, zs.zfsbufsz,
5671		    zs.extended_error, zs.match, zs.doi,
5672		    zs.label, zs.flags));
5673	case ZONE_BOOT:
5674		return (zone_boot((zoneid_t)(uintptr_t)arg1));
5675	case ZONE_DESTROY:
5676		return (zone_destroy((zoneid_t)(uintptr_t)arg1));
5677	case ZONE_GETATTR:
5678		return (zone_getattr((zoneid_t)(uintptr_t)arg1,
5679		    (int)(uintptr_t)arg2, arg3, (size_t)arg4));
5680	case ZONE_SETATTR:
5681		return (zone_setattr((zoneid_t)(uintptr_t)arg1,
5682		    (int)(uintptr_t)arg2, arg3, (size_t)arg4));
5683	case ZONE_ENTER:
5684		return (zone_enter((zoneid_t)(uintptr_t)arg1));
5685	case ZONE_LIST:
5686		return (zone_list((zoneid_t *)arg1, (uint_t *)arg2));
5687	case ZONE_SHUTDOWN:
5688		return (zone_shutdown((zoneid_t)(uintptr_t)arg1));
5689	case ZONE_LOOKUP:
5690		return (zone_lookup((const char *)arg1));
5691	case ZONE_VERSION:
5692		return (zone_version((int *)arg1));
5693	case ZONE_ADD_DATALINK:
5694		return (zone_add_datalink((zoneid_t)(uintptr_t)arg1,
5695		    (datalink_id_t)(uintptr_t)arg2));
5696	case ZONE_DEL_DATALINK:
5697		return (zone_remove_datalink((zoneid_t)(uintptr_t)arg1,
5698		    (datalink_id_t)(uintptr_t)arg2));
5699	case ZONE_CHECK_DATALINK: {
5700		zoneid_t	zoneid;
5701		boolean_t	need_copyout;
5702
5703		if (copyin(arg1, &zoneid, sizeof (zoneid)) != 0)
5704			return (EFAULT);
5705		need_copyout = (zoneid == ALL_ZONES);
5706		err = zone_check_datalink(&zoneid,
5707		    (datalink_id_t)(uintptr_t)arg2);
5708		if (err == 0 && need_copyout) {
5709			if (copyout(&zoneid, arg1, sizeof (zoneid)) != 0)
5710				err = EFAULT;
5711		}
5712		return (err == 0 ? 0 : set_errno(err));
5713	}
5714	case ZONE_LIST_DATALINK:
5715		return (zone_list_datalink((zoneid_t)(uintptr_t)arg1,
5716		    (int *)arg2, (datalink_id_t *)(uintptr_t)arg3));
5717	default:
5718		return (set_errno(EINVAL));
5719	}
5720}
5721
5722struct zarg {
5723	zone_t *zone;
5724	zone_cmd_arg_t arg;
5725};
5726
5727static int
5728zone_lookup_door(const char *zone_name, door_handle_t *doorp)
5729{
5730	char *buf;
5731	size_t buflen;
5732	int error;
5733
5734	buflen = sizeof (ZONE_DOOR_PATH) + strlen(zone_name);
5735	buf = kmem_alloc(buflen, KM_SLEEP);
5736	(void) snprintf(buf, buflen, ZONE_DOOR_PATH, zone_name);
5737	error = door_ki_open(buf, doorp);
5738	kmem_free(buf, buflen);
5739	return (error);
5740}
5741
5742static void
5743zone_release_door(door_handle_t *doorp)
5744{
5745	door_ki_rele(*doorp);
5746	*doorp = NULL;
5747}
5748
5749static void
5750zone_ki_call_zoneadmd(struct zarg *zargp)
5751{
5752	door_handle_t door = NULL;
5753	door_arg_t darg, save_arg;
5754	char *zone_name;
5755	size_t zone_namelen;
5756	zoneid_t zoneid;
5757	zone_t *zone;
5758	zone_cmd_arg_t arg;
5759	uint64_t uniqid;
5760	size_t size;
5761	int error;
5762	int retry;
5763
5764	zone = zargp->zone;
5765	arg = zargp->arg;
5766	kmem_free(zargp, sizeof (*zargp));
5767
5768	zone_namelen = strlen(zone->zone_name) + 1;
5769	zone_name = kmem_alloc(zone_namelen, KM_SLEEP);
5770	bcopy(zone->zone_name, zone_name, zone_namelen);
5771	zoneid = zone->zone_id;
5772	uniqid = zone->zone_uniqid;
5773	/*
5774	 * zoneadmd may be down, but at least we can empty out the zone.
5775	 * We can ignore the return value of zone_empty() since we're called
5776	 * from a kernel thread and know we won't be delivered any signals.
5777	 */
5778	ASSERT(curproc == &p0);
5779	(void) zone_empty(zone);
5780	ASSERT(zone_status_get(zone) >= ZONE_IS_EMPTY);
5781	zone_rele(zone);
5782
5783	size = sizeof (arg);
5784	darg.rbuf = (char *)&arg;
5785	darg.data_ptr = (char *)&arg;
5786	darg.rsize = size;
5787	darg.data_size = size;
5788	darg.desc_ptr = NULL;
5789	darg.desc_num = 0;
5790
5791	save_arg = darg;
5792	/*
5793	 * Since we're not holding a reference to the zone, any number of
5794	 * things can go wrong, including the zone disappearing before we get a
5795	 * chance to talk to zoneadmd.
5796	 */
5797	for (retry = 0; /* forever */; retry++) {
5798		if (door == NULL &&
5799		    (error = zone_lookup_door(zone_name, &door)) != 0) {
5800			goto next;
5801		}
5802		ASSERT(door != NULL);
5803
5804		if ((error = door_ki_upcall_limited(door, &darg, NULL,
5805		    SIZE_MAX, 0)) == 0) {
5806			break;
5807		}
5808		switch (error) {
5809		case EINTR:
5810			/* FALLTHROUGH */
5811		case EAGAIN:	/* process may be forking */
5812			/*
5813			 * Back off for a bit
5814			 */
5815			break;
5816		case EBADF:
5817			zone_release_door(&door);
5818			if (zone_lookup_door(zone_name, &door) != 0) {
5819				/*
5820				 * zoneadmd may be dead, but it may come back to
5821				 * life later.
5822				 */
5823				break;
5824			}
5825			break;
5826		default:
5827			cmn_err(CE_WARN,
5828			    "zone_ki_call_zoneadmd: door_ki_upcall error %d\n",
5829			    error);
5830			goto out;
5831		}
5832next:
5833		/*
5834		 * If this isn't the same zone_t that we originally had in mind,
5835		 * then this is the same as if two kadmin requests come in at
5836		 * the same time: the first one wins.  This means we lose, so we
5837		 * bail.
5838		 */
5839		if ((zone = zone_find_by_id(zoneid)) == NULL) {
5840			/*
5841			 * Problem is solved.
5842			 */
5843			break;
5844		}
5845		if (zone->zone_uniqid != uniqid) {
5846			/*
5847			 * zoneid recycled
5848			 */
5849			zone_rele(zone);
5850			break;
5851		}
5852		/*
5853		 * We could zone_status_timedwait(), but there doesn't seem to
5854		 * be much point in doing that (plus, it would mean that
5855		 * zone_free() isn't called until this thread exits).
5856		 */
5857		zone_rele(zone);
5858		delay(hz);
5859		darg = save_arg;
5860	}
5861out:
5862	if (door != NULL) {
5863		zone_release_door(&door);
5864	}
5865	kmem_free(zone_name, zone_namelen);
5866	thread_exit();
5867}
5868
5869/*
5870 * Entry point for uadmin() to tell the zone to go away or reboot.  Analog to
5871 * kadmin().  The caller is a process in the zone.
5872 *
5873 * In order to shutdown the zone, we will hand off control to zoneadmd
5874 * (running in the global zone) via a door.  We do a half-hearted job at
5875 * killing all processes in the zone, create a kernel thread to contact
5876 * zoneadmd, and make note of the "uniqid" of the zone.  The uniqid is
5877 * a form of generation number used to let zoneadmd (as well as
5878 * zone_destroy()) know exactly which zone they're re talking about.
5879 */
5880int
5881zone_kadmin(int cmd, int fcn, const char *mdep, cred_t *credp)
5882{
5883	struct zarg *zargp;
5884	zone_cmd_t zcmd;
5885	zone_t *zone;
5886
5887	zone = curproc->p_zone;
5888	ASSERT(getzoneid() != GLOBAL_ZONEID);
5889
5890	switch (cmd) {
5891	case A_SHUTDOWN:
5892		switch (fcn) {
5893		case AD_HALT:
5894		case AD_POWEROFF:
5895			zcmd = Z_HALT;
5896			break;
5897		case AD_BOOT:
5898			zcmd = Z_REBOOT;
5899			break;
5900		case AD_IBOOT:
5901		case AD_SBOOT:
5902		case AD_SIBOOT:
5903		case AD_NOSYNC:
5904			return (ENOTSUP);
5905		default:
5906			return (EINVAL);
5907		}
5908		break;
5909	case A_REBOOT:
5910		zcmd = Z_REBOOT;
5911		break;
5912	case A_FTRACE:
5913	case A_REMOUNT:
5914	case A_FREEZE:
5915	case A_DUMP:
5916	case A_CONFIG:
5917		return (ENOTSUP);
5918	default:
5919		ASSERT(cmd != A_SWAPCTL);	/* handled by uadmin() */
5920		return (EINVAL);
5921	}
5922
5923	if (secpolicy_zone_admin(credp, B_FALSE))
5924		return (EPERM);
5925	mutex_enter(&zone_status_lock);
5926
5927	/*
5928	 * zone_status can't be ZONE_IS_EMPTY or higher since curproc
5929	 * is in the zone.
5930	 */
5931	ASSERT(zone_status_get(zone) < ZONE_IS_EMPTY);
5932	if (zone_status_get(zone) > ZONE_IS_RUNNING) {
5933		/*
5934		 * This zone is already on its way down.
5935		 */
5936		mutex_exit(&zone_status_lock);
5937		return (0);
5938	}
5939	/*
5940	 * Prevent future zone_enter()s
5941	 */
5942	zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
5943	mutex_exit(&zone_status_lock);
5944
5945	/*
5946	 * Kill everyone now and call zoneadmd later.
5947	 * zone_ki_call_zoneadmd() will do a more thorough job of this
5948	 * later.
5949	 */
5950	killall(zone->zone_id);
5951	/*
5952	 * Now, create the thread to contact zoneadmd and do the rest of the
5953	 * work.  This thread can't be created in our zone otherwise
5954	 * zone_destroy() would deadlock.
5955	 */
5956	zargp = kmem_zalloc(sizeof (*zargp), KM_SLEEP);
5957	zargp->arg.cmd = zcmd;
5958	zargp->arg.uniqid = zone->zone_uniqid;
5959	zargp->zone = zone;
5960	(void) strcpy(zargp->arg.locale, "C");
5961	/* mdep was already copied in for us by uadmin */
5962	if (mdep != NULL)
5963		(void) strlcpy(zargp->arg.bootbuf, mdep,
5964		    sizeof (zargp->arg.bootbuf));
5965	zone_hold(zone);
5966
5967	(void) thread_create(NULL, 0, zone_ki_call_zoneadmd, zargp, 0, &p0,
5968	    TS_RUN, minclsyspri);
5969	exit(CLD_EXITED, 0);
5970
5971	return (EINVAL);
5972}
5973
5974/*
5975 * Entry point so kadmin(A_SHUTDOWN, ...) can set the global zone's
5976 * status to ZONE_IS_SHUTTING_DOWN.
5977 *
5978 * This function also shuts down all running zones to ensure that they won't
5979 * fork new processes.
5980 */
5981void
5982zone_shutdown_global(void)
5983{
5984	zone_t *current_zonep;
5985
5986	ASSERT(INGLOBALZONE(curproc));
5987	mutex_enter(&zonehash_lock);
5988	mutex_enter(&zone_status_lock);
5989
5990	/* Modify the global zone's status first. */
5991	ASSERT(zone_status_get(global_zone) == ZONE_IS_RUNNING);
5992	zone_status_set(global_zone, ZONE_IS_SHUTTING_DOWN);
5993
5994	/*
5995	 * Now change the states of all running zones to ZONE_IS_SHUTTING_DOWN.
5996	 * We don't mark all zones with ZONE_IS_SHUTTING_DOWN because doing so
5997	 * could cause assertions to fail (e.g., assertions about a zone's
5998	 * state during initialization, readying, or booting) or produce races.
5999	 * We'll let threads continue to initialize and ready new zones: they'll
6000	 * fail to boot the new zones when they see that the global zone is
6001	 * shutting down.
6002	 */
6003	for (current_zonep = list_head(&zone_active); current_zonep != NULL;
6004	    current_zonep = list_next(&zone_active, current_zonep)) {
6005		if (zone_status_get(current_zonep) == ZONE_IS_RUNNING)
6006			zone_status_set(current_zonep, ZONE_IS_SHUTTING_DOWN);
6007	}
6008	mutex_exit(&zone_status_lock);
6009	mutex_exit(&zonehash_lock);
6010}
6011
6012/*
6013 * Returns true if the named dataset is visible in the current zone.
6014 * The 'write' parameter is set to 1 if the dataset is also writable.
6015 */
6016int
6017zone_dataset_visible(const char *dataset, int *write)
6018{
6019	static int zfstype = -1;
6020	zone_dataset_t *zd;
6021	size_t len;
6022	zone_t *zone = curproc->p_zone;
6023	const char *name = NULL;
6024	vfs_t *vfsp = NULL;
6025
6026	if (dataset[0] == '\0')
6027		return (0);
6028
6029	/*
6030	 * Walk the list once, looking for datasets which match exactly, or
6031	 * specify a dataset underneath an exported dataset.  If found, return
6032	 * true and note that it is writable.
6033	 */
6034	for (zd = list_head(&zone->zone_datasets); zd != NULL;
6035	    zd = list_next(&zone->zone_datasets, zd)) {
6036
6037		len = strlen(zd->zd_dataset);
6038		if (strlen(dataset) >= len &&
6039		    bcmp(dataset, zd->zd_dataset, len) == 0 &&
6040		    (dataset[len] == '\0' || dataset[len] == '/' ||
6041		    dataset[len] == '@')) {
6042			if (write)
6043				*write = 1;
6044			return (1);
6045		}
6046	}
6047
6048	/*
6049	 * Walk the list a second time, searching for datasets which are parents
6050	 * of exported datasets.  These should be visible, but read-only.
6051	 *
6052	 * Note that we also have to support forms such as 'pool/dataset/', with
6053	 * a trailing slash.
6054	 */
6055	for (zd = list_head(&zone->zone_datasets); zd != NULL;
6056	    zd = list_next(&zone->zone_datasets, zd)) {
6057
6058		len = strlen(dataset);
6059		if (dataset[len - 1] == '/')
6060			len--;	/* Ignore trailing slash */
6061		if (len < strlen(zd->zd_dataset) &&
6062		    bcmp(dataset, zd->zd_dataset, len) == 0 &&
6063		    zd->zd_dataset[len] == '/') {
6064			if (write)
6065				*write = 0;
6066			return (1);
6067		}
6068	}
6069
6070	/*
6071	 * We reach here if the given dataset is not found in the zone_dataset
6072	 * list. Check if this dataset was added as a filesystem (ie. "add fs")
6073	 * instead of delegation. For this we search for the dataset in the
6074	 * zone_vfslist of this zone. If found, return true and note that it is
6075	 * not writable.
6076	 */
6077
6078	/*
6079	 * Initialize zfstype if it is not initialized yet.
6080	 */
6081	if (zfstype == -1) {
6082		struct vfssw *vswp = vfs_getvfssw("zfs");
6083		zfstype = vswp - vfssw;
6084		vfs_unrefvfssw(vswp);
6085	}
6086
6087	vfs_list_read_lock();
6088	vfsp = zone->zone_vfslist;
6089	do {
6090		ASSERT(vfsp);
6091		if (vfsp->vfs_fstype == zfstype) {
6092			name = refstr_value(vfsp->vfs_resource);
6093
6094			/*
6095			 * Check if we have an exact match.
6096			 */
6097			if (strcmp(dataset, name) == 0) {
6098				vfs_list_unlock();
6099				if (write)
6100					*write = 0;
6101				return (1);
6102			}
6103			/*
6104			 * We need to check if we are looking for parents of
6105			 * a dataset. These should be visible, but read-only.
6106			 */
6107			len = strlen(dataset);
6108			if (dataset[len - 1] == '/')
6109				len--;
6110
6111			if (len < strlen(name) &&
6112			    bcmp(dataset, name, len) == 0 && name[len] == '/') {
6113				vfs_list_unlock();
6114				if (write)
6115					*write = 0;
6116				return (1);
6117			}
6118		}
6119		vfsp = vfsp->vfs_zone_next;
6120	} while (vfsp != zone->zone_vfslist);
6121
6122	vfs_list_unlock();
6123	return (0);
6124}
6125
6126/*
6127 * zone_find_by_any_path() -
6128 *
6129 * kernel-private routine similar to zone_find_by_path(), but which
6130 * effectively compares against zone paths rather than zonerootpath
6131 * (i.e., the last component of zonerootpaths, which should be "root/",
6132 * are not compared.)  This is done in order to accurately identify all
6133 * paths, whether zone-visible or not, including those which are parallel
6134 * to /root/, such as /dev/, /home/, etc...
6135 *
6136 * If the specified path does not fall under any zone path then global
6137 * zone is returned.
6138 *
6139 * The treat_abs parameter indicates whether the path should be treated as
6140 * an absolute path although it does not begin with "/".  (This supports
6141 * nfs mount syntax such as host:any/path.)
6142 *
6143 * The caller is responsible for zone_rele of the returned zone.
6144 */
6145zone_t *
6146zone_find_by_any_path(const char *path, boolean_t treat_abs)
6147{
6148	zone_t *zone;
6149	int path_offset = 0;
6150
6151	if (path == NULL) {
6152		zone_hold(global_zone);
6153		return (global_zone);
6154	}
6155
6156	if (*path != '/') {
6157		ASSERT(treat_abs);
6158		path_offset = 1;
6159	}
6160
6161	mutex_enter(&zonehash_lock);
6162	for (zone = list_head(&zone_active); zone != NULL;
6163	    zone = list_next(&zone_active, zone)) {
6164		char	*c;
6165		size_t	pathlen;
6166		char *rootpath_start;
6167
6168		if (zone == global_zone)	/* skip global zone */
6169			continue;
6170
6171		/* scan backwards to find start of last component */
6172		c = zone->zone_rootpath + zone->zone_rootpathlen - 2;
6173		do {
6174			c--;
6175		} while (*c != '/');
6176
6177		pathlen = c - zone->zone_rootpath + 1 - path_offset;
6178		rootpath_start = (zone->zone_rootpath + path_offset);
6179		if (strncmp(path, rootpath_start, pathlen) == 0)
6180			break;
6181	}
6182	if (zone == NULL)
6183		zone = global_zone;
6184	zone_hold(zone);
6185	mutex_exit(&zonehash_lock);
6186	return (zone);
6187}
6188
6189/*
6190 * Finds a zone_dl_t with the given linkid in the given zone.  Returns the
6191 * zone_dl_t pointer if found, and NULL otherwise.
6192 */
6193static zone_dl_t *
6194zone_find_dl(zone_t *zone, datalink_id_t linkid)
6195{
6196	zone_dl_t *zdl;
6197
6198	ASSERT(mutex_owned(&zone->zone_lock));
6199	for (zdl = list_head(&zone->zone_dl_list); zdl != NULL;
6200	    zdl = list_next(&zone->zone_dl_list, zdl)) {
6201		if (zdl->zdl_id == linkid)
6202			break;
6203	}
6204	return (zdl);
6205}
6206
6207static boolean_t
6208zone_dl_exists(zone_t *zone, datalink_id_t linkid)
6209{
6210	boolean_t exists;
6211
6212	mutex_enter(&zone->zone_lock);
6213	exists = (zone_find_dl(zone, linkid) != NULL);
6214	mutex_exit(&zone->zone_lock);
6215	return (exists);
6216}
6217
6218/*
6219 * Add an data link name for the zone.
6220 */
6221static int
6222zone_add_datalink(zoneid_t zoneid, datalink_id_t linkid)
6223{
6224	zone_dl_t *zdl;
6225	zone_t *zone;
6226	zone_t *thiszone;
6227
6228	if ((thiszone = zone_find_by_id(zoneid)) == NULL)
6229		return (set_errno(ENXIO));
6230
6231	/* Verify that the datalink ID doesn't already belong to a zone. */
6232	mutex_enter(&zonehash_lock);
6233	for (zone = list_head(&zone_active); zone != NULL;
6234	    zone = list_next(&zone_active, zone)) {
6235		if (zone_dl_exists(zone, linkid)) {
6236			mutex_exit(&zonehash_lock);
6237			zone_rele(thiszone);
6238			return (set_errno((zone == thiszone) ? EEXIST : EPERM));
6239		}
6240	}
6241
6242	zdl = kmem_zalloc(sizeof (*zdl), KM_SLEEP);
6243	zdl->zdl_id = linkid;
6244	mutex_enter(&thiszone->zone_lock);
6245	list_insert_head(&thiszone->zone_dl_list, zdl);
6246	mutex_exit(&thiszone->zone_lock);
6247	mutex_exit(&zonehash_lock);
6248	zone_rele(thiszone);
6249	return (0);
6250}
6251
6252static int
6253zone_remove_datalink(zoneid_t zoneid, datalink_id_t linkid)
6254{
6255	zone_dl_t *zdl;
6256	zone_t *zone;
6257	int err = 0;
6258
6259	if ((zone = zone_find_by_id(zoneid)) == NULL)
6260		return (set_errno(EINVAL));
6261
6262	mutex_enter(&zone->zone_lock);
6263	if ((zdl = zone_find_dl(zone, linkid)) == NULL) {
6264		err = ENXIO;
6265	} else {
6266		list_remove(&zone->zone_dl_list, zdl);
6267		kmem_free(zdl, sizeof (zone_dl_t));
6268	}
6269	mutex_exit(&zone->zone_lock);
6270	zone_rele(zone);
6271	return (err == 0 ? 0 : set_errno(err));
6272}
6273
6274/*
6275 * Using the zoneidp as ALL_ZONES, we can lookup which zone has been assigned
6276 * the linkid.  Otherwise we just check if the specified zoneidp has been
6277 * assigned the supplied linkid.
6278 */
6279int
6280zone_check_datalink(zoneid_t *zoneidp, datalink_id_t linkid)
6281{
6282	zone_t *zone;
6283	int err = ENXIO;
6284
6285	if (*zoneidp != ALL_ZONES) {
6286		if ((zone = zone_find_by_id(*zoneidp)) != NULL) {
6287			if (zone_dl_exists(zone, linkid))
6288				err = 0;
6289			zone_rele(zone);
6290		}
6291		return (err);
6292	}
6293
6294	mutex_enter(&zonehash_lock);
6295	for (zone = list_head(&zone_active); zone != NULL;
6296	    zone = list_next(&zone_active, zone)) {
6297		if (zone_dl_exists(zone, linkid)) {
6298			*zoneidp = zone->zone_id;
6299			err = 0;
6300			break;
6301		}
6302	}
6303	mutex_exit(&zonehash_lock);
6304	return (err);
6305}
6306
6307/*
6308 * Get the list of datalink IDs assigned to a zone.
6309 *
6310 * On input, *nump is the number of datalink IDs that can fit in the supplied
6311 * idarray.  Upon return, *nump is either set to the number of datalink IDs
6312 * that were placed in the array if the array was large enough, or to the
6313 * number of datalink IDs that the function needs to place in the array if the
6314 * array is too small.
6315 */
6316static int
6317zone_list_datalink(zoneid_t zoneid, int *nump, datalink_id_t *idarray)
6318{
6319	uint_t num, dlcount;
6320	zone_t *zone;
6321	zone_dl_t *zdl;
6322	datalink_id_t *idptr = idarray;
6323
6324	if (copyin(nump, &dlcount, sizeof (dlcount)) != 0)
6325		return (set_errno(EFAULT));
6326	if ((zone = zone_find_by_id(zoneid)) == NULL)
6327		return (set_errno(ENXIO));
6328
6329	num = 0;
6330	mutex_enter(&zone->zone_lock);
6331	for (zdl = list_head(&zone->zone_dl_list); zdl != NULL;
6332	    zdl = list_next(&zone->zone_dl_list, zdl)) {
6333		/*
6334		 * If the list is bigger than what the caller supplied, just
6335		 * count, don't do copyout.
6336		 */
6337		if (++num > dlcount)
6338			continue;
6339		if (copyout(&zdl->zdl_id, idptr, sizeof (*idptr)) != 0) {
6340			mutex_exit(&zone->zone_lock);
6341			zone_rele(zone);
6342			return (set_errno(EFAULT));
6343		}
6344		idptr++;
6345	}
6346	mutex_exit(&zone->zone_lock);
6347	zone_rele(zone);
6348
6349	/* Increased or decreased, caller should be notified. */
6350	if (num != dlcount) {
6351		if (copyout(&num, nump, sizeof (num)) != 0)
6352			return (set_errno(EFAULT));
6353	}
6354	return (0);
6355}
6356
6357/*
6358 * Public interface for looking up a zone by zoneid. It's a customized version
6359 * for netstack_zone_create(). It can only be called from the zsd create
6360 * callbacks, since it doesn't have reference on the zone structure hence if
6361 * it is called elsewhere the zone could disappear after the zonehash_lock
6362 * is dropped.
6363 *
6364 * Furthermore it
6365 * 1. Doesn't check the status of the zone.
6366 * 2. It will be called even before zone_init is called, in that case the
6367 *    address of zone0 is returned directly, and netstack_zone_create()
6368 *    will only assign a value to zone0.zone_netstack, won't break anything.
6369 * 3. Returns without the zone being held.
6370 */
6371zone_t *
6372zone_find_by_id_nolock(zoneid_t zoneid)
6373{
6374	zone_t *zone;
6375
6376	mutex_enter(&zonehash_lock);
6377	if (zonehashbyid == NULL)
6378		zone = &zone0;
6379	else
6380		zone = zone_find_all_by_id(zoneid);
6381	mutex_exit(&zonehash_lock);
6382	return (zone);
6383}
6384
6385/*
6386 * Walk the datalinks for a given zone
6387 */
6388int
6389zone_datalink_walk(zoneid_t zoneid, int (*cb)(datalink_id_t, void *),
6390    void *data)
6391{
6392	zone_t		*zone;
6393	zone_dl_t	*zdl;
6394	datalink_id_t	*idarray;
6395	uint_t		idcount = 0;
6396	int		i, ret = 0;
6397
6398	if ((zone = zone_find_by_id(zoneid)) == NULL)
6399		return (ENOENT);
6400
6401	/*
6402	 * We first build an array of linkid's so that we can walk these and
6403	 * execute the callback with the zone_lock dropped.
6404	 */
6405	mutex_enter(&zone->zone_lock);
6406	for (zdl = list_head(&zone->zone_dl_list); zdl != NULL;
6407	    zdl = list_next(&zone->zone_dl_list, zdl)) {
6408		idcount++;
6409	}
6410
6411	if (idcount == 0) {
6412		mutex_exit(&zone->zone_lock);
6413		zone_rele(zone);
6414		return (0);
6415	}
6416
6417	idarray = kmem_alloc(sizeof (datalink_id_t) * idcount, KM_NOSLEEP);
6418	if (idarray == NULL) {
6419		mutex_exit(&zone->zone_lock);
6420		zone_rele(zone);
6421		return (ENOMEM);
6422	}
6423
6424	for (i = 0, zdl = list_head(&zone->zone_dl_list); zdl != NULL;
6425	    i++, zdl = list_next(&zone->zone_dl_list, zdl)) {
6426		idarray[i] = zdl->zdl_id;
6427	}
6428
6429	mutex_exit(&zone->zone_lock);
6430
6431	for (i = 0; i < idcount && ret == 0; i++) {
6432		if ((ret = (*cb)(idarray[i], data)) != 0)
6433			break;
6434	}
6435
6436	zone_rele(zone);
6437	kmem_free(idarray, sizeof (datalink_id_t) * idcount);
6438	return (ret);
6439}
6440