27c478bdstevel@tonic-gate * CDDL HEADER START
37c478bdstevel@tonic-gate *
47c478bdstevel@tonic-gate * The contents of this file are subject to the terms of the
545916cdjpk * Common Development and Distribution License (the "License").
645916cdjpk * You may not use this file except in compliance with the License.
77c478bdstevel@tonic-gate *
87c478bdstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
97c478bdstevel@tonic-gate * or http://www.opensolaris.org/os/licensing.
107c478bdstevel@tonic-gate * See the License for the specific language governing permissions
117c478bdstevel@tonic-gate * and limitations under the License.
127c478bdstevel@tonic-gate *
137c478bdstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each
147c478bdstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
157c478bdstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the
167c478bdstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying
177c478bdstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner]
187c478bdstevel@tonic-gate *
197c478bdstevel@tonic-gate * CDDL HEADER END
207c478bdstevel@tonic-gate */
23134a1f4Casper H.S. Dik * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
24a48d812Jerry Jelinek * Copyright 2015, Joyent Inc. All rights reserved.
2548bbca8Daniel Hoffman * Copyright (c) 2016 by Delphix. All rights reserved.
2666d7818Andy Fiddaman * Copyright 2018 OmniOS Community Edition (OmniOSce) Association.
277c478bdstevel@tonic-gate */
307c478bdstevel@tonic-gate * Zones
317c478bdstevel@tonic-gate *
327c478bdstevel@tonic-gate *   A zone is a named collection of processes, namespace constraints,
337c478bdstevel@tonic-gate *   and other system resources which comprise a secure and manageable
347c478bdstevel@tonic-gate *   application containment facility.
357c478bdstevel@tonic-gate *
367c478bdstevel@tonic-gate *   Zones (represented by the reference counted zone_t) are tracked in
377c478bdstevel@tonic-gate *   the kernel in the zonehash.  Elsewhere in the kernel, Zone IDs
387c478bdstevel@tonic-gate *   (zoneid_t) are used to track zone association.  Zone IDs are
397c478bdstevel@tonic-gate *   dynamically generated when the zone is created; if a persistent
407c478bdstevel@tonic-gate *   identifier is needed (core files, accounting logs, audit trail,
417c478bdstevel@tonic-gate *   etc.), the zone name should be used.
427c478bdstevel@tonic-gate *
437c478bdstevel@tonic-gate *
447c478bdstevel@tonic-gate *   Global Zone:
457c478bdstevel@tonic-gate *
467c478bdstevel@tonic-gate *   The global zone (zoneid 0) is automatically associated with all
477c478bdstevel@tonic-gate *   system resources that have not been bound to a user-created zone.
487c478bdstevel@tonic-gate *   This means that even systems where zones are not in active use
497c478bdstevel@tonic-gate *   have a global zone, and all processes, mounts, etc. are
507c478bdstevel@tonic-gate *   associated with that zone.  The global zone is generally
517c478bdstevel@tonic-gate *   unconstrained in terms of privileges and access, though the usual
527c478bdstevel@tonic-gate *   credential and privilege based restrictions apply.
537c478bdstevel@tonic-gate *
547c478bdstevel@tonic-gate *
557c478bdstevel@tonic-gate *   Zone States:
567c478bdstevel@tonic-gate *
577c478bdstevel@tonic-gate *   The states in which a zone may be in and the transitions are as
587c478bdstevel@tonic-gate *   follows:
597c478bdstevel@tonic-gate *
607c478bdstevel@tonic-gate *   ZONE_IS_UNINITIALIZED: primordial state for a zone. The partially
617c478bdstevel@tonic-gate *   initialized zone is added to the list of active zones on the system but
627c478bdstevel@tonic-gate *   isn't accessible.
637c478bdstevel@tonic-gate *
64bd41d0anordmark *   ZONE_IS_INITIALIZED: Initialization complete except the ZSD callbacks are
65bd41d0anordmark *   not yet completed. Not possible to enter the zone, but attributes can
66bd41d0anordmark *   be retrieved.
67bd41d0anordmark *
687c478bdstevel@tonic-gate *   ZONE_IS_READY: zsched (the kernel dummy process for a zone) is
697c478bdstevel@tonic-gate *   ready.  The zone is made visible after the ZSD constructor callbacks are
707c478bdstevel@tonic-gate *   executed.  A zone remains in this state until it transitions into
717c478bdstevel@tonic-gate *   the ZONE_IS_BOOTING state as a result of a call to zone_boot().
727c478bdstevel@tonic-gate *
737c478bdstevel@tonic-gate *   ZONE_IS_BOOTING: in this shortlived-state, zsched attempts to start
747c478bdstevel@tonic-gate *   init.  Should that fail, the zone proceeds to the ZONE_IS_SHUTTING_DOWN
757c478bdstevel@tonic-gate *   state.
767c478bdstevel@tonic-gate *
777c478bdstevel@tonic-gate *   ZONE_IS_RUNNING: The zone is open for business: zsched has
787c478bdstevel@tonic-gate *   successfully started init.   A zone remains in this state until
797c478bdstevel@tonic-gate *   zone_shutdown() is called.
807c478bdstevel@tonic-gate *
817c478bdstevel@tonic-gate *   ZONE_IS_SHUTTING_DOWN: zone_shutdown() has been called, the system is
827c478bdstevel@tonic-gate *   killing all processes running in the zone. The zone remains
837c478bdstevel@tonic-gate *   in this state until there are no more user processes running in the zone.
847c478bdstevel@tonic-gate *   zone_create(), zone_enter(), and zone_destroy() on this zone will fail.
857c478bdstevel@tonic-gate *   Since zone_shutdown() is restartable, it may be called successfully
867c478bdstevel@tonic-gate *   multiple times for the same zone_t.  Setting of the zone's state to
877c478bdstevel@tonic-gate *   ZONE_IS_SHUTTING_DOWN is synchronized with mounts, so VOP_MOUNT() may check
887c478bdstevel@tonic-gate *   the zone's status without worrying about it being a moving target.
897c478bdstevel@tonic-gate *
907c478bdstevel@tonic-gate *   ZONE_IS_EMPTY: zone_shutdown() has been called, and there
917c478bdstevel@tonic-gate *   are no more user processes in the zone.  The zone remains in this
927c478bdstevel@tonic-gate *   state until there are no more kernel threads associated with the
937c478bdstevel@tonic-gate *   zone.  zone_create(), zone_enter(), and zone_destroy() on this zone will
947c478bdstevel@tonic-gate *   fail.
957c478bdstevel@tonic-gate *
967c478bdstevel@tonic-gate *   ZONE_IS_DOWN: All kernel threads doing work on behalf of the zone
977c478bdstevel@tonic-gate *   have exited.  zone_shutdown() returns.  Henceforth it is not possible to
987c478bdstevel@tonic-gate *   join the zone or create kernel threads therein.
997c478bdstevel@tonic-gate *
1007c478bdstevel@tonic-gate *   ZONE_IS_DYING: zone_destroy() has been called on the zone; zone
1017c478bdstevel@tonic-gate *   remains in this state until zsched exits.  Calls to zone_find_by_*()
1027c478bdstevel@tonic-gate *   return NULL from now on.
1037c478bdstevel@tonic-gate *
1047c478bdstevel@tonic-gate *   ZONE_IS_DEAD: zsched has exited (zone_ntasks == 0).  There are no
1057c478bdstevel@tonic-gate *   processes or threads doing work on behalf of the zone.  The zone is
1067c478bdstevel@tonic-gate *   removed from the list of active zones.  zone_destroy() returns, and
1077c478bdstevel@tonic-gate *   the zone can be recreated.
1087c478bdstevel@tonic-gate *
1097c478bdstevel@tonic-gate *   ZONE_IS_FREE (internal state): zone_ref goes to 0, ZSD destructor
1107c478bdstevel@tonic-gate *   callbacks are executed, and all memory associated with the zone is
1117c478bdstevel@tonic-gate *   freed.
1127c478bdstevel@tonic-gate *
1137c478bdstevel@tonic-gate *   Threads can wait for the zone to enter a requested state by using
1147c478bdstevel@tonic-gate *   zone_status_wait() or zone_status_timedwait() with the desired
1157c478bdstevel@tonic-gate *   state passed in as an argument.  Zone state transitions are
1167c478bdstevel@tonic-gate *   uni-directional; it is not possible to move back to an earlier state.
1177c478bdstevel@tonic-gate *
1187c478bdstevel@tonic-gate *
1197c478bdstevel@tonic-gate *   Zone-Specific Data:
1207c478bdstevel@tonic-gate *
1217c478bdstevel@tonic-gate *   Subsystems needing to maintain zone-specific data can store that
1227c478bdstevel@tonic-gate *   data using the ZSD mechanism.  This provides a zone-specific data
1237c478bdstevel@tonic-gate *   store, similar to thread-specific data (see pthread_getspecific(3C)
1247c478bdstevel@tonic-gate *   or the TSD code in uts/common/disp/thread.c.  Also, ZSD can be used
1257c478bdstevel@tonic-gate *   to register callbacks to be invoked when a zone is created, shut
1267c478bdstevel@tonic-gate *   down, or destroyed.  This can be used to initialize zone-specific
1277c478bdstevel@tonic-gate *   data for new zones and to clean up when zones go away.
1287c478bdstevel@tonic-gate *
1297c478bdstevel@tonic-gate *
1307c478bdstevel@tonic-gate *   Data Structures:
1317c478bdstevel@tonic-gate *
1327c478bdstevel@tonic-gate *   The per-zone structure (zone_t) is reference counted, and freed
1337c478bdstevel@tonic-gate *   when all references are released.  zone_hold and zone_rele can be
1347c478bdstevel@tonic-gate *   used to adjust the reference count.  In addition, reference counts
1357c478bdstevel@tonic-gate *   associated with the cred_t structure are tracked separately using
1367c478bdstevel@tonic-gate *   zone_cred_hold and zone_cred_rele.
1377c478bdstevel@tonic-gate *
1387c478bdstevel@tonic-gate *   Pointers to active zone_t's are stored in two hash tables; one
1397c478bdstevel@tonic-gate *   for searching by id, the other for searching by name.  Lookups
1407c478bdstevel@tonic-gate *   can be performed on either basis, using zone_find_by_id and
1417c478bdstevel@tonic-gate *   zone_find_by_name.  Both return zone_t pointers with the zone
1427c478bdstevel@tonic-gate *   held, so zone_rele should be called when the pointer is no longer
1437c478bdstevel@tonic-gate *   needed.  Zones can also be searched by path; zone_find_by_path
1447c478bdstevel@tonic-gate *   returns the zone with which a path name is associated (global
1457c478bdstevel@tonic-gate *   zone if the path is not within some other zone's file system
1467c478bdstevel@tonic-gate *   hierarchy).  This currently requires iterating through each zone,
1477c478bdstevel@tonic-gate *   so it is slower than an id or name search via a hash table.
1487c478bdstevel@tonic-gate *
1497c478bdstevel@tonic-gate *
1507c478bdstevel@tonic-gate *   Locking:
1517c478bdstevel@tonic-gate *
1527c478bdstevel@tonic-gate *   zonehash_lock: This is a top-level global lock used to protect the
1537c478bdstevel@tonic-gate *       zone hash tables and lists.  Zones cannot be created or destroyed
1547c478bdstevel@tonic-gate *       while this lock is held.
1557c478bdstevel@tonic-gate *   zone_status_lock: This is a global lock protecting zone state.
1567c478bdstevel@tonic-gate *       Zones cannot change state while this lock is held.  It also
1577c478bdstevel@tonic-gate *       protects the list of kernel threads associated with a zone.
1587c478bdstevel@tonic-gate *   zone_lock: This is a per-zone lock used to protect several fields of
1597c478bdstevel@tonic-gate *       the zone_t (see <sys/zone.h> for details).  In addition, holding
1607c478bdstevel@tonic-gate *       this lock means that the zone cannot go away.
1610209230gjelinek *   zone_nlwps_lock: This is a per-zone lock used to protect the fields
1620209230gjelinek *	 related to the zone.max-lwps rctl.
1630209230gjelinek *   zone_mem_lock: This is a per-zone lock used to protect the fields
1640209230gjelinek *	 related to the zone.max-locked-memory and zone.max-swap rctls.
1650fbb751John Levon *   zone_rctl_lock: This is a per-zone lock used to protect other rctls,
1660fbb751John Levon *       currently just max_lofi
1677c478bdstevel@tonic-gate *   zsd_key_lock: This is a global lock protecting the key state for ZSD.
1687c478bdstevel@tonic-gate *   zone_deathrow_lock: This is a global lock protecting the "deathrow"
1697c478bdstevel@tonic-gate *       list (a list of zones in the ZONE_IS_DEAD state).
1707c478bdstevel@tonic-gate *
1717c478bdstevel@tonic-gate *   Ordering requirements:
1727c478bdstevel@tonic-gate *       pool_lock --> cpu_lock --> zonehash_lock --> zone_status_lock -->
1732918c4aJohn Levon *       zone_lock --> zsd_key_lock --> pidlock --> p_lock
1747c478bdstevel@tonic-gate *
1750209230gjelinek *   When taking zone_mem_lock or zone_nlwps_lock, the lock ordering is:
1760209230gjelinek *	zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_mem_lock
177ff19e02Menno Lageman *	zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_nlwps_lock
1780209230gjelinek *
1797c478bdstevel@tonic-gate *   Blocking memory allocations are permitted while holding any of the
1807c478bdstevel@tonic-gate *   zone locks.
1817c478bdstevel@tonic-gate *
1827c478bdstevel@tonic-gate *
1837c478bdstevel@tonic-gate *   System Call Interface:
1847c478bdstevel@tonic-gate *
1857c478bdstevel@tonic-gate *   The zone subsystem can be managed and queried from user level with
1867c478bdstevel@tonic-gate *   the following system calls (all subcodes of the primary "zone"
1877c478bdstevel@tonic-gate *   system call):
1887c478bdstevel@tonic-gate *   - zone_create: creates a zone with selected attributes (name,
189fa9e406ahrens *     root path, privileges, resource controls, ZFS datasets)
1907c478bdstevel@tonic-gate *   - zone_enter: allows the current process to enter a zone
1917c478bdstevel@tonic-gate *   - zone_getattr: reports attributes of a zone
1923f2f09cdp *   - zone_setattr: set attributes of a zone
1933f2f09cdp *   - zone_boot: set 'init' running for the zone
1947c478bdstevel@tonic-gate *   - zone_list: lists all zones active in the system
1957c478bdstevel@tonic-gate *   - zone_lookup: looks up zone id based on name
1967c478bdstevel@tonic-gate *   - zone_shutdown: initiates shutdown process (see states above)
1977c478bdstevel@tonic-gate *   - zone_destroy: completes shutdown process (see states above)
1987c478bdstevel@tonic-gate *
1997c478bdstevel@tonic-gate */
2017c478bdstevel@tonic-gate#include <sys/priv_impl.h>
2027c478bdstevel@tonic-gate#include <sys/cred.h>
2037c478bdstevel@tonic-gate#include <c2/audit.h>
2047c478bdstevel@tonic-gate#include <sys/debug.h>
2057c478bdstevel@tonic-gate#include <sys/file.h>
2067c478bdstevel@tonic-gate#include <sys/kmem.h>
2070209230gjelinek#include <sys/kstat.h>
2087c478bdstevel@tonic-gate#include <sys/mutex.h>
20945916cdjpk#include <sys/note.h>
2107c478bdstevel@tonic-gate#include <sys/pathname.h>
2117c478bdstevel@tonic-gate#include <sys/proc.h>
2127c478bdstevel@tonic-gate#include <sys/project.h>
213cf8f45cdstaff#include <sys/sysevent.h>
2147c478bdstevel@tonic-gate#include <sys/task.h>
2157c478bdstevel@tonic-gate#include <sys/systm.h>
2167c478bdstevel@tonic-gate#include <sys/types.h>
2177c478bdstevel@tonic-gate#include <sys/utsname.h>
2187c478bdstevel@tonic-gate#include <sys/vnode.h>
2197c478bdstevel@tonic-gate#include <sys/vfs.h>
2207c478bdstevel@tonic-gate#include <sys/systeminfo.h>
2217c478bdstevel@tonic-gate#include <sys/policy.h>
2227c478bdstevel@tonic-gate#include <sys/cred_impl.h>
2237c478bdstevel@tonic-gate#include <sys/contract_impl.h>
2247c478bdstevel@tonic-gate#include <sys/contract/process_impl.h>
2257c478bdstevel@tonic-gate#include <sys/class.h>
2267c478bdstevel@tonic-gate#include <sys/pool.h>
2277c478bdstevel@tonic-gate#include <sys/pool_pset.h>
2287c478bdstevel@tonic-gate#include <sys/pset.h>
229a19609fjv#include <sys/strlog.h>
2307c478bdstevel@tonic-gate#include <sys/sysmacros.h>
2317c478bdstevel@tonic-gate#include <sys/callb.h>
2327c478bdstevel@tonic-gate#include <sys/vmparam.h>
2337c478bdstevel@tonic-gate#include <sys/corectl.h>
234824c205ml#include <sys/ipc_impl.h>
235134a1f4Casper H.S. Dik#include <sys/klpd.h>
2377c478bdstevel@tonic-gate#include <sys/door.h>
2387c478bdstevel@tonic-gate#include <sys/cpuvar.h>
239bd41d0anordmark#include <sys/sdt.h>
2417c478bdstevel@tonic-gate#include <sys/uadmin.h>
2427c478bdstevel@tonic-gate#include <sys/session.h>
2437c478bdstevel@tonic-gate#include <sys/cmn_err.h>
2447c478bdstevel@tonic-gate#include <sys/modhash.h>
2453f2f09cdp#include <sys/sunddi.h>
2467c478bdstevel@tonic-gate#include <sys/nvpair.h>
2477c478bdstevel@tonic-gate#include <sys/rctl.h>
2487c478bdstevel@tonic-gate#include <sys/fss.h>
2499acbbeann#include <sys/brand.h>
2507c478bdstevel@tonic-gate#include <sys/zone.h>
251f4b3ec6dh#include <net/if.h>
252c97ad5cakolb#include <sys/cpucaps.h>
2530209230gjelinek#include <vm/seg.h>
2542b24ab6Sebastien Roy#include <sys/mac.h>
2552b24ab6Sebastien Roy
257a19609fjv * This constant specifies the number of seconds that threads waiting for
258a19609fjv * subsystems to release a zone's general-purpose references will wait before
259a19609fjv * they log the zone's reference counts.  The constant's value shouldn't
260a19609fjv * be so small that reference counts are unnecessarily reported for zones
261a19609fjv * whose references are slowly released.  On the other hand, it shouldn't be so
262a19609fjv * large that users reboot their systems out of frustration over hung zones
263a19609fjv * before the system logs the zones' reference counts.
264a19609fjv */
265a19609fjv#define	ZONE_DESTROY_TIMEOUT_SECS	60
2672b24ab6Sebastien Roy/* List of data link IDs which are accessible from the zone */
2682b24ab6Sebastien Roytypedef struct zone_dl {
2692b24ab6Sebastien Roy	datalink_id_t	zdl_id;
270550b6e4Sowmini Varadhan	nvlist_t	*zdl_net;
2712b24ab6Sebastien Roy	list_node_t	zdl_linkage;
2722b24ab6Sebastien Roy} zone_dl_t;
2757c478bdstevel@tonic-gate * cv used to signal that all references to the zone have been released.  This
2767c478bdstevel@tonic-gate * needs to be global since there may be multiple waiters, and the first to
2777c478bdstevel@tonic-gate * wake up will free the zone_t, hence we cannot use zone->zone_cv.
2787c478bdstevel@tonic-gate */
2797c478bdstevel@tonic-gatestatic kcondvar_t zone_destroy_cv;
2817c478bdstevel@tonic-gate * Lock used to serialize access to zone_cv.  This could have been per-zone,
2827c478bdstevel@tonic-gate * but then we'd need another lock for zone_destroy_cv, and why bother?
2837c478bdstevel@tonic-gate */
2847c478bdstevel@tonic-gatestatic kmutex_t zone_status_lock;
2877c478bdstevel@tonic-gate * ZSD-related global variables.
2887c478bdstevel@tonic-gate */
2897c478bdstevel@tonic-gatestatic kmutex_t zsd_key_lock;	/* protects the following two */
2917c478bdstevel@tonic-gate * The next caller of zone_key_create() will be assigned a key of ++zsd_keyval.
2927c478bdstevel@tonic-gate */
2937c478bdstevel@tonic-gatestatic zone_key_t zsd_keyval = 0;
2957c478bdstevel@tonic-gate * Global list of registered keys.  We use this when a new zone is created.
2967c478bdstevel@tonic-gate */
2977c478bdstevel@tonic-gatestatic list_t zsd_registered_keys;
2997c478bdstevel@tonic-gateint zone_hash_size = 256;
30045916cdjpkstatic mod_hash_t *zonehashbyname, *zonehashbyid, *zonehashbylabel;
3017c478bdstevel@tonic-gatestatic kmutex_t zonehash_lock;
3027c478bdstevel@tonic-gatestatic uint_t zonecount;
3037c478bdstevel@tonic-gatestatic id_space_t *zoneid_space;
3067c478bdstevel@tonic-gate * The global zone (aka zone0) is the all-seeing, all-knowing zone in which the
3077c478bdstevel@tonic-gate * kernel proper runs, and which manages all other zones.
3087c478bdstevel@tonic-gate *
3097c478bdstevel@tonic-gate * Although not declared as static, the variable "zone0" should not be used
3107c478bdstevel@tonic-gate * except for by code that needs to reference the global zone early on in boot,
3117c478bdstevel@tonic-gate * before it is fully initialized.  All other consumers should use
3127c478bdstevel@tonic-gate * 'global_zone'.
3137c478bdstevel@tonic-gate */
3147c478bdstevel@tonic-gatezone_t zone0;
3157c478bdstevel@tonic-gatezone_t *global_zone = NULL;	/* Set when the global zone is initialized */
3187c478bdstevel@tonic-gate * List of active zones, protected by zonehash_lock.
3197c478bdstevel@tonic-gate */
3207c478bdstevel@tonic-gatestatic list_t zone_active;
3237c478bdstevel@tonic-gate * List of destroyed zones that still have outstanding cred references.
3247c478bdstevel@tonic-gate * Used for debugging.  Uses a separate lock to avoid lock ordering
3257c478bdstevel@tonic-gate * problems in zone_free.
3267c478bdstevel@tonic-gate */
3277c478bdstevel@tonic-gatestatic list_t zone_deathrow;
3287c478bdstevel@tonic-gatestatic kmutex_t zone_deathrow_lock;
3307c478bdstevel@tonic-gate/* number of zones is limited by virtual interface limit in IP */
3317c478bdstevel@tonic-gateuint_t maxzones = 8192;
333cf8f45cdstaff/* Event channel to sent zone state change notifications */
334cf8f45cdstaffevchan_t *zone_event_chan;
337cf8f45cdstaff * This table holds the mapping from kernel zone states to
338cf8f45cdstaff * states visible in the state notification API.
339cf8f45cdstaff * The idea is that we only expose "obvious" states and
340cf8f45cdstaff * do not expose states which are just implementation details.
341cf8f45cdstaff */
342cf8f45cdstaffconst char  *zone_status_table[] = {
343cf8f45cdstaff	ZONE_EVENT_UNINITIALIZED,	/* uninitialized */
344bd41d0anordmark	ZONE_EVENT_INITIALIZED,		/* initialized */
345cf8f45cdstaff	ZONE_EVENT_READY,		/* ready */
346cf8f45cdstaff	ZONE_EVENT_READY,		/* booting */
347cf8f45cdstaff	ZONE_EVENT_RUNNING,		/* running */
348cf8f45cdstaff	ZONE_EVENT_SHUTTING_DOWN,	/* shutting_down */
349cf8f45cdstaff	ZONE_EVENT_SHUTTING_DOWN,	/* empty */
350cf8f45cdstaff	ZONE_EVENT_SHUTTING_DOWN,	/* down */
351cf8f45cdstaff	ZONE_EVENT_SHUTTING_DOWN,	/* dying */
352cf8f45cdstaff	ZONE_EVENT_UNINITIALIZED,	/* dead */
356a19609fjv * This array contains the names of the subsystems listed in zone_ref_subsys_t
357a19609fjv * (see sys/zone.h).
358a19609fjv */
359a19609fjvstatic char *zone_ref_subsys_names[] = {
360a19609fjv	"NFS",		/* ZONE_REF_NFS */
361a19609fjv	"NFSv4",	/* ZONE_REF_NFSV4 */
362a19609fjv	"SMBFS",	/* ZONE_REF_SMBFS */
363a19609fjv	"MNTFS",	/* ZONE_REF_MNTFS */
364a19609fjv	"LOFI",		/* ZONE_REF_LOFI */
365a19609fjv	"VFS",		/* ZONE_REF_VFS */
366a19609fjv	"IPC"		/* ZONE_REF_IPC */
3707c478bdstevel@tonic-gate * This isn't static so lint doesn't complain.
3717c478bdstevel@tonic-gate */
3727c478bdstevel@tonic-gaterctl_hndl_t rc_zone_cpu_shares;
373c693965slrctl_hndl_t rc_zone_locked_mem;
3740209230gjelinekrctl_hndl_t rc_zone_max_swap;
3750fbb751John Levonrctl_hndl_t rc_zone_max_lofi;
376c97ad5cakolbrctl_hndl_t rc_zone_cpu_cap;
3777c478bdstevel@tonic-gaterctl_hndl_t rc_zone_nlwps;
378ff19e02Menno Lagemanrctl_hndl_t rc_zone_nprocs;
379824c205mlrctl_hndl_t rc_zone_shmmax;
380824c205mlrctl_hndl_t rc_zone_shmmni;
381824c205mlrctl_hndl_t rc_zone_semmni;
382824c205mlrctl_hndl_t rc_zone_msgmni;
3843f2f09cdpconst char * const zone_default_initname = "/sbin/init";
38545916cdjpkstatic char * const zone_prefix = "/zone/";
3867c478bdstevel@tonic-gatestatic int zone_shutdown(zoneid_t zoneid);
3872b24ab6Sebastien Roystatic int zone_add_datalink(zoneid_t, datalink_id_t);
3882b24ab6Sebastien Roystatic int zone_remove_datalink(zoneid_t, datalink_id_t);
3892b24ab6Sebastien Roystatic int zone_list_datalink(zoneid_t, int *, datalink_id_t *);
390550b6e4Sowmini Varadhanstatic int zone_set_network(zoneid_t, zone_net_data_t *);
391550b6e4Sowmini Varadhanstatic int zone_get_network(zoneid_t, zone_net_data_t *);
393bd41d0anordmarktypedef boolean_t zsd_applyfn_t(kmutex_t *, boolean_t, zone_t *, zone_key_t);
395bd41d0anordmarkstatic void zsd_apply_all_zones(zsd_applyfn_t *, zone_key_t);
396bd41d0anordmarkstatic void zsd_apply_all_keys(zsd_applyfn_t *, zone_t *);
397bd41d0anordmarkstatic boolean_t zsd_apply_create(kmutex_t *, boolean_t, zone_t *, zone_key_t);
398bd41d0anordmarkstatic boolean_t zsd_apply_shutdown(kmutex_t *, boolean_t, zone_t *,
399bd41d0anordmark    zone_key_t);
400bd41d0anordmarkstatic boolean_t zsd_apply_destroy(kmutex_t *, boolean_t, zone_t *, zone_key_t);
401bd41d0anordmarkstatic boolean_t zsd_wait_for_creator(zone_t *, struct zsd_entry *,
402bd41d0anordmark    kmutex_t *);
403bd41d0anordmarkstatic boolean_t zsd_wait_for_inprogress(zone_t *, struct zsd_entry *,
404bd41d0anordmark    kmutex_t *);
407821c4a9dp * Bump this number when you alter the zone syscall interfaces; this is
408821c4a9dp * because we need to have support for previous API versions in libc
409821c4a9dp * to support patching; libc calls into the kernel to determine this number.
410821c4a9dp *
411821c4a9dp * Version 1 of the API is the version originally shipped with Solaris 10
412821c4a9dp * Version 2 alters the zone_create system call in order to support more
413821c4a9dp *     arguments by moving the args into a structure; and to do better
414821c4a9dp *     error reporting when zone_create() fails.
415821c4a9dp * Version 3 alters the zone_create system call in order to support the
416821c4a9dp *     import of ZFS datasets to zones.
41745916cdjpk * Version 4 alters the zone_create system call in order to support
41845916cdjpk *     Trusted Extensions.
4193f2f09cdp * Version 5 alters the zone_boot system call, and converts its old
4203f2f09cdp *     bootargs parameter to be set by the zone_setattr API instead.
421f4b3ec6dh * Version 6 adds the flag argument to zone_create.
422821c4a9dp */
423f4b3ec6dhstatic const int ZONE_SYSCALL_API_VERSION = 6;
4267c478bdstevel@tonic-gate * Certain filesystems (such as NFS and autofs) need to know which zone
4277c478bdstevel@tonic-gate * the mount is being placed in.  Because of this, we need to be able to
4285fd5c68Jerry Jelinek * ensure that a zone isn't in the process of being created/destroyed such
4295fd5c68Jerry Jelinek * that nfs_mount() thinks it is in the global/NGZ zone, while by the time
4305fd5c68Jerry Jelinek * it gets added the list of mounted zones, it ends up on the wrong zone's
4315fd5c68Jerry Jelinek * mount list. Since a zone can't reside on an NFS file system, we don't
4325fd5c68Jerry Jelinek * have to worry about the zonepath itself.
4337c478bdstevel@tonic-gate *
4347c478bdstevel@tonic-gate * The following functions: block_mounts()/resume_mounts() and
4357c478bdstevel@tonic-gate * mount_in_progress()/mount_completed() are used by zones and the VFS
4365fd5c68Jerry Jelinek * layer (respectively) to synchronize zone state transitions and new
4375fd5c68Jerry Jelinek * mounts within a zone. This syncronization is on a per-zone basis, so
4385fd5c68Jerry Jelinek * activity for one zone will not interfere with activity for another zone.
4397c478bdstevel@tonic-gate *
4407c478bdstevel@tonic-gate * The semantics are like a reader-reader lock such that there may
4415fd5c68Jerry Jelinek * either be multiple mounts (or zone state transitions, if that weren't
4427c478bdstevel@tonic-gate * serialized by zonehash_lock) in progress at the same time, but not
4437c478bdstevel@tonic-gate * both.
4447c478bdstevel@tonic-gate *
4457c478bdstevel@tonic-gate * We use cv's so the user can ctrl-C out of the operation if it's
4467c478bdstevel@tonic-gate * taking too long.
4477c478bdstevel@tonic-gate *
4487c478bdstevel@tonic-gate * The semantics are such that there is unfair bias towards the
4495fd5c68Jerry Jelinek * "current" operation.  This means that zone halt may starve if
4505fd5c68Jerry Jelinek * there is a rapid succession of new mounts coming in to the zone.
4517c478bdstevel@tonic-gate */
4537c478bdstevel@tonic-gate * Prevent new mounts from progressing to the point of calling
4547c478bdstevel@tonic-gate * VFS_MOUNT().  If there are already mounts in this "region", wait for
4557c478bdstevel@tonic-gate * them to complete.
4567c478bdstevel@tonic-gate */
4577c478bdstevel@tonic-gatestatic int
4585fd5c68Jerry Jelinekblock_mounts(zone_t *zp)
4607c478bdstevel@tonic-gate	int retval = 0;
4627c478bdstevel@tonic-gate	/*
4637c478bdstevel@tonic-gate	 * Since it may block for a long time, block_mounts() shouldn't be
4647c478bdstevel@tonic-gate	 * called with zonehash_lock held.
4657c478bdstevel@tonic-gate	 */
4667c478bdstevel@tonic-gate	ASSERT(MUTEX_NOT_HELD(&zonehash_lock));
4675fd5c68Jerry Jelinek	mutex_enter(&zp->zone_mount_lock);
4685fd5c68Jerry Jelinek	while (zp->zone_mounts_in_progress > 0) {
4695fd5c68Jerry Jelinek		if (cv_wait_sig(&zp->zone_mount_cv, &zp->zone_mount_lock) == 0)
4707c478bdstevel@tonic-gate			goto signaled;
4717c478bdstevel@tonic-gate	}
4727c478bdstevel@tonic-gate	/*
4737c478bdstevel@tonic-gate	 * A negative value of mounts_in_progress indicates that mounts
4745fd5c68Jerry Jelinek	 * have been blocked by (-mounts_in_progress) different callers
4755fd5c68Jerry Jelinek	 * (remotely possible if two threads enter zone_shutdown at the same
4765fd5c68Jerry Jelinek	 * time).
4777c478bdstevel@tonic-gate	 */
4785fd5c68Jerry Jelinek	zp->zone_mounts_in_progress--;
4797c478bdstevel@tonic-gate	retval = 1;
4815fd5c68Jerry Jelinek	mutex_exit(&zp->zone_mount_lock);
4827c478bdstevel@tonic-gate	return (retval);
4867c478bdstevel@tonic-gate * The VFS layer may progress with new mounts as far as we're concerned.
4877c478bdstevel@tonic-gate * Allow them to progress if we were the last obstacle.
4887c478bdstevel@tonic-gate */
4897c478bdstevel@tonic-gatestatic void
4905fd5c68Jerry Jelinekresume_mounts(zone_t *zp)
4925fd5c68Jerry Jelinek	mutex_enter(&zp->zone_mount_lock);
4935fd5c68Jerry Jelinek	if (++zp->zone_mounts_in_progress == 0)
4945fd5c68Jerry Jelinek		cv_broadcast(&zp->zone_mount_cv);
4955fd5c68Jerry Jelinek	mutex_exit(&zp->zone_mount_lock);
4995fd5c68Jerry Jelinek * The VFS layer is busy with a mount; this zone should wait until all
5005fd5c68Jerry Jelinek * of its mounts are completed to progress.
5017c478bdstevel@tonic-gate */
5035fd5c68Jerry Jelinekmount_in_progress(zone_t *zp)
5055fd5c68Jerry Jelinek	mutex_enter(&zp->zone_mount_lock);
5065fd5c68Jerry Jelinek	while (zp->zone_mounts_in_progress < 0)
5075fd5c68Jerry Jelinek		cv_wait(&zp->zone_mount_cv, &zp->zone_mount_lock);
5085fd5c68Jerry Jelinek	zp->zone_mounts_in_progress++;
5095fd5c68Jerry Jelinek	mutex_exit(&zp->zone_mount_lock);
5137c478bdstevel@tonic-gate * VFS is done with one mount; wake up any waiting block_mounts()
5147c478bdstevel@tonic-gate * callers if this is the last mount.
5157c478bdstevel@tonic-gate */
5175fd5c68Jerry Jelinekmount_completed(zone_t *zp)
5195fd5c68Jerry Jelinek	mutex_enter(&zp->zone_mount_lock);
5205fd5c68Jerry Jelinek	if (--zp->zone_mounts_in_progress == 0)
5215fd5c68Jerry Jelinek		cv_broadcast(&zp->zone_mount_cv);
5225fd5c68Jerry Jelinek	mutex_exit(&zp->zone_mount_lock);
5267c478bdstevel@tonic-gate * ZSD routines.
5277c478bdstevel@tonic-gate *
5287c478bdstevel@tonic-gate * Zone Specific Data (ZSD) is modeled after Thread Specific Data as
5297c478bdstevel@tonic-gate * defined by the pthread_key_create() and related interfaces.
5307c478bdstevel@tonic-gate *
5317c478bdstevel@tonic-gate * Kernel subsystems may register one or more data items and/or
5327c478bdstevel@tonic-gate * callbacks to be executed when a zone is created, shutdown, or
5337c478bdstevel@tonic-gate * destroyed.
5347c478bdstevel@tonic-gate *
5357c478bdstevel@tonic-gate * Unlike the thread counterpart, destructor callbacks will be executed
5367c478bdstevel@tonic-gate * even if the data pointer is NULL and/or there are no constructor
5377c478bdstevel@tonic-gate * callbacks, so it is the responsibility of such callbacks to check for
5387c478bdstevel@tonic-gate * NULL data values if necessary.
5397c478bdstevel@tonic-gate *
5407c478bdstevel@tonic-gate * The locking strategy and overall picture is as follows:
5417c478bdstevel@tonic-gate *
5427c478bdstevel@tonic-gate * When someone calls zone_key_create(), a template ZSD entry is added to the
543bd41d0anordmark * global list "zsd_registered_keys", protected by zsd_key_lock.  While
544bd41d0anordmark * holding that lock all the existing zones are marked as
545bd41d0anordmark * ZSD_CREATE_NEEDED and a copy of the ZSD entry added to the per-zone
546bd41d0anordmark * zone_zsd list (protected by zone_lock). The global list is updated first
547bd41d0anordmark * (under zone_key_lock) to make sure that newly created zones use the
548bd41d0anordmark * most recent list of keys. Then under zonehash_lock we walk the zones
549bd41d0anordmark * and mark them.  Similar locking is used in zone_key_delete().
5507c478bdstevel@tonic-gate *
551bd41d0anordmark * The actual create, shutdown, and destroy callbacks are done without
552bd41d0anordmark * holding any lock. And zsd_flags are used to ensure that the operations
553bd41d0anordmark * completed so that when zone_key_create (and zone_create) is done, as well as
554bd41d0anordmark * zone_key_delete (and zone_destroy) is done, all the necessary callbacks
555bd41d0anordmark * are completed.
5567c478bdstevel@tonic-gate *
5577c478bdstevel@tonic-gate * When new zones are created constructor callbacks for all registered ZSD
558bd41d0anordmark * entries will be called. That also uses the above two phases of marking
559bd41d0anordmark * what needs to be done, and then running the callbacks without holding
560bd41d0anordmark * any locks.
5617c478bdstevel@tonic-gate *
5627c478bdstevel@tonic-gate * The framework does not provide any locking around zone_getspecific() and
5637c478bdstevel@tonic-gate * zone_setspecific() apart from that needed for internal consistency, so
5647c478bdstevel@tonic-gate * callers interested in atomic "test-and-set" semantics will need to provide
5657c478bdstevel@tonic-gate * their own locking.
5667c478bdstevel@tonic-gate */
569bd41d0anordmark * Helper function to find the zsd_entry associated with the key in the
570bd41d0anordmark * given list.
571bd41d0anordmark */
572bd41d0anordmarkstatic struct zsd_entry *
573bd41d0anordmarkzsd_find(list_t *l, zone_key_t key)
575bd41d0anordmark	struct zsd_entry *zsd;
577bd41d0anordmark	for (zsd = list_head(l); zsd != NULL; zsd = list_next(l, zsd)) {
578bd41d0anordmark		if (zsd->zsd_key == key) {
579bd41d0anordmark			return (zsd);
5807c478bdstevel@tonic-gate		}
5817c478bdstevel@tonic-gate	}
582bd41d0anordmark	return (NULL);
5867c478bdstevel@tonic-gate * Helper function to find the zsd_entry associated with the key in the
587bd41d0anordmark * given list. Move it to the front of the list.
5887c478bdstevel@tonic-gate */
5897c478bdstevel@tonic-gatestatic struct zsd_entry *
590bd41d0anordmarkzsd_find_mru(list_t *l, zone_key_t key)
5927c478bdstevel@tonic-gate	struct zsd_entry *zsd;
5947c478bdstevel@tonic-gate	for (zsd = list_head(l); zsd != NULL; zsd = list_next(l, zsd)) {
5957c478bdstevel@tonic-gate		if (zsd->zsd_key == key) {
5967c478bdstevel@tonic-gate			/*
5977c478bdstevel@tonic-gate			 * Move to head of list to keep list in MRU order.
5987c478bdstevel@tonic-gate			 */
5997c478bdstevel@tonic-gate			if (zsd != list_head(l)) {
6007c478bdstevel@tonic-gate				list_remove(l, zsd);
6017c478bdstevel@tonic-gate				list_insert_head(l, zsd);
6027c478bdstevel@tonic-gate			}
6037c478bdstevel@tonic-gate			return (zsd);
6047c478bdstevel@tonic-gate		}
6057c478bdstevel@tonic-gate	}
6067c478bdstevel@tonic-gate	return (NULL);
610bd41d0anordmarkzone_key_create(zone_key_t *keyp, void *(*create)(zoneid_t),
611bd41d0anordmark    void (*shutdown)(zoneid_t, void *), void (*destroy)(zoneid_t, void *))
613bd41d0anordmark	struct zsd_entry *zsdp;
614bd41d0anordmark	struct zsd_entry *t;
615bd41d0anordmark	struct zone *zone;
616bd41d0anordmark	zone_key_t  key;
618bd41d0anordmark	zsdp = kmem_zalloc(sizeof (*zsdp), KM_SLEEP);
619bd41d0anordmark	zsdp->zsd_data = NULL;
620bd41d0anordmark	zsdp->zsd_create = create;
621bd41d0anordmark	zsdp->zsd_shutdown = shutdown;
622bd41d0anordmark	zsdp->zsd_destroy = destroy;
624bd41d0anordmark	/*
625bd41d0anordmark	 * Insert in global list of callbacks. Makes future zone creations
626bd41d0anordmark	 * see it.
627bd41d0anordmark	 */
628bd41d0anordmark	mutex_enter(&zsd_key_lock);
629fe16170Pramod Batni	key = zsdp->zsd_key = ++zsd_keyval;
630bd41d0anordmark	ASSERT(zsd_keyval != 0);
631bd41d0anordmark	list_insert_tail(&zsd_registered_keys, zsdp);
632bd41d0anordmark	mutex_exit(&zsd_key_lock);
634bd41d0anordmark	/*
635bd41d0anordmark	 * Insert for all existing zones and mark them as needing
636bd41d0anordmark	 * a create callback.
637bd41d0anordmark	 */
638bd41d0anordmark	mutex_enter(&zonehash_lock);	/* stop the world */
639bd41d0anordmark	for (zone = list_head(&zone_active); zone != NULL;
640bd41d0anordmark	    zone = list_next(&zone_active, zone)) {
641bd41d0anordmark		zone_status_t status;
643bd41d0anordmark		mutex_enter(&zone->zone_lock);
645bd41d0anordmark		/* Skip zones that are on the way down or not yet up */
646bd41d0anordmark		status = zone_status_get(zone);
647bd41d0anordmark		if (status >= ZONE_IS_DOWN ||
648bd41d0anordmark		    status == ZONE_IS_UNINITIALIZED) {
649bd41d0anordmark			mutex_exit(&zone->zone_lock);
650bd41d0anordmark			continue;
651bd41d0anordmark		}
653bd41d0anordmark		t = zsd_find_mru(&zone->zone_zsd, key);
654bd41d0anordmark		if (t != NULL) {
655bd41d0anordmark			/*
656bd41d0anordmark			 * A zsd_configure already inserted it after
657bd41d0anordmark			 * we dropped zsd_key_lock above.
658bd41d0anordmark			 */
659bd41d0anordmark			mutex_exit(&zone->zone_lock);
660bd41d0anordmark			continue;
661bd41d0anordmark		}
662bd41d0anordmark		t = kmem_zalloc(sizeof (*t), KM_SLEEP);
663bd41d0anordmark		t->zsd_key = key;
664bd41d0anordmark		t->zsd_create = create;
665bd41d0anordmark		t->zsd_shutdown = shutdown;
666bd41d0anordmark		t->zsd_destroy = destroy;
667bd41d0anordmark		if (create != NULL) {
668bd41d0anordmark			t->zsd_flags = ZSD_CREATE_NEEDED;
669bd41d0anordmark			DTRACE_PROBE2(zsd__create__needed,
670bd41d0anordmark			    zone_t *, zone, zone_key_t, key);
671bd41d0anordmark		}
672bd41d0anordmark		list_insert_tail(&zone->zone_zsd, t);
673bd41d0anordmark		mutex_exit(&zone->zone_lock);
674bd41d0anordmark	}
675bd41d0anordmark	mutex_exit(&zonehash_lock);
677bd41d0anordmark	if (create != NULL) {
678bd41d0anordmark		/* Now call the create callback for this key */
679bd41d0anordmark		zsd_apply_all_zones(zsd_apply_create, key);
680bd41d0anordmark	}
681fe16170Pramod Batni	/*
682835ee21Robert Harris	 * It is safe for consumers to use the key now, make it
683835ee21Robert Harris	 * globally visible. Specifically zone_getspecific() will
684835ee21Robert Harris	 * always successfully return the zone specific data associated
685835ee21Robert Harris	 * with the key.
686835ee21Robert Harris	 */
687fe16170Pramod Batni	*keyp = key;
688fe16170Pramod Batni
6927c478bdstevel@tonic-gate * Function called when a module is being unloaded, or otherwise wishes
6937c478bdstevel@tonic-gate * to unregister its ZSD key and callbacks.
694bd41d0anordmark *
695bd41d0anordmark * Remove from the global list and determine the functions that need to
696bd41d0anordmark * be called under a global lock. Then call the functions without
697bd41d0anordmark * holding any locks. Finally free up the zone_zsd entries. (The apply
698bd41d0anordmark * functions need to access the zone_zsd entries to find zsd_data etc.)
6997c478bdstevel@tonic-gate */
7017c478bdstevel@tonic-gatezone_key_delete(zone_key_t key)
7037c478bdstevel@tonic-gate	struct zsd_entry *zsdp = NULL;
7047c478bdstevel@tonic-gate	zone_t *zone;
7067c478bdstevel@tonic-gate	mutex_enter(&zsd_key_lock);
707bd41d0anordmark	zsdp = zsd_find_mru(&zsd_registered_keys, key);
708bd41d0anordmark	if (zsdp == NULL) {
709bd41d0anordmark		mutex_exit(&zsd_key_lock);
710bd41d0anordmark		return (-1);
711bd41d0anordmark	}
7127c478bdstevel@tonic-gate	list_remove(&zsd_registered_keys, zsdp);
7137c478bdstevel@tonic-gate	mutex_exit(&zsd_key_lock);
715bd41d0anordmark	mutex_enter(&zonehash_lock);
7167c478bdstevel@tonic-gate	for (zone = list_head(&zone_active); zone != NULL;
7177c478bdstevel@tonic-gate	    zone = list_next(&zone_active, zone)) {
7187c478bdstevel@tonic-gate		struct zsd_entry *del;
720bd41d0anordmark		mutex_enter(&zone->zone_lock);
721bd41d0anordmark		del = zsd_find_mru(&zone->zone_zsd, key);
722bd41d0anordmark		if (del == NULL) {
723bd41d0anordmark			/*
724bd41d0anordmark			 * Somebody else got here first e.g the zone going
725bd41d0anordmark			 * away.
726bd41d0anordmark			 */
727bd41d0anordmark			mutex_exit(&zone->zone_lock);
728bd41d0anordmark			continue;
729bd41d0anordmark		}
730bd41d0anordmark		ASSERT(del->zsd_shutdown == zsdp->zsd_shutdown);
731bd41d0anordmark		ASSERT(del->zsd_destroy == zsdp->zsd_destroy);
732bd41d0anordmark		if (del->zsd_shutdown != NULL &&
733bd41d0anordmark		    (del->zsd_flags & ZSD_SHUTDOWN_ALL) == 0) {
734bd41d0anordmark			del->zsd_flags |= ZSD_SHUTDOWN_NEEDED;
735bd41d0anordmark			DTRACE_PROBE2(zsd__shutdown__needed,
736bd41d0anordmark			    zone_t *, zone, zone_key_t, key);
737bd41d0anordmark		}
738bd41d0anordmark		if (del->zsd_destroy != NULL &&
739bd41d0anordmark		    (del->zsd_flags & ZSD_DESTROY_ALL) == 0) {
740bd41d0anordmark			del->zsd_flags |= ZSD_DESTROY_NEEDED;
741bd41d0anordmark			DTRACE_PROBE2(zsd__destroy__needed,
742bd41d0anordmark			    zone_t *, zone, zone_key_t, key);
7437c478bdstevel@tonic-gate		}
7447c478bdstevel@tonic-gate		mutex_exit(&zone->zone_lock);
7457c478bdstevel@tonic-gate	}
7467c478bdstevel@tonic-gate	mutex_exit(&zonehash_lock);
7477c478bdstevel@tonic-gate	kmem_free(zsdp, sizeof (*zsdp));
749bd41d0anordmark	/* Now call the shutdown and destroy callback for this key */
750bd41d0anordmark	zsd_apply_all_zones(zsd_apply_shutdown, key);
751bd41d0anordmark	zsd_apply_all_zones(zsd_apply_destroy, key);
753bd41d0anordmark	/* Now we can free up the zsdp structures in each zone */
754bd41d0anordmark	mutex_enter(&zonehash_lock);
7557c478bdstevel@tonic-gate	for (zone = list_head(&zone_active); zone != NULL;
756bd41d0anordmark	    zone = list_next(&zone_active, zone)) {
757bd41d0anordmark		struct zsd_entry *del;
759bd41d0anordmark		mutex_enter(&zone->zone_lock);
760bd41d0anordmark		del = zsd_find(&zone->zone_zsd, key);
761bd41d0anordmark		if (del != NULL) {
762bd41d0anordmark			list_remove(&zone->zone_zsd, del);
763bd41d0anordmark			ASSERT(!(del->zsd_flags & ZSD_ALL_INPROGRESS));
764bd41d0anordmark			kmem_free(del, sizeof (*del));
765bd41d0anordmark		}
7667c478bdstevel@tonic-gate		mutex_exit(&zone->zone_lock);
767bd41d0anordmark	}
7687c478bdstevel@tonic-gate	mutex_exit(&zonehash_lock);
770bd41d0anordmark	return (0);
7747c478bdstevel@tonic-gate * ZSD counterpart of pthread_setspecific().
775bd41d0anordmark *
776bd41d0anordmark * Since all zsd callbacks, including those with no create function,
777bd41d0anordmark * have an entry in zone_zsd, if the key is registered it is part of
778bd41d0anordmark * the zone_zsd list.
779bd41d0anordmark * Return an error if the key wasn't registerd.
7807c478bdstevel@tonic-gate */
7827c478bdstevel@tonic-gatezone_setspecific(zone_key_t key, zone_t *zone, const void *data)
7847c478bdstevel@tonic-gate	struct zsd_entry *t;
7867c478bdstevel@tonic-gate	mutex_enter(&zone->zone_lock);
787bd41d0anordmark	t = zsd_find_mru(&zone->zone_zsd, key);
7887c478bdstevel@tonic-gate	if (t != NULL) {
7897c478bdstevel@tonic-gate		/*
7907c478bdstevel@tonic-gate		 * Replace old value with new
7917c478bdstevel@tonic-gate		 */
7927c478bdstevel@tonic-gate		t->zsd_data = (void *)data;
7937c478bdstevel@tonic-gate		mutex_exit(&zone->zone_lock);
7947c478bdstevel@tonic-gate		return (0);
7957c478bdstevel@tonic-gate	}
7967c478bdstevel@tonic-gate	mutex_exit(&zone->zone_lock);
797bd41d0anordmark	return (-1);
8017c478bdstevel@tonic-gate * ZSD counterpart of pthread_getspecific().
8027c478bdstevel@tonic-gate */
8037c478bdstevel@tonic-gatevoid *
8047c478bdstevel@tonic-gatezone_getspecific(zone_key_t key, zone_t *zone)
8067c478bdstevel@tonic-gate	struct zsd_entry *t;
8077c478bdstevel@tonic-gate	void *data;
8097c478bdstevel@tonic-gate	mutex_enter(&zone->zone_lock);
810bd41d0anordmark	t = zsd_find_mru(&zone->zone_zsd, key);
8117c478bdstevel@tonic-gate	data = (t == NULL ? NULL : t->zsd_data);
8127c478bdstevel@tonic-gate	mutex_exit(&zone->zone_lock);
8137c478bdstevel@tonic-gate	return (data);
8177c478bdstevel@tonic-gate * Function used to initialize a zone's list of ZSD callbacks and data
8187c478bdstevel@tonic-gate * when the zone is being created.  The callbacks are initialized from
819bd41d0anordmark * the template list (zsd_registered_keys). The constructor callback is
820bd41d0anordmark * executed later (once the zone exists and with locks dropped).
8217c478bdstevel@tonic-gate */
8227c478bdstevel@tonic-gatestatic void
8237c478bdstevel@tonic-gatezone_zsd_configure(zone_t *zone)
8257c478bdstevel@tonic-gate	struct zsd_entry *zsdp;
8267c478bdstevel@tonic-gate	struct zsd_entry *t;
8287c478bdstevel@tonic-gate	ASSERT(MUTEX_HELD(&zonehash_lock));
8297c478bdstevel@tonic-gate	ASSERT(list_head(&zone->zone_zsd) == NULL);
830bd41d0anordmark	mutex_enter(&zone->zone_lock);
8317c478bdstevel@tonic-gate	mutex_enter(&zsd_key_lock);
8327c478bdstevel@tonic-gate	for (zsdp = list_head(&zsd_registered_keys); zsdp != NULL;
8337c478bdstevel@tonic-gate	    zsdp = list_next(&zsd_registered_keys, zsdp)) {
834bd41d0anordmark		/*
835bd41d0anordmark		 * Since this zone is ZONE_IS_UNCONFIGURED, zone_key_create
836bd41d0anordmark		 * should not have added anything to it.
837bd41d0anordmark		 */
838bd41d0anordmark		ASSERT(zsd_find(&zone->zone_zsd, zsdp->zsd_key) == NULL);
840bd41d0anordmark		t = kmem_zalloc(sizeof (*t), KM_SLEEP);
841bd41d0anordmark		t->zsd_key = zsdp->zsd_key;
842bd41d0anordmark		t->zsd_create = zsdp->zsd_create;
843bd41d0anordmark		t->zsd_shutdown = zsdp->zsd_shutdown;
844bd41d0anordmark		t->zsd_destroy = zsdp->zsd_destroy;
8457c478bdstevel@tonic-gate		if (zsdp->zsd_create != NULL) {
846bd41d0anordmark			t->zsd_flags = ZSD_CREATE_NEEDED;
847bd41d0anordmark			DTRACE_PROBE2(zsd__create__needed,
848bd41d0anordmark			    zone_t *, zone, zone_key_t, zsdp->zsd_key);
8497c478bdstevel@tonic-gate		}
850bd41d0anordmark		list_insert_tail(&zone->zone_zsd, t);
8517c478bdstevel@tonic-gate	}
8527c478bdstevel@tonic-gate	mutex_exit(&zsd_key_lock);
853bd41d0anordmark	mutex_exit(&zone->zone_lock);
8567c478bdstevel@tonic-gateenum zsd_callback_type { ZSD_CREATE, ZSD_SHUTDOWN, ZSD_DESTROY };
8597c478bdstevel@tonic-gate * Helper function to execute shutdown or destructor callbacks.
8607c478bdstevel@tonic-gate */
8617c478bdstevel@tonic-gatestatic void
8627c478bdstevel@tonic-gatezone_zsd_callbacks(zone_t *zone, enum zsd_callback_type ct)
8647c478bdstevel@tonic-gate	struct zsd_entry *t;
8667c478bdstevel@tonic-gate	ASSERT(ct == ZSD_SHUTDOWN || ct == ZSD_DESTROY);
8677c478bdstevel@tonic-gate	ASSERT(ct != ZSD_SHUTDOWN || zone_status_get(zone) >= ZONE_IS_EMPTY);
8687c478bdstevel@tonic-gate	ASSERT(ct != ZSD_DESTROY || zone_status_get(zone) >= ZONE_IS_DOWN);
8707c478bdstevel@tonic-gate	/*
871bd41d0anordmark	 * Run the callback solely based on what is registered for the zone
872bd41d0anordmark	 * in zone_zsd. The global list can change independently of this
873bd41d0anordmark	 * as keys are registered and unregistered and we don't register new
874bd41d0anordmark	 * callbacks for a zone that is in the process of going away.
8757c478bdstevel@tonic-gate	 */
876bd41d0anordmark	mutex_enter(&zone->zone_lock);
877bd41d0anordmark	for (t = list_head(&zone->zone_zsd); t != NULL;
878bd41d0anordmark	    t = list_next(&zone->zone_zsd, t)) {
879bd41d0anordmark		zone_key_t key = t->zsd_key;
8817c478bdstevel@tonic-gate		/* Skip if no callbacks registered */
883bd41d0anordmark		if (ct == ZSD_SHUTDOWN) {
884bd41d0anordmark			if (t->zsd_shutdown != NULL &&
885bd41d0anordmark			    (t->zsd_flags & ZSD_SHUTDOWN_ALL) == 0) {
886bd41d0anordmark				t->zsd_flags |= ZSD_SHUTDOWN_NEEDED;
887bd41d0anordmark				DTRACE_PROBE2(zsd__shutdown__needed,
888bd41d0anordmark				    zone_t *, zone, zone_key_t, key);
8897c478bdstevel@tonic-gate			}
8907c478bdstevel@tonic-gate		} else {
891bd41d0anordmark			if (t->zsd_destroy != NULL &&
892bd41d0anordmark			    (t->zsd_flags & ZSD_DESTROY_ALL) == 0) {
893bd41d0anordmark				t->zsd_flags |= ZSD_DESTROY_NEEDED;
894bd41d0anordmark				DTRACE_PROBE2(zsd__destroy__needed,
895bd41d0anordmark				    zone_t *, zone, zone_key_t, key);
8967c478bdstevel@tonic-gate			}
8977c478bdstevel@tonic-gate		}
8987c478bdstevel@tonic-gate	}
899bd41d0anordmark	mutex_exit(&zone->zone_lock);
901bd41d0anordmark	/* Now call the shutdown and destroy callback for this key */
902bd41d0anordmark	zsd_apply_all_keys(zsd_apply_shutdown, zone);
903bd41d0anordmark	zsd_apply_all_keys(zsd_apply_destroy, zone);
9087c478bdstevel@tonic-gate * Called when the zone is going away; free ZSD-related memory, and
9097c478bdstevel@tonic-gate * destroy the zone_zsd list.
9107c478bdstevel@tonic-gate */
9117c478bdstevel@tonic-gatestatic void
9127c478bdstevel@tonic-gatezone_free_zsd(zone_t *zone)
9147c478bdstevel@tonic-gate	struct zsd_entry *t, *next;
9167c478bdstevel@tonic-gate	/*
9177c478bdstevel@tonic-gate	 * Free all the zsd_entry's we had on this zone.
9187c478bdstevel@tonic-gate	 */
919bd41d0anordmark	mutex_enter(&zone->zone_lock);
9207c478bdstevel@tonic-gate	for (t = list_head(&zone->zone_zsd); t != NULL; t = next) {
9217c478bdstevel@tonic-gate		next = list_next(&zone->zone_zsd, t);
9227c478bdstevel@tonic-gate		list_remove(&zone->zone_zsd, t);
923bd41d0anordmark		ASSERT(!(t->zsd_flags & ZSD_ALL_INPROGRESS));
9247c478bdstevel@tonic-gate		kmem_free(t, sizeof (*t));
9257c478bdstevel@tonic-gate	}
9267c478bdstevel@tonic-gate	list_destroy(&zone->zone_zsd);
927bd41d0anordmark	mutex_exit(&zone->zone_lock);
932bd41d0anordmark * Apply a function to all zones for particular key value.
933bd41d0anordmark *
934bd41d0anordmark * The applyfn has to drop zonehash_lock if it does some work, and
935bd41d0anordmark * then reacquire it before it returns.
936bd41d0anordmark * When the lock is dropped we don't follow list_next even
937bd41d0anordmark * if it is possible to do so without any hazards. This is
938bd41d0anordmark * because we want the design to allow for the list of zones
939bd41d0anordmark * to change in any arbitrary way during the time the
940bd41d0anordmark * lock was dropped.
941bd41d0anordmark *
942bd41d0anordmark * It is safe to restart the loop at list_head since the applyfn
943bd41d0anordmark * changes the zsd_flags as it does work, so a subsequent
944bd41d0anordmark * pass through will have no effect in applyfn, hence the loop will terminate
945bd41d0anordmark * in at worst O(N^2).
946bd41d0anordmark */
947bd41d0anordmarkstatic void
948bd41d0anordmarkzsd_apply_all_zones(zsd_applyfn_t *applyfn, zone_key_t key)
950bd41d0anordmark	zone_t *zone;
952bd41d0anordmark	mutex_enter(&zonehash_lock);
953bd41d0anordmark	zone = list_head(&zone_active);
954bd41d0anordmark	while (zone != NULL) {
955bd41d0anordmark		if ((applyfn)(&zonehash_lock, B_FALSE, zone, key)) {
956bd41d0anordmark			/* Lock dropped - restart at head */
957bd41d0anordmark			zone = list_head(&zone_active);
958bd41d0anordmark		} else {
959bd41d0anordmark			zone = list_next(&zone_active, zone);
960bd41d0anordmark		}
961bd41d0anordmark	}
962bd41d0anordmark	mutex_exit(&zonehash_lock);
966bd41d0anordmark * Apply a function to all keys for a particular zone.
967bd41d0anordmark *
968bd41d0anordmark * The applyfn has to drop zonehash_lock if it does some work, and
969bd41d0anordmark * then reacquire it before it returns.
970bd41d0anordmark * When the lock is dropped we don't follow list_next even
971bd41d0anordmark * if it is possible to do so without any hazards. This is
972bd41d0anordmark * because we want the design to allow for the list of zsd callbacks
973bd41d0anordmark * to change in any arbitrary way during the time the
974bd41d0anordmark * lock was dropped.
975bd41d0anordmark *
976bd41d0anordmark * It is safe to restart the loop at list_head since the applyfn
977bd41d0anordmark * changes the zsd_flags as it does work, so a subsequent
978bd41d0anordmark * pass through will have no effect in applyfn, hence the loop will terminate
979bd41d0anordmark * in at worst O(N^2).
980bd41d0anordmark */
981bd41d0anordmarkstatic void
982bd41d0anordmarkzsd_apply_all_keys(zsd_applyfn_t *applyfn, zone_t *zone)
984bd41d0anordmark	struct zsd_entry *t;
986bd41d0anordmark	mutex_enter(&zone->zone_lock);
987bd41d0anordmark	t = list_head(&zone->zone_zsd);
988bd41d0anordmark	while (t != NULL) {
989bd41d0anordmark		if ((applyfn)(NULL, B_TRUE, zone, t->zsd_key)) {
990bd41d0anordmark			/* Lock dropped - restart at head */
991bd41d0anordmark			t = list_head(&zone->zone_zsd);
992bd41d0anordmark		} else {
993bd41d0anordmark			t = list_next(&zone->zone_zsd, t);
994bd41d0anordmark		}
995bd41d0anordmark	}
996bd41d0anordmark	mutex_exit(&zone->zone_lock);
1000bd41d0anordmark * Call the create function for the zone and key if CREATE_NEEDED
1001bd41d0anordmark * is set.
1002bd41d0anordmark * If some other thread gets here first and sets CREATE_INPROGRESS, then
1003bd41d0anordmark * we wait for that thread to complete so that we can ensure that
1004bd41d0anordmark * all the callbacks are done when we've looped over all zones/keys.
1005bd41d0anordmark *
1006bd41d0anordmark * When we call the create function, we drop the global held by the
1007bd41d0anordmark * caller, and return true to tell the caller it needs to re-evalute the
1008bd41d0anordmark * state.
1009bd41d0anordmark * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
1010bd41d0anordmark * remains held on exit.
1011bd41d0anordmark */
1012bd41d0anordmarkstatic boolean_t
1013bd41d0anordmarkzsd_apply_create(kmutex_t *lockp, boolean_t zone_lock_held,
1014bd41d0anordmark    zone_t *zone, zone_key_t key)
1016bd41d0anordmark	void *result;
1017bd41d0anordmark	struct zsd_entry *t;
1018bd41d0anordmark	boolean_t dropped;
1020bd41d0anordmark	if (lockp != NULL) {
1021bd41d0anordmark		ASSERT(MUTEX_HELD(lockp));
1022bd41d0anordmark	}
1023bd41d0anordmark	if (zone_lock_held) {
1024bd41d0anordmark		ASSERT(MUTEX_HELD(&zone->zone_lock));
1025bd41d0anordmark	} else {
1026bd41d0anordmark		mutex_enter(&zone->zone_lock);
1027bd41d0anordmark	}
1029bd41d0anordmark	t = zsd_find(&zone->zone_zsd, key);
1030bd41d0anordmark	if (t == NULL) {
1031bd41d0anordmark		/*
1032bd41d0anordmark		 * Somebody else got here first e.g the zone going
1033bd41d0anordmark		 * away.
1034bd41d0anordmark		 */
1035bd41d0anordmark		if (!zone_lock_held)
1036bd41d0anordmark			mutex_exit(&zone->zone_lock);
1037bd41d0anordmark		return (B_FALSE);
1038bd41d0anordmark	}
1039bd41d0anordmark	dropped = B_FALSE;
1040bd41d0anordmark	if (zsd_wait_for_inprogress(zone, t, lockp))
1041bd41d0anordmark		dropped = B_TRUE;
1043bd41d0anordmark	if (t->zsd_flags & ZSD_CREATE_NEEDED) {
1044bd41d0anordmark		t->zsd_flags &= ~ZSD_CREATE_NEEDED;
1045bd41d0anordmark		t->zsd_flags |= ZSD_CREATE_INPROGRESS;
1046bd41d0anordmark		DTRACE_PROBE2(zsd__create__inprogress,
1047bd41d0anordmark		    zone_t *, zone, zone_key_t, key);
1048bd41d0anordmark		mutex_exit(&zone->zone_lock);
1049bd41d0anordmark		if (lockp != NULL)
1050bd41d0anordmark			mutex_exit(lockp);
1052bd41d0anordmark		dropped = B_TRUE;
1053bd41d0anordmark		ASSERT(t->zsd_create != NULL);
1054bd41d0anordmark		DTRACE_PROBE2(zsd__create__start,
1055bd41d0anordmark		    zone_t *, zone, zone_key_t, key);
1057bd41d0anordmark		result = (*t->zsd_create)(zone->zone_id);
1059bd41d0anordmark		DTRACE_PROBE2(zsd__create__end,
1060bd41d0anordmark		    zone_t *, zone, voidn *, result);
1062bd41d0anordmark		ASSERT(result != NULL);
1063bd41d0anordmark		if (lockp != NULL)
1064bd41d0anordmark			mutex_enter(lockp);
1065bd41d0anordmark		mutex_enter(&zone->zone_lock);
1066bd41d0anordmark		t->zsd_data = result;
1067bd41d0anordmark		t->zsd_flags &= ~ZSD_CREATE_INPROGRESS;
1068bd41d0anordmark		t->zsd_flags |= ZSD_CREATE_COMPLETED;
1069bd41d0anordmark		cv_broadcast(&t->zsd_cv);
1070bd41d0anordmark		DTRACE_PROBE2(zsd__create__completed,
1071bd41d0anordmark		    zone_t *, zone, zone_key_t, key);
1072bd41d0anordmark	}
1073bd41d0anordmark	if (!zone_lock_held)
1074bd41d0anordmark		mutex_exit(&zone->zone_lock);
1075bd41d0anordmark	return (dropped);
1079bd41d0anordmark * Call the shutdown function for the zone and key if SHUTDOWN_NEEDED
1080bd41d0anordmark * is set.
1081bd41d0anordmark * If some other thread gets here first and sets *_INPROGRESS, then
1082bd41d0anordmark * we wait for that thread to complete so that we can ensure that
1083bd41d0anordmark * all the callbacks are done when we've looped over all zones/keys.
1084bd41d0anordmark *
1085bd41d0anordmark * When we call the shutdown function, we drop the global held by the
1086bd41d0anordmark * caller, and return true to tell the caller it needs to re-evalute the
1087bd41d0anordmark * state.
1088bd41d0anordmark * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
1089bd41d0anordmark * remains held on exit.
1090bd41d0anordmark */
1091bd41d0anordmarkstatic boolean_t
1092bd41d0anordmarkzsd_apply_shutdown(kmutex_t *lockp, boolean_t zone_lock_held,
1093bd41d0anordmark    zone_t *zone, zone_key_t key)
1095bd41d0anordmark	struct zsd_entry *t;
1096bd41d0anordmark	void *data;
1097bd41d0anordmark	boolean_t dropped;
1099bd41d0anordmark	if (lockp != NULL) {
1100bd41d0anordmark		ASSERT(MUTEX_HELD(lockp));
1101bd41d0anordmark	}
1102bd41d0anordmark	if (zone_lock_held) {
1103bd41d0anordmark		ASSERT(MUTEX_HELD(&zone->zone_lock));
1104bd41d0anordmark	} else {
1105bd41d0anordmark		mutex_enter(&zone->zone_lock);
1106bd41d0anordmark	}
1108bd41d0anordmark	t = zsd_find(&zone->zone_zsd, key);
1109bd41d0anordmark	if (t == NULL) {
1110bd41d0anordmark		/*
1111bd41d0anordmark		 * Somebody else got here first e.g the zone going
1112bd41d0anordmark		 * away.
1113bd41d0anordmark		 */
1114bd41d0anordmark		if (!zone_lock_held)
1115bd41d0anordmark			mutex_exit(&zone->zone_lock);
1116bd41d0anordmark		return (B_FALSE);
1117bd41d0anordmark	}
1118bd41d0anordmark	dropped = B_FALSE;
1119bd41d0anordmark	if (zsd_wait_for_creator(zone, t, lockp))
1120bd41d0anordmark		dropped = B_TRUE;
1122bd41d0anordmark	if (zsd_wait_for_inprogress(zone, t, lockp))
1123bd41d0anordmark		dropped = B_TRUE;
1125bd41d0anordmark	if (t->zsd_flags & ZSD_SHUTDOWN_NEEDED) {
1126bd41d0anordmark		t->zsd_flags &= ~ZSD_SHUTDOWN_NEEDED;
1127bd41d0anordmark		t->zsd_flags |= ZSD_SHUTDOWN_INPROGRESS;
1128bd41d0anordmark		DTRACE_PROBE2(zsd__shutdown__inprogress,
1129bd41d0anordmark		    zone_t *, zone, zone_key_t, key);
1130bd41d0anordmark		mutex_exit(&zone->zone_lock);
1131bd41d0anordmark		if (lockp != NULL)
1132bd41d0anordmark			mutex_exit(lockp);
1133bd41d0anordmark		dropped = B_TRUE;
1135bd41d0anordmark		ASSERT(t->zsd_shutdown != NULL);
1136bd41d0anordmark		data = t->zsd_data;
1138bd41d0anordmark		DTRACE_PROBE2(zsd__shutdown__start,
1139bd41d0anordmark		    zone_t *, zone, zone_key_t, key);
1141bd41d0anordmark		(t->zsd_shutdown)(zone->zone_id, data);
1142bd41d0anordmark		DTRACE_PROBE2(zsd__shutdown__end,
1143bd41d0anordmark		    zone_t *, zone, zone_key_t, key);
1145bd41d0anordmark		if (lockp != NULL)
1146bd41d0anordmark			mutex_enter(lockp);
1147bd41d0anordmark		mutex_enter(&zone->zone_lock);
1148bd41d0anordmark		t->zsd_flags &= ~ZSD_SHUTDOWN_INPROGRESS;
1149bd41d0anordmark		t->zsd_flags |= ZSD_SHUTDOWN_COMPLETED;
1150bd41d0anordmark		cv_broadcast(&t->zsd_cv);
1151bd41d0anordmark		DTRACE_PROBE2(zsd__shutdown__completed,
1152bd41d0anordmark		    zone_t *, zone, zone_key_t, key);
1153bd41d0anordmark	}
1154bd41d0anordmark	if (!zone_lock_held)
1155bd41d0anordmark		mutex_exit(&zone->zone_lock);
1156bd41d0anordmark	return (dropped);
1160bd41d0anordmark * Call the destroy function for the zone and key if DESTROY_NEEDED
1161bd41d0anordmark * is set.
1162bd41d0anordmark * If some other thread gets here first and sets *_INPROGRESS, then
1163bd41d0anordmark * we wait for that thread to complete so that we can ensure that
1164bd41d0anordmark * all the callbacks are done when we've looped over all zones/keys.
1165bd41d0anordmark *
1166bd41d0anordmark * When we call the destroy function, we drop the global held by the
1167bd41d0anordmark * caller, and return true to tell the caller it needs to re-evalute the
1168bd41d0anordmark * state.
1169bd41d0anordmark * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
1170bd41d0anordmark * remains held on exit.
1171bd41d0anordmark */
1172bd41d0anordmarkstatic boolean_t
1173bd41d0anordmarkzsd_apply_destroy(kmutex_t *lockp, boolean_t zone_lock_held,
1174bd41d0anordmark    zone_t *zone, zone_key_t key)
1176bd41d0anordmark	struct zsd_entry *t;
1177bd41d0anordmark	void *data;
1178bd41d0anordmark	boolean_t dropped;
1180bd41d0anordmark	if (lockp != NULL) {
1181bd41d0anordmark		ASSERT(MUTEX_HELD(lockp));
1182bd41d0anordmark	}
1183bd41d0anordmark	if (zone_lock_held) {
1184bd41d0anordmark		ASSERT(MUTEX_HELD(&zone->zone_lock));
1185bd41d0anordmark	} else {
1186bd41d0anordmark		mutex_enter(&zone->zone_lock);
1187bd41d0anordmark	}
1189bd41d0anordmark	t = zsd_find(&zone->zone_zsd, key);
1190bd41d0anordmark	if (t == NULL) {
1191bd41d0anordmark		/*
1192bd41d0anordmark		 * Somebody else got here first e.g the zone going
1193bd41d0anordmark		 * away.
1194bd41d0anordmark		 */
1195bd41d0anordmark		if (!zone_lock_held)
1196bd41d0anordmark			mutex_exit(&zone->zone_lock);
1197bd41d0anordmark		return (B_FALSE);
1198bd41d0anordmark	}
1199bd41d0anordmark	dropped = B_FALSE;
1200bd41d0anordmark	if (zsd_wait_for_creator(zone, t, lockp))
1201bd41d0anordmark		dropped = B_TRUE;
1203bd41d0anordmark	if (zsd_wait_for_inprogress(zone, t, lockp))
1204bd41d0anordmark		dropped = B_TRUE;
1206bd41d0anordmark	if (t->zsd_flags & ZSD_DESTROY_NEEDED) {
1207bd41d0anordmark		t->zsd_flags &= ~ZSD_DESTROY_NEEDED;
1208bd41d0anordmark		t->zsd_flags |= ZSD_DESTROY_INPROGRESS;
1209bd41d0anordmark		DTRACE_PROBE2(zsd__destroy__inprogress,
1210bd41d0anordmark		    zone_t *, zone, zone_key_t, key);
1211bd41d0anordmark		mutex_exit(&zone->zone_lock);
1212bd41d0anordmark		if (lockp != NULL)
1213bd41d0anordmark			mutex_exit(lockp);
1214bd41d0anordmark		dropped = B_TRUE;
1216bd41d0anordmark		ASSERT(t->zsd_destroy != NULL);
1217bd41d0anordmark		data = t->zsd_data;
1218bd41d0anordmark		DTRACE_PROBE2(zsd__destroy__start,
1219bd41d0anordmark		    zone_t *, zone, zone_key_t, key);
1221bd41d0anordmark		(t->zsd_destroy)(zone->zone_id, data);
1222bd41d0anordmark		DTRACE_PROBE2(zsd__destroy__end,
1223bd41d0anordmark		    zone_t *, zone, zone_key_t, key);
1225bd41d0anordmark		if (lockp != NULL)
1226bd41d0anordmark			mutex_enter(lockp);
1227bd41d0anordmark		mutex_enter(&zone->zone_lock);
1228bd41d0anordmark		t->zsd_data = NULL;
1229bd41d0anordmark		t->zsd_flags &= ~ZSD_DESTROY_INPROGRESS;
1230bd41d0anordmark		t->zsd_flags |= ZSD_DESTROY_COMPLETED;
1231bd41d0anordmark		cv_broadcast(&t->zsd_cv);
1232bd41d0anordmark		DTRACE_PROBE2(zsd__destroy__completed,
1233bd41d0anordmark		    zone_t *, zone, zone_key_t, key);
1234bd41d0anordmark	}
1235bd41d0anordmark	if (!zone_lock_held)
1236bd41d0anordmark		mutex_exit(&zone->zone_lock);
1237bd41d0anordmark	return (dropped);
1241bd41d0anordmark * Wait for any CREATE_NEEDED flag to be cleared.
1242bd41d0anordmark * Returns true if lockp was temporarily dropped while waiting.
1243bd41d0anordmark */
1244bd41d0anordmarkstatic boolean_t
1245bd41d0anordmarkzsd_wait_for_creator(zone_t *zone, struct zsd_entry *t, kmutex_t *lockp)
1247bd41d0anordmark	boolean_t dropped = B_FALSE;
1249bd41d0anordmark	while (t->zsd_flags & ZSD_CREATE_NEEDED) {
1250bd41d0anordmark		DTRACE_PROBE2(zsd__wait__for__creator,
1251bd41d0anordmark		    zone_t *, zone, struct zsd_entry *, t);
1252bd41d0anordmark		if (lockp != NULL) {
1253bd41d0anordmark			dropped = B_TRUE;
1254bd41d0anordmark			mutex_exit(lockp);
1255bd41d0anordmark		}
1256bd41d0anordmark		cv_wait(&t->zsd_cv, &zone->zone_lock);
1257bd41d0anordmark		if (lockp != NULL) {
1258bd41d0anordmark			/* First drop zone_lock to preserve order */
1259bd41d0anordmark			mutex_exit(&zone->zone_lock);
1260bd41d0anordmark			mutex_enter(lockp);
1261bd41d0anordmark			mutex_enter(&zone->zone_lock);
1262bd41d0anordmark		}
1263bd41d0anordmark	}
1264bd41d0anordmark	return (dropped);
1268bd41d0anordmark * Wait for any INPROGRESS flag to be cleared.
1269bd41d0anordmark * Returns true if lockp was temporarily dropped while waiting.
1270bd41d0anordmark */
1271bd41d0anordmarkstatic boolean_t
1272bd41d0anordmarkzsd_wait_for_inprogress(zone_t *zone, struct zsd_entry *t, kmutex_t *lockp)
1274bd41d0anordmark	boolean_t dropped = B_FALSE;
1276bd41d0anordmark	while (t->zsd_flags & ZSD_ALL_INPROGRESS) {
1277bd41d0anordmark		DTRACE_PROBE2(zsd__wait__for__inprogress,
1278bd41d0anordmark		    zone_t *, zone, struct zsd_entry *, t);
1279bd41d0anordmark		if (lockp != NULL) {
1280bd41d0anordmark			dropped = B_TRUE;
1281bd41d0anordmark			mutex_exit(lockp);
1282bd41d0anordmark		}
1283bd41d0anordmark		cv_wait(&t->zsd_cv, &zone->zone_lock);
1284bd41d0anordmark		if (lockp != NULL) {
1285bd41d0anordmark			/* First drop zone_lock to preserve order */
1286bd41d0anordmark			mutex_exit(&zone->zone_lock);
1287bd41d0anordmark			mutex_enter(lockp);
1288bd41d0anordmark			mutex_enter(&zone->zone_lock);
1289bd41d0anordmark		}
1290bd41d0anordmark	}
1291bd41d0anordmark	return (dropped);
1295fa9e406ahrens * Frees memory associated with the zone dataset list.
1296fa9e406ahrens */
1297fa9e406ahrensstatic void
1298fa9e406ahrenszone_free_datasets(zone_t *zone)
1300fa9e406ahrens	zone_dataset_t *t, *next;
1302fa9e406ahrens	for (t = list_head(&zone->zone_datasets); t != NULL; t = next) {
1303fa9e406ahrens		next = list_next(&zone->zone_datasets, t);
1304fa9e406ahrens		list_remove(&zone->zone_datasets, t);
1305fa9e406ahrens		kmem_free(t->zd_dataset, strlen(t->zd_dataset) + 1);
1306fa9e406ahrens		kmem_free(t, sizeof (*t));
1307fa9e406ahrens	}
1308fa9e406ahrens	list_destroy(&zone->zone_datasets);
13127c478bdstevel@tonic-gate * zone.cpu-shares resource control support.
13137c478bdstevel@tonic-gate */
13157c478bdstevel@tonic-gatestatic rctl_qty_t
13167c478bdstevel@tonic-gatezone_cpu_shares_usage(rctl_t *rctl, struct proc *p)
13187c478bdstevel@tonic-gate	ASSERT(MUTEX_HELD(&p->p_lock));
13197c478bdstevel@tonic-gate	return (p->p_zone->zone_shares);
13237c478bdstevel@tonic-gatestatic int
13247c478bdstevel@tonic-gatezone_cpu_shares_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
13257c478bdstevel@tonic-gate    rctl_qty_t nv)
13277c478bdstevel@tonic-gate	ASSERT(MUTEX_HELD(&p->p_lock));
13287c478bdstevel@tonic-gate	ASSERT(e->rcep_t == RCENTITY_ZONE);
13297c478bdstevel@tonic-gate	if (e->rcep_p.zone == NULL)
13307c478bdstevel@tonic-gate		return (0);
13327c478bdstevel@tonic-gate	e->rcep_p.zone->zone_shares = nv;
13337c478bdstevel@tonic-gate	return (0);
13367c478bdstevel@tonic-gatestatic rctl_ops_t zone_cpu_shares_ops = {
13377c478bdstevel@tonic-gate	rcop_no_action,
13387c478bdstevel@tonic-gate	zone_cpu_shares_usage,
13397c478bdstevel@tonic-gate	zone_cpu_shares_set,
13407c478bdstevel@tonic-gate	rcop_no_test
1344c97ad5cakolb * zone.cpu-cap resource control support.
1345c97ad5cakolb */
1347c97ad5cakolbstatic rctl_qty_t
1348c97ad5cakolbzone_cpu_cap_get(rctl_t *rctl, struct proc *p)
1350c97ad5cakolb	ASSERT(MUTEX_HELD(&p->p_lock));
1351c97ad5cakolb	return (cpucaps_zone_get(p->p_zone));
1355c97ad5cakolbstatic int
1356c97ad5cakolbzone_cpu_cap_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1357c97ad5cakolb    rctl_qty_t nv)
1359c97ad5cakolb	zone_t *zone = e->rcep_p.zone;
1361c97ad5cakolb	ASSERT(MUTEX_HELD(&p->p_lock));
1362c97ad5cakolb	ASSERT(e->rcep_t == RCENTITY_ZONE);
1364c97ad5cakolb	if (zone == NULL)
1365c97ad5cakolb		return (0);
1367c97ad5cakolb	/*
1368c97ad5cakolb	 * set cap to the new value.
1369c97ad5cakolb	 */
1370c97ad5cakolb	return (cpucaps_zone_set(zone, nv));
1373c97ad5cakolbstatic rctl_ops_t zone_cpu_cap_ops = {
1374c97ad5cakolb	rcop_no_action,
1375c97ad5cakolb	zone_cpu_cap_get,
1376c97ad5cakolb	zone_cpu_cap_set,
1377c97ad5cakolb	rcop_no_test
13817c478bdstevel@tonic-gatestatic rctl_qty_t
13827c478bdstevel@tonic-gatezone_lwps_usage(rctl_t *r, proc_t *p)
13847c478bdstevel@tonic-gate	rctl_qty_t nlwps;
13857c478bdstevel@tonic-gate	zone_t *zone = p->p_zone;
13877c478bdstevel@tonic-gate	ASSERT(MUTEX_HELD(&p->p_lock));
13897c478bdstevel@tonic-gate	mutex_enter(&zone->zone_nlwps_lock);
13907c478bdstevel@tonic-gate	nlwps = zone->zone_nlwps;
13917c478bdstevel@tonic-gate	mutex_exit(&zone->zone_nlwps_lock);
13937c478bdstevel@tonic-gate	return (nlwps);
13977c478bdstevel@tonic-gatestatic int
13987c478bdstevel@tonic-gatezone_lwps_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rcntl,
13997c478bdstevel@tonic-gate    rctl_qty_t incr, uint_t flags)
14017c478bdstevel@tonic-gate	rctl_qty_t nlwps;
14037c478bdstevel@tonic-gate	ASSERT(MUTEX_HELD(&p->p_lock));
14047c478bdstevel@tonic-gate	ASSERT(e->rcep_t == RCENTITY_ZONE);
14057c478bdstevel@tonic-gate	if (e->rcep_p.zone == NULL)
14067c478bdstevel@tonic-gate		return (0);
14077c478bdstevel@tonic-gate	ASSERT(MUTEX_HELD(&(e->rcep_p.zone->zone_nlwps_lock)));
14087c478bdstevel@tonic-gate	nlwps = e->rcep_p.zone->zone_nlwps;
14107c478bdstevel@tonic-gate	if (nlwps + incr > rcntl->rcv_value)
14117c478bdstevel@tonic-gate		return (1);
14137c478bdstevel@tonic-gate	return (0);
14177c478bdstevel@tonic-gatestatic int
1418c693965slzone_lwps_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, rctl_qty_t nv)
14207c478bdstevel@tonic-gate	ASSERT(MUTEX_HELD(&p->p_lock));
14217c478bdstevel@tonic-gate	ASSERT(e->rcep_t == RCENTITY_ZONE);
14227c478bdstevel@tonic-gate	if (e->rcep_p.zone == NULL)
14237c478bdstevel@tonic-gate		return (0);
14247c478bdstevel@tonic-gate	e->rcep_p.zone->zone_nlwps_ctl = nv;
14257c478bdstevel@tonic-gate	return (0);
14287c478bdstevel@tonic-gatestatic rctl_ops_t zone_lwps_ops = {
14297c478bdstevel@tonic-gate	rcop_no_action,
14307c478bdstevel@tonic-gate	zone_lwps_usage,
14317c478bdstevel@tonic-gate	zone_lwps_set,
14327c478bdstevel@tonic-gate	zone_lwps_test,
1436ff19e02Menno Lagemanstatic rctl_qty_t
1437ff19e02Menno Lagemanzone_procs_usage(rctl_t *r, proc_t *p)
1438ff19e02Menno Lageman{
1439ff19e02Menno Lageman	rctl_qty_t nprocs;
1440ff19e02Menno Lageman	zone_t *zone = p->p_zone;
1441ff19e02Menno Lageman
1442ff19e02Menno Lageman	ASSERT(MUTEX_HELD(&p->p_lock));
1443ff19e02Menno Lageman
1444ff19e02Menno Lageman	mutex_enter(&zone->zone_nlwps_lock);
1445ff19e02Menno Lageman	nprocs = zone->zone_nprocs;
1446ff19e02Menno Lageman	mutex_exit(&zone->zone_nlwps_lock);
1447ff19e02Menno Lageman
1448ff19e02Menno Lageman	return (nprocs);
1449ff19e02Menno Lageman}
1450ff19e02Menno Lageman
1451ff19e02Menno Lageman/*ARGSUSED*/
1452ff19e02Menno Lagemanstatic int
1453ff19e02Menno Lagemanzone_procs_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rcntl,
1454ff19e02Menno Lageman    rctl_qty_t incr, uint_t flags)
1455ff19e02Menno Lageman{
1456ff19e02Menno Lageman	rctl_qty_t nprocs;
1457ff19e02Menno Lageman
1458ff19e02Menno Lageman	ASSERT(MUTEX_HELD(&p->p_lock));
1459ff19e02Menno Lageman	ASSERT(e->rcep_t == RCENTITY_ZONE);
1460ff19e02Menno Lageman	if (e->rcep_p.zone == NULL)
1461ff19e02Menno Lageman		return (0);
1462ff19e02Menno Lageman	ASSERT(MUTEX_HELD(&(e->rcep_p.zone->zone_nlwps_lock)));
1463ff19e02Menno Lageman	nprocs = e->rcep_p.zone->zone_nprocs;
1464ff19e02Menno Lageman
1465ff19e02Menno Lageman	if (nprocs + incr > rcntl->rcv_value)
1466ff19e02Menno Lageman		return (1);
1467ff19e02Menno Lageman
1468ff19e02Menno Lageman	return (0);
1469ff19e02Menno Lageman}
1470ff19e02Menno Lageman
1471ff19e02Menno Lageman/*ARGSUSED*/
1472ff19e02Menno Lagemanstatic int
1473ff19e02Menno Lagemanzone_procs_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, rctl_qty_t nv)
1474ff19e02Menno Lageman{
1475ff19e02Menno Lageman	ASSERT(MUTEX_HELD(&p->p_lock));
1476ff19e02Menno Lageman	ASSERT(e->rcep_t == RCENTITY_ZONE);
1477ff19e02Menno Lageman	if (e->rcep_p.zone == NULL)
1478ff19e02Menno Lageman		return (0);
1479ff19e02Menno Lageman	e->rcep_p.zone->zone_nprocs_ctl = nv;
1480ff19e02Menno Lageman	return (0);
1481ff19e02Menno Lageman}
1482ff19e02Menno Lageman
1483ff19e02Menno Lagemanstatic rctl_ops_t zone_procs_ops = {
1484ff19e02Menno Lageman	rcop_no_action,
1485ff19e02Menno Lageman	zone_procs_usage,
1486ff19e02Menno Lageman	zone_procs_set,
1487ff19e02Menno Lageman	zone_procs_test,
1488ff19e02Menno Lageman};
1489ff19e02Menno Lageman
1490ff19e02Menno Lageman/*ARGSUSED*/
149108c359eJerry Jelinekstatic rctl_qty_t
149208c359eJerry Jelinekzone_shmmax_usage(rctl_t *rctl, struct proc *p)
149308c359eJerry Jelinek{
149408c359eJerry Jelinek	ASSERT(MUTEX_HELD(&p->p_lock));
149508c359eJerry Jelinek	return (p->p_zone->zone_shmmax);
149608c359eJerry Jelinek}
149708c359eJerry Jelinek
149808c359eJerry Jelinek/*ARGSUSED*/
1499824c205mlstatic int
1500824c205mlzone_shmmax_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1501824c205ml    rctl_qty_t incr, uint_t flags)
1503824c205ml	rctl_qty_t v;
1504824c205ml	ASSERT(MUTEX_HELD(&p->p_lock));
1505824c205ml	ASSERT(e->rcep_t == RCENTITY_ZONE);
1506824c205ml	v = e->rcep_p.zone->zone_shmmax + incr;
1507824c205ml	if (v > rval->rcv_value)
1508824c205ml		return (1);
1509824c205ml	return (0);
1512824c205mlstatic rctl_ops_t zone_shmmax_ops = {
1513824c205ml	rcop_no_action,
151408c359eJerry Jelinek	zone_shmmax_usage,
1515824c205ml	rcop_no_set,
1516824c205ml	zone_shmmax_test
152008c359eJerry Jelinekstatic rctl_qty_t
152108c359eJerry Jelinekzone_shmmni_usage(rctl_t *rctl, struct proc *p)
152208c359eJerry Jelinek{
152308c359eJerry Jelinek	ASSERT(MUTEX_HELD(&p->p_lock));
152408c359eJerry Jelinek	return (p->p_zone->zone_ipc.ipcq_shmmni);
152508c359eJerry Jelinek}
152608c359eJerry Jelinek
152708c359eJerry Jelinek/*ARGSUSED*/
1528824c205mlstatic int
1529824c205mlzone_shmmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1530824c205ml    rctl_qty_t incr, uint_t flags)
1532824c205ml	rctl_qty_t v;
1533824c205ml	ASSERT(MUTEX_HELD(&p->p_lock));
1534824c205ml	ASSERT(e->rcep_t == RCENTITY_ZONE);
1535824c205ml	v = e->rcep_p.zone->zone_ipc.ipcq_shmmni + incr;
1536824c205ml	if (v > rval->rcv_value)
1537824c205ml		return (1);
1538824c205ml	return (0);
1541824c205mlstatic rctl_ops_t zone_shmmni_ops = {
1542824c205ml	rcop_no_action,
154308c359eJerry Jelinek	zone_shmmni_usage,
1544824c205ml	rcop_no_set,
1545824c205ml	zone_shmmni_test
154908c359eJerry Jelinekstatic rctl_qty_t
155008c359eJerry Jelinekzone_semmni_usage(rctl_t *rctl, struct proc *p)
155108c359eJerry Jelinek{
155208c359eJerry Jelinek	ASSERT(MUTEX_HELD(&p->p_lock));
155308c359eJerry Jelinek	return (p->p_zone->zone_ipc.ipcq_semmni);
155408c359eJerry Jelinek}
155508c359eJerry Jelinek
155608c359eJerry Jelinek/*ARGSUSED*/
1557824c205mlstatic int
1558824c205mlzone_semmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1559824c205ml    rctl_qty_t incr, uint_t flags)
1561824c205ml	rctl_qty_t v;
1562824c205ml	ASSERT(MUTEX_HELD(&p->p_lock));