xref: /illumos-gate/usr/src/uts/common/os/zone.c (revision 55fcd84f)
17c478bd9Sstevel@tonic-gate /*
27c478bd9Sstevel@tonic-gate  * CDDL HEADER START
37c478bd9Sstevel@tonic-gate  *
47c478bd9Sstevel@tonic-gate  * The contents of this file are subject to the terms of the
545916cd2Sjpk  * Common Development and Distribution License (the "License").
645916cd2Sjpk  * You may not use this file except in compliance with the License.
77c478bd9Sstevel@tonic-gate  *
87c478bd9Sstevel@tonic-gate  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
97c478bd9Sstevel@tonic-gate  * or http://www.opensolaris.org/os/licensing.
107c478bd9Sstevel@tonic-gate  * See the License for the specific language governing permissions
117c478bd9Sstevel@tonic-gate  * and limitations under the License.
127c478bd9Sstevel@tonic-gate  *
137c478bd9Sstevel@tonic-gate  * When distributing Covered Code, include this CDDL HEADER in each
147c478bd9Sstevel@tonic-gate  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
157c478bd9Sstevel@tonic-gate  * If applicable, add the following below this CDDL HEADER, with the
167c478bd9Sstevel@tonic-gate  * fields enclosed by brackets "[]" replaced with your own identifying
177c478bd9Sstevel@tonic-gate  * information: Portions Copyright [yyyy] [name of copyright owner]
187c478bd9Sstevel@tonic-gate  *
197c478bd9Sstevel@tonic-gate  * CDDL HEADER END
207c478bd9Sstevel@tonic-gate  */
2197eda132Sraf 
227c478bd9Sstevel@tonic-gate /*
23134a1f4eSCasper H.S. Dik  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
24a48d8120SJerry Jelinek  * Copyright 2015, Joyent Inc. All rights reserved.
2548bbca81SDaniel Hoffman  * Copyright (c) 2016 by Delphix. All rights reserved.
2666d7818bSAndy Fiddaman  * Copyright 2018 OmniOS Community Edition (OmniOSce) Association.
277c478bd9Sstevel@tonic-gate  */
287c478bd9Sstevel@tonic-gate 
297c478bd9Sstevel@tonic-gate /*
307c478bd9Sstevel@tonic-gate  * Zones
317c478bd9Sstevel@tonic-gate  *
327c478bd9Sstevel@tonic-gate  *   A zone is a named collection of processes, namespace constraints,
337c478bd9Sstevel@tonic-gate  *   and other system resources which comprise a secure and manageable
347c478bd9Sstevel@tonic-gate  *   application containment facility.
357c478bd9Sstevel@tonic-gate  *
367c478bd9Sstevel@tonic-gate  *   Zones (represented by the reference counted zone_t) are tracked in
377c478bd9Sstevel@tonic-gate  *   the kernel in the zonehash.  Elsewhere in the kernel, Zone IDs
387c478bd9Sstevel@tonic-gate  *   (zoneid_t) are used to track zone association.  Zone IDs are
397c478bd9Sstevel@tonic-gate  *   dynamically generated when the zone is created; if a persistent
407c478bd9Sstevel@tonic-gate  *   identifier is needed (core files, accounting logs, audit trail,
417c478bd9Sstevel@tonic-gate  *   etc.), the zone name should be used.
427c478bd9Sstevel@tonic-gate  *
437c478bd9Sstevel@tonic-gate  *
447c478bd9Sstevel@tonic-gate  *   Global Zone:
457c478bd9Sstevel@tonic-gate  *
467c478bd9Sstevel@tonic-gate  *   The global zone (zoneid 0) is automatically associated with all
477c478bd9Sstevel@tonic-gate  *   system resources that have not been bound to a user-created zone.
487c478bd9Sstevel@tonic-gate  *   This means that even systems where zones are not in active use
497c478bd9Sstevel@tonic-gate  *   have a global zone, and all processes, mounts, etc. are
507c478bd9Sstevel@tonic-gate  *   associated with that zone.  The global zone is generally
517c478bd9Sstevel@tonic-gate  *   unconstrained in terms of privileges and access, though the usual
527c478bd9Sstevel@tonic-gate  *   credential and privilege based restrictions apply.
537c478bd9Sstevel@tonic-gate  *
547c478bd9Sstevel@tonic-gate  *
557c478bd9Sstevel@tonic-gate  *   Zone States:
567c478bd9Sstevel@tonic-gate  *
577c478bd9Sstevel@tonic-gate  *   The states in which a zone may be in and the transitions are as
587c478bd9Sstevel@tonic-gate  *   follows:
597c478bd9Sstevel@tonic-gate  *
607c478bd9Sstevel@tonic-gate  *   ZONE_IS_UNINITIALIZED: primordial state for a zone. The partially
617c478bd9Sstevel@tonic-gate  *   initialized zone is added to the list of active zones on the system but
627c478bd9Sstevel@tonic-gate  *   isn't accessible.
637c478bd9Sstevel@tonic-gate  *
64bd41d0a8Snordmark  *   ZONE_IS_INITIALIZED: Initialization complete except the ZSD callbacks are
65bd41d0a8Snordmark  *   not yet completed. Not possible to enter the zone, but attributes can
66bd41d0a8Snordmark  *   be retrieved.
67bd41d0a8Snordmark  *
687c478bd9Sstevel@tonic-gate  *   ZONE_IS_READY: zsched (the kernel dummy process for a zone) is
697c478bd9Sstevel@tonic-gate  *   ready.  The zone is made visible after the ZSD constructor callbacks are
707c478bd9Sstevel@tonic-gate  *   executed.  A zone remains in this state until it transitions into
717c478bd9Sstevel@tonic-gate  *   the ZONE_IS_BOOTING state as a result of a call to zone_boot().
727c478bd9Sstevel@tonic-gate  *
737c478bd9Sstevel@tonic-gate  *   ZONE_IS_BOOTING: in this shortlived-state, zsched attempts to start
747c478bd9Sstevel@tonic-gate  *   init.  Should that fail, the zone proceeds to the ZONE_IS_SHUTTING_DOWN
757c478bd9Sstevel@tonic-gate  *   state.
767c478bd9Sstevel@tonic-gate  *
777c478bd9Sstevel@tonic-gate  *   ZONE_IS_RUNNING: The zone is open for business: zsched has
787c478bd9Sstevel@tonic-gate  *   successfully started init.   A zone remains in this state until
797c478bd9Sstevel@tonic-gate  *   zone_shutdown() is called.
807c478bd9Sstevel@tonic-gate  *
817c478bd9Sstevel@tonic-gate  *   ZONE_IS_SHUTTING_DOWN: zone_shutdown() has been called, the system is
827c478bd9Sstevel@tonic-gate  *   killing all processes running in the zone. The zone remains
837c478bd9Sstevel@tonic-gate  *   in this state until there are no more user processes running in the zone.
847c478bd9Sstevel@tonic-gate  *   zone_create(), zone_enter(), and zone_destroy() on this zone will fail.
857c478bd9Sstevel@tonic-gate  *   Since zone_shutdown() is restartable, it may be called successfully
867c478bd9Sstevel@tonic-gate  *   multiple times for the same zone_t.  Setting of the zone's state to
877c478bd9Sstevel@tonic-gate  *   ZONE_IS_SHUTTING_DOWN is synchronized with mounts, so VOP_MOUNT() may check
887c478bd9Sstevel@tonic-gate  *   the zone's status without worrying about it being a moving target.
897c478bd9Sstevel@tonic-gate  *
907c478bd9Sstevel@tonic-gate  *   ZONE_IS_EMPTY: zone_shutdown() has been called, and there
917c478bd9Sstevel@tonic-gate  *   are no more user processes in the zone.  The zone remains in this
927c478bd9Sstevel@tonic-gate  *   state until there are no more kernel threads associated with the
937c478bd9Sstevel@tonic-gate  *   zone.  zone_create(), zone_enter(), and zone_destroy() on this zone will
947c478bd9Sstevel@tonic-gate  *   fail.
957c478bd9Sstevel@tonic-gate  *
967c478bd9Sstevel@tonic-gate  *   ZONE_IS_DOWN: All kernel threads doing work on behalf of the zone
977c478bd9Sstevel@tonic-gate  *   have exited.  zone_shutdown() returns.  Henceforth it is not possible to
987c478bd9Sstevel@tonic-gate  *   join the zone or create kernel threads therein.
997c478bd9Sstevel@tonic-gate  *
1007c478bd9Sstevel@tonic-gate  *   ZONE_IS_DYING: zone_destroy() has been called on the zone; zone
1017c478bd9Sstevel@tonic-gate  *   remains in this state until zsched exits.  Calls to zone_find_by_*()
1027c478bd9Sstevel@tonic-gate  *   return NULL from now on.
1037c478bd9Sstevel@tonic-gate  *
1047c478bd9Sstevel@tonic-gate  *   ZONE_IS_DEAD: zsched has exited (zone_ntasks == 0).  There are no
1057c478bd9Sstevel@tonic-gate  *   processes or threads doing work on behalf of the zone.  The zone is
1067c478bd9Sstevel@tonic-gate  *   removed from the list of active zones.  zone_destroy() returns, and
1077c478bd9Sstevel@tonic-gate  *   the zone can be recreated.
1087c478bd9Sstevel@tonic-gate  *
1097c478bd9Sstevel@tonic-gate  *   ZONE_IS_FREE (internal state): zone_ref goes to 0, ZSD destructor
1107c478bd9Sstevel@tonic-gate  *   callbacks are executed, and all memory associated with the zone is
1117c478bd9Sstevel@tonic-gate  *   freed.
1127c478bd9Sstevel@tonic-gate  *
1137c478bd9Sstevel@tonic-gate  *   Threads can wait for the zone to enter a requested state by using
1147c478bd9Sstevel@tonic-gate  *   zone_status_wait() or zone_status_timedwait() with the desired
1157c478bd9Sstevel@tonic-gate  *   state passed in as an argument.  Zone state transitions are
1167c478bd9Sstevel@tonic-gate  *   uni-directional; it is not possible to move back to an earlier state.
1177c478bd9Sstevel@tonic-gate  *
1187c478bd9Sstevel@tonic-gate  *
1197c478bd9Sstevel@tonic-gate  *   Zone-Specific Data:
1207c478bd9Sstevel@tonic-gate  *
1217c478bd9Sstevel@tonic-gate  *   Subsystems needing to maintain zone-specific data can store that
1227c478bd9Sstevel@tonic-gate  *   data using the ZSD mechanism.  This provides a zone-specific data
1237c478bd9Sstevel@tonic-gate  *   store, similar to thread-specific data (see pthread_getspecific(3C)
1247c478bd9Sstevel@tonic-gate  *   or the TSD code in uts/common/disp/thread.c.  Also, ZSD can be used
1257c478bd9Sstevel@tonic-gate  *   to register callbacks to be invoked when a zone is created, shut
1267c478bd9Sstevel@tonic-gate  *   down, or destroyed.  This can be used to initialize zone-specific
1277c478bd9Sstevel@tonic-gate  *   data for new zones and to clean up when zones go away.
1287c478bd9Sstevel@tonic-gate  *
1297c478bd9Sstevel@tonic-gate  *
1307c478bd9Sstevel@tonic-gate  *   Data Structures:
1317c478bd9Sstevel@tonic-gate  *
1327c478bd9Sstevel@tonic-gate  *   The per-zone structure (zone_t) is reference counted, and freed
1337c478bd9Sstevel@tonic-gate  *   when all references are released.  zone_hold and zone_rele can be
1347c478bd9Sstevel@tonic-gate  *   used to adjust the reference count.  In addition, reference counts
1357c478bd9Sstevel@tonic-gate  *   associated with the cred_t structure are tracked separately using
1367c478bd9Sstevel@tonic-gate  *   zone_cred_hold and zone_cred_rele.
1377c478bd9Sstevel@tonic-gate  *
1387c478bd9Sstevel@tonic-gate  *   Pointers to active zone_t's are stored in two hash tables; one
1397c478bd9Sstevel@tonic-gate  *   for searching by id, the other for searching by name.  Lookups
1407c478bd9Sstevel@tonic-gate  *   can be performed on either basis, using zone_find_by_id and
1417c478bd9Sstevel@tonic-gate  *   zone_find_by_name.  Both return zone_t pointers with the zone
1427c478bd9Sstevel@tonic-gate  *   held, so zone_rele should be called when the pointer is no longer
1437c478bd9Sstevel@tonic-gate  *   needed.  Zones can also be searched by path; zone_find_by_path
1447c478bd9Sstevel@tonic-gate  *   returns the zone with which a path name is associated (global
1457c478bd9Sstevel@tonic-gate  *   zone if the path is not within some other zone's file system
1467c478bd9Sstevel@tonic-gate  *   hierarchy).  This currently requires iterating through each zone,
1477c478bd9Sstevel@tonic-gate  *   so it is slower than an id or name search via a hash table.
1487c478bd9Sstevel@tonic-gate  *
1497c478bd9Sstevel@tonic-gate  *
1507c478bd9Sstevel@tonic-gate  *   Locking:
1517c478bd9Sstevel@tonic-gate  *
1527c478bd9Sstevel@tonic-gate  *   zonehash_lock: This is a top-level global lock used to protect the
1537c478bd9Sstevel@tonic-gate  *       zone hash tables and lists.  Zones cannot be created or destroyed
1547c478bd9Sstevel@tonic-gate  *       while this lock is held.
1557c478bd9Sstevel@tonic-gate  *   zone_status_lock: This is a global lock protecting zone state.
1567c478bd9Sstevel@tonic-gate  *       Zones cannot change state while this lock is held.  It also
1577c478bd9Sstevel@tonic-gate  *       protects the list of kernel threads associated with a zone.
1587c478bd9Sstevel@tonic-gate  *   zone_lock: This is a per-zone lock used to protect several fields of
1597c478bd9Sstevel@tonic-gate  *       the zone_t (see <sys/zone.h> for details).  In addition, holding
1607c478bd9Sstevel@tonic-gate  *       this lock means that the zone cannot go away.
1610209230bSgjelinek  *   zone_nlwps_lock: This is a per-zone lock used to protect the fields
1620209230bSgjelinek  *	 related to the zone.max-lwps rctl.
1630209230bSgjelinek  *   zone_mem_lock: This is a per-zone lock used to protect the fields
1640209230bSgjelinek  *	 related to the zone.max-locked-memory and zone.max-swap rctls.
1650fbb751dSJohn Levon  *   zone_rctl_lock: This is a per-zone lock used to protect other rctls,
1660fbb751dSJohn Levon  *       currently just max_lofi
1677c478bd9Sstevel@tonic-gate  *   zsd_key_lock: This is a global lock protecting the key state for ZSD.
1687c478bd9Sstevel@tonic-gate  *   zone_deathrow_lock: This is a global lock protecting the "deathrow"
1697c478bd9Sstevel@tonic-gate  *       list (a list of zones in the ZONE_IS_DEAD state).
1707c478bd9Sstevel@tonic-gate  *
1717c478bd9Sstevel@tonic-gate  *   Ordering requirements:
1727c478bd9Sstevel@tonic-gate  *       pool_lock --> cpu_lock --> zonehash_lock --> zone_status_lock -->
1732918c4a3SJohn Levon  *       zone_lock --> zsd_key_lock --> pidlock --> p_lock
1747c478bd9Sstevel@tonic-gate  *
1750209230bSgjelinek  *   When taking zone_mem_lock or zone_nlwps_lock, the lock ordering is:
1760209230bSgjelinek  *	zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_mem_lock
177ff19e029SMenno Lageman  *	zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_nlwps_lock
1780209230bSgjelinek  *
1797c478bd9Sstevel@tonic-gate  *   Blocking memory allocations are permitted while holding any of the
1807c478bd9Sstevel@tonic-gate  *   zone locks.
1817c478bd9Sstevel@tonic-gate  *
1827c478bd9Sstevel@tonic-gate  *
1837c478bd9Sstevel@tonic-gate  *   System Call Interface:
1847c478bd9Sstevel@tonic-gate  *
1857c478bd9Sstevel@tonic-gate  *   The zone subsystem can be managed and queried from user level with
1867c478bd9Sstevel@tonic-gate  *   the following system calls (all subcodes of the primary "zone"
1877c478bd9Sstevel@tonic-gate  *   system call):
1887c478bd9Sstevel@tonic-gate  *   - zone_create: creates a zone with selected attributes (name,
189fa9e4066Sahrens  *     root path, privileges, resource controls, ZFS datasets)
1907c478bd9Sstevel@tonic-gate  *   - zone_enter: allows the current process to enter a zone
1917c478bd9Sstevel@tonic-gate  *   - zone_getattr: reports attributes of a zone
1923f2f09c1Sdp  *   - zone_setattr: set attributes of a zone
1933f2f09c1Sdp  *   - zone_boot: set 'init' running for the zone
1947c478bd9Sstevel@tonic-gate  *   - zone_list: lists all zones active in the system
1957c478bd9Sstevel@tonic-gate  *   - zone_lookup: looks up zone id based on name
1967c478bd9Sstevel@tonic-gate  *   - zone_shutdown: initiates shutdown process (see states above)
1977c478bd9Sstevel@tonic-gate  *   - zone_destroy: completes shutdown process (see states above)
1987c478bd9Sstevel@tonic-gate  *
1997c478bd9Sstevel@tonic-gate  */
2007c478bd9Sstevel@tonic-gate 
2017c478bd9Sstevel@tonic-gate #include <sys/priv_impl.h>
2027c478bd9Sstevel@tonic-gate #include <sys/cred.h>
2037c478bd9Sstevel@tonic-gate #include <c2/audit.h>
2047c478bd9Sstevel@tonic-gate #include <sys/debug.h>
2057c478bd9Sstevel@tonic-gate #include <sys/file.h>
2067c478bd9Sstevel@tonic-gate #include <sys/kmem.h>
2070209230bSgjelinek #include <sys/kstat.h>
2087c478bd9Sstevel@tonic-gate #include <sys/mutex.h>
20945916cd2Sjpk #include <sys/note.h>
2107c478bd9Sstevel@tonic-gate #include <sys/pathname.h>
2117c478bd9Sstevel@tonic-gate #include <sys/proc.h>
2127c478bd9Sstevel@tonic-gate #include <sys/project.h>
213cf8f45c7Sdstaff #include <sys/sysevent.h>
2147c478bd9Sstevel@tonic-gate #include <sys/task.h>
2157c478bd9Sstevel@tonic-gate #include <sys/systm.h>
2167c478bd9Sstevel@tonic-gate #include <sys/types.h>
2177c478bd9Sstevel@tonic-gate #include <sys/utsname.h>
2187c478bd9Sstevel@tonic-gate #include <sys/vnode.h>
2197c478bd9Sstevel@tonic-gate #include <sys/vfs.h>
2207c478bd9Sstevel@tonic-gate #include <sys/systeminfo.h>
2217c478bd9Sstevel@tonic-gate #include <sys/policy.h>
2227c478bd9Sstevel@tonic-gate #include <sys/cred_impl.h>
2237c478bd9Sstevel@tonic-gate #include <sys/contract_impl.h>
2247c478bd9Sstevel@tonic-gate #include <sys/contract/process_impl.h>
2257c478bd9Sstevel@tonic-gate #include <sys/class.h>
2267c478bd9Sstevel@tonic-gate #include <sys/pool.h>
2277c478bd9Sstevel@tonic-gate #include <sys/pool_pset.h>
2287c478bd9Sstevel@tonic-gate #include <sys/pset.h>
229a19609f8Sjv #include <sys/strlog.h>
2307c478bd9Sstevel@tonic-gate #include <sys/sysmacros.h>
2317c478bd9Sstevel@tonic-gate #include <sys/callb.h>
2327c478bd9Sstevel@tonic-gate #include <sys/vmparam.h>
2337c478bd9Sstevel@tonic-gate #include <sys/corectl.h>
234824c205fSml #include <sys/ipc_impl.h>
235134a1f4eSCasper H.S. Dik #include <sys/klpd.h>
2367c478bd9Sstevel@tonic-gate 
2377c478bd9Sstevel@tonic-gate #include <sys/door.h>
2387c478bd9Sstevel@tonic-gate #include <sys/cpuvar.h>
239bd41d0a8Snordmark #include <sys/sdt.h>
2407c478bd9Sstevel@tonic-gate 
2417c478bd9Sstevel@tonic-gate #include <sys/uadmin.h>
2427c478bd9Sstevel@tonic-gate #include <sys/session.h>
2437c478bd9Sstevel@tonic-gate #include <sys/cmn_err.h>
2447c478bd9Sstevel@tonic-gate #include <sys/modhash.h>
2453f2f09c1Sdp #include <sys/sunddi.h>
2467c478bd9Sstevel@tonic-gate #include <sys/nvpair.h>
2477c478bd9Sstevel@tonic-gate #include <sys/rctl.h>
2487c478bd9Sstevel@tonic-gate #include <sys/fss.h>
2499acbbeafSnn #include <sys/brand.h>
2507c478bd9Sstevel@tonic-gate #include <sys/zone.h>
251f4b3ec61Sdh #include <net/if.h>
252c97ad5cdSakolb #include <sys/cpucaps.h>
2530209230bSgjelinek #include <vm/seg.h>
2542b24ab6bSSebastien Roy #include <sys/mac.h>
2552b24ab6bSSebastien Roy 
256a19609f8Sjv /*
257a19609f8Sjv  * This constant specifies the number of seconds that threads waiting for
258a19609f8Sjv  * subsystems to release a zone's general-purpose references will wait before
259a19609f8Sjv  * they log the zone's reference counts.  The constant's value shouldn't
260a19609f8Sjv  * be so small that reference counts are unnecessarily reported for zones
261a19609f8Sjv  * whose references are slowly released.  On the other hand, it shouldn't be so
262a19609f8Sjv  * large that users reboot their systems out of frustration over hung zones
263a19609f8Sjv  * before the system logs the zones' reference counts.
264a19609f8Sjv  */
265a19609f8Sjv #define	ZONE_DESTROY_TIMEOUT_SECS	60
266a19609f8Sjv 
2672b24ab6bSSebastien Roy /* List of data link IDs which are accessible from the zone */
2682b24ab6bSSebastien Roy typedef struct zone_dl {
2692b24ab6bSSebastien Roy 	datalink_id_t	zdl_id;
270550b6e40SSowmini Varadhan 	nvlist_t	*zdl_net;
2712b24ab6bSSebastien Roy 	list_node_t	zdl_linkage;
2722b24ab6bSSebastien Roy } zone_dl_t;
2730209230bSgjelinek 
2747c478bd9Sstevel@tonic-gate /*
2757c478bd9Sstevel@tonic-gate  * cv used to signal that all references to the zone have been released.  This
2767c478bd9Sstevel@tonic-gate  * needs to be global since there may be multiple waiters, and the first to
2777c478bd9Sstevel@tonic-gate  * wake up will free the zone_t, hence we cannot use zone->zone_cv.
2787c478bd9Sstevel@tonic-gate  */
2797c478bd9Sstevel@tonic-gate static kcondvar_t zone_destroy_cv;
2807c478bd9Sstevel@tonic-gate /*
2817c478bd9Sstevel@tonic-gate  * Lock used to serialize access to zone_cv.  This could have been per-zone,
2827c478bd9Sstevel@tonic-gate  * but then we'd need another lock for zone_destroy_cv, and why bother?
2837c478bd9Sstevel@tonic-gate  */
2847c478bd9Sstevel@tonic-gate static kmutex_t zone_status_lock;
2857c478bd9Sstevel@tonic-gate 
2867c478bd9Sstevel@tonic-gate /*
2877c478bd9Sstevel@tonic-gate  * ZSD-related global variables.
2887c478bd9Sstevel@tonic-gate  */
2897c478bd9Sstevel@tonic-gate static kmutex_t zsd_key_lock;	/* protects the following two */
2907c478bd9Sstevel@tonic-gate /*
2917c478bd9Sstevel@tonic-gate  * The next caller of zone_key_create() will be assigned a key of ++zsd_keyval.
2927c478bd9Sstevel@tonic-gate  */
2937c478bd9Sstevel@tonic-gate static zone_key_t zsd_keyval = 0;
2947c478bd9Sstevel@tonic-gate /*
2957c478bd9Sstevel@tonic-gate  * Global list of registered keys.  We use this when a new zone is created.
2967c478bd9Sstevel@tonic-gate  */
2977c478bd9Sstevel@tonic-gate static list_t zsd_registered_keys;
2987c478bd9Sstevel@tonic-gate 
2997c478bd9Sstevel@tonic-gate int zone_hash_size = 256;
30045916cd2Sjpk static mod_hash_t *zonehashbyname, *zonehashbyid, *zonehashbylabel;
3017c478bd9Sstevel@tonic-gate static kmutex_t zonehash_lock;
3027c478bd9Sstevel@tonic-gate static uint_t zonecount;
3037c478bd9Sstevel@tonic-gate static id_space_t *zoneid_space;
3047c478bd9Sstevel@tonic-gate 
3057c478bd9Sstevel@tonic-gate /*
3067c478bd9Sstevel@tonic-gate  * The global zone (aka zone0) is the all-seeing, all-knowing zone in which the
3077c478bd9Sstevel@tonic-gate  * kernel proper runs, and which manages all other zones.
3087c478bd9Sstevel@tonic-gate  *
3097c478bd9Sstevel@tonic-gate  * Although not declared as static, the variable "zone0" should not be used
3107c478bd9Sstevel@tonic-gate  * except for by code that needs to reference the global zone early on in boot,
3117c478bd9Sstevel@tonic-gate  * before it is fully initialized.  All other consumers should use
3127c478bd9Sstevel@tonic-gate  * 'global_zone'.
3137c478bd9Sstevel@tonic-gate  */
3147c478bd9Sstevel@tonic-gate zone_t zone0;
3157c478bd9Sstevel@tonic-gate zone_t *global_zone = NULL;	/* Set when the global zone is initialized */
3167c478bd9Sstevel@tonic-gate 
3177c478bd9Sstevel@tonic-gate /*
3187c478bd9Sstevel@tonic-gate  * List of active zones, protected by zonehash_lock.
3197c478bd9Sstevel@tonic-gate  */
3207c478bd9Sstevel@tonic-gate static list_t zone_active;
3217c478bd9Sstevel@tonic-gate 
3227c478bd9Sstevel@tonic-gate /*
3237c478bd9Sstevel@tonic-gate  * List of destroyed zones that still have outstanding cred references.
3247c478bd9Sstevel@tonic-gate  * Used for debugging.  Uses a separate lock to avoid lock ordering
3257c478bd9Sstevel@tonic-gate  * problems in zone_free.
3267c478bd9Sstevel@tonic-gate  */
3277c478bd9Sstevel@tonic-gate static list_t zone_deathrow;
3287c478bd9Sstevel@tonic-gate static kmutex_t zone_deathrow_lock;
3297c478bd9Sstevel@tonic-gate 
3307c478bd9Sstevel@tonic-gate /* number of zones is limited by virtual interface limit in IP */
3317c478bd9Sstevel@tonic-gate uint_t maxzones = 8192;
3327c478bd9Sstevel@tonic-gate 
333cf8f45c7Sdstaff /* Event channel to sent zone state change notifications */
334cf8f45c7Sdstaff evchan_t *zone_event_chan;
335cf8f45c7Sdstaff 
336cf8f45c7Sdstaff /*
337cf8f45c7Sdstaff  * This table holds the mapping from kernel zone states to
338cf8f45c7Sdstaff  * states visible in the state notification API.
339cf8f45c7Sdstaff  * The idea is that we only expose "obvious" states and
340cf8f45c7Sdstaff  * do not expose states which are just implementation details.
341cf8f45c7Sdstaff  */
342cf8f45c7Sdstaff const char  *zone_status_table[] = {
343cf8f45c7Sdstaff 	ZONE_EVENT_UNINITIALIZED,	/* uninitialized */
344bd41d0a8Snordmark 	ZONE_EVENT_INITIALIZED,		/* initialized */
345cf8f45c7Sdstaff 	ZONE_EVENT_READY,		/* ready */
346cf8f45c7Sdstaff 	ZONE_EVENT_READY,		/* booting */
347cf8f45c7Sdstaff 	ZONE_EVENT_RUNNING,		/* running */
348cf8f45c7Sdstaff 	ZONE_EVENT_SHUTTING_DOWN,	/* shutting_down */
349cf8f45c7Sdstaff 	ZONE_EVENT_SHUTTING_DOWN,	/* empty */
350cf8f45c7Sdstaff 	ZONE_EVENT_SHUTTING_DOWN,	/* down */
351cf8f45c7Sdstaff 	ZONE_EVENT_SHUTTING_DOWN,	/* dying */
352cf8f45c7Sdstaff 	ZONE_EVENT_UNINITIALIZED,	/* dead */
353cf8f45c7Sdstaff };
354cf8f45c7Sdstaff 
355a19609f8Sjv /*
356a19609f8Sjv  * This array contains the names of the subsystems listed in zone_ref_subsys_t
357a19609f8Sjv  * (see sys/zone.h).
358a19609f8Sjv  */
359a19609f8Sjv static char *zone_ref_subsys_names[] = {
360a19609f8Sjv 	"NFS",		/* ZONE_REF_NFS */
361a19609f8Sjv 	"NFSv4",	/* ZONE_REF_NFSV4 */
362a19609f8Sjv 	"SMBFS",	/* ZONE_REF_SMBFS */
363a19609f8Sjv 	"MNTFS",	/* ZONE_REF_MNTFS */
364a19609f8Sjv 	"LOFI",		/* ZONE_REF_LOFI */
365a19609f8Sjv 	"VFS",		/* ZONE_REF_VFS */
366a19609f8Sjv 	"IPC"		/* ZONE_REF_IPC */
367a19609f8Sjv };
368a19609f8Sjv 
3697c478bd9Sstevel@tonic-gate /*
3707c478bd9Sstevel@tonic-gate  * This isn't static so lint doesn't complain.
3717c478bd9Sstevel@tonic-gate  */
3727c478bd9Sstevel@tonic-gate rctl_hndl_t rc_zone_cpu_shares;
373c6939658Ssl rctl_hndl_t rc_zone_locked_mem;
3740209230bSgjelinek rctl_hndl_t rc_zone_max_swap;
3750fbb751dSJohn Levon rctl_hndl_t rc_zone_max_lofi;
376c97ad5cdSakolb rctl_hndl_t rc_zone_cpu_cap;
3777c478bd9Sstevel@tonic-gate rctl_hndl_t rc_zone_nlwps;
378ff19e029SMenno Lageman rctl_hndl_t rc_zone_nprocs;
379824c205fSml rctl_hndl_t rc_zone_shmmax;
380824c205fSml rctl_hndl_t rc_zone_shmmni;
381824c205fSml rctl_hndl_t rc_zone_semmni;
382824c205fSml rctl_hndl_t rc_zone_msgmni;
3837c478bd9Sstevel@tonic-gate 
3843f2f09c1Sdp const char * const zone_default_initname = "/sbin/init";
38545916cd2Sjpk static char * const zone_prefix = "/zone/";
3867c478bd9Sstevel@tonic-gate static int zone_shutdown(zoneid_t zoneid);
3872b24ab6bSSebastien Roy static int zone_add_datalink(zoneid_t, datalink_id_t);
3882b24ab6bSSebastien Roy static int zone_remove_datalink(zoneid_t, datalink_id_t);
3892b24ab6bSSebastien Roy static int zone_list_datalink(zoneid_t, int *, datalink_id_t *);
390550b6e40SSowmini Varadhan static int zone_set_network(zoneid_t, zone_net_data_t *);
391550b6e40SSowmini Varadhan static int zone_get_network(zoneid_t, zone_net_data_t *);
3927c478bd9Sstevel@tonic-gate 
393bd41d0a8Snordmark typedef boolean_t zsd_applyfn_t(kmutex_t *, boolean_t, zone_t *, zone_key_t);
394bd41d0a8Snordmark 
395bd41d0a8Snordmark static void zsd_apply_all_zones(zsd_applyfn_t *, zone_key_t);
396bd41d0a8Snordmark static void zsd_apply_all_keys(zsd_applyfn_t *, zone_t *);
397bd41d0a8Snordmark static boolean_t zsd_apply_create(kmutex_t *, boolean_t, zone_t *, zone_key_t);
398bd41d0a8Snordmark static boolean_t zsd_apply_shutdown(kmutex_t *, boolean_t, zone_t *,
399bd41d0a8Snordmark     zone_key_t);
400bd41d0a8Snordmark static boolean_t zsd_apply_destroy(kmutex_t *, boolean_t, zone_t *, zone_key_t);
401bd41d0a8Snordmark static boolean_t zsd_wait_for_creator(zone_t *, struct zsd_entry *,
402bd41d0a8Snordmark     kmutex_t *);
403bd41d0a8Snordmark static boolean_t zsd_wait_for_inprogress(zone_t *, struct zsd_entry *,
404bd41d0a8Snordmark     kmutex_t *);
405bd41d0a8Snordmark 
406821c4a97Sdp /*
407821c4a97Sdp  * Bump this number when you alter the zone syscall interfaces; this is
408821c4a97Sdp  * because we need to have support for previous API versions in libc
409821c4a97Sdp  * to support patching; libc calls into the kernel to determine this number.
410821c4a97Sdp  *
411821c4a97Sdp  * Version 1 of the API is the version originally shipped with Solaris 10
412821c4a97Sdp  * Version 2 alters the zone_create system call in order to support more
413821c4a97Sdp  *     arguments by moving the args into a structure; and to do better
414821c4a97Sdp  *     error reporting when zone_create() fails.
415821c4a97Sdp  * Version 3 alters the zone_create system call in order to support the
416821c4a97Sdp  *     import of ZFS datasets to zones.
41745916cd2Sjpk  * Version 4 alters the zone_create system call in order to support
41845916cd2Sjpk  *     Trusted Extensions.
4193f2f09c1Sdp  * Version 5 alters the zone_boot system call, and converts its old
4203f2f09c1Sdp  *     bootargs parameter to be set by the zone_setattr API instead.
421f4b3ec61Sdh  * Version 6 adds the flag argument to zone_create.
422821c4a97Sdp  */
423f4b3ec61Sdh static const int ZONE_SYSCALL_API_VERSION = 6;
424821c4a97Sdp 
4257c478bd9Sstevel@tonic-gate /*
4267c478bd9Sstevel@tonic-gate  * Certain filesystems (such as NFS and autofs) need to know which zone
4277c478bd9Sstevel@tonic-gate  * the mount is being placed in.  Because of this, we need to be able to
4285fd5c689SJerry Jelinek  * ensure that a zone isn't in the process of being created/destroyed such
4295fd5c689SJerry Jelinek  * that nfs_mount() thinks it is in the global/NGZ zone, while by the time
4305fd5c689SJerry Jelinek  * it gets added the list of mounted zones, it ends up on the wrong zone's
4315fd5c689SJerry Jelinek  * mount list. Since a zone can't reside on an NFS file system, we don't
4325fd5c689SJerry Jelinek  * have to worry about the zonepath itself.
4337c478bd9Sstevel@tonic-gate  *
4347c478bd9Sstevel@tonic-gate  * The following functions: block_mounts()/resume_mounts() and
4357c478bd9Sstevel@tonic-gate  * mount_in_progress()/mount_completed() are used by zones and the VFS
4365fd5c689SJerry Jelinek  * layer (respectively) to synchronize zone state transitions and new
4375fd5c689SJerry Jelinek  * mounts within a zone. This syncronization is on a per-zone basis, so
4385fd5c689SJerry Jelinek  * activity for one zone will not interfere with activity for another zone.
4397c478bd9Sstevel@tonic-gate  *
4407c478bd9Sstevel@tonic-gate  * The semantics are like a reader-reader lock such that there may
4415fd5c689SJerry Jelinek  * either be multiple mounts (or zone state transitions, if that weren't
4427c478bd9Sstevel@tonic-gate  * serialized by zonehash_lock) in progress at the same time, but not
4437c478bd9Sstevel@tonic-gate  * both.
4447c478bd9Sstevel@tonic-gate  *
4457c478bd9Sstevel@tonic-gate  * We use cv's so the user can ctrl-C out of the operation if it's
4467c478bd9Sstevel@tonic-gate  * taking too long.
4477c478bd9Sstevel@tonic-gate  *
4487c478bd9Sstevel@tonic-gate  * The semantics are such that there is unfair bias towards the
4495fd5c689SJerry Jelinek  * "current" operation.  This means that zone halt may starve if
4505fd5c689SJerry Jelinek  * there is a rapid succession of new mounts coming in to the zone.
4517c478bd9Sstevel@tonic-gate  */
4527c478bd9Sstevel@tonic-gate /*
4537c478bd9Sstevel@tonic-gate  * Prevent new mounts from progressing to the point of calling
4547c478bd9Sstevel@tonic-gate  * VFS_MOUNT().  If there are already mounts in this "region", wait for
4557c478bd9Sstevel@tonic-gate  * them to complete.
4567c478bd9Sstevel@tonic-gate  */
4577c478bd9Sstevel@tonic-gate static int
block_mounts(zone_t * zp)4585fd5c689SJerry Jelinek block_mounts(zone_t *zp)
4597c478bd9Sstevel@tonic-gate {
4607c478bd9Sstevel@tonic-gate 	int retval = 0;
4617c478bd9Sstevel@tonic-gate 
4627c478bd9Sstevel@tonic-gate 	/*
4637c478bd9Sstevel@tonic-gate 	 * Since it may block for a long time, block_mounts() shouldn't be
4647c478bd9Sstevel@tonic-gate 	 * called with zonehash_lock held.
4657c478bd9Sstevel@tonic-gate 	 */
4667c478bd9Sstevel@tonic-gate 	ASSERT(MUTEX_NOT_HELD(&zonehash_lock));
4675fd5c689SJerry Jelinek 	mutex_enter(&zp->zone_mount_lock);
4685fd5c689SJerry Jelinek 	while (zp->zone_mounts_in_progress > 0) {
4695fd5c689SJerry Jelinek 		if (cv_wait_sig(&zp->zone_mount_cv, &zp->zone_mount_lock) == 0)
4707c478bd9Sstevel@tonic-gate 			goto signaled;
4717c478bd9Sstevel@tonic-gate 	}
4727c478bd9Sstevel@tonic-gate 	/*
4737c478bd9Sstevel@tonic-gate 	 * A negative value of mounts_in_progress indicates that mounts
4745fd5c689SJerry Jelinek 	 * have been blocked by (-mounts_in_progress) different callers
4755fd5c689SJerry Jelinek 	 * (remotely possible if two threads enter zone_shutdown at the same
4765fd5c689SJerry Jelinek 	 * time).
4777c478bd9Sstevel@tonic-gate 	 */
4785fd5c689SJerry Jelinek 	zp->zone_mounts_in_progress--;
4797c478bd9Sstevel@tonic-gate 	retval = 1;
4807c478bd9Sstevel@tonic-gate signaled:
4815fd5c689SJerry Jelinek 	mutex_exit(&zp->zone_mount_lock);
4827c478bd9Sstevel@tonic-gate 	return (retval);
4837c478bd9Sstevel@tonic-gate }
4847c478bd9Sstevel@tonic-gate 
4857c478bd9Sstevel@tonic-gate /*
4867c478bd9Sstevel@tonic-gate  * The VFS layer may progress with new mounts as far as we're concerned.
4877c478bd9Sstevel@tonic-gate  * Allow them to progress if we were the last obstacle.
4887c478bd9Sstevel@tonic-gate  */
4897c478bd9Sstevel@tonic-gate static void
resume_mounts(zone_t * zp)4905fd5c689SJerry Jelinek resume_mounts(zone_t *zp)
4917c478bd9Sstevel@tonic-gate {
4925fd5c689SJerry Jelinek 	mutex_enter(&zp->zone_mount_lock);
4935fd5c689SJerry Jelinek 	if (++zp->zone_mounts_in_progress == 0)
4945fd5c689SJerry Jelinek 		cv_broadcast(&zp->zone_mount_cv);
4955fd5c689SJerry Jelinek 	mutex_exit(&zp->zone_mount_lock);
4967c478bd9Sstevel@tonic-gate }
4977c478bd9Sstevel@tonic-gate 
4987c478bd9Sstevel@tonic-gate /*
4995fd5c689SJerry Jelinek  * The VFS layer is busy with a mount; this zone should wait until all
5005fd5c689SJerry Jelinek  * of its mounts are completed to progress.
5017c478bd9Sstevel@tonic-gate  */
5027c478bd9Sstevel@tonic-gate void
mount_in_progress(zone_t * zp)5035fd5c689SJerry Jelinek mount_in_progress(zone_t *zp)
5047c478bd9Sstevel@tonic-gate {
5055fd5c689SJerry Jelinek 	mutex_enter(&zp->zone_mount_lock);
5065fd5c689SJerry Jelinek 	while (zp->zone_mounts_in_progress < 0)
5075fd5c689SJerry Jelinek 		cv_wait(&zp->zone_mount_cv, &zp->zone_mount_lock);
5085fd5c689SJerry Jelinek 	zp->zone_mounts_in_progress++;
5095fd5c689SJerry Jelinek 	mutex_exit(&zp->zone_mount_lock);
5107c478bd9Sstevel@tonic-gate }
5117c478bd9Sstevel@tonic-gate 
5127c478bd9Sstevel@tonic-gate /*
5137c478bd9Sstevel@tonic-gate  * VFS is done with one mount; wake up any waiting block_mounts()
5147c478bd9Sstevel@tonic-gate  * callers if this is the last mount.
5157c478bd9Sstevel@tonic-gate  */
5167c478bd9Sstevel@tonic-gate void
mount_completed(zone_t * zp)5175fd5c689SJerry Jelinek mount_completed(zone_t *zp)
5187c478bd9Sstevel@tonic-gate {
5195fd5c689SJerry Jelinek 	mutex_enter(&zp->zone_mount_lock);
5205fd5c689SJerry Jelinek 	if (--zp->zone_mounts_in_progress == 0)
5215fd5c689SJerry Jelinek 		cv_broadcast(&zp->zone_mount_cv);
5225fd5c689SJerry Jelinek 	mutex_exit(&zp->zone_mount_lock);
5237c478bd9Sstevel@tonic-gate }
5247c478bd9Sstevel@tonic-gate 
5257c478bd9Sstevel@tonic-gate /*
5267c478bd9Sstevel@tonic-gate  * ZSD routines.
5277c478bd9Sstevel@tonic-gate  *
5287c478bd9Sstevel@tonic-gate  * Zone Specific Data (ZSD) is modeled after Thread Specific Data as
5297c478bd9Sstevel@tonic-gate  * defined by the pthread_key_create() and related interfaces.
5307c478bd9Sstevel@tonic-gate  *
5317c478bd9Sstevel@tonic-gate  * Kernel subsystems may register one or more data items and/or
5327c478bd9Sstevel@tonic-gate  * callbacks to be executed when a zone is created, shutdown, or
5337c478bd9Sstevel@tonic-gate  * destroyed.
5347c478bd9Sstevel@tonic-gate  *
5357c478bd9Sstevel@tonic-gate  * Unlike the thread counterpart, destructor callbacks will be executed
5367c478bd9Sstevel@tonic-gate  * even if the data pointer is NULL and/or there are no constructor
5377c478bd9Sstevel@tonic-gate  * callbacks, so it is the responsibility of such callbacks to check for
5387c478bd9Sstevel@tonic-gate  * NULL data values if necessary.
5397c478bd9Sstevel@tonic-gate  *
5407c478bd9Sstevel@tonic-gate  * The locking strategy and overall picture is as follows:
5417c478bd9Sstevel@tonic-gate  *
5427c478bd9Sstevel@tonic-gate  * When someone calls zone_key_create(), a template ZSD entry is added to the
543bd41d0a8Snordmark  * global list "zsd_registered_keys", protected by zsd_key_lock.  While
544bd41d0a8Snordmark  * holding that lock all the existing zones are marked as
545bd41d0a8Snordmark  * ZSD_CREATE_NEEDED and a copy of the ZSD entry added to the per-zone
546bd41d0a8Snordmark  * zone_zsd list (protected by zone_lock). The global list is updated first
547bd41d0a8Snordmark  * (under zone_key_lock) to make sure that newly created zones use the
548bd41d0a8Snordmark  * most recent list of keys. Then under zonehash_lock we walk the zones
549bd41d0a8Snordmark  * and mark them.  Similar locking is used in zone_key_delete().
5507c478bd9Sstevel@tonic-gate  *
551bd41d0a8Snordmark  * The actual create, shutdown, and destroy callbacks are done without
552bd41d0a8Snordmark  * holding any lock. And zsd_flags are used to ensure that the operations
553bd41d0a8Snordmark  * completed so that when zone_key_create (and zone_create) is done, as well as
554bd41d0a8Snordmark  * zone_key_delete (and zone_destroy) is done, all the necessary callbacks
555bd41d0a8Snordmark  * are completed.
5567c478bd9Sstevel@tonic-gate  *
5577c478bd9Sstevel@tonic-gate  * When new zones are created constructor callbacks for all registered ZSD
558bd41d0a8Snordmark  * entries will be called. That also uses the above two phases of marking
559bd41d0a8Snordmark  * what needs to be done, and then running the callbacks without holding
560bd41d0a8Snordmark  * any locks.
5617c478bd9Sstevel@tonic-gate  *
5627c478bd9Sstevel@tonic-gate  * The framework does not provide any locking around zone_getspecific() and
5637c478bd9Sstevel@tonic-gate  * zone_setspecific() apart from that needed for internal consistency, so
5647c478bd9Sstevel@tonic-gate  * callers interested in atomic "test-and-set" semantics will need to provide
5657c478bd9Sstevel@tonic-gate  * their own locking.
5667c478bd9Sstevel@tonic-gate  */
5677c478bd9Sstevel@tonic-gate 
568bd41d0a8Snordmark /*
569bd41d0a8Snordmark  * Helper function to find the zsd_entry associated with the key in the
570bd41d0a8Snordmark  * given list.
571bd41d0a8Snordmark  */
572bd41d0a8Snordmark static struct zsd_entry *
zsd_find(list_t * l,zone_key_t key)573bd41d0a8Snordmark zsd_find(list_t *l, zone_key_t key)
574bd41d0a8Snordmark {
575bd41d0a8Snordmark 	struct zsd_entry *zsd;
5767c478bd9Sstevel@tonic-gate 
577bd41d0a8Snordmark 	for (zsd = list_head(l); zsd != NULL; zsd = list_next(l, zsd)) {
578bd41d0a8Snordmark 		if (zsd->zsd_key == key) {
579bd41d0a8Snordmark 			return (zsd);
5807c478bd9Sstevel@tonic-gate 		}
5817c478bd9Sstevel@tonic-gate 	}
582bd41d0a8Snordmark 	return (NULL);
5837c478bd9Sstevel@tonic-gate }
5847c478bd9Sstevel@tonic-gate 
5857c478bd9Sstevel@tonic-gate /*
5867c478bd9Sstevel@tonic-gate  * Helper function to find the zsd_entry associated with the key in the
587bd41d0a8Snordmark  * given list. Move it to the front of the list.
5887c478bd9Sstevel@tonic-gate  */
5897c478bd9Sstevel@tonic-gate static struct zsd_entry *
zsd_find_mru(list_t * l,zone_key_t key)590bd41d0a8Snordmark zsd_find_mru(list_t *l, zone_key_t key)
5917c478bd9Sstevel@tonic-gate {
5927c478bd9Sstevel@tonic-gate 	struct zsd_entry *zsd;
5937c478bd9Sstevel@tonic-gate 
5947c478bd9Sstevel@tonic-gate 	for (zsd = list_head(l); zsd != NULL; zsd = list_next(l, zsd)) {
5957c478bd9Sstevel@tonic-gate 		if (zsd->zsd_key == key) {
5967c478bd9Sstevel@tonic-gate 			/*
5977c478bd9Sstevel@tonic-gate 			 * Move to head of list to keep list in MRU order.
5987c478bd9Sstevel@tonic-gate 			 */
5997c478bd9Sstevel@tonic-gate 			if (zsd != list_head(l)) {
6007c478bd9Sstevel@tonic-gate 				list_remove(l, zsd);
6017c478bd9Sstevel@tonic-gate 				list_insert_head(l, zsd);
6027c478bd9Sstevel@tonic-gate 			}
6037c478bd9Sstevel@tonic-gate 			return (zsd);
6047c478bd9Sstevel@tonic-gate 		}
6057c478bd9Sstevel@tonic-gate 	}
6067c478bd9Sstevel@tonic-gate 	return (NULL);
6077c478bd9Sstevel@tonic-gate }
6087c478bd9Sstevel@tonic-gate 
609bd41d0a8Snordmark void
zone_key_create(zone_key_t * keyp,void * (* create)(zoneid_t),void (* shutdown)(zoneid_t,void *),void (* destroy)(zoneid_t,void *))610bd41d0a8Snordmark zone_key_create(zone_key_t *keyp, void *(*create)(zoneid_t),
611bd41d0a8Snordmark     void (*shutdown)(zoneid_t, void *), void (*destroy)(zoneid_t, void *))
612bd41d0a8Snordmark {
613bd41d0a8Snordmark 	struct zsd_entry *zsdp;
614bd41d0a8Snordmark 	struct zsd_entry *t;
615bd41d0a8Snordmark 	struct zone *zone;
616bd41d0a8Snordmark 	zone_key_t  key;
617bd41d0a8Snordmark 
618bd41d0a8Snordmark 	zsdp = kmem_zalloc(sizeof (*zsdp), KM_SLEEP);
619bd41d0a8Snordmark 	zsdp->zsd_data = NULL;
620bd41d0a8Snordmark 	zsdp->zsd_create = create;
621bd41d0a8Snordmark 	zsdp->zsd_shutdown = shutdown;
622bd41d0a8Snordmark 	zsdp->zsd_destroy = destroy;
623bd41d0a8Snordmark 
624bd41d0a8Snordmark 	/*
625bd41d0a8Snordmark 	 * Insert in global list of callbacks. Makes future zone creations
626bd41d0a8Snordmark 	 * see it.
627bd41d0a8Snordmark 	 */
628bd41d0a8Snordmark 	mutex_enter(&zsd_key_lock);
629fe16170aSPramod Batni 	key = zsdp->zsd_key = ++zsd_keyval;
630bd41d0a8Snordmark 	ASSERT(zsd_keyval != 0);
631bd41d0a8Snordmark 	list_insert_tail(&zsd_registered_keys, zsdp);
632bd41d0a8Snordmark 	mutex_exit(&zsd_key_lock);
633bd41d0a8Snordmark 
634bd41d0a8Snordmark 	/*
635bd41d0a8Snordmark 	 * Insert for all existing zones and mark them as needing
636bd41d0a8Snordmark 	 * a create callback.
637bd41d0a8Snordmark 	 */
638bd41d0a8Snordmark 	mutex_enter(&zonehash_lock);	/* stop the world */
639bd41d0a8Snordmark 	for (zone = list_head(&zone_active); zone != NULL;
640bd41d0a8Snordmark 	    zone = list_next(&zone_active, zone)) {
641bd41d0a8Snordmark 		zone_status_t status;
642bd41d0a8Snordmark 
643bd41d0a8Snordmark 		mutex_enter(&zone->zone_lock);
644bd41d0a8Snordmark 
645bd41d0a8Snordmark 		/* Skip zones that are on the way down or not yet up */
646bd41d0a8Snordmark 		status = zone_status_get(zone);
647bd41d0a8Snordmark 		if (status >= ZONE_IS_DOWN ||
648bd41d0a8Snordmark 		    status == ZONE_IS_UNINITIALIZED) {
649bd41d0a8Snordmark 			mutex_exit(&zone->zone_lock);
650bd41d0a8Snordmark 			continue;
651bd41d0a8Snordmark 		}
652bd41d0a8Snordmark 
653bd41d0a8Snordmark 		t = zsd_find_mru(&zone->zone_zsd, key);
654bd41d0a8Snordmark 		if (t != NULL) {
655bd41d0a8Snordmark 			/*
656bd41d0a8Snordmark 			 * A zsd_configure already inserted it after
657bd41d0a8Snordmark 			 * we dropped zsd_key_lock above.
658bd41d0a8Snordmark 			 */
659bd41d0a8Snordmark 			mutex_exit(&zone->zone_lock);
660bd41d0a8Snordmark 			continue;
661bd41d0a8Snordmark 		}
662bd41d0a8Snordmark 		t = kmem_zalloc(sizeof (*t), KM_SLEEP);
663bd41d0a8Snordmark 		t->zsd_key = key;
664bd41d0a8Snordmark 		t->zsd_create = create;
665bd41d0a8Snordmark 		t->zsd_shutdown = shutdown;
666bd41d0a8Snordmark 		t->zsd_destroy = destroy;
667bd41d0a8Snordmark 		if (create != NULL) {
668bd41d0a8Snordmark 			t->zsd_flags = ZSD_CREATE_NEEDED;
669bd41d0a8Snordmark 			DTRACE_PROBE2(zsd__create__needed,
670bd41d0a8Snordmark 			    zone_t *, zone, zone_key_t, key);
671bd41d0a8Snordmark 		}
672bd41d0a8Snordmark 		list_insert_tail(&zone->zone_zsd, t);
673bd41d0a8Snordmark 		mutex_exit(&zone->zone_lock);
674bd41d0a8Snordmark 	}
675bd41d0a8Snordmark 	mutex_exit(&zonehash_lock);
676bd41d0a8Snordmark 
677bd41d0a8Snordmark 	if (create != NULL) {
678bd41d0a8Snordmark 		/* Now call the create callback for this key */
679bd41d0a8Snordmark 		zsd_apply_all_zones(zsd_apply_create, key);
680bd41d0a8Snordmark 	}
681fe16170aSPramod Batni 	/*
682835ee219SRobert Harris 	 * It is safe for consumers to use the key now, make it
683835ee219SRobert Harris 	 * globally visible. Specifically zone_getspecific() will
684835ee219SRobert Harris 	 * always successfully return the zone specific data associated
685835ee219SRobert Harris 	 * with the key.
686835ee219SRobert Harris 	 */
687fe16170aSPramod Batni 	*keyp = key;
688fe16170aSPramod Batni 
689bd41d0a8Snordmark }
690bd41d0a8Snordmark 
6917c478bd9Sstevel@tonic-gate /*
6927c478bd9Sstevel@tonic-gate  * Function called when a module is being unloaded, or otherwise wishes
6937c478bd9Sstevel@tonic-gate  * to unregister its ZSD key and callbacks.
694bd41d0a8Snordmark  *
695bd41d0a8Snordmark  * Remove from the global list and determine the functions that need to
696bd41d0a8Snordmark  * be called under a global lock. Then call the functions without
697bd41d0a8Snordmark  * holding any locks. Finally free up the zone_zsd entries. (The apply
698bd41d0a8Snordmark  * functions need to access the zone_zsd entries to find zsd_data etc.)
6997c478bd9Sstevel@tonic-gate  */
7007c478bd9Sstevel@tonic-gate int
zone_key_delete(zone_key_t key)7017c478bd9Sstevel@tonic-gate zone_key_delete(zone_key_t key)
7027c478bd9Sstevel@tonic-gate {
7037c478bd9Sstevel@tonic-gate 	struct zsd_entry *zsdp = NULL;
7047c478bd9Sstevel@tonic-gate 	zone_t *zone;
7057c478bd9Sstevel@tonic-gate 
7067c478bd9Sstevel@tonic-gate 	mutex_enter(&zsd_key_lock);
707bd41d0a8Snordmark 	zsdp = zsd_find_mru(&zsd_registered_keys, key);
708bd41d0a8Snordmark 	if (zsdp == NULL) {
709bd41d0a8Snordmark 		mutex_exit(&zsd_key_lock);
710bd41d0a8Snordmark 		return (-1);
711bd41d0a8Snordmark 	}
7127c478bd9Sstevel@tonic-gate 	list_remove(&zsd_registered_keys, zsdp);
7137c478bd9Sstevel@tonic-gate 	mutex_exit(&zsd_key_lock);
7147c478bd9Sstevel@tonic-gate 
715bd41d0a8Snordmark 	mutex_enter(&zonehash_lock);
7167c478bd9Sstevel@tonic-gate 	for (zone = list_head(&zone_active); zone != NULL;
7177c478bd9Sstevel@tonic-gate 	    zone = list_next(&zone_active, zone)) {
7187c478bd9Sstevel@tonic-gate 		struct zsd_entry *del;
719bd41d0a8Snordmark 
720bd41d0a8Snordmark 		mutex_enter(&zone->zone_lock);
721bd41d0a8Snordmark 		del = zsd_find_mru(&zone->zone_zsd, key);
722bd41d0a8Snordmark 		if (del == NULL) {
723bd41d0a8Snordmark 			/*
724bd41d0a8Snordmark 			 * Somebody else got here first e.g the zone going
725bd41d0a8Snordmark 			 * away.
726bd41d0a8Snordmark 			 */
727bd41d0a8Snordmark 			mutex_exit(&zone->zone_lock);
728bd41d0a8Snordmark 			continue;
729bd41d0a8Snordmark 		}
730bd41d0a8Snordmark 		ASSERT(del->zsd_shutdown == zsdp->zsd_shutdown);
731bd41d0a8Snordmark 		ASSERT(del->zsd_destroy == zsdp->zsd_destroy);
732bd41d0a8Snordmark 		if (del->zsd_shutdown != NULL &&
733bd41d0a8Snordmark 		    (del->zsd_flags & ZSD_SHUTDOWN_ALL) == 0) {
734bd41d0a8Snordmark 			del->zsd_flags |= ZSD_SHUTDOWN_NEEDED;
735bd41d0a8Snordmark 			DTRACE_PROBE2(zsd__shutdown__needed,
736bd41d0a8Snordmark 			    zone_t *, zone, zone_key_t, key);
737bd41d0a8Snordmark 		}
738bd41d0a8Snordmark 		if (del->zsd_destroy != NULL &&
739bd41d0a8Snordmark 		    (del->zsd_flags & ZSD_DESTROY_ALL) == 0) {
740bd41d0a8Snordmark 			del->zsd_flags |= ZSD_DESTROY_NEEDED;
741bd41d0a8Snordmark 			DTRACE_PROBE2(zsd__destroy__needed,
742bd41d0a8Snordmark 			    zone_t *, zone, zone_key_t, key);
7437c478bd9Sstevel@tonic-gate 		}
7447c478bd9Sstevel@tonic-gate 		mutex_exit(&zone->zone_lock);
7457c478bd9Sstevel@tonic-gate 	}
7467c478bd9Sstevel@tonic-gate 	mutex_exit(&zonehash_lock);
7477c478bd9Sstevel@tonic-gate 	kmem_free(zsdp, sizeof (*zsdp));
7487c478bd9Sstevel@tonic-gate 
749bd41d0a8Snordmark 	/* Now call the shutdown and destroy callback for this key */
750bd41d0a8Snordmark 	zsd_apply_all_zones(zsd_apply_shutdown, key);
751bd41d0a8Snordmark 	zsd_apply_all_zones(zsd_apply_destroy, key);
752bd41d0a8Snordmark 
753bd41d0a8Snordmark 	/* Now we can free up the zsdp structures in each zone */
754bd41d0a8Snordmark 	mutex_enter(&zonehash_lock);
7557c478bd9Sstevel@tonic-gate 	for (zone = list_head(&zone_active); zone != NULL;
756bd41d0a8Snordmark 	    zone = list_next(&zone_active, zone)) {
757bd41d0a8Snordmark 		struct zsd_entry *del;
758bd41d0a8Snordmark 
759bd41d0a8Snordmark 		mutex_enter(&zone->zone_lock);
760bd41d0a8Snordmark 		del = zsd_find(&zone->zone_zsd, key);
761bd41d0a8Snordmark 		if (del != NULL) {
762bd41d0a8Snordmark 			list_remove(&zone->zone_zsd, del);
763bd41d0a8Snordmark 			ASSERT(!(del->zsd_flags & ZSD_ALL_INPROGRESS));
764bd41d0a8Snordmark 			kmem_free(del, sizeof (*del));
765bd41d0a8Snordmark 		}
7667c478bd9Sstevel@tonic-gate 		mutex_exit(&zone->zone_lock);
767bd41d0a8Snordmark 	}
7687c478bd9Sstevel@tonic-gate 	mutex_exit(&zonehash_lock);
769bd41d0a8Snordmark 
770bd41d0a8Snordmark 	return (0);
7717c478bd9Sstevel@tonic-gate }
7727c478bd9Sstevel@tonic-gate 
7737c478bd9Sstevel@tonic-gate /*
7747c478bd9Sstevel@tonic-gate  * ZSD counterpart of pthread_setspecific().
775bd41d0a8Snordmark  *
776bd41d0a8Snordmark  * Since all zsd callbacks, including those with no create function,
777bd41d0a8Snordmark  * have an entry in zone_zsd, if the key is registered it is part of
778bd41d0a8Snordmark  * the zone_zsd list.
779bd41d0a8Snordmark  * Return an error if the key wasn't registerd.
7807c478bd9Sstevel@tonic-gate  */
7817c478bd9Sstevel@tonic-gate int
zone_setspecific(zone_key_t key,zone_t * zone,const void * data)7827c478bd9Sstevel@tonic-gate zone_setspecific(zone_key_t key, zone_t *zone, const void *data)
7837c478bd9Sstevel@tonic-gate {
7847c478bd9Sstevel@tonic-gate 	struct zsd_entry *t;
7857c478bd9Sstevel@tonic-gate 
7867c478bd9Sstevel@tonic-gate 	mutex_enter(&zone->zone_lock);
787bd41d0a8Snordmark 	t = zsd_find_mru(&zone->zone_zsd, key);
7887c478bd9Sstevel@tonic-gate 	if (t != NULL) {
7897c478bd9Sstevel@tonic-gate 		/*
7907c478bd9Sstevel@tonic-gate 		 * Replace old value with new
7917c478bd9Sstevel@tonic-gate 		 */
7927c478bd9Sstevel@tonic-gate 		t->zsd_data = (void *)data;
7937c478bd9Sstevel@tonic-gate 		mutex_exit(&zone->zone_lock);
7947c478bd9Sstevel@tonic-gate 		return (0);
7957c478bd9Sstevel@tonic-gate 	}
7967c478bd9Sstevel@tonic-gate 	mutex_exit(&zone->zone_lock);
797bd41d0a8Snordmark 	return (-1);
7987c478bd9Sstevel@tonic-gate }
7997c478bd9Sstevel@tonic-gate 
8007c478bd9Sstevel@tonic-gate /*
8017c478bd9Sstevel@tonic-gate  * ZSD counterpart of pthread_getspecific().
8027c478bd9Sstevel@tonic-gate  */
8037c478bd9Sstevel@tonic-gate void *
zone_getspecific(zone_key_t key,zone_t * zone)8047c478bd9Sstevel@tonic-gate zone_getspecific(zone_key_t key, zone_t *zone)
8057c478bd9Sstevel@tonic-gate {
8067c478bd9Sstevel@tonic-gate 	struct zsd_entry *t;
8077c478bd9Sstevel@tonic-gate 	void *data;
8087c478bd9Sstevel@tonic-gate 
8097c478bd9Sstevel@tonic-gate 	mutex_enter(&zone->zone_lock);
810bd41d0a8Snordmark 	t = zsd_find_mru(&zone->zone_zsd, key);
8117c478bd9Sstevel@tonic-gate 	data = (t == NULL ? NULL : t->zsd_data);
8127c478bd9Sstevel@tonic-gate 	mutex_exit(&zone->zone_lock);
8137c478bd9Sstevel@tonic-gate 	return (data);
8147c478bd9Sstevel@tonic-gate }
8157c478bd9Sstevel@tonic-gate 
8167c478bd9Sstevel@tonic-gate /*
8177c478bd9Sstevel@tonic-gate  * Function used to initialize a zone's list of ZSD callbacks and data
8187c478bd9Sstevel@tonic-gate  * when the zone is being created.  The callbacks are initialized from
819bd41d0a8Snordmark  * the template list (zsd_registered_keys). The constructor callback is
820bd41d0a8Snordmark  * executed later (once the zone exists and with locks dropped).
8217c478bd9Sstevel@tonic-gate  */
8227c478bd9Sstevel@tonic-gate static void
zone_zsd_configure(zone_t * zone)8237c478bd9Sstevel@tonic-gate zone_zsd_configure(zone_t *zone)
8247c478bd9Sstevel@tonic-gate {
8257c478bd9Sstevel@tonic-gate 	struct zsd_entry *zsdp;
8267c478bd9Sstevel@tonic-gate 	struct zsd_entry *t;
8277c478bd9Sstevel@tonic-gate 
8287c478bd9Sstevel@tonic-gate 	ASSERT(MUTEX_HELD(&zonehash_lock));
8297c478bd9Sstevel@tonic-gate 	ASSERT(list_head(&zone->zone_zsd) == NULL);
830bd41d0a8Snordmark 	mutex_enter(&zone->zone_lock);
8317c478bd9Sstevel@tonic-gate 	mutex_enter(&zsd_key_lock);
8327c478bd9Sstevel@tonic-gate 	for (zsdp = list_head(&zsd_registered_keys); zsdp != NULL;
8337c478bd9Sstevel@tonic-gate 	    zsdp = list_next(&zsd_registered_keys, zsdp)) {
834bd41d0a8Snordmark 		/*
835bd41d0a8Snordmark 		 * Since this zone is ZONE_IS_UNCONFIGURED, zone_key_create
836bd41d0a8Snordmark 		 * should not have added anything to it.
837bd41d0a8Snordmark 		 */
838bd41d0a8Snordmark 		ASSERT(zsd_find(&zone->zone_zsd, zsdp->zsd_key) == NULL);
839bd41d0a8Snordmark 
840bd41d0a8Snordmark 		t = kmem_zalloc(sizeof (*t), KM_SLEEP);
841bd41d0a8Snordmark 		t->zsd_key = zsdp->zsd_key;
842bd41d0a8Snordmark 		t->zsd_create = zsdp->zsd_create;
843bd41d0a8Snordmark 		t->zsd_shutdown = zsdp->zsd_shutdown;
844bd41d0a8Snordmark 		t->zsd_destroy = zsdp->zsd_destroy;
8457c478bd9Sstevel@tonic-gate 		if (zsdp->zsd_create != NULL) {
846bd41d0a8Snordmark 			t->zsd_flags = ZSD_CREATE_NEEDED;
847bd41d0a8Snordmark 			DTRACE_PROBE2(zsd__create__needed,
848bd41d0a8Snordmark 			    zone_t *, zone, zone_key_t, zsdp->zsd_key);
8497c478bd9Sstevel@tonic-gate 		}
850bd41d0a8Snordmark 		list_insert_tail(&zone->zone_zsd, t);
8517c478bd9Sstevel@tonic-gate 	}
8527c478bd9Sstevel@tonic-gate 	mutex_exit(&zsd_key_lock);
853bd41d0a8Snordmark 	mutex_exit(&zone->zone_lock);
8547c478bd9Sstevel@tonic-gate }
8557c478bd9Sstevel@tonic-gate 
8567c478bd9Sstevel@tonic-gate enum zsd_callback_type { ZSD_CREATE, ZSD_SHUTDOWN, ZSD_DESTROY };
8577c478bd9Sstevel@tonic-gate 
8587c478bd9Sstevel@tonic-gate /*
8597c478bd9Sstevel@tonic-gate  * Helper function to execute shutdown or destructor callbacks.
8607c478bd9Sstevel@tonic-gate  */
8617c478bd9Sstevel@tonic-gate static void
zone_zsd_callbacks(zone_t * zone,enum zsd_callback_type ct)8627c478bd9Sstevel@tonic-gate zone_zsd_callbacks(zone_t *zone, enum zsd_callback_type ct)
8637c478bd9Sstevel@tonic-gate {
8647c478bd9Sstevel@tonic-gate 	struct zsd_entry *t;
8657c478bd9Sstevel@tonic-gate 
8667c478bd9Sstevel@tonic-gate 	ASSERT(ct == ZSD_SHUTDOWN || ct == ZSD_DESTROY);
8677c478bd9Sstevel@tonic-gate 	ASSERT(ct != ZSD_SHUTDOWN || zone_status_get(zone) >= ZONE_IS_EMPTY);
8687c478bd9Sstevel@tonic-gate 	ASSERT(ct != ZSD_DESTROY || zone_status_get(zone) >= ZONE_IS_DOWN);
8697c478bd9Sstevel@tonic-gate 
8707c478bd9Sstevel@tonic-gate 	/*
871bd41d0a8Snordmark 	 * Run the callback solely based on what is registered for the zone
872bd41d0a8Snordmark 	 * in zone_zsd. The global list can change independently of this
873bd41d0a8Snordmark 	 * as keys are registered and unregistered and we don't register new
874bd41d0a8Snordmark 	 * callbacks for a zone that is in the process of going away.
8757c478bd9Sstevel@tonic-gate 	 */
876bd41d0a8Snordmark 	mutex_enter(&zone->zone_lock);
877bd41d0a8Snordmark 	for (t = list_head(&zone->zone_zsd); t != NULL;
878bd41d0a8Snordmark 	    t = list_next(&zone->zone_zsd, t)) {
879bd41d0a8Snordmark 		zone_key_t key = t->zsd_key;
8807c478bd9Sstevel@tonic-gate 
8817c478bd9Sstevel@tonic-gate 		/* Skip if no callbacks registered */
882bd41d0a8Snordmark 
883bd41d0a8Snordmark 		if (ct == ZSD_SHUTDOWN) {
884bd41d0a8Snordmark 			if (t->zsd_shutdown != NULL &&
885bd41d0a8Snordmark 			    (t->zsd_flags & ZSD_SHUTDOWN_ALL) == 0) {
886bd41d0a8Snordmark 				t->zsd_flags |= ZSD_SHUTDOWN_NEEDED;
887bd41d0a8Snordmark 				DTRACE_PROBE2(zsd__shutdown__needed,
888bd41d0a8Snordmark 				    zone_t *, zone, zone_key_t, key);
8897c478bd9Sstevel@tonic-gate 			}
8907c478bd9Sstevel@tonic-gate 		} else {
891bd41d0a8Snordmark 			if (t->zsd_destroy != NULL &&
892bd41d0a8Snordmark 			    (t->zsd_flags & ZSD_DESTROY_ALL) == 0) {
893bd41d0a8Snordmark 				t->zsd_flags |= ZSD_DESTROY_NEEDED;
894bd41d0a8Snordmark 				DTRACE_PROBE2(zsd__destroy__needed,
895bd41d0a8Snordmark 				    zone_t *, zone, zone_key_t, key);
8967c478bd9Sstevel@tonic-gate 			}
8977c478bd9Sstevel@tonic-gate 		}
8987c478bd9Sstevel@tonic-gate 	}
899bd41d0a8Snordmark 	mutex_exit(&zone->zone_lock);
900bd41d0a8Snordmark 
901bd41d0a8Snordmark 	/* Now call the shutdown and destroy callback for this key */
902bd41d0a8Snordmark 	zsd_apply_all_keys(zsd_apply_shutdown, zone);
903bd41d0a8Snordmark 	zsd_apply_all_keys(zsd_apply_destroy, zone);
904bd41d0a8Snordmark 
9057c478bd9Sstevel@tonic-gate }
9067c478bd9Sstevel@tonic-gate 
9077c478bd9Sstevel@tonic-gate /*
9087c478bd9Sstevel@tonic-gate  * Called when the zone is going away; free ZSD-related memory, and
9097c478bd9Sstevel@tonic-gate  * destroy the zone_zsd list.
9107c478bd9Sstevel@tonic-gate  */
9117c478bd9Sstevel@tonic-gate static void
zone_free_zsd(zone_t * zone)9127c478bd9Sstevel@tonic-gate zone_free_zsd(zone_t *zone)
9137c478bd9Sstevel@tonic-gate {
9147c478bd9Sstevel@tonic-gate 	struct zsd_entry *t, *next;
9157c478bd9Sstevel@tonic-gate 
9167c478bd9Sstevel@tonic-gate 	/*
9177c478bd9Sstevel@tonic-gate 	 * Free all the zsd_entry's we had on this zone.
9187c478bd9Sstevel@tonic-gate 	 */
919bd41d0a8Snordmark 	mutex_enter(&zone->zone_lock);
9207c478bd9Sstevel@tonic-gate 	for (t = list_head(&zone->zone_zsd); t != NULL; t = next) {
9217c478bd9Sstevel@tonic-gate 		next = list_next(&zone->zone_zsd, t);
9227c478bd9Sstevel@tonic-gate 		list_remove(&zone->zone_zsd, t);
923bd41d0a8Snordmark 		ASSERT(!(t->zsd_flags & ZSD_ALL_INPROGRESS));
9247c478bd9Sstevel@tonic-gate 		kmem_free(t, sizeof (*t));
9257c478bd9Sstevel@tonic-gate 	}
9267c478bd9Sstevel@tonic-gate 	list_destroy(&zone->zone_zsd);
927bd41d0a8Snordmark 	mutex_exit(&zone->zone_lock);
928bd41d0a8Snordmark 
929bd41d0a8Snordmark }
930bd41d0a8Snordmark 
931bd41d0a8Snordmark /*
932bd41d0a8Snordmark  * Apply a function to all zones for particular key value.
933bd41d0a8Snordmark  *
934bd41d0a8Snordmark  * The applyfn has to drop zonehash_lock if it does some work, and
935bd41d0a8Snordmark  * then reacquire it before it returns.
936bd41d0a8Snordmark  * When the lock is dropped we don't follow list_next even
937bd41d0a8Snordmark  * if it is possible to do so without any hazards. This is
938bd41d0a8Snordmark  * because we want the design to allow for the list of zones
939bd41d0a8Snordmark  * to change in any arbitrary way during the time the
940bd41d0a8Snordmark  * lock was dropped.
941bd41d0a8Snordmark  *
942bd41d0a8Snordmark  * It is safe to restart the loop at list_head since the applyfn
943bd41d0a8Snordmark  * changes the zsd_flags as it does work, so a subsequent
944bd41d0a8Snordmark  * pass through will have no effect in applyfn, hence the loop will terminate
945bd41d0a8Snordmark  * in at worst O(N^2).
946bd41d0a8Snordmark  */
947bd41d0a8Snordmark static void
zsd_apply_all_zones(zsd_applyfn_t * applyfn,zone_key_t key)948bd41d0a8Snordmark zsd_apply_all_zones(zsd_applyfn_t *applyfn, zone_key_t key)
949bd41d0a8Snordmark {
950bd41d0a8Snordmark 	zone_t *zone;
951bd41d0a8Snordmark 
952bd41d0a8Snordmark 	mutex_enter(&zonehash_lock);
953bd41d0a8Snordmark 	zone = list_head(&zone_active);
954bd41d0a8Snordmark 	while (zone != NULL) {
955bd41d0a8Snordmark 		if ((applyfn)(&zonehash_lock, B_FALSE, zone, key)) {
956bd41d0a8Snordmark 			/* Lock dropped - restart at head */
957bd41d0a8Snordmark 			zone = list_head(&zone_active);
958bd41d0a8Snordmark 		} else {
959bd41d0a8Snordmark 			zone = list_next(&zone_active, zone);
960bd41d0a8Snordmark 		}
961bd41d0a8Snordmark 	}
962bd41d0a8Snordmark 	mutex_exit(&zonehash_lock);
963bd41d0a8Snordmark }
964bd41d0a8Snordmark 
965bd41d0a8Snordmark /*
966bd41d0a8Snordmark  * Apply a function to all keys for a particular zone.
967bd41d0a8Snordmark  *
968bd41d0a8Snordmark  * The applyfn has to drop zonehash_lock if it does some work, and
969bd41d0a8Snordmark  * then reacquire it before it returns.
970bd41d0a8Snordmark  * When the lock is dropped we don't follow list_next even
971bd41d0a8Snordmark  * if it is possible to do so without any hazards. This is
972bd41d0a8Snordmark  * because we want the design to allow for the list of zsd callbacks
973bd41d0a8Snordmark  * to change in any arbitrary way during the time the
974bd41d0a8Snordmark  * lock was dropped.
975bd41d0a8Snordmark  *
976bd41d0a8Snordmark  * It is safe to restart the loop at list_head since the applyfn
977bd41d0a8Snordmark  * changes the zsd_flags as it does work, so a subsequent
978bd41d0a8Snordmark  * pass through will have no effect in applyfn, hence the loop will terminate
979bd41d0a8Snordmark  * in at worst O(N^2).
980bd41d0a8Snordmark  */
981bd41d0a8Snordmark static void
zsd_apply_all_keys(zsd_applyfn_t * applyfn,zone_t * zone)982bd41d0a8Snordmark zsd_apply_all_keys(zsd_applyfn_t *applyfn, zone_t *zone)
983bd41d0a8Snordmark {
984bd41d0a8Snordmark 	struct zsd_entry *t;
985bd41d0a8Snordmark 
986bd41d0a8Snordmark 	mutex_enter(&zone->zone_lock);
987bd41d0a8Snordmark 	t = list_head(&zone->zone_zsd);
988bd41d0a8Snordmark 	while (t != NULL) {
989bd41d0a8Snordmark 		if ((applyfn)(NULL, B_TRUE, zone, t->zsd_key)) {
990bd41d0a8Snordmark 			/* Lock dropped - restart at head */
991bd41d0a8Snordmark 			t = list_head(&zone->zone_zsd);
992bd41d0a8Snordmark 		} else {
993bd41d0a8Snordmark 			t = list_next(&zone->zone_zsd, t);
994bd41d0a8Snordmark 		}
995bd41d0a8Snordmark 	}
996bd41d0a8Snordmark 	mutex_exit(&zone->zone_lock);
997bd41d0a8Snordmark }
998bd41d0a8Snordmark 
999bd41d0a8Snordmark /*
1000bd41d0a8Snordmark  * Call the create function for the zone and key if CREATE_NEEDED
1001bd41d0a8Snordmark  * is set.
1002bd41d0a8Snordmark  * If some other thread gets here first and sets CREATE_INPROGRESS, then
1003bd41d0a8Snordmark  * we wait for that thread to complete so that we can ensure that
1004bd41d0a8Snordmark  * all the callbacks are done when we've looped over all zones/keys.
1005bd41d0a8Snordmark  *
1006bd41d0a8Snordmark  * When we call the create function, we drop the global held by the
1007bd41d0a8Snordmark  * caller, and return true to tell the caller it needs to re-evalute the
1008bd41d0a8Snordmark  * state.
1009bd41d0a8Snordmark  * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
1010bd41d0a8Snordmark  * remains held on exit.
1011bd41d0a8Snordmark  */
1012bd41d0a8Snordmark static boolean_t
zsd_apply_create(kmutex_t * lockp,boolean_t zone_lock_held,zone_t * zone,zone_key_t key)1013bd41d0a8Snordmark zsd_apply_create(kmutex_t *lockp, boolean_t zone_lock_held,
1014bd41d0a8Snordmark     zone_t *zone, zone_key_t key)
1015bd41d0a8Snordmark {
1016bd41d0a8Snordmark 	void *result;
1017bd41d0a8Snordmark 	struct zsd_entry *t;
1018bd41d0a8Snordmark 	boolean_t dropped;
1019bd41d0a8Snordmark 
1020bd41d0a8Snordmark 	if (lockp != NULL) {
1021bd41d0a8Snordmark 		ASSERT(MUTEX_HELD(lockp));
1022bd41d0a8Snordmark 	}
1023bd41d0a8Snordmark 	if (zone_lock_held) {
1024bd41d0a8Snordmark 		ASSERT(MUTEX_HELD(&zone->zone_lock));
1025bd41d0a8Snordmark 	} else {
1026bd41d0a8Snordmark 		mutex_enter(&zone->zone_lock);
1027bd41d0a8Snordmark 	}
1028bd41d0a8Snordmark 
1029bd41d0a8Snordmark 	t = zsd_find(&zone->zone_zsd, key);
1030bd41d0a8Snordmark 	if (t == NULL) {
1031bd41d0a8Snordmark 		/*
1032bd41d0a8Snordmark 		 * Somebody else got here first e.g the zone going
1033bd41d0a8Snordmark 		 * away.
1034bd41d0a8Snordmark 		 */
1035bd41d0a8Snordmark 		if (!zone_lock_held)
1036bd41d0a8Snordmark 			mutex_exit(&zone->zone_lock);
1037bd41d0a8Snordmark 		return (B_FALSE);
1038bd41d0a8Snordmark 	}
1039bd41d0a8Snordmark 	dropped = B_FALSE;
1040bd41d0a8Snordmark 	if (zsd_wait_for_inprogress(zone, t, lockp))
1041bd41d0a8Snordmark 		dropped = B_TRUE;
1042bd41d0a8Snordmark 
1043bd41d0a8Snordmark 	if (t->zsd_flags & ZSD_CREATE_NEEDED) {
1044bd41d0a8Snordmark 		t->zsd_flags &= ~ZSD_CREATE_NEEDED;
1045bd41d0a8Snordmark 		t->zsd_flags |= ZSD_CREATE_INPROGRESS;
1046bd41d0a8Snordmark 		DTRACE_PROBE2(zsd__create__inprogress,
1047bd41d0a8Snordmark 		    zone_t *, zone, zone_key_t, key);
1048bd41d0a8Snordmark 		mutex_exit(&zone->zone_lock);
1049bd41d0a8Snordmark 		if (lockp != NULL)
1050bd41d0a8Snordmark 			mutex_exit(lockp);
1051bd41d0a8Snordmark 
1052bd41d0a8Snordmark 		dropped = B_TRUE;
1053bd41d0a8Snordmark 		ASSERT(t->zsd_create != NULL);
1054bd41d0a8Snordmark 		DTRACE_PROBE2(zsd__create__start,
1055bd41d0a8Snordmark 		    zone_t *, zone, zone_key_t, key);
1056bd41d0a8Snordmark 
1057bd41d0a8Snordmark 		result = (*t->zsd_create)(zone->zone_id);
1058bd41d0a8Snordmark 
1059bd41d0a8Snordmark 		DTRACE_PROBE2(zsd__create__end,
1060bd41d0a8Snordmark 		    zone_t *, zone, voidn *, result);
1061bd41d0a8Snordmark 
1062bd41d0a8Snordmark 		ASSERT(result != NULL);
1063bd41d0a8Snordmark 		if (lockp != NULL)
1064bd41d0a8Snordmark 			mutex_enter(lockp);
1065bd41d0a8Snordmark 		mutex_enter(&zone->zone_lock);
1066bd41d0a8Snordmark 		t->zsd_data = result;
1067bd41d0a8Snordmark 		t->zsd_flags &= ~ZSD_CREATE_INPROGRESS;
1068