zone.c revision 550b6e4083768ca350e9e7c3a1ebbf720b23dcad
27c478bdstevel@tonic-gate * CDDL HEADER START
37c478bdstevel@tonic-gate *
47c478bdstevel@tonic-gate * The contents of this file are subject to the terms of the
545916cdjpk * Common Development and Distribution License (the "License").
645916cdjpk * You may not use this file except in compliance with the License.
77c478bdstevel@tonic-gate *
87c478bdstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
97c478bdstevel@tonic-gate * or
107c478bdstevel@tonic-gate * See the License for the specific language governing permissions
117c478bdstevel@tonic-gate * and limitations under the License.
127c478bdstevel@tonic-gate *
137c478bdstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each
147c478bdstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
157c478bdstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the
167c478bdstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying
177c478bdstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner]
187c478bdstevel@tonic-gate *
197c478bdstevel@tonic-gate * CDDL HEADER END
207c478bdstevel@tonic-gate */
23134a1f4Casper H.S. Dik * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
247c478bdstevel@tonic-gate */
277c478bdstevel@tonic-gate * Zones
287c478bdstevel@tonic-gate *
297c478bdstevel@tonic-gate *   A zone is a named collection of processes, namespace constraints,
307c478bdstevel@tonic-gate *   and other system resources which comprise a secure and manageable
317c478bdstevel@tonic-gate *   application containment facility.
327c478bdstevel@tonic-gate *
337c478bdstevel@tonic-gate *   Zones (represented by the reference counted zone_t) are tracked in
347c478bdstevel@tonic-gate *   the kernel in the zonehash.  Elsewhere in the kernel, Zone IDs
357c478bdstevel@tonic-gate *   (zoneid_t) are used to track zone association.  Zone IDs are
367c478bdstevel@tonic-gate *   dynamically generated when the zone is created; if a persistent
377c478bdstevel@tonic-gate *   identifier is needed (core files, accounting logs, audit trail,
387c478bdstevel@tonic-gate *   etc.), the zone name should be used.
397c478bdstevel@tonic-gate *
407c478bdstevel@tonic-gate *
417c478bdstevel@tonic-gate *   Global Zone:
427c478bdstevel@tonic-gate *
437c478bdstevel@tonic-gate *   The global zone (zoneid 0) is automatically associated with all
447c478bdstevel@tonic-gate *   system resources that have not been bound to a user-created zone.
457c478bdstevel@tonic-gate *   This means that even systems where zones are not in active use
467c478bdstevel@tonic-gate *   have a global zone, and all processes, mounts, etc. are
477c478bdstevel@tonic-gate *   associated with that zone.  The global zone is generally
487c478bdstevel@tonic-gate *   unconstrained in terms of privileges and access, though the usual
497c478bdstevel@tonic-gate *   credential and privilege based restrictions apply.
507c478bdstevel@tonic-gate *
517c478bdstevel@tonic-gate *
527c478bdstevel@tonic-gate *   Zone States:
537c478bdstevel@tonic-gate *
547c478bdstevel@tonic-gate *   The states in which a zone may be in and the transitions are as
557c478bdstevel@tonic-gate *   follows:
567c478bdstevel@tonic-gate *
577c478bdstevel@tonic-gate *   ZONE_IS_UNINITIALIZED: primordial state for a zone. The partially
587c478bdstevel@tonic-gate *   initialized zone is added to the list of active zones on the system but
597c478bdstevel@tonic-gate *   isn't accessible.
607c478bdstevel@tonic-gate *
61bd41d0anordmark *   ZONE_IS_INITIALIZED: Initialization complete except the ZSD callbacks are
62bd41d0anordmark *   not yet completed. Not possible to enter the zone, but attributes can
63bd41d0anordmark *   be retrieved.
64bd41d0anordmark *
657c478bdstevel@tonic-gate *   ZONE_IS_READY: zsched (the kernel dummy process for a zone) is
667c478bdstevel@tonic-gate *   ready.  The zone is made visible after the ZSD constructor callbacks are
677c478bdstevel@tonic-gate *   executed.  A zone remains in this state until it transitions into
687c478bdstevel@tonic-gate *   the ZONE_IS_BOOTING state as a result of a call to zone_boot().
697c478bdstevel@tonic-gate *
707c478bdstevel@tonic-gate *   ZONE_IS_BOOTING: in this shortlived-state, zsched attempts to start
717c478bdstevel@tonic-gate *   init.  Should that fail, the zone proceeds to the ZONE_IS_SHUTTING_DOWN
727c478bdstevel@tonic-gate *   state.
737c478bdstevel@tonic-gate *
747c478bdstevel@tonic-gate *   ZONE_IS_RUNNING: The zone is open for business: zsched has
757c478bdstevel@tonic-gate *   successfully started init.   A zone remains in this state until
767c478bdstevel@tonic-gate *   zone_shutdown() is called.
777c478bdstevel@tonic-gate *
787c478bdstevel@tonic-gate *   ZONE_IS_SHUTTING_DOWN: zone_shutdown() has been called, the system is
797c478bdstevel@tonic-gate *   killing all processes running in the zone. The zone remains
807c478bdstevel@tonic-gate *   in this state until there are no more user processes running in the zone.
817c478bdstevel@tonic-gate *   zone_create(), zone_enter(), and zone_destroy() on this zone will fail.
827c478bdstevel@tonic-gate *   Since zone_shutdown() is restartable, it may be called successfully
837c478bdstevel@tonic-gate *   multiple times for the same zone_t.  Setting of the zone's state to
847c478bdstevel@tonic-gate *   ZONE_IS_SHUTTING_DOWN is synchronized with mounts, so VOP_MOUNT() may check
857c478bdstevel@tonic-gate *   the zone's status without worrying about it being a moving target.
867c478bdstevel@tonic-gate *
877c478bdstevel@tonic-gate *   ZONE_IS_EMPTY: zone_shutdown() has been called, and there
887c478bdstevel@tonic-gate *   are no more user processes in the zone.  The zone remains in this
897c478bdstevel@tonic-gate *   state until there are no more kernel threads associated with the
907c478bdstevel@tonic-gate *   zone.  zone_create(), zone_enter(), and zone_destroy() on this zone will
917c478bdstevel@tonic-gate *   fail.
927c478bdstevel@tonic-gate *
937c478bdstevel@tonic-gate *   ZONE_IS_DOWN: All kernel threads doing work on behalf of the zone
947c478bdstevel@tonic-gate *   have exited.  zone_shutdown() returns.  Henceforth it is not possible to
957c478bdstevel@tonic-gate *   join the zone or create kernel threads therein.
967c478bdstevel@tonic-gate *
977c478bdstevel@tonic-gate *   ZONE_IS_DYING: zone_destroy() has been called on the zone; zone
987c478bdstevel@tonic-gate *   remains in this state until zsched exits.  Calls to zone_find_by_*()
997c478bdstevel@tonic-gate *   return NULL from now on.
1007c478bdstevel@tonic-gate *
1017c478bdstevel@tonic-gate *   ZONE_IS_DEAD: zsched has exited (zone_ntasks == 0).  There are no
1027c478bdstevel@tonic-gate *   processes or threads doing work on behalf of the zone.  The zone is
1037c478bdstevel@tonic-gate *   removed from the list of active zones.  zone_destroy() returns, and
1047c478bdstevel@tonic-gate *   the zone can be recreated.
1057c478bdstevel@tonic-gate *
1067c478bdstevel@tonic-gate *   ZONE_IS_FREE (internal state): zone_ref goes to 0, ZSD destructor
1077c478bdstevel@tonic-gate *   callbacks are executed, and all memory associated with the zone is
1087c478bdstevel@tonic-gate *   freed.
1097c478bdstevel@tonic-gate *
1107c478bdstevel@tonic-gate *   Threads can wait for the zone to enter a requested state by using
1117c478bdstevel@tonic-gate *   zone_status_wait() or zone_status_timedwait() with the desired
1127c478bdstevel@tonic-gate *   state passed in as an argument.  Zone state transitions are
1137c478bdstevel@tonic-gate *   uni-directional; it is not possible to move back to an earlier state.
1147c478bdstevel@tonic-gate *
1157c478bdstevel@tonic-gate *
1167c478bdstevel@tonic-gate *   Zone-Specific Data:
1177c478bdstevel@tonic-gate *
1187c478bdstevel@tonic-gate *   Subsystems needing to maintain zone-specific data can store that
1197c478bdstevel@tonic-gate *   data using the ZSD mechanism.  This provides a zone-specific data
1207c478bdstevel@tonic-gate *   store, similar to thread-specific data (see pthread_getspecific(3C)
1217c478bdstevel@tonic-gate *   or the TSD code in uts/common/disp/thread.c.  Also, ZSD can be used
1227c478bdstevel@tonic-gate *   to register callbacks to be invoked when a zone is created, shut
1237c478bdstevel@tonic-gate *   down, or destroyed.  This can be used to initialize zone-specific
1247c478bdstevel@tonic-gate *   data for new zones and to clean up when zones go away.
1257c478bdstevel@tonic-gate *
1267c478bdstevel@tonic-gate *
1277c478bdstevel@tonic-gate *   Data Structures:
1287c478bdstevel@tonic-gate *
1297c478bdstevel@tonic-gate *   The per-zone structure (zone_t) is reference counted, and freed
1307c478bdstevel@tonic-gate *   when all references are released.  zone_hold and zone_rele can be
1317c478bdstevel@tonic-gate *   used to adjust the reference count.  In addition, reference counts
1327c478bdstevel@tonic-gate *   associated with the cred_t structure are tracked separately using
1337c478bdstevel@tonic-gate *   zone_cred_hold and zone_cred_rele.
1347c478bdstevel@tonic-gate *
1357c478bdstevel@tonic-gate *   Pointers to active zone_t's are stored in two hash tables; one
1367c478bdstevel@tonic-gate *   for searching by id, the other for searching by name.  Lookups
1377c478bdstevel@tonic-gate *   can be performed on either basis, using zone_find_by_id and
1387c478bdstevel@tonic-gate *   zone_find_by_name.  Both return zone_t pointers with the zone
1397c478bdstevel@tonic-gate *   held, so zone_rele should be called when the pointer is no longer
1407c478bdstevel@tonic-gate *   needed.  Zones can also be searched by path; zone_find_by_path
1417c478bdstevel@tonic-gate *   returns the zone with which a path name is associated (global
1427c478bdstevel@tonic-gate *   zone if the path is not within some other zone's file system
1437c478bdstevel@tonic-gate *   hierarchy).  This currently requires iterating through each zone,
1447c478bdstevel@tonic-gate *   so it is slower than an id or name search via a hash table.
1457c478bdstevel@tonic-gate *
1467c478bdstevel@tonic-gate *
1477c478bdstevel@tonic-gate *   Locking:
1487c478bdstevel@tonic-gate *
1497c478bdstevel@tonic-gate *   zonehash_lock: This is a top-level global lock used to protect the
1507c478bdstevel@tonic-gate *       zone hash tables and lists.  Zones cannot be created or destroyed
1517c478bdstevel@tonic-gate *       while this lock is held.
1527c478bdstevel@tonic-gate *   zone_status_lock: This is a global lock protecting zone state.
1537c478bdstevel@tonic-gate *       Zones cannot change state while this lock is held.  It also
1547c478bdstevel@tonic-gate *       protects the list of kernel threads associated with a zone.
1557c478bdstevel@tonic-gate *   zone_lock: This is a per-zone lock used to protect several fields of
1567c478bdstevel@tonic-gate *       the zone_t (see <sys/zone.h> for details).  In addition, holding
1577c478bdstevel@tonic-gate *       this lock means that the zone cannot go away.
1580209230gjelinek *   zone_nlwps_lock: This is a per-zone lock used to protect the fields
1590209230gjelinek *	 related to the zone.max-lwps rctl.
1600209230gjelinek *   zone_mem_lock: This is a per-zone lock used to protect the fields
1610209230gjelinek *	 related to the zone.max-locked-memory and zone.max-swap rctls.
1620fbb751John Levon *   zone_rctl_lock: This is a per-zone lock used to protect other rctls,
1630fbb751John Levon *       currently just max_lofi
1647c478bdstevel@tonic-gate *   zsd_key_lock: This is a global lock protecting the key state for ZSD.
1657c478bdstevel@tonic-gate *   zone_deathrow_lock: This is a global lock protecting the "deathrow"
1667c478bdstevel@tonic-gate *       list (a list of zones in the ZONE_IS_DEAD state).
1677c478bdstevel@tonic-gate *
1687c478bdstevel@tonic-gate *   Ordering requirements:
1697c478bdstevel@tonic-gate *       pool_lock --> cpu_lock --> zonehash_lock --> zone_status_lock -->
1707c478bdstevel@tonic-gate *       	zone_lock --> zsd_key_lock --> pidlock --> p_lock
1717c478bdstevel@tonic-gate *
1720209230gjelinek *   When taking zone_mem_lock or zone_nlwps_lock, the lock ordering is:
1730209230gjelinek *	zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_mem_lock
174ff19e02Menno Lageman *	zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_nlwps_lock
1750209230gjelinek *
1767c478bdstevel@tonic-gate *   Blocking memory allocations are permitted while holding any of the
1777c478bdstevel@tonic-gate *   zone locks.
1787c478bdstevel@tonic-gate *
1797c478bdstevel@tonic-gate *
1807c478bdstevel@tonic-gate *   System Call Interface:
1817c478bdstevel@tonic-gate *
1827c478bdstevel@tonic-gate *   The zone subsystem can be managed and queried from user level with
1837c478bdstevel@tonic-gate *   the following system calls (all subcodes of the primary "zone"
1847c478bdstevel@tonic-gate *   system call):
1857c478bdstevel@tonic-gate *   - zone_create: creates a zone with selected attributes (name,
186fa9e406ahrens *     root path, privileges, resource controls, ZFS datasets)
1877c478bdstevel@tonic-gate *   - zone_enter: allows the current process to enter a zone
1887c478bdstevel@tonic-gate *   - zone_getattr: reports attributes of a zone
1893f2f09cdp *   - zone_setattr: set attributes of a zone
1903f2f09cdp *   - zone_boot: set 'init' running for the zone
1917c478bdstevel@tonic-gate *   - zone_list: lists all zones active in the system
1927c478bdstevel@tonic-gate *   - zone_lookup: looks up zone id based on name
1937c478bdstevel@tonic-gate *   - zone_shutdown: initiates shutdown process (see states above)
1947c478bdstevel@tonic-gate *   - zone_destroy: completes shutdown process (see states above)
1957c478bdstevel@tonic-gate *
1967c478bdstevel@tonic-gate */
1987c478bdstevel@tonic-gate#include <sys/priv_impl.h>
1997c478bdstevel@tonic-gate#include <sys/cred.h>
2007c478bdstevel@tonic-gate#include <c2/audit.h>
2017c478bdstevel@tonic-gate#include <sys/debug.h>
2027c478bdstevel@tonic-gate#include <sys/file.h>
2037c478bdstevel@tonic-gate#include <sys/kmem.h>
2040209230gjelinek#include <sys/kstat.h>
2057c478bdstevel@tonic-gate#include <sys/mutex.h>
20645916cdjpk#include <sys/note.h>
2077c478bdstevel@tonic-gate#include <sys/pathname.h>
2087c478bdstevel@tonic-gate#include <sys/proc.h>
2097c478bdstevel@tonic-gate#include <sys/project.h>
210cf8f45cdstaff#include <sys/sysevent.h>
2117c478bdstevel@tonic-gate#include <sys/task.h>
2127c478bdstevel@tonic-gate#include <sys/systm.h>
2137c478bdstevel@tonic-gate#include <sys/types.h>
2147c478bdstevel@tonic-gate#include <sys/utsname.h>
2157c478bdstevel@tonic-gate#include <sys/vnode.h>
2167c478bdstevel@tonic-gate#include <sys/vfs.h>
2177c478bdstevel@tonic-gate#include <sys/systeminfo.h>
2187c478bdstevel@tonic-gate#include <sys/policy.h>
2197c478bdstevel@tonic-gate#include <sys/cred_impl.h>
2207c478bdstevel@tonic-gate#include <sys/contract_impl.h>
2217c478bdstevel@tonic-gate#include <sys/contract/process_impl.h>
2227c478bdstevel@tonic-gate#include <sys/class.h>
2237c478bdstevel@tonic-gate#include <sys/pool.h>
2247c478bdstevel@tonic-gate#include <sys/pool_pset.h>
2257c478bdstevel@tonic-gate#include <sys/pset.h>
2267c478bdstevel@tonic-gate#include <sys/sysmacros.h>
2277c478bdstevel@tonic-gate#include <sys/callb.h>
2287c478bdstevel@tonic-gate#include <sys/vmparam.h>
2297c478bdstevel@tonic-gate#include <sys/corectl.h>
230824c205ml#include <sys/ipc_impl.h>
231134a1f4Casper H.S. Dik#include <sys/klpd.h>
2337c478bdstevel@tonic-gate#include <sys/door.h>
2347c478bdstevel@tonic-gate#include <sys/cpuvar.h>
235bd41d0anordmark#include <sys/sdt.h>
2377c478bdstevel@tonic-gate#include <sys/uadmin.h>
2387c478bdstevel@tonic-gate#include <sys/session.h>
2397c478bdstevel@tonic-gate#include <sys/cmn_err.h>
2407c478bdstevel@tonic-gate#include <sys/modhash.h>
2413f2f09cdp#include <sys/sunddi.h>
2427c478bdstevel@tonic-gate#include <sys/nvpair.h>
2437c478bdstevel@tonic-gate#include <sys/rctl.h>
2447c478bdstevel@tonic-gate#include <sys/fss.h>
2459acbbeann#include <sys/brand.h>
2467c478bdstevel@tonic-gate#include <sys/zone.h>
247f4b3ec6dh#include <net/if.h>
248c97ad5cakolb#include <sys/cpucaps.h>
2490209230gjelinek#include <vm/seg.h>
2502b24ab6Sebastien Roy#include <sys/mac.h>
2512b24ab6Sebastien Roy
2522b24ab6Sebastien Roy/* List of data link IDs which are accessible from the zone */
2532b24ab6Sebastien Roytypedef struct zone_dl {
2542b24ab6Sebastien Roy	datalink_id_t	zdl_id;
255550b6e4Sowmini Varadhan	nvlist_t	*zdl_net;
2562b24ab6Sebastien Roy	list_node_t	zdl_linkage;
2572b24ab6Sebastien Roy} zone_dl_t;
2607c478bdstevel@tonic-gate * cv used to signal that all references to the zone have been released.  This
2617c478bdstevel@tonic-gate * needs to be global since there may be multiple waiters, and the first to
2627c478bdstevel@tonic-gate * wake up will free the zone_t, hence we cannot use zone->zone_cv.
2637c478bdstevel@tonic-gate */
2647c478bdstevel@tonic-gatestatic kcondvar_t zone_destroy_cv;
2667c478bdstevel@tonic-gate * Lock used to serialize access to zone_cv.  This could have been per-zone,
2677c478bdstevel@tonic-gate * but then we'd need another lock for zone_destroy_cv, and why bother?
2687c478bdstevel@tonic-gate */
2697c478bdstevel@tonic-gatestatic kmutex_t zone_status_lock;
2727c478bdstevel@tonic-gate * ZSD-related global variables.
2737c478bdstevel@tonic-gate */
2747c478bdstevel@tonic-gatestatic kmutex_t zsd_key_lock;	/* protects the following two */
2767c478bdstevel@tonic-gate * The next caller of zone_key_create() will be assigned a key of ++zsd_keyval.
2777c478bdstevel@tonic-gate */
2787c478bdstevel@tonic-gatestatic zone_key_t zsd_keyval = 0;
2807c478bdstevel@tonic-gate * Global list of registered keys.  We use this when a new zone is created.
2817c478bdstevel@tonic-gate */
2827c478bdstevel@tonic-gatestatic list_t zsd_registered_keys;
2847c478bdstevel@tonic-gateint zone_hash_size = 256;
28545916cdjpkstatic mod_hash_t *zonehashbyname, *zonehashbyid, *zonehashbylabel;
2867c478bdstevel@tonic-gatestatic kmutex_t zonehash_lock;
2877c478bdstevel@tonic-gatestatic uint_t zonecount;
2887c478bdstevel@tonic-gatestatic id_space_t *zoneid_space;
2917c478bdstevel@tonic-gate * The global zone (aka zone0) is the all-seeing, all-knowing zone in which the
2927c478bdstevel@tonic-gate * kernel proper runs, and which manages all other zones.
2937c478bdstevel@tonic-gate *
2947c478bdstevel@tonic-gate * Although not declared as static, the variable "zone0" should not be used
2957c478bdstevel@tonic-gate * except for by code that needs to reference the global zone early on in boot,
2967c478bdstevel@tonic-gate * before it is fully initialized.  All other consumers should use
2977c478bdstevel@tonic-gate * 'global_zone'.
2987c478bdstevel@tonic-gate */
2997c478bdstevel@tonic-gatezone_t zone0;
3007c478bdstevel@tonic-gatezone_t *global_zone = NULL;	/* Set when the global zone is initialized */
3037c478bdstevel@tonic-gate * List of active zones, protected by zonehash_lock.
3047c478bdstevel@tonic-gate */
3057c478bdstevel@tonic-gatestatic list_t zone_active;
3087c478bdstevel@tonic-gate * List of destroyed zones that still have outstanding cred references.
3097c478bdstevel@tonic-gate * Used for debugging.  Uses a separate lock to avoid lock ordering
3107c478bdstevel@tonic-gate * problems in zone_free.
3117c478bdstevel@tonic-gate */
3127c478bdstevel@tonic-gatestatic list_t zone_deathrow;
3137c478bdstevel@tonic-gatestatic kmutex_t zone_deathrow_lock;
3157c478bdstevel@tonic-gate/* number of zones is limited by virtual interface limit in IP */
3167c478bdstevel@tonic-gateuint_t maxzones = 8192;
318cf8f45cdstaff/* Event channel to sent zone state change notifications */
319cf8f45cdstaffevchan_t *zone_event_chan;
322cf8f45cdstaff * This table holds the mapping from kernel zone states to
323cf8f45cdstaff * states visible in the state notification API.
324cf8f45cdstaff * The idea is that we only expose "obvious" states and
325cf8f45cdstaff * do not expose states which are just implementation details.
326cf8f45cdstaff */
327cf8f45cdstaffconst char  *zone_status_table[] = {
328cf8f45cdstaff	ZONE_EVENT_UNINITIALIZED,	/* uninitialized */
329bd41d0anordmark	ZONE_EVENT_INITIALIZED,		/* initialized */
330cf8f45cdstaff	ZONE_EVENT_READY,		/* ready */
331cf8f45cdstaff	ZONE_EVENT_READY,		/* booting */
332cf8f45cdstaff	ZONE_EVENT_RUNNING,		/* running */
333cf8f45cdstaff	ZONE_EVENT_SHUTTING_DOWN,	/* shutting_down */
334cf8f45cdstaff	ZONE_EVENT_SHUTTING_DOWN,	/* empty */
335cf8f45cdstaff	ZONE_EVENT_SHUTTING_DOWN,	/* down */
336cf8f45cdstaff	ZONE_EVENT_SHUTTING_DOWN,	/* dying */
337cf8f45cdstaff	ZONE_EVENT_UNINITIALIZED,	/* dead */
3417c478bdstevel@tonic-gate * This isn't static so lint doesn't complain.
3427c478bdstevel@tonic-gate */
3437c478bdstevel@tonic-gaterctl_hndl_t rc_zone_cpu_shares;
344c693965slrctl_hndl_t rc_zone_locked_mem;
3450209230gjelinekrctl_hndl_t rc_zone_max_swap;
3460fbb751John Levonrctl_hndl_t rc_zone_max_lofi;
347c97ad5cakolbrctl_hndl_t rc_zone_cpu_cap;
3487c478bdstevel@tonic-gaterctl_hndl_t rc_zone_nlwps;
349ff19e02Menno Lagemanrctl_hndl_t rc_zone_nprocs;
350824c205mlrctl_hndl_t rc_zone_shmmax;
351824c205mlrctl_hndl_t rc_zone_shmmni;
352824c205mlrctl_hndl_t rc_zone_semmni;
353824c205mlrctl_hndl_t rc_zone_msgmni;
3557c478bdstevel@tonic-gate * Synchronization primitives used to synchronize between mounts and zone
3567c478bdstevel@tonic-gate * creation/destruction.
3577c478bdstevel@tonic-gate */
3587c478bdstevel@tonic-gatestatic int mounts_in_progress;
3597c478bdstevel@tonic-gatestatic kcondvar_t mount_cv;
3607c478bdstevel@tonic-gatestatic kmutex_t mount_lock;
3623f2f09cdpconst char * const zone_default_initname = "/sbin/init";
36345916cdjpkstatic char * const zone_prefix = "/zone/";
3647c478bdstevel@tonic-gatestatic int zone_shutdown(zoneid_t zoneid);
3652b24ab6Sebastien Roystatic int zone_add_datalink(zoneid_t, datalink_id_t);
3662b24ab6Sebastien Roystatic int zone_remove_datalink(zoneid_t, datalink_id_t);
3672b24ab6Sebastien Roystatic int zone_list_datalink(zoneid_t, int *, datalink_id_t *);
368550b6e4Sowmini Varadhanstatic int zone_set_network(zoneid_t, zone_net_data_t *);
369550b6e4Sowmini Varadhanstatic int zone_get_network(zoneid_t, zone_net_data_t *);
371bd41d0anordmarktypedef boolean_t zsd_applyfn_t(kmutex_t *, boolean_t, zone_t *, zone_key_t);
373bd41d0anordmarkstatic void zsd_apply_all_zones(zsd_applyfn_t *, zone_key_t);
374bd41d0anordmarkstatic void zsd_apply_all_keys(zsd_applyfn_t *, zone_t *);
375bd41d0anordmarkstatic boolean_t zsd_apply_create(kmutex_t *, boolean_t, zone_t *, zone_key_t);
376bd41d0anordmarkstatic boolean_t zsd_apply_shutdown(kmutex_t *, boolean_t, zone_t *,
377bd41d0anordmark    zone_key_t);
378bd41d0anordmarkstatic boolean_t zsd_apply_destroy(kmutex_t *, boolean_t, zone_t *, zone_key_t);
379bd41d0anordmarkstatic boolean_t zsd_wait_for_creator(zone_t *, struct zsd_entry *,
380bd41d0anordmark    kmutex_t *);
381bd41d0anordmarkstatic boolean_t zsd_wait_for_inprogress(zone_t *, struct zsd_entry *,
382bd41d0anordmark    kmutex_t *);
385821c4a9dp * Bump this number when you alter the zone syscall interfaces; this is
386821c4a9dp * because we need to have support for previous API versions in libc
387821c4a9dp * to support patching; libc calls into the kernel to determine this number.
388821c4a9dp *
389821c4a9dp * Version 1 of the API is the version originally shipped with Solaris 10
390821c4a9dp * Version 2 alters the zone_create system call in order to support more
391821c4a9dp *     arguments by moving the args into a structure; and to do better
392821c4a9dp *     error reporting when zone_create() fails.
393821c4a9dp * Version 3 alters the zone_create system call in order to support the
394821c4a9dp *     import of ZFS datasets to zones.
39545916cdjpk * Version 4 alters the zone_create system call in order to support
39645916cdjpk *     Trusted Extensions.
3973f2f09cdp * Version 5 alters the zone_boot system call, and converts its old
3983f2f09cdp *     bootargs parameter to be set by the zone_setattr API instead.
399f4b3ec6dh * Version 6 adds the flag argument to zone_create.
400821c4a9dp */
401f4b3ec6dhstatic const int ZONE_SYSCALL_API_VERSION = 6;
4047c478bdstevel@tonic-gate * Certain filesystems (such as NFS and autofs) need to know which zone
4057c478bdstevel@tonic-gate * the mount is being placed in.  Because of this, we need to be able to
4067c478bdstevel@tonic-gate * ensure that a zone isn't in the process of being created such that
4077c478bdstevel@tonic-gate * nfs_mount() thinks it is in the global zone, while by the time it
4087c478bdstevel@tonic-gate * gets added the list of mounted zones, it ends up on zoneA's mount
4097c478bdstevel@tonic-gate * list.
4107c478bdstevel@tonic-gate *
4117c478bdstevel@tonic-gate * The following functions: block_mounts()/resume_mounts() and
4127c478bdstevel@tonic-gate * mount_in_progress()/mount_completed() are used by zones and the VFS
4137c478bdstevel@tonic-gate * layer (respectively) to synchronize zone creation and new mounts.
4147c478bdstevel@tonic-gate *
4157c478bdstevel@tonic-gate * The semantics are like a reader-reader lock such that there may
4167c478bdstevel@tonic-gate * either be multiple mounts (or zone creations, if that weren't
4177c478bdstevel@tonic-gate * serialized by zonehash_lock) in progress at the same time, but not
4187c478bdstevel@tonic-gate * both.
4197c478bdstevel@tonic-gate *
4207c478bdstevel@tonic-gate * We use cv's so the user can ctrl-C out of the operation if it's
4217c478bdstevel@tonic-gate * taking too long.
4227c478bdstevel@tonic-gate *
4237c478bdstevel@tonic-gate * The semantics are such that there is unfair bias towards the
4247c478bdstevel@tonic-gate * "current" operation.  This means that zone creations may starve if
4257c478bdstevel@tonic-gate * there is a rapid succession of new mounts coming in to the system, or
4267c478bdstevel@tonic-gate * there is a remote possibility that zones will be created at such a
4277c478bdstevel@tonic-gate * rate that new mounts will not be able to proceed.
4287c478bdstevel@tonic-gate */
4307c478bdstevel@tonic-gate * Prevent new mounts from progressing to the point of calling
4317c478bdstevel@tonic-gate * VFS_MOUNT().  If there are already mounts in this "region", wait for
4327c478bdstevel@tonic-gate * them to complete.
4337c478bdstevel@tonic-gate */
4347c478bdstevel@tonic-gatestatic int
4377c478bdstevel@tonic-gate	int retval = 0;
4397c478bdstevel@tonic-gate	/*
4407c478bdstevel@tonic-gate	 * Since it may block for a long time, block_mounts() shouldn't be
4417c478bdstevel@tonic-gate	 * called with zonehash_lock held.
4427c478bdstevel@tonic-gate	 */
4437c478bdstevel@tonic-gate	ASSERT(MUTEX_NOT_HELD(&zonehash_lock));
4447c478bdstevel@tonic-gate	mutex_enter(&mount_lock);
4457c478bdstevel@tonic-gate	while (mounts_in_progress > 0) {
4467c478bdstevel@tonic-gate		if (cv_wait_sig(&mount_cv, &mount_lock) == 0)
4477c478bdstevel@tonic-gate			goto signaled;
4487c478bdstevel@tonic-gate	}
4497c478bdstevel@tonic-gate	/*
4507c478bdstevel@tonic-gate	 * A negative value of mounts_in_progress indicates that mounts
4517c478bdstevel@tonic-gate	 * have been blocked by (-mounts_in_progress) different callers.
4527c478bdstevel@tonic-gate	 */
4537c478bdstevel@tonic-gate	mounts_in_progress--;
4547c478bdstevel@tonic-gate	retval = 1;
4567c478bdstevel@tonic-gate	mutex_exit(&mount_lock);
4577c478bdstevel@tonic-gate	return (retval);
4617c478bdstevel@tonic-gate * The VFS layer may progress with new mounts as far as we're concerned.
4627c478bdstevel@tonic-gate * Allow them to progress if we were the last obstacle.
4637c478bdstevel@tonic-gate */
4647c478bdstevel@tonic-gatestatic void
4677c478bdstevel@tonic-gate	mutex_enter(&mount_lock);
4687c478bdstevel@tonic-gate	if (++mounts_in_progress == 0)
4697c478bdstevel@tonic-gate		cv_broadcast(&mount_cv);
4707c478bdstevel@tonic-gate	mutex_exit(&mount_lock);
4747c478bdstevel@tonic-gate * The VFS layer is busy with a mount; zones should wait until all
4757c478bdstevel@tonic-gate * mounts are completed to progress.
4767c478bdstevel@tonic-gate */
4807c478bdstevel@tonic-gate	mutex_enter(&mount_lock);
4817c478bdstevel@tonic-gate	while (mounts_in_progress < 0)
4827c478bdstevel@tonic-gate		cv_wait(&mount_cv, &mount_lock);
4837c478bdstevel@tonic-gate	mounts_in_progress++;
4847c478bdstevel@tonic-gate	mutex_exit(&mount_lock);
4887c478bdstevel@tonic-gate * VFS is done with one mount; wake up any waiting block_mounts()
4897c478bdstevel@tonic-gate * callers if this is the last mount.
4907c478bdstevel@tonic-gate */
4947c478bdstevel@tonic-gate	mutex_enter(&mount_lock);
4957c478bdstevel@tonic-gate	if (--mounts_in_progress == 0)
4967c478bdstevel@tonic-gate		cv_broadcast(&mount_cv);
4977c478bdstevel@tonic-gate	mutex_exit(&mount_lock);
5017c478bdstevel@tonic-gate * ZSD routines.
5027c478bdstevel@tonic-gate *
5037c478bdstevel@tonic-gate * Zone Specific Data (ZSD) is modeled after Thread Specific Data as
5047c478bdstevel@tonic-gate * defined by the pthread_key_create() and related interfaces.
5057c478bdstevel@tonic-gate *
5067c478bdstevel@tonic-gate * Kernel subsystems may register one or more data items and/or
5077c478bdstevel@tonic-gate * callbacks to be executed when a zone is created, shutdown, or
5087c478bdstevel@tonic-gate * destroyed.
5097c478bdstevel@tonic-gate *
5107c478bdstevel@tonic-gate * Unlike the thread counterpart, destructor callbacks will be executed
5117c478bdstevel@tonic-gate * even if the data pointer is NULL and/or there are no constructor
5127c478bdstevel@tonic-gate * callbacks, so it is the responsibility of such callbacks to check for
5137c478bdstevel@tonic-gate * NULL data values if necessary.
5147c478bdstevel@tonic-gate *
5157c478bdstevel@tonic-gate * The locking strategy and overall picture is as follows:
5167c478bdstevel@tonic-gate *
5177c478bdstevel@tonic-gate * When someone calls zone_key_create(), a template ZSD entry is added to the
518bd41d0anordmark * global list "zsd_registered_keys", protected by zsd_key_lock.  While
519bd41d0anordmark * holding that lock all the existing zones are marked as
520bd41d0anordmark * ZSD_CREATE_NEEDED and a copy of the ZSD entry added to the per-zone
521bd41d0anordmark * zone_zsd list (protected by zone_lock). The global list is updated first
522bd41d0anordmark * (under zone_key_lock) to make sure that newly created zones use the
523bd41d0anordmark * most recent list of keys. Then under zonehash_lock we walk the zones
524bd41d0anordmark * and mark them.  Similar locking is used in zone_key_delete().
5257c478bdstevel@tonic-gate *
526bd41d0anordmark * The actual create, shutdown, and destroy callbacks are done without
527bd41d0anordmark * holding any lock. And zsd_flags are used to ensure that the operations
528bd41d0anordmark * completed so that when zone_key_create (and zone_create) is done, as well as
529bd41d0anordmark * zone_key_delete (and zone_destroy) is done, all the necessary callbacks
530bd41d0anordmark * are completed.
5317c478bdstevel@tonic-gate *
5327c478bdstevel@tonic-gate * When new zones are created constructor callbacks for all registered ZSD
533bd41d0anordmark * entries will be called. That also uses the above two phases of marking
534bd41d0anordmark * what needs to be done, and then running the callbacks without holding
535bd41d0anordmark * any locks.
5367c478bdstevel@tonic-gate *
5377c478bdstevel@tonic-gate * The framework does not provide any locking around zone_getspecific() and
5387c478bdstevel@tonic-gate * zone_setspecific() apart from that needed for internal consistency, so
5397c478bdstevel@tonic-gate * callers interested in atomic "test-and-set" semantics will need to provide
5407c478bdstevel@tonic-gate * their own locking.
5417c478bdstevel@tonic-gate */
544bd41d0anordmark * Helper function to find the zsd_entry associated with the key in the
545bd41d0anordmark * given list.
546bd41d0anordmark */
547bd41d0anordmarkstatic struct zsd_entry *
548bd41d0anordmarkzsd_find(list_t *l, zone_key_t key)
550bd41d0anordmark	struct zsd_entry *zsd;
552bd41d0anordmark	for (zsd = list_head(l); zsd != NULL; zsd = list_next(l, zsd)) {
553bd41d0anordmark		if (zsd->zsd_key == key) {
554bd41d0anordmark			return (zsd);
5557c478bdstevel@tonic-gate		}
5567c478bdstevel@tonic-gate	}
557bd41d0anordmark	return (NULL);
5617c478bdstevel@tonic-gate * Helper function to find the zsd_entry associated with the key in the
562bd41d0anordmark * given list. Move it to the front of the list.
5637c478bdstevel@tonic-gate */
5647c478bdstevel@tonic-gatestatic struct zsd_entry *
565bd41d0anordmarkzsd_find_mru(list_t *l, zone_key_t key)
5677c478bdstevel@tonic-gate	struct zsd_entry *zsd;
5697c478bdstevel@tonic-gate	for (zsd = list_head(l); zsd != NULL; zsd = list_next(l, zsd)) {
5707c478bdstevel@tonic-gate		if (zsd->zsd_key == key) {
5717c478bdstevel@tonic-gate			/*
5727c478bdstevel@tonic-gate			 * Move to head of list to keep list in MRU order.
5737c478bdstevel@tonic-gate			 */
5747c478bdstevel@tonic-gate			if (zsd != list_head(l)) {
5757c478bdstevel@tonic-gate				list_remove(l, zsd);
5767c478bdstevel@tonic-gate				list_insert_head(l, zsd);
5777c478bdstevel@tonic-gate			}
5787c478bdstevel@tonic-gate			return (zsd);
5797c478bdstevel@tonic-gate		}
5807c478bdstevel@tonic-gate	}
5817c478bdstevel@tonic-gate	return (NULL);
585bd41d0anordmarkzone_key_create(zone_key_t *keyp, void *(*create)(zoneid_t),
586bd41d0anordmark    void (*shutdown)(zoneid_t, void *), void (*destroy)(zoneid_t, void *))
588bd41d0anordmark	struct zsd_entry *zsdp;
589bd41d0anordmark	struct zsd_entry *t;
590bd41d0anordmark	struct zone *zone;
591bd41d0anordmark	zone_key_t  key;
593bd41d0anordmark	zsdp = kmem_zalloc(sizeof (*zsdp), KM_SLEEP);
594bd41d0anordmark	zsdp->zsd_data = NULL;
595bd41d0anordmark	zsdp->zsd_create = create;
596bd41d0anordmark	zsdp->zsd_shutdown = shutdown;
597bd41d0anordmark	zsdp->zsd_destroy = destroy;
599bd41d0anordmark	/*
600bd41d0anordmark	 * Insert in global list of callbacks. Makes future zone creations
601bd41d0anordmark	 * see it.
602bd41d0anordmark	 */
603bd41d0anordmark	mutex_enter(&zsd_key_lock);
604fe16170Pramod Batni	key = zsdp->zsd_key = ++zsd_keyval;
605bd41d0anordmark	ASSERT(zsd_keyval != 0);
606bd41d0anordmark	list_insert_tail(&zsd_registered_keys, zsdp);
607bd41d0anordmark	mutex_exit(&zsd_key_lock);
609bd41d0anordmark	/*
610bd41d0anordmark	 * Insert for all existing zones and mark them as needing
611bd41d0anordmark	 * a create callback.
612bd41d0anordmark	 */
613bd41d0anordmark	mutex_enter(&zonehash_lock);	/* stop the world */
614bd41d0anordmark	for (zone = list_head(&zone_active); zone != NULL;
615bd41d0anordmark	    zone = list_next(&zone_active, zone)) {
616bd41d0anordmark		zone_status_t status;
618bd41d0anordmark		mutex_enter(&zone->zone_lock);
620bd41d0anordmark		/* Skip zones that are on the way down or not yet up */
621bd41d0anordmark		status = zone_status_get(zone);
622bd41d0anordmark		if (status >= ZONE_IS_DOWN ||
623bd41d0anordmark		    status == ZONE_IS_UNINITIALIZED) {
624bd41d0anordmark			mutex_exit(&zone->zone_lock);
625bd41d0anordmark			continue;
626bd41d0anordmark		}
628bd41d0anordmark		t = zsd_find_mru(&zone->zone_zsd, key);
629bd41d0anordmark		if (t != NULL) {
630bd41d0anordmark			/*
631bd41d0anordmark			 * A zsd_configure already inserted it after
632bd41d0anordmark			 * we dropped zsd_key_lock above.
633bd41d0anordmark			 */
634bd41d0anordmark			mutex_exit(&zone->zone_lock);
635bd41d0anordmark			continue;
636bd41d0anordmark		}
637bd41d0anordmark		t = kmem_zalloc(sizeof (*t), KM_SLEEP);
638bd41d0anordmark		t->zsd_key = key;
639bd41d0anordmark		t->zsd_create = create;
640bd41d0anordmark		t->zsd_shutdown = shutdown;
641bd41d0anordmark		t->zsd_destroy = destroy;
642bd41d0anordmark		if (create != NULL) {
643bd41d0anordmark			t->zsd_flags = ZSD_CREATE_NEEDED;
644bd41d0anordmark			DTRACE_PROBE2(zsd__create__needed,
645bd41d0anordmark			    zone_t *, zone, zone_key_t, key);
646bd41d0anordmark		}
647bd41d0anordmark		list_insert_tail(&zone->zone_zsd, t);
648bd41d0anordmark		mutex_exit(&zone->zone_lock);
649bd41d0anordmark	}
650bd41d0anordmark	mutex_exit(&zonehash_lock);
652bd41d0anordmark	if (create != NULL) {
653bd41d0anordmark		/* Now call the create callback for this key */
654bd41d0anordmark		zsd_apply_all_zones(zsd_apply_create, key);
655bd41d0anordmark	}
656fe16170Pramod Batni	/*
657835ee21Robert Harris	 * It is safe for consumers to use the key now, make it
658835ee21Robert Harris	 * globally visible. Specifically zone_getspecific() will
659835ee21Robert Harris	 * always successfully return the zone specific data associated
660835ee21Robert Harris	 * with the key.
661835ee21Robert Harris	 */
662fe16170Pramod Batni	*keyp = key;
663fe16170Pramod Batni
6677c478bdstevel@tonic-gate * Function called when a module is being unloaded, or otherwise wishes
6687c478bdstevel@tonic-gate * to unregister its ZSD key and callbacks.
669bd41d0anordmark *
670bd41d0anordmark * Remove from the global list and determine the functions that need to
671bd41d0anordmark * be called under a global lock. Then call the functions without
672bd41d0anordmark * holding any locks. Finally free up the zone_zsd entries. (The apply
673bd41d0anordmark * functions need to access the zone_zsd entries to find zsd_data etc.)
6747c478bdstevel@tonic-gate */
6767c478bdstevel@tonic-gatezone_key_delete(zone_key_t key)
6787c478bdstevel@tonic-gate	struct zsd_entry *zsdp = NULL;
6797c478bdstevel@tonic-gate	zone_t *zone;
6817c478bdstevel@tonic-gate	mutex_enter(&zsd_key_lock);
682bd41d0anordmark	zsdp = zsd_find_mru(&zsd_registered_keys, key);
683bd41d0anordmark	if (zsdp == NULL) {
684bd41d0anordmark		mutex_exit(&zsd_key_lock);
685bd41d0anordmark		return (-1);
686bd41d0anordmark	}
6877c478bdstevel@tonic-gate	list_remove(&zsd_registered_keys, zsdp);
6887c478bdstevel@tonic-gate	mutex_exit(&zsd_key_lock);
690bd41d0anordmark	mutex_enter(&zonehash_lock);
6917c478bdstevel@tonic-gate	for (zone = list_head(&zone_active); zone != NULL;
6927c478bdstevel@tonic-gate	    zone = list_next(&zone_active, zone)) {
6937c478bdstevel@tonic-gate		struct zsd_entry *del;
695bd41d0anordmark		mutex_enter(&zone->zone_lock);
696bd41d0anordmark		del = zsd_find_mru(&zone->zone_zsd, key);
697bd41d0anordmark		if (del == NULL) {
698bd41d0anordmark			/*
699bd41d0anordmark			 * Somebody else got here first e.g the zone going
700bd41d0anordmark			 * away.
701bd41d0anordmark			 */
702bd41d0anordmark			mutex_exit(&zone->zone_lock);
703bd41d0anordmark			continue;
704bd41d0anordmark		}
705bd41d0anordmark		ASSERT(del->zsd_shutdown == zsdp->zsd_shutdown);
706bd41d0anordmark		ASSERT(del->zsd_destroy == zsdp->zsd_destroy);
707bd41d0anordmark		if (del->zsd_shutdown != NULL &&
708bd41d0anordmark		    (del->zsd_flags & ZSD_SHUTDOWN_ALL) == 0) {
709bd41d0anordmark			del->zsd_flags |= ZSD_SHUTDOWN_NEEDED;
710bd41d0anordmark			DTRACE_PROBE2(zsd__shutdown__needed,
711bd41d0anordmark			    zone_t *, zone, zone_key_t, key);
712bd41d0anordmark		}
713bd41d0anordmark		if (del->zsd_destroy != NULL &&
714bd41d0anordmark		    (del->zsd_flags & ZSD_DESTROY_ALL) == 0) {
715bd41d0anordmark			del->zsd_flags |= ZSD_DESTROY_NEEDED;
716bd41d0anordmark			DTRACE_PROBE2(zsd__destroy__needed,
717bd41d0anordmark			    zone_t *, zone, zone_key_t, key);
7187c478bdstevel@tonic-gate		}
7197c478bdstevel@tonic-gate		mutex_exit(&zone->zone_lock);
7207c478bdstevel@tonic-gate	}
7217c478bdstevel@tonic-gate	mutex_exit(&zonehash_lock);
7227c478bdstevel@tonic-gate	kmem_free(zsdp, sizeof (*zsdp));
724bd41d0anordmark	/* Now call the shutdown and destroy callback for this key */
725bd41d0anordmark	zsd_apply_all_zones(zsd_apply_shutdown, key);
726bd41d0anordmark	zsd_apply_all_zones(zsd_apply_destroy, key);
728bd41d0anordmark	/* Now we can free up the zsdp structures in each zone */
729bd41d0anordmark	mutex_enter(&zonehash_lock);
7307c478bdstevel@tonic-gate	for (zone = list_head(&zone_active); zone != NULL;
731bd41d0anordmark	    zone = list_next(&zone_active, zone)) {
732bd41d0anordmark		struct zsd_entry *del;
734bd41d0anordmark		mutex_enter(&zone->zone_lock);
735bd41d0anordmark		del = zsd_find(&zone->zone_zsd, key);
736bd41d0anordmark		if (del != NULL) {
737bd41d0anordmark			list_remove(&zone->zone_zsd, del);
738bd41d0anordmark			ASSERT(!(del->zsd_flags & ZSD_ALL_INPROGRESS));
739bd41d0anordmark			kmem_free(del, sizeof (*del));
740bd41d0anordmark		}
7417c478bdstevel@tonic-gate		mutex_exit(&zone->zone_lock);
742bd41d0anordmark	}
7437c478bdstevel@tonic-gate	mutex_exit(&zonehash_lock);
745bd41d0anordmark	return (0);
7497c478bdstevel@tonic-gate * ZSD counterpart of pthread_setspecific().
750bd41d0anordmark *
751bd41d0anordmark * Since all zsd callbacks, including those with no create function,
752bd41d0anordmark * have an entry in zone_zsd, if the key is registered it is part of
753bd41d0anordmark * the zone_zsd list.
754bd41d0anordmark * Return an error if the key wasn't registerd.
7557c478bdstevel@tonic-gate */
7577c478bdstevel@tonic-gatezone_setspecific(zone_key_t key, zone_t *zone, const void *data)
7597c478bdstevel@tonic-gate	struct zsd_entry *t;
7617c478bdstevel@tonic-gate	mutex_enter(&zone->zone_lock);
762bd41d0anordmark	t = zsd_find_mru(&zone->zone_zsd, key);
7637c478bdstevel@tonic-gate	if (t != NULL) {
7647c478bdstevel@tonic-gate		/*
7657c478bdstevel@tonic-gate		 * Replace old value with new
7667c478bdstevel@tonic-gate		 */
7677c478bdstevel@tonic-gate		t->zsd_data = (void *)data;
7687c478bdstevel@tonic-gate		mutex_exit(&zone->zone_lock);
7697c478bdstevel@tonic-gate		return (0);
7707c478bdstevel@tonic-gate	}
7717c478bdstevel@tonic-gate	mutex_exit(&zone->zone_lock);
772bd41d0anordmark	return (-1);
7767c478bdstevel@tonic-gate * ZSD counterpart of pthread_getspecific().
7777c478bdstevel@tonic-gate */
7787c478bdstevel@tonic-gatevoid *
7797c478bdstevel@tonic-gatezone_getspecific(zone_key_t key, zone_t *zone)
7817c478bdstevel@tonic-gate	struct zsd_entry *t;
7827c478bdstevel@tonic-gate	void *data;
7847c478bdstevel@tonic-gate	mutex_enter(&zone->zone_lock);
785bd41d0anordmark	t = zsd_find_mru(&zone->zone_zsd, key);
7867c478bdstevel@tonic-gate	data = (t == NULL ? NULL : t->zsd_data);
7877c478bdstevel@tonic-gate	mutex_exit(&zone->zone_lock);
7887c478bdstevel@tonic-gate	return (data);
7927c478bdstevel@tonic-gate * Function used to initialize a zone's list of ZSD callbacks and data
7937c478bdstevel@tonic-gate * when the zone is being created.  The callbacks are initialized from
794bd41d0anordmark * the template list (zsd_registered_keys). The constructor callback is
795bd41d0anordmark * executed later (once the zone exists and with locks dropped).
7967c478bdstevel@tonic-gate */
7977c478bdstevel@tonic-gatestatic void
7987c478bdstevel@tonic-gatezone_zsd_configure(zone_t *zone)
8007c478bdstevel@tonic-gate	struct zsd_entry *zsdp;
8017c478bdstevel@tonic-gate	struct zsd_entry *t;
8037c478bdstevel@tonic-gate	ASSERT(MUTEX_HELD(&zonehash_lock));
8047c478bdstevel@tonic-gate	ASSERT(list_head(&zone->zone_zsd) == NULL);
805bd41d0anordmark	mutex_enter(&zone->zone_lock);
8067c478bdstevel@tonic-gate	mutex_enter(&zsd_key_lock);
8077c478bdstevel@tonic-gate	for (zsdp = list_head(&zsd_registered_keys); zsdp != NULL;
8087c478bdstevel@tonic-gate	    zsdp = list_next(&zsd_registered_keys, zsdp)) {
809bd41d0anordmark		/*
810bd41d0anordmark		 * Since this zone is ZONE_IS_UNCONFIGURED, zone_key_create
811bd41d0anordmark		 * should not have added anything to it.
812bd41d0anordmark		 */
813bd41d0anordmark		ASSERT(zsd_find(&zone->zone_zsd, zsdp->zsd_key) == NULL);
815bd41d0anordmark		t = kmem_zalloc(sizeof (*t), KM_SLEEP);
816bd41d0anordmark		t->zsd_key = zsdp->zsd_key;
817bd41d0anordmark		t->zsd_create = zsdp->zsd_create;
818bd41d0anordmark		t->zsd_shutdown = zsdp->zsd_shutdown;
819bd41d0anordmark		t->zsd_destroy = zsdp->zsd_destroy;
8207c478bdstevel@tonic-gate		if (zsdp->zsd_create != NULL) {
821bd41d0anordmark			t->zsd_flags = ZSD_CREATE_NEEDED;
822bd41d0anordmark			DTRACE_PROBE2(zsd__create__needed,
823bd41d0anordmark			    zone_t *, zone, zone_key_t, zsdp->zsd_key);
8247c478bdstevel@tonic-gate		}
825bd41d0anordmark		list_insert_tail(&zone->zone_zsd, t);
8267c478bdstevel@tonic-gate	}
8277c478bdstevel@tonic-gate	mutex_exit(&zsd_key_lock);
828bd41d0anordmark	mutex_exit(&zone->zone_lock);
8317c478bdstevel@tonic-gateenum zsd_callback_type { ZSD_CREATE, ZSD_SHUTDOWN, ZSD_DESTROY };
8347c478bdstevel@tonic-gate * Helper function to execute shutdown or destructor callbacks.
8357c478bdstevel@tonic-gate */
8367c478bdstevel@tonic-gatestatic void
8377c478bdstevel@tonic-gatezone_zsd_callbacks(zone_t *zone, enum zsd_callback_type ct)
8397c478bdstevel@tonic-gate	struct zsd_entry *t;
8417c478bdstevel@tonic-gate	ASSERT(ct == ZSD_SHUTDOWN || ct == ZSD_DESTROY);
8427c478bdstevel@tonic-gate	ASSERT(ct != ZSD_SHUTDOWN || zone_status_get(zone) >= ZONE_IS_EMPTY);
8437c478bdstevel@tonic-gate	ASSERT(ct != ZSD_DESTROY || zone_status_get(zone) >= ZONE_IS_DOWN);
8457c478bdstevel@tonic-gate	/*
846bd41d0anordmark	 * Run the callback solely based on what is registered for the zone
847bd41d0anordmark	 * in zone_zsd. The global list can change independently of this
848bd41d0anordmark	 * as keys are registered and unregistered and we don't register new
849bd41d0anordmark	 * callbacks for a zone that is in the process of going away.
8507c478bdstevel@tonic-gate	 */
851bd41d0anordmark	mutex_enter(&zone->zone_lock);
852bd41d0anordmark	for (t = list_head(&zone->zone_zsd); t != NULL;
853bd41d0anordmark	    t = list_next(&zone->zone_zsd, t)) {
854bd41d0anordmark		zone_key_t key = t->zsd_key;
8567c478bdstevel@tonic-gate		/* Skip if no callbacks registered */
858bd41d0anordmark		if (ct == ZSD_SHUTDOWN) {
859bd41d0anordmark			if (t->zsd_shutdown != NULL &&
860bd41d0anordmark			    (t->zsd_flags & ZSD_SHUTDOWN_ALL) == 0) {
861bd41d0anordmark				t->zsd_flags |= ZSD_SHUTDOWN_NEEDED;
862bd41d0anordmark				DTRACE_PROBE2(zsd__shutdown__needed,
863bd41d0anordmark				    zone_t *, zone, zone_key_t, key);
8647c478bdstevel@tonic-gate			}
8657c478bdstevel@tonic-gate		} else {
866bd41d0anordmark			if (t->zsd_destroy != NULL &&
867bd41d0anordmark			    (t->zsd_flags & ZSD_DESTROY_ALL) == 0) {
868bd41d0anordmark				t->zsd_flags |= ZSD_DESTROY_NEEDED;
869bd41d0anordmark				DTRACE_PROBE2(zsd__destroy__needed,
870bd41d0anordmark				    zone_t *, zone, zone_key_t, key);
8717c478bdstevel@tonic-gate			}
8727c478bdstevel@tonic-gate		}
8737c478bdstevel@tonic-gate	}
874bd41d0anordmark	mutex_exit(&zone->zone_lock);
876bd41d0anordmark	/* Now call the shutdown and destroy callback for this key */
877bd41d0anordmark	zsd_apply_all_keys(zsd_apply_shutdown, zone);
878bd41d0anordmark	zsd_apply_all_keys(zsd_apply_destroy, zone);
8837c478bdstevel@tonic-gate * Called when the zone is going away; free ZSD-related memory, and
8847c478bdstevel@tonic-gate * destroy the zone_zsd list.
8857c478bdstevel@tonic-gate */
8867c478bdstevel@tonic-gatestatic void
8877c478bdstevel@tonic-gatezone_free_zsd(zone_t *zone)
8897c478bdstevel@tonic-gate	struct zsd_entry *t, *next;
8917c478bdstevel@tonic-gate	/*
8927c478bdstevel@tonic-gate	 * Free all the zsd_entry's we had on this zone.
8937c478bdstevel@tonic-gate	 */
894bd41d0anordmark	mutex_enter(&zone->zone_lock);
8957c478bdstevel@tonic-gate	for (t = list_head(&zone->zone_zsd); t != NULL; t = next) {
8967c478bdstevel@tonic-gate		next = list_next(&zone->zone_zsd, t);
8977c478bdstevel@tonic-gate		list_remove(&zone->zone_zsd, t);
898bd41d0anordmark		ASSERT(!(t->zsd_flags & ZSD_ALL_INPROGRESS));
8997c478bdstevel@tonic-gate		kmem_free(t, sizeof (*t));
9007c478bdstevel@tonic-gate	}
9017c478bdstevel@tonic-gate	list_destroy(&zone->zone_zsd);
902bd41d0anordmark	mutex_exit(&zone->zone_lock);
907bd41d0anordmark * Apply a function to all zones for particular key value.
908bd41d0anordmark *
909bd41d0anordmark * The applyfn has to drop zonehash_lock if it does some work, and
910bd41d0anordmark * then reacquire it before it returns.
911bd41d0anordmark * When the lock is dropped we don't follow list_next even
912bd41d0anordmark * if it is possible to do so without any hazards. This is
913bd41d0anordmark * because we want the design to allow for the list of zones
914bd41d0anordmark * to change in any arbitrary way during the time the
915bd41d0anordmark * lock was dropped.
916bd41d0anordmark *
917bd41d0anordmark * It is safe to restart the loop at list_head since the applyfn
918bd41d0anordmark * changes the zsd_flags as it does work, so a subsequent
919bd41d0anordmark * pass through will have no effect in applyfn, hence the loop will terminate
920bd41d0anordmark * in at worst O(N^2).
921bd41d0anordmark */
922bd41d0anordmarkstatic void
923bd41d0anordmarkzsd_apply_all_zones(zsd_applyfn_t *applyfn, zone_key_t key)
925bd41d0anordmark	zone_t *zone;
927bd41d0anordmark	mutex_enter(&zonehash_lock);
928bd41d0anordmark	zone = list_head(&zone_active);
929bd41d0anordmark	while (zone != NULL) {
930bd41d0anordmark		if ((applyfn)(&zonehash_lock, B_FALSE, zone, key)) {
931bd41d0anordmark			/* Lock dropped - restart at head */
932bd41d0anordmark			zone = list_head(&zone_active);
933bd41d0anordmark		} else {
934bd41d0anordmark			zone = list_next(&zone_active, zone);
935bd41d0anordmark		}
936bd41d0anordmark	}
937bd41d0anordmark	mutex_exit(&zonehash_lock);
941bd41d0anordmark * Apply a function to all keys for a particular zone.
942bd41d0anordmark *
943bd41d0anordmark * The applyfn has to drop zonehash_lock if it does some work, and
944bd41d0anordmark * then reacquire it before it returns.
945bd41d0anordmark * When the lock is dropped we don't follow list_next even
946bd41d0anordmark * if it is possible to do so without any hazards. This is
947bd41d0anordmark * because we want the design to allow for the list of zsd callbacks
948bd41d0anordmark * to change in any arbitrary way during the time the
949bd41d0anordmark * lock was dropped.
950bd41d0anordmark *
951bd41d0anordmark * It is safe to restart the loop at list_head since the applyfn
952bd41d0anordmark * changes the zsd_flags as it does work, so a subsequent
953bd41d0anordmark * pass through will have no effect in applyfn, hence the loop will terminate
954bd41d0anordmark * in at worst O(N^2).
955bd41d0anordmark */
956bd41d0anordmarkstatic void
957bd41d0anordmarkzsd_apply_all_keys(zsd_applyfn_t *applyfn, zone_t *zone)
959bd41d0anordmark	struct zsd_entry *t;
961bd41d0anordmark	mutex_enter(&zone->zone_lock);
962bd41d0anordmark	t = list_head(&zone->zone_zsd);
963bd41d0anordmark	while (t != NULL) {
964bd41d0anordmark		if ((applyfn)(NULL, B_TRUE, zone, t->zsd_key)) {
965bd41d0anordmark			/* Lock dropped - restart at head */
966bd41d0anordmark			t = list_head(&zone->zone_zsd);
967bd41d0anordmark		} else {
968bd41d0anordmark			t = list_next(&zone->zone_zsd, t);
969bd41d0anordmark		}
970bd41d0anordmark	}
971bd41d0anordmark	mutex_exit(&zone->zone_lock);
975bd41d0anordmark * Call the create function for the zone and key if CREATE_NEEDED
976bd41d0anordmark * is set.
977bd41d0anordmark * If some other thread gets here first and sets CREATE_INPROGRESS, then
978bd41d0anordmark * we wait for that thread to complete so that we can ensure that
979bd41d0anordmark * all the callbacks are done when we've looped over all zones/keys.
980bd41d0anordmark *
981bd41d0anordmark * When we call the create function, we drop the global held by the
982bd41d0anordmark * caller, and return true to tell the caller it needs to re-evalute the
983bd41d0anordmark * state.
984bd41d0anordmark * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
985bd41d0anordmark * remains held on exit.
986bd41d0anordmark */
987bd41d0anordmarkstatic boolean_t
988bd41d0anordmarkzsd_apply_create(kmutex_t *lockp, boolean_t zone_lock_held,
989bd41d0anordmark    zone_t *zone, zone_key_t key)
991bd41d0anordmark	void *result;
992bd41d0anordmark	struct zsd_entry *t;
993bd41d0anordmark	boolean_t dropped;
995bd41d0anordmark	if (lockp != NULL) {
996bd41d0anordmark		ASSERT(MUTEX_HELD(lockp));
997bd41d0anordmark	}
998bd41d0anordmark	if (zone_lock_held) {
999bd41d0anordmark		ASSERT(MUTEX_HELD(&zone->zone_lock));
1000bd41d0anordmark	} else {
1001bd41d0anordmark		mutex_enter(&zone->zone_lock);
1002bd41d0anordmark	}
1004bd41d0anordmark	t = zsd_find(&zone->zone_zsd, key);
1005bd41d0anordmark	if (t == NULL) {
1006bd41d0anordmark		/*
1007bd41d0anordmark		 * Somebody else got here first e.g the zone going
1008bd41d0anordmark		 * away.
1009bd41d0anordmark		 */
1010bd41d0anordmark		if (!zone_lock_held)
1011bd41d0anordmark			mutex_exit(&zone->zone_lock);
1012bd41d0anordmark		return (B_FALSE);
1013bd41d0anordmark	}
1014bd41d0anordmark	dropped = B_FALSE;
1015bd41d0anordmark	if (zsd_wait_for_inprogress(zone, t, lockp))
1016bd41d0anordmark		dropped = B_TRUE;
1018bd41d0anordmark	if (t->zsd_flags & ZSD_CREATE_NEEDED) {
1019bd41d0anordmark		t->zsd_flags &= ~ZSD_CREATE_NEEDED;
1020bd41d0anordmark		t->zsd_flags |= ZSD_CREATE_INPROGRESS;
1021bd41d0anordmark		DTRACE_PROBE2(zsd__create__inprogress,
1022bd41d0anordmark		    zone_t *, zone, zone_key_t, key);
1023bd41d0anordmark		mutex_exit(&zone->zone_lock);
1024bd41d0anordmark		if (lockp != NULL)
1025bd41d0anordmark			mutex_exit(lockp);
1027bd41d0anordmark		dropped = B_TRUE;
1028bd41d0anordmark		ASSERT(t->zsd_create != NULL);
1029bd41d0anordmark		DTRACE_PROBE2(zsd__create__start,
1030bd41d0anordmark		    zone_t *, zone, zone_key_t, key);
1032bd41d0anordmark		result = (*t->zsd_create)(zone->zone_id);
1034bd41d0anordmark		DTRACE_PROBE2(zsd__create__end,
1035bd41d0anordmark		    zone_t *, zone, voidn *, result);
1037bd41d0anordmark		ASSERT(result != NULL);
1038bd41d0anordmark		if (lockp != NULL)
1039bd41d0anordmark			mutex_enter(lockp);
1040bd41d0anordmark		mutex_enter(&zone->zone_lock);
1041bd41d0anordmark		t->zsd_data = result;
1042bd41d0anordmark		t->zsd_flags &= ~ZSD_CREATE_INPROGRESS;
1043bd41d0anordmark		t->zsd_flags |= ZSD_CREATE_COMPLETED;
1044bd41d0anordmark		cv_broadcast(&t->zsd_cv);
1045bd41d0anordmark		DTRACE_PROBE2(zsd__create__completed,
1046bd41d0anordmark		    zone_t *, zone, zone_key_t, key);
1047bd41d0anordmark	}
1048bd41d0anordmark	if (!zone_lock_held)
1049bd41d0anordmark		mutex_exit(&zone->zone_lock);
1050bd41d0anordmark	return (dropped);
1054bd41d0anordmark * Call the shutdown function for the zone and key if SHUTDOWN_NEEDED
1055bd41d0anordmark * is set.
1056bd41d0anordmark * If some other thread gets here first and sets *_INPROGRESS, then
1057bd41d0anordmark * we wait for that thread to complete so that we can ensure that
1058bd41d0anordmark * all the callbacks are done when we've looped over all zones/keys.
1059bd41d0anordmark *
1060bd41d0anordmark * When we call the shutdown function, we drop the global held by the
1061bd41d0anordmark * caller, and return true to tell the caller it needs to re-evalute the
1062bd41d0anordmark * state.
1063bd41d0anordmark * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
1064bd41d0anordmark * remains held on exit.
1065bd41d0anordmark */
1066bd41d0anordmarkstatic boolean_t
1067bd41d0anordmarkzsd_apply_shutdown(kmutex_t *lockp, boolean_t zone_lock_held,
1068bd41d0anordmark    zone_t *zone, zone_key_t key)
1070bd41d0anordmark	struct zsd_entry *t;
1071bd41d0anordmark	void *data;
1072bd41d0anordmark	boolean_t dropped;
1074bd41d0anordmark	if (lockp != NULL) {
1075bd41d0anordmark		ASSERT(MUTEX_HELD(lockp));
1076bd41d0anordmark	}
1077bd41d0anordmark	if (zone_lock_held) {
1078bd41d0anordmark		ASSERT(MUTEX_HELD(&zone->zone_lock));
1079bd41d0anordmark	} else {
1080bd41d0anordmark		mutex_enter(&zone->zone_lock);
1081bd41d0anordmark	}
1083bd41d0anordmark	t = zsd_find(&zone->zone_zsd, key);
1084bd41d0anordmark	if (t == NULL) {
1085bd41d0anordmark		/*
1086bd41d0anordmark		 * Somebody else got here first e.g the zone going
1087bd41d0anordmark		 * away.
1088bd41d0anordmark		 */
1089bd41d0anordmark		if (!zone_lock_held)
1090bd41d0anordmark			mutex_exit(&zone->zone_lock);
1091bd41d0anordmark		return (B_FALSE);
1092bd41d0anordmark	}
1093bd41d0anordmark	dropped = B_FALSE;
1094bd41d0anordmark	if (zsd_wait_for_creator(zone, t, lockp))
1095bd41d0anordmark		dropped = B_TRUE;
1097bd41d0anordmark	if (zsd_wait_for_inprogress(zone, t, lockp))
1098bd41d0anordmark		dropped = B_TRUE;
1100bd41d0anordmark	if (t->zsd_flags & ZSD_SHUTDOWN_NEEDED) {
1101bd41d0anordmark		t->zsd_flags &= ~ZSD_SHUTDOWN_NEEDED;
1102bd41d0anordmark		t->zsd_flags |= ZSD_SHUTDOWN_INPROGRESS;
1103bd41d0anordmark		DTRACE_PROBE2(zsd__shutdown__inprogress,
1104bd41d0anordmark		    zone_t *, zone, zone_key_t, key);
1105bd41d0anordmark		mutex_exit(&zone->zone_lock);
1106bd41d0anordmark		if (lockp != NULL)
1107bd41d0anordmark			mutex_exit(lockp);
1108bd41d0anordmark		dropped = B_TRUE;
1110bd41d0anordmark		ASSERT(t->zsd_shutdown != NULL);
1111bd41d0anordmark		data = t->zsd_data;
1113bd41d0anordmark		DTRACE_PROBE2(zsd__shutdown__start,
1114bd41d0anordmark		    zone_t *, zone, zone_key_t, key);
1116bd41d0anordmark		(t->zsd_shutdown)(zone->zone_id, data);
1117bd41d0anordmark		DTRACE_PROBE2(zsd__shutdown__end,
1118bd41d0anordmark		    zone_t *, zone, zone_key_t, key);
1120bd41d0anordmark		if (lockp != NULL)
1121bd41d0anordmark			mutex_enter(lockp);
1122bd41d0anordmark		mutex_enter(&zone->zone_lock);
1123bd41d0anordmark		t->zsd_flags &= ~ZSD_SHUTDOWN_INPROGRESS;
1124bd41d0anordmark		t->zsd_flags |= ZSD_SHUTDOWN_COMPLETED;
1125bd41d0anordmark		cv_broadcast(&t->zsd_cv);
1126bd41d0anordmark		DTRACE_PROBE2(zsd__shutdown__completed,
1127bd41d0anordmark		    zone_t *, zone, zone_key_t, key);
1128bd41d0anordmark	}
1129bd41d0anordmark	if (!zone_lock_held)
1130bd41d0anordmark		mutex_exit(&zone->zone_lock);
1131bd41d0anordmark	return (dropped);
1135bd41d0anordmark * Call the destroy function for the zone and key if DESTROY_NEEDED
1136bd41d0anordmark * is set.
1137bd41d0anordmark * If some other thread gets here first and sets *_INPROGRESS, then
1138bd41d0anordmark * we wait for that thread to complete so that we can ensure that
1139bd41d0anordmark * all the callbacks are done when we've looped over all zones/keys.
1140bd41d0anordmark *
1141bd41d0anordmark * When we call the destroy function, we drop the global held by the
1142bd41d0anordmark * caller, and return true to tell the caller it needs to re-evalute the
1143bd41d0anordmark * state.
1144bd41d0anordmark * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
1145bd41d0anordmark * remains held on exit.
1146bd41d0anordmark */
1147bd41d0anordmarkstatic boolean_t
1148bd41d0anordmarkzsd_apply_destroy(kmutex_t *lockp, boolean_t zone_lock_held,
1149bd41d0anordmark    zone_t *zone, zone_key_t key)
1151bd41d0anordmark	struct zsd_entry *t;
1152bd41d0anordmark	void *data;
1153bd41d0anordmark	boolean_t dropped;
1155bd41d0anordmark	if (lockp != NULL) {
1156bd41d0anordmark		ASSERT(MUTEX_HELD(lockp));
1157bd41d0anordmark	}
1158bd41d0anordmark	if (zone_lock_held) {
1159bd41d0anordmark		ASSERT(MUTEX_HELD(&zone->zone_lock));
1160bd41d0anordmark	} else {
1161bd41d0anordmark		mutex_enter(&zone->zone_lock);
1162bd41d0anordmark	}
1164bd41d0anordmark	t = zsd_find(&zone->zone_zsd, key);
1165bd41d0anordmark	if (t == NULL) {
1166bd41d0anordmark		/*
1167bd41d0anordmark		 * Somebody else got here first e.g the zone going
1168bd41d0anordmark		 * away.
1169bd41d0anordmark		 */
1170bd41d0anordmark		if (!zone_lock_held)
1171bd41d0anordmark			mutex_exit(&zone->zone_lock);
1172bd41d0anordmark		return (B_FALSE);
1173bd41d0anordmark	}
1174bd41d0anordmark	dropped = B_FALSE;
1175bd41d0anordmark	if (zsd_wait_for_creator(zone, t, lockp))
1176bd41d0anordmark		dropped = B_TRUE;
1178bd41d0anordmark	if (zsd_wait_for_inprogress(zone, t, lockp))
1179bd41d0anordmark		dropped = B_TRUE;
1181bd41d0anordmark	if (t->zsd_flags & ZSD_DESTROY_NEEDED) {
1182bd41d0anordmark		t->zsd_flags &= ~ZSD_DESTROY_NEEDED;
1183bd41d0anordmark		t->zsd_flags |= ZSD_DESTROY_INPROGRESS;
1184bd41d0anordmark		DTRACE_PROBE2(zsd__destroy__inprogress,
1185bd41d0anordmark		    zone_t *, zone, zone_key_t, key);
1186bd41d0anordmark		mutex_exit(&zone->zone_lock);
1187bd41d0anordmark		if (lockp != NULL)
1188bd41d0anordmark			mutex_exit(lockp);
1189bd41d0anordmark		dropped = B_TRUE;
1191bd41d0anordmark		ASSERT(t->zsd_destroy != NULL);
1192bd41d0anordmark		data = t->zsd_data;
1193bd41d0anordmark		DTRACE_PROBE2(zsd__destroy__start,
1194bd41d0anordmark		    zone_t *, zone, zone_key_t, key);
1196bd41d0anordmark		(t->zsd_destroy)(zone->zone_id, data);
1197bd41d0anordmark		DTRACE_PROBE2(zsd__destroy__end,
1198bd41d0anordmark		    zone_t *, zone, zone_key_t, key);
1200bd41d0anordmark		if (lockp != NULL)
1201bd41d0anordmark			mutex_enter(lockp);
1202bd41d0anordmark		mutex_enter(&zone->zone_lock);
1203bd41d0anordmark		t->zsd_data = NULL;
1204bd41d0anordmark		t->zsd_flags &= ~ZSD_DESTROY_INPROGRESS;
1205bd41d0anordmark		t->zsd_flags |= ZSD_DESTROY_COMPLETED;
1206bd41d0anordmark		cv_broadcast(&t->zsd_cv);
1207bd41d0anordmark		DTRACE_PROBE2(zsd__destroy__completed,
1208bd41d0anordmark		    zone_t *, zone, zone_key_t, key);
1209bd41d0anordmark	}
1210bd41d0anordmark	if (!zone_lock_held)
1211bd41d0anordmark		mutex_exit(&zone->zone_lock);
1212bd41d0anordmark	return (dropped);
1216bd41d0anordmark * Wait for any CREATE_NEEDED flag to be cleared.
1217bd41d0anordmark * Returns true if lockp was temporarily dropped while waiting.
1218bd41d0anordmark */
1219bd41d0anordmarkstatic boolean_t
1220bd41d0anordmarkzsd_wait_for_creator(zone_t *zone, struct zsd_entry *t, kmutex_t *lockp)
1222bd41d0anordmark	boolean_t dropped = B_FALSE;
1224bd41d0anordmark	while (t->zsd_flags & ZSD_CREATE_NEEDED) {
1225bd41d0anordmark		DTRACE_PROBE2(zsd__wait__for__creator,
1226bd41d0anordmark		    zone_t *, zone, struct zsd_entry *, t);
1227bd41d0anordmark		if (lockp != NULL) {
1228bd41d0anordmark			dropped = B_TRUE;
1229bd41d0anordmark			mutex_exit(lockp);
1230bd41d0anordmark		}
1231bd41d0anordmark		cv_wait(&t->zsd_cv, &zone->zone_lock);
1232bd41d0anordmark		if (lockp != NULL) {
1233bd41d0anordmark			/* First drop zone_lock to preserve order */
1234bd41d0anordmark			mutex_exit(&zone->zone_lock);
1235bd41d0anordmark			mutex_enter(lockp);
1236bd41d0anordmark			mutex_enter(&zone->zone_lock);
1237bd41d0anordmark		}
1238bd41d0anordmark	}
1239bd41d0anordmark	return (dropped);
1243bd41d0anordmark * Wait for any INPROGRESS flag to be cleared.
1244bd41d0anordmark * Returns true if lockp was temporarily dropped while waiting.
1245bd41d0anordmark */
1246bd41d0anordmarkstatic boolean_t
1247bd41d0anordmarkzsd_wait_for_inprogress(zone_t *zone, struct zsd_entry *t, kmutex_t *lockp)
1249bd41d0anordmark	boolean_t dropped = B_FALSE;
1251bd41d0anordmark	while (t->zsd_flags & ZSD_ALL_INPROGRESS) {
1252bd41d0anordmark		DTRACE_PROBE2(zsd__wait__for__inprogress,
1253bd41d0anordmark		    zone_t *, zone, struct zsd_entry *, t);
1254bd41d0anordmark		if (lockp != NULL) {
1255bd41d0anordmark			dropped = B_TRUE;
1256bd41d0anordmark			mutex_exit(lockp);
1257bd41d0anordmark		}
1258bd41d0anordmark		cv_wait(&t->zsd_cv, &zone->zone_lock);
1259bd41d0anordmark		if (lockp != NULL) {
1260bd41d0anordmark			/* First drop zone_lock to preserve order */
1261bd41d0anordmark			mutex_exit(&zone->zone_lock);
1262bd41d0anordmark			mutex_enter(lockp);
1263bd41d0anordmark			mutex_enter(&zone->zone_lock);
1264bd41d0anordmark		}
1265bd41d0anordmark	}
1266bd41d0anordmark	return (dropped);
1270fa9e406ahrens * Frees memory associated with the zone dataset list.
1271fa9e406ahrens */
1272fa9e406ahrensstatic void
1273fa9e406ahrenszone_free_datasets(zone_t *zone)
1275fa9e406ahrens	zone_dataset_t *t, *next;
1277fa9e406ahrens	for (t = list_head(&zone->zone_datasets); t != NULL; t = next) {
1278fa9e406ahrens		next = list_next(&zone->zone_datasets, t);
1279fa9e406ahrens		list_remove(&zone->zone_datasets, t);
1280fa9e406ahrens		kmem_free(t->zd_dataset, strlen(t->zd_dataset) + 1);
1281fa9e406ahrens		kmem_free(t, sizeof (*t));
1282fa9e406ahrens	}
1283fa9e406ahrens	list_destroy(&zone->zone_datasets);
12877c478bdstevel@tonic-gate * zone.cpu-shares resource control support.
12887c478bdstevel@tonic-gate */
12907c478bdstevel@tonic-gatestatic rctl_qty_t
12917c478bdstevel@tonic-gatezone_cpu_shares_usage(rctl_t *rctl, struct proc *p)
12937c478bdstevel@tonic-gate	ASSERT(MUTEX_HELD(&p->p_lock));
12947c478bdstevel@tonic-gate	return (p->p_zone->zone_shares);
12987c478bdstevel@tonic-gatestatic int
12997c478bdstevel@tonic-gatezone_cpu_shares_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
13007c478bdstevel@tonic-gate    rctl_qty_t nv)
13027c478bdstevel@tonic-gate	ASSERT(MUTEX_HELD(&p->p_lock));
13037c478bdstevel@tonic-gate	ASSERT(e->rcep_t == RCENTITY_ZONE);
13047c478bdstevel@tonic-gate	if (e-> == NULL)
13057c478bdstevel@tonic-gate		return (0);
13077c478bdstevel@tonic-gate	e->>zone_shares = nv;
13087c478bdstevel@tonic-gate	return (0);
13117c478bdstevel@tonic-gatestatic rctl_ops_t zone_cpu_shares_ops = {
13127c478bdstevel@tonic-gate	rcop_no_action,
13137c478bdstevel@tonic-gate	zone_cpu_shares_usage,
13147c478bdstevel@tonic-gate	zone_cpu_shares_set,
13157c478bdstevel@tonic-gate	rcop_no_test
1319c97ad5cakolb * zone.cpu-cap resource control support.
1320c97ad5cakolb */
1322c97ad5cakolbstatic rctl_qty_t
1323c97ad5cakolbzone_cpu_cap_get(rctl_t *rctl, struct proc *p)
1325c97ad5cakolb	ASSERT(MUTEX_HELD(&p->p_lock));
1326c97ad5cakolb	return (cpucaps_zone_get(p->p_zone));
1330c97ad5cakolbstatic int
1331c97ad5cakolbzone_cpu_cap_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1332c97ad5cakolb    rctl_qty_t nv)
1334c97ad5cakolb	zone_t *zone = e->;
1336c97ad5cakolb	ASSERT(MUTEX_HELD(&p->p_lock));
1337c97ad5cakolb	ASSERT(e->rcep_t == RCENTITY_ZONE);
1339c97ad5cakolb	if (zone == NULL)
1340c97ad5cakolb		return (0);
1342c97ad5cakolb	/*
1343c97ad5cakolb	 * set cap to the new value.
1344c97ad5cakolb	 */
1345c97ad5cakolb	return (cpucaps_zone_set(zone, nv));
1348c97ad5cakolbstatic rctl_ops_t zone_cpu_cap_ops = {
1349c97ad5cakolb	rcop_no_action,
1350c97ad5cakolb	zone_cpu_cap_get,
1351c97ad5cakolb	zone_cpu_cap_set,
1352c97ad5cakolb	rcop_no_test
13567c478bdstevel@tonic-gatestatic rctl_qty_t
13577c478bdstevel@tonic-gatezone_lwps_usage(rctl_t *r, proc_t *p)
13597c478bdstevel@tonic-gate	rctl_qty_t nlwps;
13607c478bdstevel@tonic-gate	zone_t *zone = p->p_zone;
13627c478bdstevel@tonic-gate	ASSERT(MUTEX_HELD(&p->p_lock));
13647c478bdstevel@tonic-gate	mutex_enter(&zone->zone_nlwps_lock);
13657c478bdstevel@tonic-gate	nlwps = zone->zone_nlwps;
13667c478bdstevel@tonic-gate	mutex_exit(&zone->zone_nlwps_lock);
13687c478bdstevel@tonic-gate	return (nlwps);
13727c478bdstevel@tonic-gatestatic int
13737c478bdstevel@tonic-gatezone_lwps_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rcntl,
13747c478bdstevel@tonic-gate    rctl_qty_t incr, uint_t flags)
13767c478bdstevel@tonic-gate	rctl_qty_t nlwps;
13787c478bdstevel@tonic-gate	ASSERT(MUTEX_HELD(&p->p_lock));
13797c478bdstevel@tonic-gate	ASSERT(e->rcep_t == RCENTITY_ZONE);
13807c478bdstevel@tonic-gate	if (e-> == NULL)
13817c478bdstevel@tonic-gate		return (0);
13827c478bdstevel@tonic-gate	ASSERT(MUTEX_HELD(&(e->>zone_nlwps_lock)));
13837c478bdstevel@tonic-gate	nlwps = e->>zone_nlwps;
13857c478bdstevel@tonic-gate	if (nlwps + incr > rcntl->rcv_value)
13867c478bdstevel@tonic-gate		return (1);
13887c478bdstevel@tonic-gate	return (0);
13927c478bdstevel@tonic-gatestatic int
1393c693965slzone_lwps_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, rctl_qty_t nv)
13957c478bdstevel@tonic-gate	ASSERT(MUTEX_HELD(&p->p_lock));
13967c478bdstevel@tonic-gate	ASSERT(e->rcep_t == RCENTITY_ZONE);
13977c478bdstevel@tonic-gate	if (e-> == NULL)
13987c478bdstevel@tonic-gate		return (0);
13997c478bdstevel@tonic-gate	e->>zone_nlwps_ctl = nv;
14007c478bdstevel@tonic-gate	return (0);
14037c478bdstevel@tonic-gatestatic rctl_ops_t zone_lwps_ops = {
14047c478bdstevel@tonic-gate	rcop_no_action,
14057c478bdstevel@tonic-gate	zone_lwps_usage,
14067c478bdstevel@tonic-gate	zone_lwps_set,
14077c478bdstevel@tonic-gate	zone_lwps_test,
1411ff19e02Menno Lagemanstatic rctl_qty_t
1412ff19e02Menno Lagemanzone_procs_usage(rctl_t *r, proc_t *p)
1413ff19e02Menno Lageman{
1414ff19e02Menno Lageman	rctl_qty_t nprocs;
1415ff19e02Menno Lageman	zone_t *zone = p->p_zone;
1416ff19e02Menno Lageman
1417ff19e02Menno Lageman	ASSERT(MUTEX_HELD(&p->p_lock));
1418ff19e02Menno Lageman
1419ff19e02Menno Lageman	mutex_enter(&zone->zone_nlwps_lock);
1420ff19e02Menno Lageman	nprocs = zone->zone_nprocs;
1421ff19e02Menno Lageman	mutex_exit(&zone->zone_nlwps_lock);
1422ff19e02Menno Lageman
1423ff19e02Menno Lageman	return (nprocs);
1424ff19e02Menno Lageman}
1425ff19e02Menno Lageman
1426ff19e02Menno Lageman/*ARGSUSED*/
1427ff19e02Menno Lagemanstatic int
1428ff19e02Menno Lagemanzone_procs_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rcntl,
1429ff19e02Menno Lageman    rctl_qty_t incr, uint_t flags)
1430ff19e02Menno Lageman{
1431ff19e02Menno Lageman	rctl_qty_t nprocs;
1432ff19e02Menno Lageman
1433ff19e02Menno Lageman	ASSERT(MUTEX_HELD(&p->p_lock));
1434ff19e02Menno Lageman	ASSERT(e->rcep_t == RCENTITY_ZONE);
1435ff19e02Menno Lageman	if (e-> == NULL)
1436ff19e02Menno Lageman		return (0);
1437ff19e02Menno Lageman	ASSERT(MUTEX_HELD(&(e->>zone_nlwps_lock)));
1438ff19e02Menno Lageman	nprocs = e->>zone_nprocs;
1439ff19e02Menno Lageman
1440ff19e02Menno Lageman	if (nprocs + incr > rcntl->rcv_value)
1441ff19e02Menno Lageman		return (1);
1442ff19e02Menno Lageman
1443ff19e02Menno Lageman	return (0);
1444ff19e02Menno Lageman}
1445ff19e02Menno Lageman
1446ff19e02Menno Lageman/*ARGSUSED*/
1447ff19e02Menno Lagemanstatic int
1448ff19e02Menno Lagemanzone_procs_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, rctl_qty_t nv)
1449ff19e02Menno Lageman{
1450ff19e02Menno Lageman	ASSERT(MUTEX_HELD(&p->p_lock));
1451ff19e02Menno Lageman	ASSERT(e->rcep_t == RCENTITY_ZONE);
1452ff19e02Menno Lageman	if (e-> == NULL)
1453ff19e02Menno Lageman		return (0);
1454ff19e02Menno Lageman	e->>zone_nprocs_ctl = nv;
1455ff19e02Menno Lageman	return (0);
1456ff19e02Menno Lageman}
1457ff19e02Menno Lageman
1458ff19e02Menno Lagemanstatic rctl_ops_t zone_procs_ops = {
1459ff19e02Menno Lageman	rcop_no_action,
1460ff19e02Menno Lageman	zone_procs_usage,
1461ff19e02Menno Lageman	zone_procs_set,
1462ff19e02Menno Lageman	zone_procs_test,
1463ff19e02Menno Lageman};
1464ff19e02Menno Lageman
1465ff19e02Menno Lageman/*ARGSUSED*/
1466824c205mlstatic int
1467824c205mlzone_shmmax_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1468824c205ml    rctl_qty_t incr, uint_t flags)
1470824c205ml	rctl_qty_t v;
1471824c205ml	ASSERT(MUTEX_HELD(&p->p_lock));
1472824c205ml	ASSERT(e->rcep_t == RCENTITY_ZONE);
1473824c205ml	v = e->>zone_shmmax + incr;
1474824c205ml	if (v > rval->rcv_value)
1475824c205ml		return (1);
1476824c205ml	return (0);
1479824c205mlstatic rctl_ops_t zone_shmmax_ops = {
1480824c205ml	rcop_no_action,
1481824c205ml	rcop_no_usage,
1482824c205ml	rcop_no_set,
1483824c205ml	zone_shmmax_test
1487824c205mlstatic int
1488824c205mlzone_shmmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1489824c205ml    rctl_qty_t incr, uint_t flags)
1491824c205ml	rctl_qty_t v;
1492824c205ml	ASSERT(MUTEX_HELD(&p->p_lock));
1493824c205ml	ASSERT(e->rcep_t == RCENTITY_ZONE);
1494824c205ml	v = e->>zone_ipc.ipcq_shmmni + incr;
1495824c205ml	if (v > rval->rcv_value)
1496824c205ml		return (1);
1497824c205ml	return (0);
1500824c205mlstatic rctl_ops_t zone_shmmni_ops = {
1501824c205ml	rcop_no_action,
1502824c205ml	rcop_no_usage,
1503824c205ml	rcop_no_set,
1504824c205ml	zone_shmmni_test
1508824c205mlstatic int
1509824c205mlzone_semmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1510824c205ml    rctl_qty_t incr, uint_t flags)
1512824c205ml	rctl_qty_t v;
1513824c205ml	ASSERT(MUTEX_HELD(&p->p_lock));
1514824c205ml	ASSERT(e->rcep_t == RCENTITY_ZONE);
1515824c205ml	v = e->>zone_ipc.ipcq_semmni + incr;
1516824c205ml	if (v > rval->rcv_value)
1517824c205ml		return (1);
1518824c205ml	return (0);
1521824c205mlstatic rctl_ops_t zone_semmni_ops = {
1522824c205ml	rcop_no_action,
1523824c205ml	rcop_no_usage,
1524824c205ml	rcop_no_set,
1525824c205ml	zone_semmni_test
1529824c205mlstatic int
1530824c205mlzone_msgmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1531824c205ml    rctl_qty_t incr, uint_t flags)
1533824c205ml	rctl_qty_t v;
1534824c205ml	ASSERT(MUTEX_HELD(&p->p_lock));
1535824c205ml	ASSERT(e->rcep_t == RCENTITY_ZONE);
1536824c205ml	v = e->>zone_ipc.ipcq_msgmni + incr;
1537824c205ml	if (v > rval->rcv_value)
1538824c205ml		return (1);
1539824c205ml	return (0);
1542824c205mlstatic rctl_ops_t zone_msgmni_ops = {
1543824c205ml	rcop_no_action,
1544824c205ml	rcop_no_usage,
1545824c205ml	rcop_no_set,
1546824c205ml	zone_msgmni_test
1550c693965slstatic rctl_qty_t
1551c693965slzone_locked_mem_usage(rctl_t *rctl, struct proc *p)
1553c693965sl	rctl_qty_t q;
1554c693965sl	ASSERT(MUTEX_HELD(&p->p_lock));
15550209230gjelinek	mutex_enter(&p->p_zone->zone_mem_lock);
1556c693965sl	q = p->p_zone->zone_locked_mem;
15570209230gjelinek	mutex_exit(&p->p_zone->zone_mem_lock);
1558c693965sl	return (q);
1562c693965slstatic int
1563c693965slzone_locked_mem_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
1564c693965sl    rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
1566c693965sl	rctl_qty_t q;
15670209230gjelinek	zone_t *z;
15690209230gjelinek	z = e->;
1570c693965sl	ASSERT(MUTEX_HELD(&p->p_lock));
15710209230gjelinek	ASSERT(MUTEX_HELD(&z->zone_mem_lock));
15720209230gjelinek	q = z->zone_locked_mem;
1573c693965sl	if (q + incr > rcntl->rcv_value)
1574c693965sl		return (1);
1575c693965sl	return (0);
1579c693965slstatic int
1580c693965slzone_locked_mem_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1581c693965sl    rctl_qty_t nv)
1583c693965sl	ASSERT(MUTEX_HELD(&p->p_lock));
1584c693965sl	ASSERT(e->rcep_t == RCENTITY_ZONE);
1585c693965sl	if (e-> == NULL)
1586c693965sl		return (0);
1587c693965sl	e->>zone_locked_mem_ctl = nv;
1588c693965sl	return (0);