1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25/*
26 * Copyright (c) 2017 by Delphix. All rights reserved.
27 */
28
29/*
30 * Contracts
31 * ---------
32 *
33 * Contracts are a primitive which enrich the relationships between
34 * processes and system resources.  The primary purpose of contracts is
35 * to provide a means for the system to negotiate the departure from a
36 * binding relationship (e.g. pages locked in memory or a thread bound
37 * to processor), but they can also be used as a purely asynchronous
38 * error reporting mechanism as they are with process contracts.
39 *
40 * More information on how one interfaces with contracts and what
41 * contracts can do for you can be found in:
42 *   PSARC 2003/193 Solaris Contracts
43 *   PSARC 2004/460 Contracts addendum
44 *
45 * This file contains the core contracts framework.  By itself it is
46 * useless: it depends the contracts filesystem (ctfs) to provide an
47 * interface to user processes and individual contract types to
48 * implement the process/resource relationships.
49 *
50 * Data structure overview
51 * -----------------------
52 *
53 * A contract is represented by a contract_t, which itself points to an
54 * encapsulating contract-type specific contract object.  A contract_t
55 * contains the contract's static identity (including its terms), its
56 * linkage to various bookkeeping structures, the contract-specific
57 * event queue, and a reference count.
58 *
59 * A contract template is represented by a ct_template_t, which, like a
60 * contract, points to an encapsulating contract-type specific template
61 * object.  A ct_template_t contains the template's terms.
62 *
63 * An event queue is represented by a ct_equeue_t, and consists of a
64 * list of events, a list of listeners, and a list of listeners who are
65 * waiting for new events (affectionately referred to as "tail
66 * listeners").  There are three queue types, defined by ct_listnum_t
67 * (an enum).  An event may be on one of each type of queue
68 * simultaneously; the list linkage used by a queue is determined by
69 * its type.
70 *
71 * An event is represented by a ct_kevent_t, which contains mostly
72 * static event data (e.g. id, payload).  It also has an array of
73 * ct_member_t structures, each of which contains a list_node_t and
74 * represent the event's linkage in a specific event queue.
75 *
76 * Each open of an event endpoint results in the creation of a new
77 * listener, represented by a ct_listener_t.  In addition to linkage
78 * into the aforementioned lists in the event_queue, a ct_listener_t
79 * contains a pointer to the ct_kevent_t it is currently positioned at
80 * as well as a set of status flags and other administrative data.
81 *
82 * Each process has a list of contracts it owns, p_ct_held; a pointer
83 * to the process contract it is a member of, p_ct_process; the linkage
84 * for that membership, p_ct_member; and an array of event queue
85 * structures representing the process bundle queues.
86 *
87 * Each LWP has an array of its active templates, lwp_ct_active; and
88 * the most recently created contracts, lwp_ct_latest.
89 *
90 * A process contract has a list of member processes and a list of
91 * inherited contracts.
92 *
93 * There is a system-wide list of all contracts, as well as per-type
94 * lists of contracts.
95 *
96 * Lock ordering overview
97 * ----------------------
98 *
99 * Locks at the top are taken first:
100 *
101 *                   ct_evtlock
102 *                   regent ct_lock
103 *                   member ct_lock
104 *                   pidlock
105 *                   p_lock
106 *    contract ctq_lock         contract_lock
107 *    pbundle ctq_lock
108 *    cte_lock
109 *                   ct_reflock
110 *
111 * contract_lock and ctq_lock/cte_lock are not currently taken at the
112 * same time.
113 *
114 * Reference counting and locking
115 * ------------------------------
116 *
117 * A contract has a reference count, protected by ct_reflock.
118 * (ct_reflock is also used in a couple other places where atomic
119 * access to a variable is needed in an innermost context).  A process
120 * maintains a hold on each contract it owns.  A process contract has a
121 * hold on each contract is has inherited.  Each event has a hold on
122 * the contract which generated it.  Process contract templates have
123 * holds on the contracts referred to by their transfer terms.  CTFS
124 * contract directory nodes have holds on contracts.  Lastly, various
125 * code paths may temporarily take holds on contracts to prevent them
126 * from disappearing while other processing is going on.  It is
127 * important to note that the global contract lists do not hold
128 * references on contracts; a contract is removed from these structures
129 * atomically with the release of its last reference.
130 *
131 * At a given point in time, a contract can either be owned by a
132 * process, inherited by a regent process contract, or orphaned.  A
133 * contract_t's  owner and regent pointers, ct_owner and ct_regent, are
134 * protected by its ct_lock.  The linkage in the holder's (holder =
135 * owner or regent) list of contracts, ct_ctlist, is protected by
136 * whatever lock protects the holder's data structure.  In order for
137 * these two directions to remain consistent, changing the holder of a
138 * contract requires that both locks be held.
139 *
140 * Events also have reference counts.  There is one hold on an event
141 * per queue it is present on, in addition to those needed for the
142 * usual sundry reasons.  Individual listeners are associated with
143 * specific queues, and increase a queue-specific reference count
144 * stored in the ct_member_t structure.
145 *
146 * The dynamic contents of an event (reference count and flags) are
147 * protected by its cte_lock, while the contents of the embedded
148 * ct_member_t structures are protected by the locks of the queues they
149 * are linked into.  A ct_listener_t's contents are also protected by
150 * its event queue's ctq_lock.
151 *
152 * Resource controls
153 * -----------------
154 *
155 * Control:      project.max-contracts (rc_project_contract)
156 * Description:  Maximum number of contracts allowed a project.
157 *
158 *   When a contract is created, the project's allocation is tested and
159 *   (assuming success) increased.  When the last reference to a
160 *   contract is released, the creating project's allocation is
161 *   decreased.
162 */
163
164#include <sys/mutex.h>
165#include <sys/debug.h>
166#include <sys/types.h>
167#include <sys/param.h>
168#include <sys/kmem.h>
169#include <sys/thread.h>
170#include <sys/id_space.h>
171#include <sys/avl.h>
172#include <sys/list.h>
173#include <sys/sysmacros.h>
174#include <sys/proc.h>
175#include <sys/ctfs.h>
176#include <sys/contract_impl.h>
177#include <sys/contract/process_impl.h>
178#include <sys/dditypes.h>
179#include <sys/contract/device_impl.h>
180#include <sys/systm.h>
181#include <sys/atomic.h>
182#include <sys/cmn_err.h>
183#include <sys/model.h>
184#include <sys/policy.h>
185#include <sys/zone.h>
186#include <sys/task.h>
187#include <sys/ddi.h>
188#include <sys/sunddi.h>
189
190extern rctl_hndl_t rc_project_contract;
191
192static id_space_t	*contract_ids;
193static avl_tree_t	contract_avl;
194static kmutex_t		contract_lock;
195
196int			ct_ntypes = CTT_MAXTYPE;
197static ct_type_t	*ct_types_static[CTT_MAXTYPE];
198ct_type_t		**ct_types = ct_types_static;
199int			ct_debug;
200
201static void cte_queue_create(ct_equeue_t *, ct_listnum_t, int, int);
202static void cte_queue_destroy(ct_equeue_t *);
203static void cte_queue_drain(ct_equeue_t *, int);
204static void cte_trim(ct_equeue_t *, contract_t *);
205static void cte_copy(ct_equeue_t *, ct_equeue_t *);
206
207/*
208 * contract_compar
209 *
210 * A contract comparator which sorts on contract ID.
211 */
212int
213contract_compar(const void *x, const void *y)
214{
215	const contract_t *ct1 = x;
216	const contract_t *ct2 = y;
217
218	if (ct1->ct_id < ct2->ct_id)
219		return (-1);
220	if (ct1->ct_id > ct2->ct_id)
221		return (1);
222	return (0);
223}
224
225/*
226 * contract_init
227 *
228 * Initializes the contract subsystem, the specific contract types, and
229 * process 0.
230 */
231void
232contract_init(void)
233{
234	/*
235	 * Initialize contract subsystem.
236	 */
237	contract_ids = id_space_create("contracts", 1, INT_MAX);
238	avl_create(&contract_avl, contract_compar, sizeof (contract_t),
239	    offsetof(contract_t, ct_ctavl));
240	mutex_init(&contract_lock, NULL, MUTEX_DEFAULT, NULL);
241
242	/*
243	 * Initialize contract types.
244	 */
245	contract_process_init();
246	contract_device_init();
247
248	/*
249	 * Initialize p0/lwp0 contract state.
250	 */
251	avl_create(&p0.p_ct_held, contract_compar, sizeof (contract_t),
252	    offsetof(contract_t, ct_ctlist));
253}
254
255/*
256 * contract_dtor
257 *
258 * Performs basic destruction of the common portions of a contract.
259 * Called from the failure path of contract_ctor and from
260 * contract_rele.
261 */
262static void
263contract_dtor(contract_t *ct)
264{
265	cte_queue_destroy(&ct->ct_events);
266	list_destroy(&ct->ct_vnodes);
267	mutex_destroy(&ct->ct_reflock);
268	mutex_destroy(&ct->ct_lock);
269	mutex_destroy(&ct->ct_evtlock);
270}
271
272/*
273 * contract_ctor
274 *
275 * Called by a contract type to initialize a contract.  Fails if the
276 * max-contract resource control would have been exceeded.  After a
277 * successful call to contract_ctor, the contract is unlocked and
278 * visible in all namespaces; any type-specific initialization should
279 * be completed before calling contract_ctor.  Returns 0 on success.
280 *
281 * Because not all callers can tolerate failure, a 0 value for canfail
282 * instructs contract_ctor to ignore the project.max-contracts resource
283 * control.  Obviously, this "out" should only be employed by callers
284 * who are sufficiently constrained in other ways (e.g. newproc).
285 */
286int
287contract_ctor(contract_t *ct, ct_type_t *type, ct_template_t *tmpl, void *data,
288    ctflags_t flags, proc_t *author, int canfail)
289{
290	avl_index_t where;
291	klwp_t *curlwp = ttolwp(curthread);
292
293	ASSERT(author == curproc);
294
295	mutex_init(&ct->ct_lock, NULL, MUTEX_DEFAULT, NULL);
296	mutex_init(&ct->ct_reflock, NULL, MUTEX_DEFAULT, NULL);
297	mutex_init(&ct->ct_evtlock, NULL, MUTEX_DEFAULT, NULL);
298	ct->ct_id = id_alloc(contract_ids);
299
300	cte_queue_create(&ct->ct_events, CTEL_CONTRACT, 20, 0);
301	list_create(&ct->ct_vnodes, sizeof (contract_vnode_t),
302	    offsetof(contract_vnode_t, ctv_node));
303
304	/*
305	 * Instance data
306	 */
307	ct->ct_ref = 2;		/* one for the holder, one for "latest" */
308	ct->ct_cuid = crgetuid(CRED());
309	ct->ct_type = type;
310	ct->ct_data = data;
311	gethrestime(&ct->ct_ctime);
312	ct->ct_state = CTS_OWNED;
313	ct->ct_flags = flags;
314	ct->ct_regent = author->p_ct_process ?
315	    &author->p_ct_process->conp_contract : NULL;
316	ct->ct_ev_info = tmpl->ctmpl_ev_info;
317	ct->ct_ev_crit = tmpl->ctmpl_ev_crit;
318	ct->ct_cookie = tmpl->ctmpl_cookie;
319	ct->ct_owner = author;
320	ct->ct_ntime.ctm_total = -1;
321	ct->ct_qtime.ctm_total = -1;
322	ct->ct_nevent = NULL;
323
324	/*
325	 * Test project.max-contracts.
326	 */
327	mutex_enter(&author->p_lock);
328	mutex_enter(&contract_lock);
329	if (canfail && rctl_test(rc_project_contract,
330	    author->p_task->tk_proj->kpj_rctls, author, 1,
331	    RCA_SAFE) & RCT_DENY) {
332		id_free(contract_ids, ct->ct_id);
333		mutex_exit(&contract_lock);
334		mutex_exit(&author->p_lock);
335		ct->ct_events.ctq_flags |= CTQ_DEAD;
336		contract_dtor(ct);
337		return (1);
338	}
339	ct->ct_proj = author->p_task->tk_proj;
340	ct->ct_proj->kpj_data.kpd_contract++;
341	(void) project_hold(ct->ct_proj);
342	mutex_exit(&contract_lock);
343
344	/*
345	 * Insert into holder's avl of contracts.
346	 * We use an avl not because order is important, but because
347	 * readdir of /proc/contracts requires we be able to use a
348	 * scalar as an index into the process's list of contracts
349	 */
350	ct->ct_zoneid = author->p_zone->zone_id;
351	ct->ct_czuniqid = ct->ct_mzuniqid = author->p_zone->zone_uniqid;
352	VERIFY(avl_find(&author->p_ct_held, ct, &where) == NULL);
353	avl_insert(&author->p_ct_held, ct, where);
354	mutex_exit(&author->p_lock);
355
356	/*
357	 * Insert into global contract AVL
358	 */
359	mutex_enter(&contract_lock);
360	VERIFY(avl_find(&contract_avl, ct, &where) == NULL);
361	avl_insert(&contract_avl, ct, where);
362	mutex_exit(&contract_lock);
363
364	/*
365	 * Insert into type AVL
366	 */
367	mutex_enter(&type->ct_type_lock);
368	VERIFY(avl_find(&type->ct_type_avl, ct, &where) == NULL);
369	avl_insert(&type->ct_type_avl, ct, where);
370	type->ct_type_timestruc = ct->ct_ctime;
371	mutex_exit(&type->ct_type_lock);
372
373	if (curlwp->lwp_ct_latest[type->ct_type_index])
374		contract_rele(curlwp->lwp_ct_latest[type->ct_type_index]);
375	curlwp->lwp_ct_latest[type->ct_type_index] = ct;
376
377	return (0);
378}
379
380/*
381 * contract_rele
382 *
383 * Releases a reference to a contract.  If the caller had the last
384 * reference, the contract is removed from all namespaces, its
385 * allocation against the max-contracts resource control is released,
386 * and the contract type's free entry point is invoked for any
387 * type-specific deconstruction and to (presumably) free the object.
388 */
389void
390contract_rele(contract_t *ct)
391{
392	uint64_t nref;
393
394	mutex_enter(&ct->ct_reflock);
395	ASSERT(ct->ct_ref > 0);
396	nref = --ct->ct_ref;
397	mutex_exit(&ct->ct_reflock);
398	if (nref == 0) {
399		/*
400		 * ct_owner is cleared when it drops its reference.
401		 */
402		ASSERT(ct->ct_owner == NULL);
403		ASSERT(ct->ct_evcnt == 0);
404
405		/*
406		 * Remove from global contract AVL
407		 */
408		mutex_enter(&contract_lock);
409		avl_remove(&contract_avl, ct);
410		mutex_exit(&contract_lock);
411
412		/*
413		 * Remove from type AVL
414		 */
415		mutex_enter(&ct->ct_type->ct_type_lock);
416		avl_remove(&ct->ct_type->ct_type_avl, ct);
417		mutex_exit(&ct->ct_type->ct_type_lock);
418
419		/*
420		 * Release the contract's ID
421		 */
422		id_free(contract_ids, ct->ct_id);
423
424		/*
425		 * Release project hold
426		 */
427		mutex_enter(&contract_lock);
428		ct->ct_proj->kpj_data.kpd_contract--;
429		project_rele(ct->ct_proj);
430		mutex_exit(&contract_lock);
431
432		/*
433		 * Free the contract
434		 */
435		contract_dtor(ct);
436		ct->ct_type->ct_type_ops->contop_free(ct);
437	}
438}
439
440/*
441 * contract_hold
442 *
443 * Adds a reference to a contract
444 */
445void
446contract_hold(contract_t *ct)
447{
448	mutex_enter(&ct->ct_reflock);
449	ASSERT(ct->ct_ref < UINT64_MAX);
450	ct->ct_ref++;
451	mutex_exit(&ct->ct_reflock);
452}
453
454/*
455 * contract_getzuniqid
456 *
457 * Get a contract's zone unique ID.  Needed because 64-bit reads and
458 * writes aren't atomic on x86.  Since there are contexts where we are
459 * unable to take ct_lock, we instead use ct_reflock; in actuality any
460 * lock would do.
461 */
462uint64_t
463contract_getzuniqid(contract_t *ct)
464{
465	uint64_t zuniqid;
466
467	mutex_enter(&ct->ct_reflock);
468	zuniqid = ct->ct_mzuniqid;
469	mutex_exit(&ct->ct_reflock);
470
471	return (zuniqid);
472}
473
474/*
475 * contract_setzuniqid
476 *
477 * Sets a contract's zone unique ID.   See contract_getzuniqid.
478 */
479void
480contract_setzuniqid(contract_t *ct, uint64_t zuniqid)
481{
482	mutex_enter(&ct->ct_reflock);
483	ct->ct_mzuniqid = zuniqid;
484	mutex_exit(&ct->ct_reflock);
485}
486
487/*
488 * contract_abandon
489 *
490 * Abandons the specified contract.  If "explicit" is clear, the
491 * contract was implicitly abandoned (by process exit) and should be
492 * inherited if its terms allow it and its owner was a member of a
493 * regent contract.  Otherwise, the contract type's abandon entry point
494 * is invoked to either destroy or orphan the contract.
495 */
496int
497contract_abandon(contract_t *ct, proc_t *p, int explicit)
498{
499	ct_equeue_t *q = NULL;
500	contract_t *parent = &p->p_ct_process->conp_contract;
501	int inherit = 0;
502
503	VERIFY(p == curproc);
504
505	mutex_enter(&ct->ct_lock);
506
507	/*
508	 * Multiple contract locks are taken contract -> subcontract.
509	 * Check if the contract will be inherited so we can acquire
510	 * all the necessary locks before making sensitive changes.
511	 */
512	if (!explicit && (ct->ct_flags & CTF_INHERIT) &&
513	    contract_process_accept(parent)) {
514		mutex_exit(&ct->ct_lock);
515		mutex_enter(&parent->ct_lock);
516		mutex_enter(&ct->ct_lock);
517		inherit = 1;
518	}
519
520	if (ct->ct_owner != p) {
521		mutex_exit(&ct->ct_lock);
522		if (inherit)
523			mutex_exit(&parent->ct_lock);
524		return (EINVAL);
525	}
526
527	mutex_enter(&p->p_lock);
528	if (explicit)
529		avl_remove(&p->p_ct_held, ct);
530	ct->ct_owner = NULL;
531	mutex_exit(&p->p_lock);
532
533	/*
534	 * Since we can't call cte_trim with the contract lock held,
535	 * we grab the queue pointer here.
536	 */
537	if (p->p_ct_equeue)
538		q = p->p_ct_equeue[ct->ct_type->ct_type_index];
539
540	/*
541	 * contop_abandon may destroy the contract so we rely on it to
542	 * drop ct_lock.  We retain a reference on the contract so that
543	 * the cte_trim which follows functions properly.  Even though
544	 * cte_trim doesn't dereference the contract pointer, it is
545	 * still necessary to retain a reference to the contract so
546	 * that we don't trim events which are sent by a subsequently
547	 * allocated contract infortuitously located at the same address.
548	 */
549	contract_hold(ct);
550
551	if (inherit) {
552		ct->ct_state = CTS_INHERITED;
553		VERIFY(ct->ct_regent == parent);
554		contract_process_take(parent, ct);
555
556		/*
557		 * We are handing off the process's reference to the
558		 * parent contract.  For this reason, the order in
559		 * which we drop the contract locks is also important.
560		 */
561		mutex_exit(&ct->ct_lock);
562		mutex_exit(&parent->ct_lock);
563	} else {
564		ct->ct_regent = NULL;
565		ct->ct_type->ct_type_ops->contop_abandon(ct);
566	}
567
568	/*
569	 * ct_lock has been dropped; we can safely trim the event
570	 * queue now.
571	 */
572	if (q) {
573		mutex_enter(&q->ctq_lock);
574		cte_trim(q, ct);
575		mutex_exit(&q->ctq_lock);
576	}
577
578	contract_rele(ct);
579
580	return (0);
581}
582
583int
584contract_newct(contract_t *ct)
585{
586	return (ct->ct_type->ct_type_ops->contop_newct(ct));
587}
588
589/*
590 * contract_adopt
591 *
592 * Adopts a contract.  After a successful call to this routine, the
593 * previously inherited contract will belong to the calling process,
594 * and its events will have been appended to its new owner's process
595 * bundle queue.
596 */
597int
598contract_adopt(contract_t *ct, proc_t *p)
599{
600	avl_index_t where;
601	ct_equeue_t *q;
602	contract_t *parent;
603
604	ASSERT(p == curproc);
605
606	/*
607	 * Ensure the process has an event queue.  Checked by ASSERTs
608	 * below.
609	 */
610	(void) contract_type_pbundle(ct->ct_type, p);
611
612	mutex_enter(&ct->ct_lock);
613	parent = ct->ct_regent;
614	if (ct->ct_state != CTS_INHERITED ||
615	    &p->p_ct_process->conp_contract != parent ||
616	    p->p_zone->zone_uniqid != ct->ct_czuniqid) {
617		mutex_exit(&ct->ct_lock);
618		return (EINVAL);
619	}
620
621	/*
622	 * Multiple contract locks are taken contract -> subcontract.
623	 */
624	mutex_exit(&ct->ct_lock);
625	mutex_enter(&parent->ct_lock);
626	mutex_enter(&ct->ct_lock);
627
628	/*
629	 * It is possible that the contract was adopted by someone else
630	 * while its lock was dropped.  It isn't possible for the
631	 * contract to have been inherited by a different regent
632	 * contract.
633	 */
634	if (ct->ct_state != CTS_INHERITED) {
635		mutex_exit(&parent->ct_lock);
636		mutex_exit(&ct->ct_lock);
637		return (EBUSY);
638	}
639	ASSERT(ct->ct_regent == parent);
640
641	ct->ct_state = CTS_OWNED;
642
643	contract_process_adopt(ct, p);
644
645	mutex_enter(&p->p_lock);
646	ct->ct_owner = p;
647	VERIFY(avl_find(&p->p_ct_held, ct, &where) == NULL);
648	avl_insert(&p->p_ct_held, ct, where);
649	mutex_exit(&p->p_lock);
650
651	ASSERT(ct->ct_owner->p_ct_equeue);
652	ASSERT(ct->ct_owner->p_ct_equeue[ct->ct_type->ct_type_index]);
653	q = ct->ct_owner->p_ct_equeue[ct->ct_type->ct_type_index];
654	cte_copy(&ct->ct_events, q);
655	mutex_exit(&ct->ct_lock);
656
657	return (0);
658}
659
660/*
661 * contract_ack
662 *
663 * Acknowledges receipt of a critical event.
664 */
665int
666contract_ack(contract_t *ct, uint64_t evid, int ack)
667{
668	ct_kevent_t *ev;
669	list_t *queue = &ct->ct_events.ctq_events;
670	int error = ESRCH;
671	int nego = 0;
672	uint_t evtype;
673
674	ASSERT(ack == CT_ACK || ack == CT_NACK);
675
676	mutex_enter(&ct->ct_lock);
677	mutex_enter(&ct->ct_events.ctq_lock);
678	/*
679	 * We are probably ACKing something near the head of the queue.
680	 */
681	for (ev = list_head(queue); ev; ev = list_next(queue, ev)) {
682		if (ev->cte_id == evid) {
683			if (ev->cte_flags & CTE_NEG)
684				nego = 1;
685			else if (ack == CT_NACK)
686				break;
687			if ((ev->cte_flags & (CTE_INFO | CTE_ACK)) == 0) {
688				ev->cte_flags |= CTE_ACK;
689				ct->ct_evcnt--;
690				evtype = ev->cte_type;
691				error = 0;
692			}
693			break;
694		}
695	}
696	mutex_exit(&ct->ct_events.ctq_lock);
697	mutex_exit(&ct->ct_lock);
698
699	/*
700	 * Not all critical events are negotiation events, however
701	 * every negotiation event is a critical event. NEGEND events
702	 * are critical events but are not negotiation events
703	 */
704	if (error || !nego)
705		return (error);
706
707	if (ack == CT_ACK)
708		error = ct->ct_type->ct_type_ops->contop_ack(ct, evtype, evid);
709	else
710		error = ct->ct_type->ct_type_ops->contop_nack(ct, evtype, evid);
711
712	return (error);
713}
714
715/*ARGSUSED*/
716int
717contract_ack_inval(contract_t *ct, uint_t evtype, uint64_t evid)
718{
719	cmn_err(CE_PANIC, "contract_ack_inval: unsupported call: ctid: %u",
720	    ct->ct_id);
721	return (ENOSYS);
722}
723
724/*ARGSUSED*/
725int
726contract_qack_inval(contract_t *ct, uint_t evtype, uint64_t evid)
727{
728	cmn_err(CE_PANIC, "contract_ack_inval: unsupported call: ctid: %u",
729	    ct->ct_id);
730	return (ENOSYS);
731}
732
733/*ARGSUSED*/
734int
735contract_qack_notsup(contract_t *ct, uint_t evtype, uint64_t evid)
736{
737	return (ERANGE);
738}
739
740/*
741 * contract_qack
742 *
743 * Asks that negotiations be extended by another time quantum
744 */
745int
746contract_qack(contract_t *ct, uint64_t evid)
747{
748	ct_kevent_t *ev;
749	list_t *queue = &ct->ct_events.ctq_events;
750	int nego = 0;
751	uint_t evtype;
752
753	mutex_enter(&ct->ct_lock);
754	mutex_enter(&ct->ct_events.ctq_lock);
755
756	for (ev = list_head(queue); ev; ev = list_next(queue, ev)) {
757		if (ev->cte_id == evid) {
758			if ((ev->cte_flags & (CTE_NEG | CTE_ACK)) == CTE_NEG) {
759				evtype = ev->cte_type;
760				nego = 1;
761			}
762			break;
763		}
764	}
765	mutex_exit(&ct->ct_events.ctq_lock);
766	mutex_exit(&ct->ct_lock);
767
768	/*
769	 * Only a negotiated event (which is by definition also a critical
770	 * event) which has not yet been acknowledged can provide
771	 * time quanta to a negotiating owner process.
772	 */
773	if (!nego)
774		return (ESRCH);
775
776	return (ct->ct_type->ct_type_ops->contop_qack(ct, evtype, evid));
777}
778
779/*
780 * contract_orphan
781 *
782 * Icky-poo.  This is a process-contract special, used to ACK all
783 * critical messages when a contract is orphaned.
784 */
785void
786contract_orphan(contract_t *ct)
787{
788	ct_kevent_t *ev;
789	list_t *queue = &ct->ct_events.ctq_events;
790
791	ASSERT(MUTEX_HELD(&ct->ct_lock));
792	ASSERT(ct->ct_state != CTS_ORPHAN);
793
794	mutex_enter(&ct->ct_events.ctq_lock);
795	ct->ct_state = CTS_ORPHAN;
796	for (ev = list_head(queue); ev; ev = list_next(queue, ev)) {
797		if ((ev->cte_flags & (CTE_INFO | CTE_ACK)) == 0) {
798			ev->cte_flags |= CTE_ACK;
799			ct->ct_evcnt--;
800		}
801	}
802	mutex_exit(&ct->ct_events.ctq_lock);
803
804	ASSERT(ct->ct_evcnt == 0);
805}
806
807/*
808 * contract_destroy
809 *
810 * Explicit contract destruction.  Called when contract is empty.
811 * The contract will actually stick around until all of its events are
812 * removed from the bundle and and process bundle queues, and all fds
813 * which refer to it are closed.  See contract_dtor if you are looking
814 * for what destroys the contract structure.
815 */
816void
817contract_destroy(contract_t *ct)
818{
819	ASSERT(MUTEX_HELD(&ct->ct_lock));
820	ASSERT(ct->ct_state != CTS_DEAD);
821	ASSERT(ct->ct_owner == NULL);
822
823	ct->ct_state = CTS_DEAD;
824	cte_queue_drain(&ct->ct_events, 1);
825	mutex_exit(&ct->ct_lock);
826	mutex_enter(&ct->ct_type->ct_type_events.ctq_lock);
827	cte_trim(&ct->ct_type->ct_type_events, ct);
828	mutex_exit(&ct->ct_type->ct_type_events.ctq_lock);
829	mutex_enter(&ct->ct_lock);
830	ct->ct_type->ct_type_ops->contop_destroy(ct);
831	mutex_exit(&ct->ct_lock);
832	contract_rele(ct);
833}
834
835/*
836 * contract_vnode_get
837 *
838 * Obtains the contract directory vnode for this contract, if there is
839 * one.  The caller must VN_RELE the vnode when they are through using
840 * it.
841 */
842vnode_t *
843contract_vnode_get(contract_t *ct, vfs_t *vfsp)
844{
845	contract_vnode_t *ctv;
846	vnode_t *vp = NULL;
847
848	mutex_enter(&ct->ct_lock);
849	for (ctv = list_head(&ct->ct_vnodes); ctv != NULL;
850	    ctv = list_next(&ct->ct_vnodes, ctv))
851		if (ctv->ctv_vnode->v_vfsp == vfsp) {
852			vp = ctv->ctv_vnode;
853			VN_HOLD(vp);
854			break;
855		}
856	mutex_exit(&ct->ct_lock);
857	return (vp);
858}
859
860/*
861 * contract_vnode_set
862 *
863 * Sets the contract directory vnode for this contract.  We don't hold
864 * a reference on the vnode because we don't want to prevent it from
865 * being freed.  The vnode's inactive entry point will take care of
866 * notifying us when it should be removed.
867 */
868void
869contract_vnode_set(contract_t *ct, contract_vnode_t *ctv, vnode_t *vnode)
870{
871	mutex_enter(&ct->ct_lock);
872	ctv->ctv_vnode = vnode;
873	list_insert_head(&ct->ct_vnodes, ctv);
874	mutex_exit(&ct->ct_lock);
875}
876
877/*
878 * contract_vnode_clear
879 *
880 * Removes this vnode as the contract directory vnode for this
881 * contract.  Called from a contract directory's inactive entry point,
882 * this may return 0 indicating that the vnode gained another reference
883 * because of a simultaneous call to contract_vnode_get.
884 */
885int
886contract_vnode_clear(contract_t *ct, contract_vnode_t *ctv)
887{
888	vnode_t *vp = ctv->ctv_vnode;
889	int result;
890
891	mutex_enter(&ct->ct_lock);
892	mutex_enter(&vp->v_lock);
893	if (vp->v_count == 1) {
894		list_remove(&ct->ct_vnodes, ctv);
895		result = 1;
896	} else {
897		VN_RELE_LOCKED(vp);
898		result = 0;
899	}
900	mutex_exit(&vp->v_lock);
901	mutex_exit(&ct->ct_lock);
902
903	return (result);
904}
905
906/*
907 * contract_exit
908 *
909 * Abandons all contracts held by process p, and drains process p's
910 * bundle queues.  Called on process exit.
911 */
912void
913contract_exit(proc_t *p)
914{
915	contract_t *ct;
916	void *cookie = NULL;
917	int i;
918
919	ASSERT(p == curproc);
920
921	/*
922	 * Abandon held contracts.  contract_abandon knows enough not
923	 * to remove the contract from the list a second time.  We are
924	 * exiting, so no locks are needed here.  But because
925	 * contract_abandon will take p_lock, we need to make sure we
926	 * aren't holding it.
927	 */
928	ASSERT(MUTEX_NOT_HELD(&p->p_lock));
929	while ((ct = avl_destroy_nodes(&p->p_ct_held, &cookie)) != NULL)
930		VERIFY(contract_abandon(ct, p, 0) == 0);
931
932	/*
933	 * Drain pbundles.  Because a process bundle queue could have
934	 * been passed to another process, they may not be freed right
935	 * away.
936	 */
937	if (p->p_ct_equeue) {
938		for (i = 0; i < CTT_MAXTYPE; i++)
939			if (p->p_ct_equeue[i])
940				cte_queue_drain(p->p_ct_equeue[i], 0);
941		kmem_free(p->p_ct_equeue, CTT_MAXTYPE * sizeof (ct_equeue_t *));
942	}
943}
944
945static int
946get_time_left(struct ct_time *t)
947{
948	clock_t ticks_elapsed;
949	int secs_elapsed;
950
951	if (t->ctm_total == -1)
952		return (-1);
953
954	ticks_elapsed = ddi_get_lbolt() - t->ctm_start;
955	secs_elapsed = t->ctm_total - (drv_hztousec(ticks_elapsed)/MICROSEC);
956	return (secs_elapsed > 0 ? secs_elapsed : 0);
957}
958
959/*
960 * contract_status_common
961 *
962 * Populates a ct_status structure.  Used by contract types in their
963 * status entry points and ctfs when only common information is
964 * requested.
965 */
966void
967contract_status_common(contract_t *ct, zone_t *zone, void *status,
968    model_t model)
969{
970	STRUCT_HANDLE(ct_status, lstatus);
971
972	STRUCT_SET_HANDLE(lstatus, model, status);
973	ASSERT(MUTEX_HELD(&ct->ct_lock));
974	if (zone->zone_uniqid == GLOBAL_ZONEUNIQID ||
975	    zone->zone_uniqid == ct->ct_czuniqid) {
976		zone_t *czone;
977		zoneid_t zoneid = -1;
978
979		/*
980		 * Contracts don't have holds on the zones they were
981		 * created by.  If the contract's zone no longer
982		 * exists, we say its zoneid is -1.
983		 */
984		if (zone->zone_uniqid == ct->ct_czuniqid ||
985		    ct->ct_czuniqid == GLOBAL_ZONEUNIQID) {
986			zoneid = ct->ct_zoneid;
987		} else if ((czone = zone_find_by_id(ct->ct_zoneid)) != NULL) {
988			if (czone->zone_uniqid == ct->ct_mzuniqid)
989				zoneid = ct->ct_zoneid;
990			zone_rele(czone);
991		}
992
993		STRUCT_FSET(lstatus, ctst_zoneid, zoneid);
994		STRUCT_FSET(lstatus, ctst_holder,
995		    (ct->ct_state == CTS_OWNED) ? ct->ct_owner->p_pid :
996		    (ct->ct_state == CTS_INHERITED) ? ct->ct_regent->ct_id : 0);
997		STRUCT_FSET(lstatus, ctst_state, ct->ct_state);
998	} else {
999		/*
1000		 * We are looking at a contract which was created by a
1001		 * process outside of our zone.  We provide fake zone,
1002		 * holder, and state information.
1003		 */
1004
1005		STRUCT_FSET(lstatus, ctst_zoneid, zone->zone_id);
1006		/*
1007		 * Since "zone" can't disappear until the calling ctfs
1008		 * is unmounted, zone_zsched must be valid.
1009		 */
1010		STRUCT_FSET(lstatus, ctst_holder, (ct->ct_state < CTS_ORPHAN) ?
1011		    zone->zone_zsched->p_pid : 0);
1012		STRUCT_FSET(lstatus, ctst_state, (ct->ct_state < CTS_ORPHAN) ?
1013		    CTS_OWNED : ct->ct_state);
1014	}
1015	STRUCT_FSET(lstatus, ctst_nevents, ct->ct_evcnt);
1016	STRUCT_FSET(lstatus, ctst_ntime, get_time_left(&ct->ct_ntime));
1017	STRUCT_FSET(lstatus, ctst_qtime, get_time_left(&ct->ct_qtime));
1018	STRUCT_FSET(lstatus, ctst_nevid,
1019	    ct->ct_nevent ? ct->ct_nevent->cte_id : 0);
1020	STRUCT_FSET(lstatus, ctst_critical, ct->ct_ev_crit);
1021	STRUCT_FSET(lstatus, ctst_informative, ct->ct_ev_info);
1022	STRUCT_FSET(lstatus, ctst_cookie, ct->ct_cookie);
1023	STRUCT_FSET(lstatus, ctst_type, ct->ct_type->ct_type_index);
1024	STRUCT_FSET(lstatus, ctst_id, ct->ct_id);
1025}
1026
1027/*
1028 * contract_checkcred
1029 *
1030 * Determines if the specified contract is owned by a process with the
1031 * same effective uid as the specified credential.  The caller must
1032 * ensure that the uid spaces are the same.  Returns 1 on success.
1033 */
1034static int
1035contract_checkcred(contract_t *ct, const cred_t *cr)
1036{
1037	proc_t *p;
1038	int fail = 1;
1039
1040	mutex_enter(&ct->ct_lock);
1041	if ((p = ct->ct_owner) != NULL) {
1042		mutex_enter(&p->p_crlock);
1043		fail = crgetuid(cr) != crgetuid(p->p_cred);
1044		mutex_exit(&p->p_crlock);
1045	}
1046	mutex_exit(&ct->ct_lock);
1047
1048	return (!fail);
1049}
1050
1051/*
1052 * contract_owned
1053 *
1054 * Determines if the specified credential can view an event generated
1055 * by the specified contract.  If locked is set, the contract's ct_lock
1056 * is held and the caller will need to do additional work to determine
1057 * if they truly can see the event.  Returns 1 on success.
1058 */
1059int
1060contract_owned(contract_t *ct, const cred_t *cr, int locked)
1061{
1062	int owner, cmatch, zmatch;
1063	uint64_t zuniqid, mzuniqid;
1064	uid_t euid;
1065
1066	ASSERT(locked || MUTEX_NOT_HELD(&ct->ct_lock));
1067
1068	zuniqid = curproc->p_zone->zone_uniqid;
1069	mzuniqid = contract_getzuniqid(ct);
1070	euid = crgetuid(cr);
1071
1072	/*
1073	 * owner: we own the contract
1074	 * cmatch: we are in the creator's (and holder's) zone and our
1075	 *   uid matches the creator's or holder's
1076	 * zmatch: we are in the effective zone of a contract created
1077	 *   in the global zone, and our uid matches that of the
1078	 *   virtualized holder's (zsched/kcred)
1079	 */
1080	owner = (ct->ct_owner == curproc);
1081	cmatch = (zuniqid == ct->ct_czuniqid) &&
1082	    ((ct->ct_cuid == euid) || (!locked && contract_checkcred(ct, cr)));
1083	zmatch = (ct->ct_czuniqid != mzuniqid) && (zuniqid == mzuniqid) &&
1084	    (crgetuid(kcred) == euid);
1085
1086	return (owner || cmatch || zmatch);
1087}
1088
1089
1090/*
1091 * contract_type_init
1092 *
1093 * Called by contract types to register themselves with the contracts
1094 * framework.
1095 */
1096ct_type_t *
1097contract_type_init(ct_typeid_t type, const char *name, contops_t *ops,
1098    ct_f_default_t *dfault)
1099{
1100	ct_type_t *result;
1101
1102	ASSERT(type < CTT_MAXTYPE);
1103
1104	result = kmem_alloc(sizeof (ct_type_t), KM_SLEEP);
1105
1106	mutex_init(&result->ct_type_lock, NULL, MUTEX_DEFAULT, NULL);
1107	avl_create(&result->ct_type_avl, contract_compar, sizeof (contract_t),
1108	    offsetof(contract_t, ct_cttavl));
1109	cte_queue_create(&result->ct_type_events, CTEL_BUNDLE, 20, 0);
1110	result->ct_type_name = name;
1111	result->ct_type_ops = ops;
1112	result->ct_type_default = dfault;
1113	result->ct_type_evid = 0;
1114	gethrestime(&result->ct_type_timestruc);
1115	result->ct_type_index = type;
1116
1117	ct_types[type] = result;
1118
1119	return (result);
1120}
1121
1122/*
1123 * contract_type_count
1124 *
1125 * Obtains the number of contracts of a particular type.
1126 */
1127int
1128contract_type_count(ct_type_t *type)
1129{
1130	ulong_t count;
1131
1132	mutex_enter(&type->ct_type_lock);
1133	count = avl_numnodes(&type->ct_type_avl);
1134	mutex_exit(&type->ct_type_lock);
1135
1136	return (count);
1137}
1138
1139/*
1140 * contract_type_max
1141 *
1142 * Obtains the maximum contract id of of a particular type.
1143 */
1144ctid_t
1145contract_type_max(ct_type_t *type)
1146{
1147	contract_t *ct;
1148	ctid_t res;
1149
1150	mutex_enter(&type->ct_type_lock);
1151	ct = avl_last(&type->ct_type_avl);
1152	res = ct ? ct->ct_id : -1;
1153	mutex_exit(&type->ct_type_lock);
1154
1155	return (res);
1156}
1157
1158/*
1159 * contract_max
1160 *
1161 * Obtains the maximum contract id.
1162 */
1163ctid_t
1164contract_max(void)
1165{
1166	contract_t *ct;
1167	ctid_t res;
1168
1169	mutex_enter(&contract_lock);
1170	ct = avl_last(&contract_avl);
1171	res = ct ? ct->ct_id : -1;
1172	mutex_exit(&contract_lock);
1173
1174	return (res);
1175}
1176
1177/*
1178 * contract_lookup_common
1179 *
1180 * Common code for contract_lookup and contract_type_lookup.  Takes a
1181 * pointer to an AVL tree to search in.  Should be called with the
1182 * appropriate tree-protecting lock held (unfortunately unassertable).
1183 */
1184static ctid_t
1185contract_lookup_common(avl_tree_t *tree, uint64_t zuniqid, ctid_t current)
1186{
1187	contract_t template, *ct;
1188	avl_index_t where;
1189	ctid_t res;
1190
1191	template.ct_id = current;
1192	ct = avl_find(tree, &template, &where);
1193	if (ct == NULL)
1194		ct = avl_nearest(tree, where, AVL_AFTER);
1195	if (zuniqid != GLOBAL_ZONEUNIQID)
1196		while (ct && (contract_getzuniqid(ct) != zuniqid))
1197			ct = AVL_NEXT(tree, ct);
1198	res = ct ? ct->ct_id : -1;
1199
1200	return (res);
1201}
1202
1203/*
1204 * contract_type_lookup
1205 *
1206 * Returns the next type contract after the specified id, visible from
1207 * the specified zone.
1208 */
1209ctid_t
1210contract_type_lookup(ct_type_t *type, uint64_t zuniqid, ctid_t current)
1211{
1212	ctid_t res;
1213
1214	mutex_enter(&type->ct_type_lock);
1215	res = contract_lookup_common(&type->ct_type_avl, zuniqid, current);
1216	mutex_exit(&type->ct_type_lock);
1217
1218	return (res);
1219}
1220
1221/*
1222 * contract_lookup
1223 *
1224 * Returns the next contract after the specified id, visible from the
1225 * specified zone.
1226 */
1227ctid_t
1228contract_lookup(uint64_t zuniqid, ctid_t current)
1229{
1230	ctid_t res;
1231
1232	mutex_enter(&contract_lock);
1233	res = contract_lookup_common(&contract_avl, zuniqid, current);
1234	mutex_exit(&contract_lock);
1235
1236	return (res);
1237}
1238
1239/*
1240 * contract_plookup
1241 *
1242 * Returns the next contract held by process p after the specified id,
1243 * visible from the specified zone.  Made complicated by the fact that
1244 * contracts visible in a zone but held by processes outside of the
1245 * zone need to appear as being held by zsched to zone members.
1246 */
1247ctid_t
1248contract_plookup(proc_t *p, ctid_t current, uint64_t zuniqid)
1249{
1250	contract_t template, *ct;
1251	avl_index_t where;
1252	ctid_t res;
1253
1254	template.ct_id = current;
1255	if (zuniqid != GLOBAL_ZONEUNIQID &&
1256	    (p->p_flag & (SSYS|SZONETOP)) == (SSYS|SZONETOP)) {
1257		/* This is inelegant. */
1258		mutex_enter(&contract_lock);
1259		ct = avl_find(&contract_avl, &template, &where);
1260		if (ct == NULL)
1261			ct = avl_nearest(&contract_avl, where, AVL_AFTER);
1262		while (ct && !(ct->ct_state < CTS_ORPHAN &&
1263		    contract_getzuniqid(ct) == zuniqid &&
1264		    ct->ct_czuniqid == GLOBAL_ZONEUNIQID))
1265			ct = AVL_NEXT(&contract_avl, ct);
1266		res = ct ? ct->ct_id : -1;
1267		mutex_exit(&contract_lock);
1268	} else {
1269		mutex_enter(&p->p_lock);
1270		ct = avl_find(&p->p_ct_held, &template, &where);
1271		if (ct == NULL)
1272			ct = avl_nearest(&p->p_ct_held, where, AVL_AFTER);
1273		res = ct ? ct->ct_id : -1;
1274		mutex_exit(&p->p_lock);
1275	}
1276
1277	return (res);
1278}
1279
1280/*
1281 * contract_ptr_common
1282 *
1283 * Common code for contract_ptr and contract_type_ptr.  Takes a pointer
1284 * to an AVL tree to search in.  Should be called with the appropriate
1285 * tree-protecting lock held (unfortunately unassertable).
1286 */
1287static contract_t *
1288contract_ptr_common(avl_tree_t *tree, ctid_t id, uint64_t zuniqid)
1289{
1290	contract_t template, *ct;
1291
1292	template.ct_id = id;
1293	ct = avl_find(tree, &template, NULL);
1294	if (ct == NULL || (zuniqid != GLOBAL_ZONEUNIQID &&
1295	    contract_getzuniqid(ct) != zuniqid)) {
1296		return (NULL);
1297	}
1298
1299	/*
1300	 * Check to see if a thread is in the window in contract_rele
1301	 * between dropping the reference count and removing the
1302	 * contract from the type AVL.
1303	 */
1304	mutex_enter(&ct->ct_reflock);
1305	if (ct->ct_ref) {
1306		ct->ct_ref++;
1307		mutex_exit(&ct->ct_reflock);
1308	} else {
1309		mutex_exit(&ct->ct_reflock);
1310		ct = NULL;
1311	}
1312
1313	return (ct);
1314}
1315
1316/*
1317 * contract_type_ptr
1318 *
1319 * Returns a pointer to the contract with the specified id.  The
1320 * contract is held, so the caller needs to release the reference when
1321 * it is through with the contract.
1322 */
1323contract_t *
1324contract_type_ptr(ct_type_t *type, ctid_t id, uint64_t zuniqid)
1325{
1326	contract_t *ct;
1327
1328	mutex_enter(&type->ct_type_lock);
1329	ct = contract_ptr_common(&type->ct_type_avl, id, zuniqid);
1330	mutex_exit(&type->ct_type_lock);
1331
1332	return (ct);
1333}
1334
1335/*
1336 * contract_ptr
1337 *
1338 * Returns a pointer to the contract with the specified id.  The
1339 * contract is held, so the caller needs to release the reference when
1340 * it is through with the contract.
1341 */
1342contract_t *
1343contract_ptr(ctid_t id, uint64_t zuniqid)
1344{
1345	contract_t *ct;
1346
1347	mutex_enter(&contract_lock);
1348	ct = contract_ptr_common(&contract_avl, id, zuniqid);
1349	mutex_exit(&contract_lock);
1350
1351	return (ct);
1352}
1353
1354/*
1355 * contract_type_time
1356 *
1357 * Obtains the last time a contract of a particular type was created.
1358 */
1359void
1360contract_type_time(ct_type_t *type, timestruc_t *time)
1361{
1362	mutex_enter(&type->ct_type_lock);
1363	*time = type->ct_type_timestruc;
1364	mutex_exit(&type->ct_type_lock);
1365}
1366
1367/*
1368 * contract_type_bundle
1369 *
1370 * Obtains a type's bundle queue.
1371 */
1372ct_equeue_t *
1373contract_type_bundle(ct_type_t *type)
1374{
1375	return (&type->ct_type_events);
1376}
1377
1378/*
1379 * contract_type_pbundle
1380 *
1381 * Obtain's a process's bundle queue.  If one doesn't exist, one is
1382 * created.  Often used simply to ensure that a bundle queue is
1383 * allocated.
1384 */
1385ct_equeue_t *
1386contract_type_pbundle(ct_type_t *type, proc_t *pp)
1387{
1388	/*
1389	 * If there isn't an array of bundle queues, allocate one.
1390	 */
1391	if (pp->p_ct_equeue == NULL) {
1392		size_t size = CTT_MAXTYPE * sizeof (ct_equeue_t *);
1393		ct_equeue_t **qa = kmem_zalloc(size, KM_SLEEP);
1394
1395		mutex_enter(&pp->p_lock);
1396		if (pp->p_ct_equeue)
1397			kmem_free(qa, size);
1398		else
1399			pp->p_ct_equeue = qa;
1400		mutex_exit(&pp->p_lock);
1401	}
1402
1403	/*
1404	 * If there isn't a bundle queue of the required type, allocate
1405	 * one.
1406	 */
1407	if (pp->p_ct_equeue[type->ct_type_index] == NULL) {
1408		ct_equeue_t *q = kmem_zalloc(sizeof (ct_equeue_t), KM_SLEEP);
1409		cte_queue_create(q, CTEL_PBUNDLE, 20, 1);
1410
1411		mutex_enter(&pp->p_lock);
1412		if (pp->p_ct_equeue[type->ct_type_index])
1413			cte_queue_drain(q, 0);
1414		else
1415			pp->p_ct_equeue[type->ct_type_index] = q;
1416		mutex_exit(&pp->p_lock);
1417	}
1418
1419	return (pp->p_ct_equeue[type->ct_type_index]);
1420}
1421
1422/*
1423 * ctparam_copyin
1424 *
1425 * copyin a ct_param_t for CT_TSET or CT_TGET commands.
1426 * If ctparam_copyout() is not called after ctparam_copyin(), then
1427 * the caller must kmem_free() the buffer pointed by kparam->ctpm_kbuf.
1428 *
1429 * The copyin/out of ct_param_t is not done in ctmpl_set() and ctmpl_get()
1430 * because prctioctl() calls ctmpl_set() and ctmpl_get() while holding a
1431 * process lock.
1432 */
1433int
1434ctparam_copyin(const void *uaddr, ct_kparam_t *kparam, int flag, int cmd)
1435{
1436	uint32_t size;
1437	void *ubuf;
1438	ct_param_t *param = &kparam->param;
1439	STRUCT_DECL(ct_param, uarg);
1440
1441	STRUCT_INIT(uarg, flag);
1442	if (copyin(uaddr, STRUCT_BUF(uarg), STRUCT_SIZE(uarg)))
1443		return (EFAULT);
1444	size = STRUCT_FGET(uarg, ctpm_size);
1445	ubuf = STRUCT_FGETP(uarg, ctpm_value);
1446
1447	if (size > CT_PARAM_MAX_SIZE || size == 0)
1448		return (EINVAL);
1449
1450	kparam->ctpm_kbuf = kmem_alloc(size, KM_SLEEP);
1451	if (cmd == CT_TSET) {
1452		if (copyin(ubuf, kparam->ctpm_kbuf, size)) {
1453			kmem_free(kparam->ctpm_kbuf, size);
1454			return (EFAULT);
1455		}
1456	}
1457	param->ctpm_id = STRUCT_FGET(uarg, ctpm_id);
1458	param->ctpm_size = size;
1459	param->ctpm_value = ubuf;
1460	kparam->ret_size = 0;
1461
1462	return (0);
1463}
1464
1465/*
1466 * ctparam_copyout
1467 *
1468 * copyout a ct_kparam_t and frees the buffer pointed by the member
1469 * ctpm_kbuf of ct_kparam_t
1470 */
1471int
1472ctparam_copyout(ct_kparam_t *kparam, void *uaddr, int flag)
1473{
1474	int r = 0;
1475	ct_param_t *param = &kparam->param;
1476	STRUCT_DECL(ct_param, uarg);
1477
1478	STRUCT_INIT(uarg, flag);
1479
1480	STRUCT_FSET(uarg, ctpm_id, param->ctpm_id);
1481	STRUCT_FSET(uarg, ctpm_size, kparam->ret_size);
1482	STRUCT_FSETP(uarg, ctpm_value, param->ctpm_value);
1483	if (copyout(STRUCT_BUF(uarg), uaddr, STRUCT_SIZE(uarg))) {
1484		r = EFAULT;
1485		goto error;
1486	}
1487	if (copyout(kparam->ctpm_kbuf, param->ctpm_value,
1488	    MIN(kparam->ret_size, param->ctpm_size))) {
1489		r = EFAULT;
1490	}
1491
1492error:
1493	kmem_free(kparam->ctpm_kbuf, param->ctpm_size);
1494
1495	return (r);
1496}
1497
1498/*
1499 * ctmpl_free
1500 *
1501 * Frees a template.
1502 */
1503void
1504ctmpl_free(ct_template_t *template)
1505{
1506	mutex_destroy(&template->ctmpl_lock);
1507	template->ctmpl_ops->ctop_free(template);
1508}
1509
1510/*
1511 * ctmpl_dup
1512 *
1513 * Creates a copy of a template.
1514 */
1515ct_template_t *
1516ctmpl_dup(ct_template_t *template)
1517{
1518	ct_template_t *new;
1519
1520	if (template == NULL)
1521		return (NULL);
1522
1523	new = template->ctmpl_ops->ctop_dup(template);
1524	/*
1525	 * ctmpl_lock was taken by ctop_dup's call to ctmpl_copy and
1526	 * should have remain held until now.
1527	 */
1528	mutex_exit(&template->ctmpl_lock);
1529
1530	return (new);
1531}
1532
1533/*
1534 * ctmpl_set
1535 *
1536 * Sets the requested terms of a template.
1537 */
1538int
1539ctmpl_set(ct_template_t *template, ct_kparam_t *kparam, const cred_t *cr)
1540{
1541	int result = 0;
1542	ct_param_t *param = &kparam->param;
1543	uint64_t param_value;
1544
1545	param_value = 0;
1546	if (param->ctpm_id == CTP_COOKIE ||
1547	    param->ctpm_id == CTP_EV_INFO ||
1548	    param->ctpm_id == CTP_EV_CRITICAL) {
1549		if (param->ctpm_size < sizeof (uint64_t)) {
1550			return (EINVAL);
1551		} else {
1552			param_value = *(uint64_t *)kparam->ctpm_kbuf;
1553		}
1554	}
1555
1556	mutex_enter(&template->ctmpl_lock);
1557	switch (param->ctpm_id) {
1558	case CTP_COOKIE:
1559		template->ctmpl_cookie = param_value;
1560		break;
1561	case CTP_EV_INFO:
1562		if (param_value & ~(uint64_t)template->ctmpl_ops->allevents)
1563			result = EINVAL;
1564		else
1565			template->ctmpl_ev_info = param_value;
1566		break;
1567	case CTP_EV_CRITICAL:
1568		if (param_value & ~(uint64_t)template->ctmpl_ops->allevents) {
1569			result = EINVAL;
1570			break;
1571		} else if ((~template->ctmpl_ev_crit & param_value) == 0) {
1572			/*
1573			 * Assume that a pure reduction of the critical
1574			 * set is allowed by the contract type.
1575			 */
1576			template->ctmpl_ev_crit = param_value;
1577			break;
1578		}
1579		/*
1580		 * There may be restrictions on what we can make
1581		 * critical, so we defer to the judgement of the
1582		 * contract type.
1583		 */
1584		/* FALLTHROUGH */
1585	default:
1586		result = template->ctmpl_ops->ctop_set(template, kparam, cr);
1587	}
1588	mutex_exit(&template->ctmpl_lock);
1589
1590	return (result);
1591}
1592
1593/*
1594 * ctmpl_get
1595 *
1596 * Obtains the requested terms from a template.
1597 *
1598 * If the term requested is a variable-sized term and the buffer
1599 * provided is too small for the data, we truncate the data and return
1600 * the buffer size necessary to fit the term in kparam->ret_size. If the
1601 * term requested is fix-sized (uint64_t) and the buffer provided is too
1602 * small, we return EINVAL.  This should never happen if you're using
1603 * libcontract(3LIB), only if you call ioctl with a hand constructed
1604 * ct_param_t argument.
1605 *
1606 * Currently, only contract specific parameters have variable-sized
1607 * parameters.
1608 */
1609int
1610ctmpl_get(ct_template_t *template, ct_kparam_t *kparam)
1611{
1612	int result = 0;
1613	ct_param_t *param = &kparam->param;
1614	uint64_t *param_value;
1615
1616	param_value = NULL;
1617	if (param->ctpm_id == CTP_COOKIE ||
1618	    param->ctpm_id == CTP_EV_INFO ||
1619	    param->ctpm_id == CTP_EV_CRITICAL) {
1620		if (param->ctpm_size < sizeof (uint64_t)) {
1621			return (EINVAL);
1622		} else {
1623			param_value = kparam->ctpm_kbuf;
1624			kparam->ret_size = sizeof (uint64_t);
1625		}
1626	}
1627
1628	mutex_enter(&template->ctmpl_lock);
1629	switch (param->ctpm_id) {
1630	case CTP_COOKIE:
1631		if (param_value != NULL)
1632			*param_value = template->ctmpl_cookie;
1633		break;
1634	case CTP_EV_INFO:
1635		if (param_value != NULL)
1636			*param_value = template->ctmpl_ev_info;
1637		break;
1638	case CTP_EV_CRITICAL:
1639		if (param_value != NULL)
1640			*param_value = template->ctmpl_ev_crit;
1641		break;
1642	default:
1643		result = template->ctmpl_ops->ctop_get(template, kparam);
1644	}
1645	mutex_exit(&template->ctmpl_lock);
1646
1647	return (result);
1648}
1649
1650/*
1651 * ctmpl_makecurrent
1652 *
1653 * Used by ctmpl_activate and ctmpl_clear to set the current thread's
1654 * active template.  Frees the old active template, if there was one.
1655 */
1656static void
1657ctmpl_makecurrent(ct_template_t *template, ct_template_t *new)
1658{
1659	klwp_t *curlwp = ttolwp(curthread);
1660	proc_t *p = curproc;
1661	ct_template_t *old;
1662
1663	mutex_enter(&p->p_lock);
1664	old = curlwp->lwp_ct_active[template->ctmpl_type->ct_type_index];
1665	curlwp->lwp_ct_active[template->ctmpl_type->ct_type_index] = new;
1666	mutex_exit(&p->p_lock);
1667
1668	if (old)
1669		ctmpl_free(old);
1670}
1671
1672/*
1673 * ctmpl_activate
1674 *
1675 * Copy the specified template as the current thread's activate
1676 * template of that type.
1677 */
1678void
1679ctmpl_activate(ct_template_t *template)
1680{
1681	ctmpl_makecurrent(template, ctmpl_dup(template));
1682}
1683
1684/*
1685 * ctmpl_clear
1686 *
1687 * Clears the current thread's activate template of the same type as
1688 * the specified template.
1689 */
1690void
1691ctmpl_clear(ct_template_t *template)
1692{
1693	ctmpl_makecurrent(template, NULL);
1694}
1695
1696/*
1697 * ctmpl_create
1698 *
1699 * Creates a new contract using the specified template.
1700 */
1701int
1702ctmpl_create(ct_template_t *template, ctid_t *ctidp)
1703{
1704	return (template->ctmpl_ops->ctop_create(template, ctidp));
1705}
1706
1707/*
1708 * ctmpl_init
1709 *
1710 * Initializes the common portion of a new contract template.
1711 */
1712void
1713ctmpl_init(ct_template_t *new, ctmplops_t *ops, ct_type_t *type, void *data)
1714{
1715	mutex_init(&new->ctmpl_lock, NULL, MUTEX_DEFAULT, NULL);
1716	new->ctmpl_ops = ops;
1717	new->ctmpl_type = type;
1718	new->ctmpl_data = data;
1719	new->ctmpl_ev_info = new->ctmpl_ev_crit = 0;
1720	new->ctmpl_cookie = 0;
1721}
1722
1723/*
1724 * ctmpl_copy
1725 *
1726 * Copies the common portions of a contract template.  Intended for use
1727 * by a contract type's ctop_dup template op.  Returns with the old
1728 * template's lock held, which will should remain held until the
1729 * template op returns (it is dropped by ctmpl_dup).
1730 */
1731void
1732ctmpl_copy(ct_template_t *new, ct_template_t *old)
1733{
1734	mutex_init(&new->ctmpl_lock, NULL, MUTEX_DEFAULT, NULL);
1735	mutex_enter(&old->ctmpl_lock);
1736	new->ctmpl_ops = old->ctmpl_ops;
1737	new->ctmpl_type = old->ctmpl_type;
1738	new->ctmpl_ev_crit = old->ctmpl_ev_crit;
1739	new->ctmpl_ev_info = old->ctmpl_ev_info;
1740	new->ctmpl_cookie = old->ctmpl_cookie;
1741}
1742
1743/*
1744 * ctmpl_create_inval
1745 *
1746 * Returns EINVAL.  Provided for the convenience of those contract
1747 * types which don't support ct_tmpl_create(3contract) and would
1748 * otherwise need to create their own stub for the ctop_create template
1749 * op.
1750 */
1751/*ARGSUSED*/
1752int
1753ctmpl_create_inval(ct_template_t *template, ctid_t *ctidp)
1754{
1755	return (EINVAL);
1756}
1757
1758
1759/*
1760 * cte_queue_create
1761 *
1762 * Initializes a queue of a particular type.  If dynamic is set, the
1763 * queue is to be freed when its last listener is removed after being
1764 * drained.
1765 */
1766static void
1767cte_queue_create(ct_equeue_t *q, ct_listnum_t list, int maxinf, int dynamic)
1768{
1769	mutex_init(&q->ctq_lock, NULL, MUTEX_DEFAULT, NULL);
1770	q->ctq_listno = list;
1771	list_create(&q->ctq_events, sizeof (ct_kevent_t),
1772	    offsetof(ct_kevent_t, cte_nodes[list].ctm_node));
1773	list_create(&q->ctq_listeners, sizeof (ct_listener_t),
1774	    offsetof(ct_listener_t, ctl_allnode));
1775	list_create(&q->ctq_tail, sizeof (ct_listener_t),
1776	    offsetof(ct_listener_t, ctl_tailnode));
1777	gethrestime(&q->ctq_atime);
1778	q->ctq_nlisteners = 0;
1779	q->ctq_nreliable = 0;
1780	q->ctq_ninf = 0;
1781	q->ctq_max = maxinf;
1782
1783	/*
1784	 * Bundle queues and contract queues are embedded in other
1785	 * structures and are implicitly referenced counted by virtue
1786	 * of their vnodes' indirect hold on their contracts.  Process
1787	 * bundle queues are dynamically allocated and may persist
1788	 * after the death of the process, so they must be explicitly
1789	 * reference counted.
1790	 */
1791	q->ctq_flags = dynamic ? CTQ_REFFED : 0;
1792}
1793
1794/*
1795 * cte_queue_destroy
1796 *
1797 * Destroys the specified queue.  The queue is freed if referenced
1798 * counted.
1799 */
1800static void
1801cte_queue_destroy(ct_equeue_t *q)
1802{
1803	ASSERT(q->ctq_flags & CTQ_DEAD);
1804	ASSERT(q->ctq_nlisteners == 0);
1805	ASSERT(q->ctq_nreliable == 0);
1806	list_destroy(&q->ctq_events);
1807	list_destroy(&q->ctq_listeners);
1808	list_destroy(&q->ctq_tail);
1809	mutex_destroy(&q->ctq_lock);
1810	if (q->ctq_flags & CTQ_REFFED)
1811		kmem_free(q, sizeof (ct_equeue_t));
1812}
1813
1814/*
1815 * cte_hold
1816 *
1817 * Takes a hold on the specified event.
1818 */
1819static void
1820cte_hold(ct_kevent_t *e)
1821{
1822	mutex_enter(&e->cte_lock);
1823	ASSERT(e->cte_refs > 0);
1824	e->cte_refs++;
1825	mutex_exit(&e->cte_lock);
1826}
1827
1828/*
1829 * cte_rele
1830 *
1831 * Releases a hold on the specified event.  If the caller had the last
1832 * reference, frees the event and releases its hold on the contract
1833 * that generated it.
1834 */
1835static void
1836cte_rele(ct_kevent_t *e)
1837{
1838	mutex_enter(&e->cte_lock);
1839	ASSERT(e->cte_refs > 0);
1840	if (--e->cte_refs) {
1841		mutex_exit(&e->cte_lock);
1842		return;
1843	}
1844
1845	contract_rele(e->cte_contract);
1846
1847	mutex_destroy(&e->cte_lock);
1848	nvlist_free(e->cte_data);
1849	nvlist_free(e->cte_gdata);
1850	kmem_free(e, sizeof (ct_kevent_t));
1851}
1852
1853/*
1854 * cte_qrele
1855 *
1856 * Remove this listener's hold on the specified event, removing and
1857 * releasing the queue's hold on the event if appropriate.
1858 */
1859static void
1860cte_qrele(ct_equeue_t *q, ct_listener_t *l, ct_kevent_t *e)
1861{
1862	ct_member_t *member = &e->cte_nodes[q->ctq_listno];
1863
1864	ASSERT(MUTEX_HELD(&q->ctq_lock));
1865
1866	if (l->ctl_flags & CTLF_RELIABLE)
1867		member->ctm_nreliable--;
1868	if ((--member->ctm_refs == 0) && member->ctm_trimmed) {
1869		member->ctm_trimmed = 0;
1870		list_remove(&q->ctq_events, e);
1871		cte_rele(e);
1872	}
1873}
1874
1875/*
1876 * cte_qmove
1877 *
1878 * Move this listener to the specified event in the queue.
1879 */
1880static ct_kevent_t *
1881cte_qmove(ct_equeue_t *q, ct_listener_t *l, ct_kevent_t *e)
1882{
1883	ct_kevent_t *olde;
1884
1885	ASSERT(MUTEX_HELD(&q->ctq_lock));
1886	ASSERT(l->ctl_equeue == q);
1887
1888	if ((olde = l->ctl_position) == NULL)
1889		list_remove(&q->ctq_tail, l);
1890
1891	while (e != NULL && e->cte_nodes[q->ctq_listno].ctm_trimmed)
1892		e = list_next(&q->ctq_events, e);
1893
1894	if (e != NULL) {
1895		e->cte_nodes[q->ctq_listno].ctm_refs++;
1896		if (l->ctl_flags & CTLF_RELIABLE)
1897			e->cte_nodes[q->ctq_listno].ctm_nreliable++;
1898	} else {
1899		list_insert_tail(&q->ctq_tail, l);
1900	}
1901
1902	l->ctl_position = e;
1903	if (olde)
1904		cte_qrele(q, l, olde);
1905
1906	return (e);
1907}
1908
1909/*
1910 * cte_checkcred
1911 *
1912 * Determines if the specified event's contract is owned by a process
1913 * with the same effective uid as the specified credential.  Called
1914 * after a failed call to contract_owned with locked set.  Because it
1915 * drops the queue lock, its caller (cte_qreadable) needs to make sure
1916 * we're still in the same place after we return.  Returns 1 on
1917 * success.
1918 */
1919static int
1920cte_checkcred(ct_equeue_t *q, ct_kevent_t *e, const cred_t *cr)
1921{
1922	int result;
1923	contract_t *ct = e->cte_contract;
1924
1925	cte_hold(e);
1926	mutex_exit(&q->ctq_lock);
1927	result = curproc->p_zone->zone_uniqid == ct->ct_czuniqid &&
1928	    contract_checkcred(ct, cr);
1929	mutex_enter(&q->ctq_lock);
1930	cte_rele(e);
1931
1932	return (result);
1933}
1934
1935/*
1936 * cte_qreadable
1937 *
1938 * Ensures that the listener is pointing to a valid event that the
1939 * caller has the credentials to read.  Returns 0 if we can read the
1940 * event we're pointing to.
1941 */
1942static int
1943cte_qreadable(ct_equeue_t *q, ct_listener_t *l, const cred_t *cr,
1944    uint64_t zuniqid, int crit)
1945{
1946	ct_kevent_t *e, *next;
1947	contract_t *ct;
1948
1949	ASSERT(MUTEX_HELD(&q->ctq_lock));
1950	ASSERT(l->ctl_equeue == q);
1951
1952	if (l->ctl_flags & CTLF_COPYOUT)
1953		return (1);
1954
1955	next = l->ctl_position;
1956	while (e = cte_qmove(q, l, next)) {
1957		ct = e->cte_contract;
1958		/*
1959		 * Check obvious things first.  If we are looking for a
1960		 * critical message, is this one?  If we aren't in the
1961		 * global zone, is this message meant for us?
1962		 */
1963		if ((crit && (e->cte_flags & (CTE_INFO | CTE_ACK))) ||
1964		    (cr != NULL && zuniqid != GLOBAL_ZONEUNIQID &&
1965		    zuniqid != contract_getzuniqid(ct))) {
1966
1967			next = list_next(&q->ctq_events, e);
1968
1969		/*
1970		 * Next, see if our effective uid equals that of owner
1971		 * or author of the contract.  Since we are holding the
1972		 * queue lock, contract_owned can't always check if we
1973		 * have the same effective uid as the contract's
1974		 * owner.  If it comes to that, it fails and we take
1975		 * the slow(er) path.
1976		 */
1977		} else if (cr != NULL && !contract_owned(ct, cr, B_TRUE)) {
1978
1979			/*
1980			 * At this point we either don't have any claim
1981			 * to this contract or we match the effective
1982			 * uid of the owner but couldn't tell.  We
1983			 * first test for a NULL holder so that events
1984			 * from orphans and inherited contracts avoid
1985			 * the penalty phase.
1986			 */
1987			if (e->cte_contract->ct_owner == NULL &&
1988			    !secpolicy_contract_observer_choice(cr))
1989				next = list_next(&q->ctq_events, e);
1990
1991			/*
1992			 * cte_checkcred will juggle locks to see if we
1993			 * have the same uid as the event's contract's
1994			 * current owner.  If it succeeds, we have to
1995			 * make sure we are in the same point in the
1996			 * queue.
1997			 */
1998			else if (cte_checkcred(q, e, cr) &&
1999			    l->ctl_position == e)
2000				break;
2001
2002			/*
2003			 * cte_checkcred failed; see if we're in the
2004			 * same place.
2005			 */
2006			else if (l->ctl_position == e)
2007				if (secpolicy_contract_observer_choice(cr))
2008					break;
2009				else
2010					next = list_next(&q->ctq_events, e);
2011
2012			/*
2013			 * cte_checkcred failed, and our position was
2014			 * changed.  Start from there.
2015			 */
2016			else
2017				next = l->ctl_position;
2018		} else {
2019			break;
2020		}
2021	}
2022
2023	/*
2024	 * We check for CTLF_COPYOUT again in case we dropped the queue
2025	 * lock in cte_checkcred.
2026	 */
2027	return ((l->ctl_flags & CTLF_COPYOUT) || (l->ctl_position == NULL));
2028}
2029
2030/*
2031 * cte_qwakeup
2032 *
2033 * Wakes up any waiting listeners and points them at the specified event.
2034 */
2035static void
2036cte_qwakeup(ct_equeue_t *q, ct_kevent_t *e)
2037{
2038	ct_listener_t *l;
2039
2040	ASSERT(MUTEX_HELD(&q->ctq_lock));
2041
2042	while (l = list_head(&q->ctq_tail)) {
2043		list_remove(&q->ctq_tail, l);
2044		e->cte_nodes[q->ctq_listno].ctm_refs++;
2045		if (l->ctl_flags & CTLF_RELIABLE)
2046			e->cte_nodes[q->ctq_listno].ctm_nreliable++;
2047		l->ctl_position = e;
2048		cv_signal(&l->ctl_cv);
2049		pollwakeup(&l->ctl_pollhead, POLLIN);
2050	}
2051}
2052
2053/*
2054 * cte_copy
2055 *
2056 * Copies events from the specified contract event queue to the
2057 * end of the specified process bundle queue.  Only called from
2058 * contract_adopt.
2059 *
2060 * We copy to the end of the target queue instead of mixing the events
2061 * in their proper order because otherwise the act of adopting a
2062 * contract would require a process to reset all process bundle
2063 * listeners it needed to see the new events.  This would, in turn,
2064 * require the process to keep track of which preexisting events had
2065 * already been processed.
2066 */
2067static void
2068cte_copy(ct_equeue_t *q, ct_equeue_t *newq)
2069{
2070	ct_kevent_t *e, *first = NULL;
2071
2072	VERIFY(q->ctq_listno == CTEL_CONTRACT);
2073	VERIFY(newq->ctq_listno == CTEL_PBUNDLE);
2074
2075	mutex_enter(&q->ctq_lock);
2076	mutex_enter(&newq->ctq_lock);
2077
2078	/*
2079	 * For now, only copy critical events.
2080	 */
2081	for (e = list_head(&q->ctq_events); e != NULL;
2082	    e = list_next(&q->ctq_events, e)) {
2083		if ((e->cte_flags & (CTE_INFO | CTE_ACK)) == 0) {
2084			if (first == NULL)
2085				first = e;
2086			/*
2087			 * It is possible for adoption to race with an owner's
2088			 * cte_publish_all(); we must only enqueue events that
2089			 * have not already been enqueued.
2090			 */
2091			if (!list_link_active((list_node_t *)
2092			    ((uintptr_t)e + newq->ctq_events.list_offset))) {
2093				list_insert_tail(&newq->ctq_events, e);
2094				cte_hold(e);
2095			}
2096		}
2097	}
2098
2099	mutex_exit(&q->ctq_lock);
2100
2101	if (first)
2102		cte_qwakeup(newq, first);
2103
2104	mutex_exit(&newq->ctq_lock);
2105}
2106
2107/*
2108 * cte_trim
2109 *
2110 * Trims unneeded events from an event queue.  Algorithm works as
2111 * follows:
2112 *
2113 *   Removes all informative and acknowledged critical events until the
2114 *   first referenced event is found.
2115 *
2116 *   If a contract is specified, removes all events (regardless of
2117 *   acknowledgement) generated by that contract until the first event
2118 *   referenced by a reliable listener is found.  Reference events are
2119 *   removed by marking them "trimmed".  Such events will be removed
2120 *   when the last reference is dropped and will be skipped by future
2121 *   listeners.
2122 *
2123 * This is pretty basic.  Ideally this should remove from the middle of
2124 * the list (i.e. beyond the first referenced event), and even
2125 * referenced events.
2126 */
2127static void
2128cte_trim(ct_equeue_t *q, contract_t *ct)
2129{
2130	ct_kevent_t *e, *next;
2131	int flags, stopper;
2132	int start = 1;
2133
2134	VERIFY(MUTEX_HELD(&q->ctq_lock));
2135
2136	for (e = list_head(&q->ctq_events); e != NULL; e = next) {
2137		next = list_next(&q->ctq_events, e);
2138		flags = e->cte_flags;
2139		stopper = (q->ctq_listno != CTEL_PBUNDLE) &&
2140		    (e->cte_nodes[q->ctq_listno].ctm_nreliable > 0);
2141		if (e->cte_nodes[q->ctq_listno].ctm_refs == 0) {
2142			if ((start && (flags & (CTE_INFO | CTE_ACK))) ||
2143			    (e->cte_contract == ct)) {
2144				/*
2145				 * Toss informative and ACKed critical messages.
2146				 */
2147				list_remove(&q->ctq_events, e);
2148				cte_rele(e);
2149			}
2150		} else if ((e->cte_contract == ct) && !stopper) {
2151			ASSERT(q->ctq_nlisteners != 0);
2152			e->cte_nodes[q->ctq_listno].ctm_trimmed = 1;
2153		} else if (ct && !stopper) {
2154			start = 0;
2155		} else {
2156			/*
2157			 * Don't free messages past the first reader.
2158			 */
2159			break;
2160		}
2161	}
2162}
2163
2164/*
2165 * cte_queue_drain
2166 *
2167 * Drain all events from the specified queue, and mark it dead.  If
2168 * "ack" is set, acknowledge any critical events we find along the
2169 * way.
2170 */
2171static void
2172cte_queue_drain(ct_equeue_t *q, int ack)
2173{
2174	ct_kevent_t *e, *next;
2175	ct_listener_t *l;
2176
2177	mutex_enter(&q->ctq_lock);
2178
2179	for (e = list_head(&q->ctq_events); e != NULL; e = next) {
2180		next = list_next(&q->ctq_events, e);
2181		if (ack && ((e->cte_flags & (CTE_INFO | CTE_ACK)) == 0)) {
2182			/*
2183			 * Make sure critical messages are eventually
2184			 * removed from the bundle queues.
2185			 */
2186			mutex_enter(&e->cte_lock);
2187			e->cte_flags |= CTE_ACK;
2188			mutex_exit(&e->cte_lock);
2189			ASSERT(MUTEX_HELD(&e->cte_contract->ct_lock));
2190			e->cte_contract->ct_evcnt--;
2191		}
2192		list_remove(&q->ctq_events, e);
2193		e->cte_nodes[q->ctq_listno].ctm_refs = 0;
2194		e->cte_nodes[q->ctq_listno].ctm_nreliable = 0;
2195		e->cte_nodes[q->ctq_listno].ctm_trimmed = 0;
2196		cte_rele(e);
2197	}
2198
2199	/*
2200	 * This is necessary only because of CTEL_PBUNDLE listeners;
2201	 * the events they point to can move from one pbundle to
2202	 * another.  Fortunately, this only happens if the contract is
2203	 * inherited, which (in turn) only happens if the process
2204	 * exits, which means it's an all-or-nothing deal.  If this
2205	 * wasn't the case, we would instead need to keep track of
2206	 * listeners on a per-event basis, not just a per-queue basis.
2207	 * This would have the side benefit of letting us clean up
2208	 * trimmed events sooner (i.e. immediately), but would
2209	 * unfortunately make events even bigger than they already
2210	 * are.
2211	 */
2212	for (l = list_head(&q->ctq_listeners); l;
2213	    l = list_next(&q->ctq_listeners, l)) {
2214		l->ctl_flags |= CTLF_DEAD;
2215		if (l->ctl_position) {
2216			l->ctl_position = NULL;
2217			list_insert_tail(&q->ctq_tail, l);
2218		}
2219		cv_broadcast(&l->ctl_cv);
2220	}
2221
2222	/*
2223	 * Disallow events.
2224	 */
2225	q->ctq_flags |= CTQ_DEAD;
2226
2227	/*
2228	 * If we represent the last reference to a reference counted
2229	 * process bundle queue, free it.
2230	 */
2231	if ((q->ctq_flags & CTQ_REFFED) && (q->ctq_nlisteners == 0))
2232		cte_queue_destroy(q);
2233	else
2234		mutex_exit(&q->ctq_lock);
2235}
2236
2237/*
2238 * cte_publish
2239 *
2240 * Publishes an event to a specific queue.  Only called by
2241 * cte_publish_all.
2242 */
2243static void
2244cte_publish(ct_equeue_t *q, ct_kevent_t *e, timespec_t *tsp, boolean_t mayexist)
2245{
2246	ASSERT(MUTEX_HELD(&q->ctq_lock));
2247
2248	q->ctq_atime = *tsp;
2249
2250	/*
2251	 * If this event may already exist on this queue, check to see if it
2252	 * is already there and return if so.
2253	 */
2254	if (mayexist && list_link_active((list_node_t *)((uintptr_t)e +
2255	    q->ctq_events.list_offset))) {
2256		mutex_exit(&q->ctq_lock);
2257		cte_rele(e);
2258		return;
2259	}
2260
2261	/*
2262	 * Don't publish if the event is informative and there aren't
2263	 * any listeners, or if the queue has been shut down.
2264	 */
2265	if (((q->ctq_nlisteners == 0) && (e->cte_flags & (CTE_INFO|CTE_ACK))) ||
2266	    (q->ctq_flags & CTQ_DEAD)) {
2267		mutex_exit(&q->ctq_lock);
2268		cte_rele(e);
2269		return;
2270	}
2271
2272	/*
2273	 * Enqueue event
2274	 */
2275	VERIFY(!list_link_active((list_node_t *)
2276	    ((uintptr_t)e + q->ctq_events.list_offset)));
2277	list_insert_tail(&q->ctq_events, e);
2278
2279	/*
2280	 * Check for waiting listeners
2281	 */
2282	cte_qwakeup(q, e);
2283
2284	/*
2285	 * Trim unnecessary events from the queue.
2286	 */
2287	cte_trim(q, NULL);
2288	mutex_exit(&q->ctq_lock);
2289}
2290
2291/*
2292 * cte_publish_all
2293 *
2294 * Publish an event to all necessary event queues.  The event, e, must
2295 * be zallocated by the caller, and the event's flags and type must be
2296 * set.  The rest of the event's fields are initialized here.
2297 */
2298uint64_t
2299cte_publish_all(contract_t *ct, ct_kevent_t *e, nvlist_t *data, nvlist_t *gdata)
2300{
2301	ct_equeue_t *q;
2302	timespec_t ts;
2303	uint64_t evid;
2304	ct_kevent_t *negev;
2305	int negend;
2306
2307	e->cte_contract = ct;
2308	e->cte_data = data;
2309	e->cte_gdata = gdata;
2310	e->cte_refs = 3;
2311	evid = e->cte_id = atomic_inc_64_nv(&ct->ct_type->ct_type_evid);
2312	contract_hold(ct);
2313
2314	/*
2315	 * For a negotiation event we set the ct->ct_nevent field of the
2316	 * contract for the duration of the negotiation
2317	 */
2318	negend = 0;
2319	if (e->cte_flags & CTE_NEG) {
2320		cte_hold(e);
2321		ct->ct_nevent = e;
2322	} else if (e->cte_type == CT_EV_NEGEND) {
2323		negend = 1;
2324	}
2325
2326	gethrestime(&ts);
2327
2328	/*
2329	 * ct_evtlock simply (and only) ensures that two events sent
2330	 * from the same contract are delivered to all queues in the
2331	 * same order.
2332	 */
2333	mutex_enter(&ct->ct_evtlock);
2334
2335	/*
2336	 * CTEL_CONTRACT - First deliver to the contract queue, acking
2337	 * the event if the contract has been orphaned.
2338	 */
2339	mutex_enter(&ct->ct_lock);
2340	mutex_enter(&ct->ct_events.ctq_lock);
2341	if ((e->cte_flags & CTE_INFO) == 0) {
2342		if (ct->ct_state >= CTS_ORPHAN)
2343			e->cte_flags |= CTE_ACK;
2344		else
2345			ct->ct_evcnt++;
2346	}
2347	mutex_exit(&ct->ct_lock);
2348	cte_publish(&ct->ct_events, e, &ts, B_FALSE);
2349
2350	/*
2351	 * CTEL_BUNDLE - Next deliver to the contract type's bundle
2352	 * queue.
2353	 */
2354	mutex_enter(&ct->ct_type->ct_type_events.ctq_lock);
2355	cte_publish(&ct->ct_type->ct_type_events, e, &ts, B_FALSE);
2356
2357	/*
2358	 * CTEL_PBUNDLE - Finally, if the contract has an owner,
2359	 * deliver to the owner's process bundle queue.
2360	 */
2361	mutex_enter(&ct->ct_lock);
2362	if (ct->ct_owner) {
2363		/*
2364		 * proc_exit doesn't free event queues until it has
2365		 * abandoned all contracts.
2366		 */
2367		ASSERT(ct->ct_owner->p_ct_equeue);
2368		ASSERT(ct->ct_owner->p_ct_equeue[ct->ct_type->ct_type_index]);
2369		q = ct->ct_owner->p_ct_equeue[ct->ct_type->ct_type_index];
2370		mutex_enter(&q->ctq_lock);
2371		mutex_exit(&ct->ct_lock);
2372
2373		/*
2374		 * It is possible for this code to race with adoption; we
2375		 * publish the event indicating that the event may already
2376		 * be enqueued because adoption beat us to it (in which case
2377		 * cte_pubish() does nothing).
2378		 */
2379		cte_publish(q, e, &ts, B_TRUE);
2380	} else {
2381		mutex_exit(&ct->ct_lock);
2382		cte_rele(e);
2383	}
2384
2385	if (negend) {
2386		mutex_enter(&ct->ct_lock);
2387		negev = ct->ct_nevent;
2388		ct->ct_nevent = NULL;
2389		cte_rele(negev);
2390		mutex_exit(&ct->ct_lock);
2391	}
2392
2393	mutex_exit(&ct->ct_evtlock);
2394
2395	return (evid);
2396}
2397
2398/*
2399 * cte_add_listener
2400 *
2401 * Add a new listener to an event queue.
2402 */
2403void
2404cte_add_listener(ct_equeue_t *q, ct_listener_t *l)
2405{
2406	cv_init(&l->ctl_cv, NULL, CV_DEFAULT, NULL);
2407	l->ctl_equeue = q;
2408	l->ctl_position = NULL;
2409	l->ctl_flags = 0;
2410
2411	mutex_enter(&q->ctq_lock);
2412	list_insert_head(&q->ctq_tail, l);
2413	list_insert_head(&q->ctq_listeners, l);
2414	q->ctq_nlisteners++;
2415	mutex_exit(&q->ctq_lock);
2416}
2417
2418/*
2419 * cte_remove_listener
2420 *
2421 * Remove a listener from an event queue.  No other queue activities
2422 * (e.g. cte_get event) may be in progress at this endpoint when this
2423 * is called.
2424 */
2425void
2426cte_remove_listener(ct_listener_t *l)
2427{
2428	ct_equeue_t *q = l->ctl_equeue;
2429	ct_kevent_t *e;
2430
2431	mutex_enter(&q->ctq_lock);
2432
2433	ASSERT((l->ctl_flags & (CTLF_COPYOUT|CTLF_RESET)) == 0);
2434
2435	if ((e = l->ctl_position) != NULL)
2436		cte_qrele(q, l, e);
2437	else
2438		list_remove(&q->ctq_tail, l);
2439	l->ctl_position = NULL;
2440
2441	q->ctq_nlisteners--;
2442	list_remove(&q->ctq_listeners, l);
2443
2444	if (l->ctl_flags & CTLF_RELIABLE)
2445		q->ctq_nreliable--;
2446
2447	/*
2448	 * If we are a the last listener of a dead reference counted
2449	 * queue (i.e. a process bundle) we free it.  Otherwise we just
2450	 * trim any events which may have been kept around for our
2451	 * benefit.
2452	 */
2453	if ((q->ctq_flags & CTQ_REFFED) && (q->ctq_flags & CTQ_DEAD) &&
2454	    (q->ctq_nlisteners == 0)) {
2455		cte_queue_destroy(q);
2456	} else {
2457		cte_trim(q, NULL);
2458		mutex_exit(&q->ctq_lock);
2459	}
2460}
2461
2462/*
2463 * cte_reset_listener
2464 *
2465 * Moves a listener's queue pointer to the beginning of the queue.
2466 */
2467void
2468cte_reset_listener(ct_listener_t *l)
2469{
2470	ct_equeue_t *q = l->ctl_equeue;
2471
2472	mutex_enter(&q->ctq_lock);
2473
2474	/*
2475	 * We allow an asynchronous reset because it doesn't make a
2476	 * whole lot of sense to make reset block or fail.  We already
2477	 * have most of the mechanism needed thanks to queue trimming,
2478	 * so implementing it isn't a big deal.
2479	 */
2480	if (l->ctl_flags & CTLF_COPYOUT)
2481		l->ctl_flags |= CTLF_RESET;
2482
2483	(void) cte_qmove(q, l, list_head(&q->ctq_events));
2484
2485	/*
2486	 * Inform blocked readers.
2487	 */
2488	cv_broadcast(&l->ctl_cv);
2489	pollwakeup(&l->ctl_pollhead, POLLIN);
2490	mutex_exit(&q->ctq_lock);
2491}
2492
2493/*
2494 * cte_next_event
2495 *
2496 * Moves the event pointer for the specified listener to the next event
2497 * on the queue.  To avoid races, this movement only occurs if the
2498 * specified event id matches that of the current event.  This is used
2499 * primarily to skip events that have been read but whose extended data
2500 * haven't been copied out.
2501 */
2502int
2503cte_next_event(ct_listener_t *l, uint64_t id)
2504{
2505	ct_equeue_t *q = l->ctl_equeue;
2506	ct_kevent_t *old;
2507
2508	mutex_enter(&q->ctq_lock);
2509
2510	if (l->ctl_flags & CTLF_COPYOUT)
2511		l->ctl_flags |= CTLF_RESET;
2512
2513	if (((old = l->ctl_position) != NULL) && (old->cte_id == id))
2514		(void) cte_qmove(q, l, list_next(&q->ctq_events, old));
2515
2516	mutex_exit(&q->ctq_lock);
2517
2518	return (0);
2519}
2520
2521/*
2522 * cte_get_event
2523 *
2524 * Reads an event from an event endpoint.  If "nonblock" is clear, we
2525 * block until a suitable event is ready.  If "crit" is set, we only
2526 * read critical events.  Note that while "cr" is the caller's cred,
2527 * "zuniqid" is the unique id of the zone the calling contract
2528 * filesystem was mounted in.
2529 */
2530int
2531cte_get_event(ct_listener_t *l, int nonblock, void *uaddr, const cred_t *cr,
2532    uint64_t zuniqid, int crit)
2533{
2534	ct_equeue_t *q = l->ctl_equeue;
2535	ct_kevent_t *temp;
2536	int result = 0;
2537	int partial = 0;
2538	size_t size, gsize, len;
2539	model_t mdl = get_udatamodel();
2540	STRUCT_DECL(ct_event, ev);
2541	STRUCT_INIT(ev, mdl);
2542
2543	/*
2544	 * cte_qreadable checks for CTLF_COPYOUT as well as ensures
2545	 * that there exists, and we are pointing to, an appropriate
2546	 * event.  It may temporarily drop ctq_lock, but that doesn't
2547	 * really matter to us.
2548	 */
2549	mutex_enter(&q->ctq_lock);
2550	while (cte_qreadable(q, l, cr, zuniqid, crit)) {
2551		if (nonblock) {
2552			result = EAGAIN;
2553			goto error;
2554		}
2555		if (q->ctq_flags & CTQ_DEAD) {
2556			result = EIDRM;
2557			goto error;
2558		}
2559		result = cv_wait_sig(&l->ctl_cv, &q->ctq_lock);
2560		if (result == 0) {
2561			result = EINTR;
2562			goto error;
2563		}
2564	}
2565	temp = l->ctl_position;
2566	cte_hold(temp);
2567	l->ctl_flags |= CTLF_COPYOUT;
2568	mutex_exit(&q->ctq_lock);
2569
2570	/*
2571	 * We now have an event.  Copy in the user event structure to
2572	 * see how much space we have to work with.
2573	 */
2574	result = copyin(uaddr, STRUCT_BUF(ev), STRUCT_SIZE(ev));
2575	if (result)
2576		goto copyerr;
2577
2578	/*
2579	 * Determine what data we have and what the user should be
2580	 * allowed to see.
2581	 */
2582	size = gsize = 0;
2583	if (temp->cte_data) {
2584		VERIFY(nvlist_size(temp->cte_data, &size,
2585		    NV_ENCODE_NATIVE) == 0);
2586		ASSERT(size != 0);
2587	}
2588	if (zuniqid == GLOBAL_ZONEUNIQID && temp->cte_gdata) {
2589		VERIFY(nvlist_size(temp->cte_gdata, &gsize,
2590		    NV_ENCODE_NATIVE) == 0);
2591		ASSERT(gsize != 0);
2592	}
2593
2594	/*
2595	 * If we have enough space, copy out the extended event data.
2596	 */
2597	len = size + gsize;
2598	if (len) {
2599		if (STRUCT_FGET(ev, ctev_nbytes) >= len) {
2600			char *buf = kmem_alloc(len, KM_SLEEP);
2601
2602			if (size)
2603				VERIFY(nvlist_pack(temp->cte_data, &buf, &size,
2604				    NV_ENCODE_NATIVE, KM_SLEEP) == 0);
2605			if (gsize) {
2606				char *tmp = buf + size;
2607
2608				VERIFY(nvlist_pack(temp->cte_gdata, &tmp,
2609				    &gsize, NV_ENCODE_NATIVE, KM_SLEEP) == 0);
2610			}
2611
2612			/* This shouldn't have changed */
2613			ASSERT(size + gsize == len);
2614			result = copyout(buf, STRUCT_FGETP(ev, ctev_buffer),
2615			    len);
2616			kmem_free(buf, len);
2617			if (result)
2618				goto copyerr;
2619		} else {
2620			partial = 1;
2621		}
2622	}
2623
2624	/*
2625	 * Copy out the common event data.
2626	 */
2627	STRUCT_FSET(ev, ctev_id, temp->cte_contract->ct_id);
2628	STRUCT_FSET(ev, ctev_evid, temp->cte_id);
2629	STRUCT_FSET(ev, ctev_cttype,
2630	    temp->cte_contract->ct_type->ct_type_index);
2631	STRUCT_FSET(ev, ctev_flags, temp->cte_flags &
2632	    (CTE_ACK|CTE_INFO|CTE_NEG));
2633	STRUCT_FSET(ev, ctev_type, temp->cte_type);
2634	STRUCT_FSET(ev, ctev_nbytes, len);
2635	STRUCT_FSET(ev, ctev_goffset, size);
2636	result = copyout(STRUCT_BUF(ev), uaddr, STRUCT_SIZE(ev));
2637
2638copyerr:
2639	/*
2640	 * Only move our location in the queue if all copyouts were
2641	 * successful, the caller provided enough space for the entire
2642	 * event, and our endpoint wasn't reset or otherwise moved by
2643	 * another thread.
2644	 */
2645	mutex_enter(&q->ctq_lock);
2646	if (result)
2647		result = EFAULT;
2648	else if (!partial && ((l->ctl_flags & CTLF_RESET) == 0) &&
2649	    (l->ctl_position == temp))
2650		(void) cte_qmove(q, l, list_next(&q->ctq_events, temp));
2651	l->ctl_flags &= ~(CTLF_COPYOUT|CTLF_RESET);
2652	/*
2653	 * Signal any readers blocked on our CTLF_COPYOUT.
2654	 */
2655	cv_signal(&l->ctl_cv);
2656	cte_rele(temp);
2657
2658error:
2659	mutex_exit(&q->ctq_lock);
2660	return (result);
2661}
2662
2663/*
2664 * cte_set_reliable
2665 *
2666 * Requests that events be reliably delivered to an event endpoint.
2667 * Unread informative and acknowledged critical events will not be
2668 * removed from the queue until this listener reads or skips them.
2669 * Because a listener could maliciously request reliable delivery and
2670 * then do nothing, this requires that PRIV_CONTRACT_EVENT be in the
2671 * caller's effective set.
2672 */
2673int
2674cte_set_reliable(ct_listener_t *l, const cred_t *cr)
2675{
2676	ct_equeue_t *q = l->ctl_equeue;
2677	int error;
2678
2679	if ((error = secpolicy_contract_event(cr)) != 0)
2680		return (error);
2681
2682	mutex_enter(&q->ctq_lock);
2683	if ((l->ctl_flags & CTLF_RELIABLE) == 0) {
2684		l->ctl_flags |= CTLF_RELIABLE;
2685		q->ctq_nreliable++;
2686		if (l->ctl_position != NULL)
2687			l->ctl_position->cte_nodes[q->ctq_listno].
2688			    ctm_nreliable++;
2689	}
2690	mutex_exit(&q->ctq_lock);
2691
2692	return (0);
2693}
2694