xref: /illumos-gate/usr/src/uts/common/fs/portfs/port.c (revision aa59c4cb15a6ac5d4e585dadf7a055b580abf579)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/types.h>
30 #include <sys/systm.h>
31 #include <sys/cred.h>
32 #include <sys/modctl.h>
33 #include <sys/vfs.h>
34 #include <sys/vfs_opreg.h>
35 #include <sys/sysmacros.h>
36 #include <sys/cmn_err.h>
37 #include <sys/stat.h>
38 #include <sys/errno.h>
39 #include <sys/kmem.h>
40 #include <sys/file.h>
41 #include <sys/kstat.h>
42 #include <sys/port_impl.h>
43 #include <sys/task.h>
44 #include <sys/project.h>
45 
46 /*
47  * Event Ports can be shared across threads or across processes.
48  * Every thread/process can use an own event port or a group of them
49  * can use a single port. A major request was also to get the ability
50  * to submit user-defined events to a port. The idea of the
51  * user-defined events is to use the event ports for communication between
52  * threads/processes (like message queues). User defined-events are queued
53  * in a port with the same priority as other event types.
54  *
55  * Events are delivered only once. The thread/process which is waiting
56  * for events with the "highest priority" (priority here is related to the
57  * internal strategy to wakeup waiting threads) will retrieve the event,
58  * all other threads/processes will not be notified. There is also
59  * the requirement to have events which should be submitted immediately
60  * to all "waiting" threads. That is the main task of the alert event.
61  * The alert event is submitted by the application to a port. The port
62  * changes from a standard mode to the alert mode. Now all waiting threads
63  * will be awaken immediately and they will return with the alert event.
64  * Threads trying to retrieve events from a port in alert mode will
65  * return immediately with the alert event.
66  *
67  *
68  * An event port is like a kernel queue, which accept events submitted from
69  * user level as well as events submitted from kernel sub-systems. Sub-systems
70  * able to submit events to a port are the so-called "event sources".
71  * Current event sources:
72  * PORT_SOURCE_AIO	 : events submitted per transaction completion from
73  *			   POSIX-I/O framework.
74  * PORT_SOURCE_TIMER	 : events submitted when a timer fires
75  *			   (see timer_create(3RT)).
76  * PORT_SOURCE_FD	 : events submitted per file descriptor (see poll(2)).
77  * PORT_SOURCE_ALERT	 : events submitted from user. This is not really a
78  *			   single event, this is actually a port mode
79  *			   (see port_alert(3c)).
80  * PORT_SOURCE_USER	 : events submitted by applications with
81  *			   port_send(3c) or port_sendn(3c).
82  *
83  * There is a user API implemented in the libc library as well as a
84  * kernel API implemented in port_subr.c in genunix.
85  * The available user API functions are:
86  * port_create() : create a port as a file descriptor of portfs file system
87  *		   The standard close(2) function closes a port.
88  * port_associate() : associate a file descriptor with a port to be able to
89  *		      retrieve events from that file descriptor.
90  * port_dissociate(): remove the association of a file descriptor with a port.
91  * port_alert()	 : set/unset a port in alert mode
92  * port_send()	 : send an event of type PORT_SOURCE_USER to a port
93  * port_sendn()	 : send an event of type PORT_SOURCE_USER to a list of ports
94  * port_get()	 : retrieve a single event from a port
95  * port_getn()	 : retrieve a list of events from a port
96  *
97  * The available kernel API functions are:
98  * port_allocate_event(): allocate an event slot/structure of/from a port
99  * port_init_event()    : set event data in the event structure
100  * port_send_event()    : send event to a port
101  * port_free_event()    : deliver allocated slot/structure back to a port
102  * port_associate_ksource(): associate a kernel event source with a port
103  * port_dissociate_ksource(): dissociate a kernel event source from a port
104  *
105  * The libc implementation consists of small functions which pass the
106  * arguments to the kernel using the "portfs" system call. It means, all the
107  * synchronisation work is being done in the kernel. The "portfs" system
108  * call loads the portfs file system into the kernel.
109  *
110  * PORT CREATION
111  * The first function to be used is port_create() which internally creates
112  * a vnode and a portfs node. The portfs node is represented by the port_t
113  * structure, which again includes all the data necessary to control a port.
114  * port_create() returns a file descriptor, which needs to be used in almost
115  * all other event port functions.
116  * The maximum number of ports per system is controlled by the resource
117  * control: project:port-max-ids.
118  *
119  * EVENT GENERATION
120  * The second step is the triggering of events, which could be sent to a port.
121  * Every event source implements an own method to generate events for a port:
122  * PORT_SOURCE_AIO:
123  * 	The sigevent structure of the standard POSIX-IO functions
124  * 	was extended by an additional notification type.
125  * 	Standard notification types:
126  * 	SIGEV_NONE, SIGEV_SIGNAL and SIGEV_THREAD
127  * 	Event ports introduced now SIGEV_PORT.
128  * 	The notification type SIGEV_PORT specifies that a structure
129  * 	of type port_notify_t has to be attached to the sigev_value.
130  * 	The port_notify_t structure contains the event port file
131  * 	descriptor and a user-defined pointer.
132  * 	Internally the AIO implementation will use the kernel API
133  * 	functions to allocate an event port slot per transaction (aiocb)
134  * 	and sent the event to the port as soon as the transaction completes.
135  * 	All the events submitted per transaction are of type
136  * 	PORT_SOURCE_AIO.
137  * PORT_SOURCE_TIMER:
138  * 	The timer_create() function uses the same method as the
139  * 	PORT_SOURCE_AIO event source. It also uses the sigevent structure
140  * 	to deliver the port information.
141  * 	Internally the timer code will allocate a single event slot/struct
142  * 	per timer and it will send the timer event as soon as the timer
143  * 	fires. If the timer-fired event is not delivered to the application
144  * 	before the next period elapsed, then an overrun counter will be
145  * 	incremented. The timer event source uses a callback function to
146  * 	detect the delivery of the event to the application. At that time
147  * 	the timer callback function will update the event overrun counter.
148  * PORT_SOURCE_FD:
149  * 	This event source uses the port_associate() function to allocate
150  * 	an event slot/struct from a port. The application defines in the
151  * 	events argument of port_associate() the type of events which it is
152  * 	interested on.
153  * 	The internal pollwakeup() function is used by all the file
154  * 	systems --which are supporting the VOP_POLL() interface- to notify
155  * 	the upper layer (poll(2), devpoll(7d) and now event ports) about
156  * 	the event triggered (see valid events in poll(2)).
157  * 	The pollwakeup() function forwards the event to the layer registered
158  * 	to receive the current event.
159  * 	The port_dissociate() function can be used to free the allocated
160  * 	event slot from the port. Anyway, file descriptors deliver events
161  * 	only one time and remain deactivated until the application
162  * 	reactivates the association of a file descriptor with port_associate().
163  * 	If an associated file descriptor is closed then the file descriptor
164  * 	will be dissociated automatically from the port.
165  *
166  * PORT_SOURCE_ALERT:
167  * 	This event type is generated when the port was previously set in
168  * 	alert mode using the port_alert() function.
169  * 	A single alert event is delivered to every thread which tries to
170  * 	retrieve events from a port.
171  * PORT_SOURCE_USER:
172  * 	This type of event is generated from user level using the port_send()
173  * 	function to send a user event to a port or the port_sendn() function
174  * 	to send an event to a list of ports.
175  *
176  * EVENT DELIVERY / RETRIEVING EVENTS
177  * Events remain in the port queue until:
178  * - the application uses port_get() or port_getn() to retrieve events,
179  * - the event source cancel the event,
180  * - the event port is closed or
181  * - the process exits.
182  * The maximal number of events in a port queue is the maximal number
183  * of event slots/structures which can be allocated by event sources.
184  * The allocation of event slots/structures is controlled by the resource
185  * control: process.port-max-events.
186  * The port_get() function retrieves a single event and the port_getn()
187  * function retrieves a list of events.
188  * Events are classified as shareable and non-shareable events across processes.
189  * Non-shareable events are invisible for the port_get(n)() functions of
190  * processes other than the owner of the event.
191  *    Shareable event types are:
192  *    PORT_SOURCE_USER events
193  * 	This type of event is unconditionally shareable and without
194  * 	limitations. If the parent process sends a user event and closes
195  * 	the port afterwards, the event remains in the port and the child
196  * 	process will still be able to retrieve the user event.
197  *    PORT_SOURCE_ALERT events
198  * 	This type of event is shareable between processes.
199  * 	Limitation:	The alert mode of the port is removed if the owner
200  * 			(process which set the port in alert mode) of the
201  * 			alert event closes the port.
202  *    PORT_SOURCE_FD events
203  * 	This type of event is conditional shareable between processes.
204  * 	After fork(2) all forked file descriptors are shareable between
205  * 	the processes. The child process is allowed to retrieve events
206  * 	from the associated file descriptors and it can also re-associate
207  * 	the fd with the port.
208  * 	Limitations:	The child process is not allowed to dissociate
209  * 			the file descriptor from the port. Only the
210  * 			owner (process) of the association is allowed to
211  * 			dissociate the file descriptor from the port.
212  * 			If the owner of the association closes the port
213  * 			the association will be removed.
214  *    PORT_SOURCE_AIO events
215  * 	This type of event is not shareable between processes.
216  *    PORT_SOURCE_TIMER events
217  * 	This type of event is not shareable between processes.
218  *
219  * FORK BEHAVIOUR
220  * On fork(2) the child process inherits all opened file descriptors from
221  * the parent process. This is also valid for port file descriptors.
222  * Associated file descriptors with a port maintain the association across the
223  * fork(2). It means, the child process gets full access to the port and
224  * it can retrieve events from all common associated file descriptors.
225  * Events of file descriptors created and associated with a port after the
226  * fork(2) are non-shareable and can only be retrieved by the same process.
227  *
228  * If the parent or the child process closes an exported port (using fork(2)
229  * or I_SENDFD) all the file descriptors associated with the port by the
230  * process will be dissociated from the port. Events of dissociated file
231  * descriptors as well as all non-shareable events will be discarded.
232  * The other process can continue working with the port as usual.
233  *
234  * CLOSING A PORT
235  * close(2) has to be used to close a port. See FORK BEHAVIOUR for details.
236  *
237  * PORT EVENT STRUCTURES
238  * The global control structure of the event ports framework is port_control_t.
239  * port_control_t keeps track of the number of created ports in the system.
240  * The cache of the port event structures is also located in port_control_t.
241  *
242  * On port_create() the vnode and the portfs node is also created.
243  * The portfs node is represented by the port_t structure.
244  * The port_t structure manages all port specific tasks:
245  * - management of resource control values
246  * - port VOP_POLL interface
247  * - creation time
248  * - uid and gid of the port
249  *
250  * The port_t structure contains the port_queue_t structure.
251  * The port_queue_t structure contains all the data necessary for the
252  * queue management:
253  * - locking
254  * - condition variables
255  * - event counters
256  * - submitted events	(represented by port_kevent_t structures)
257  * - threads waiting for event delivery (check portget_t structure)
258  * - PORT_SOURCE_FD cache	(managed by the port_fdcache_t structure)
259  * - event source management (managed by the port_source_t structure)
260  * - alert mode management	(check port_alert_t structure)
261  *
262  * EVENT MANAGEMENT
263  * The event port file system creates a kmem_cache for internal allocation of
264  * event port structures.
265  *
266  * 1. Event source association with a port:
267  * The first step to do for event sources is to get associated with a port
268  * using the port_associate_ksource() function or adding an entry to the
269  * port_ksource_tab[]. An event source can get dissociated from a port
270  * using the port_dissociate_ksource() function. An entry in the
271  * port_ksource_tab[] implies that the source will be associated
272  * automatically with every new created port.
273  * The event source can deliver a callback function, which is used by the
274  * port to notify the event source about close(2). The idea is that
275  * in such a case the event source should free all allocated resources
276  * and it must return to the port all allocated slots/structures.
277  * The port_close() function will wait until all allocated event
278  * structures/slots are returned to the port.
279  * The callback function is not necessary when the event source does not
280  * maintain local resources, a second condition is that the event source
281  * can guarantee that allocated event slots will be returned without
282  * delay to the port (it will not block and sleep somewhere).
283  *
284  * 2. Reservation of an event slot / event structure
285  * The event port reliability is based on the reservation of an event "slot"
286  * (allocation of an event structure) by the event source as part of the
287  * application call. If the maximal number of event slots is exhausted then
288  * the event source can return a corresponding error code to the application.
289  *
290  * The port_alloc_event() function has to be used by event sources to
291  * allocate an event slot (reserve an event structure). The port_alloc_event()
292  * doesn not block and it will return a 0 value on success or an error code
293  * if it fails.
294  * An argument of port_alloc_event() is a flag which determines the behavior
295  * of the event after it was delivered to the application:
296  * PORT_ALLOC_DEFAULT	: event slot becomes free after delivery to the
297  *			  application.
298  * PORT_ALLOC_PRIVATE	: event slot remains under the control of the event
299  *			  source. This kind of slots can not be used for
300  *			  event delivery and should only be used internally
301  *			  by the event source.
302  * PORT_KEV_CACHED	: event slot remains under the control of an event
303  *			  port cache. It does not become free after delivery
304  *			  to the application.
305  * PORT_ALLOC_SCACHED	: event slot remains under the control of the event
306  *			  source. The event source takes the control over
307  *			  the slot after the event is delivered to the
308  *			  application.
309  *
310  * 3. Delivery of events to the event port
311  * Earlier allocated event structure/slot has to be used to deliver
312  * event data to the port. Event source has to use the function
313  * port_send_event(). The single argument is a pointer to the previously
314  * reserved event structure/slot.
315  * The portkev_events field of the port_kevent_t structure can be updated/set
316  * in two ways:
317  * 1. using the port_set_event() function, or
318  * 2. updating the portkev_events field out of the callback function:
319  *    The event source can deliver a callback function to the port as an
320  *    argument of port_init_event().
321  *    One of the arguments of the callback function is a pointer to the
322  *    events field, which will be delivered to the application.
323  *    (see Delivery of events to the application).
324  * Event structures/slots can be delivered to the event port only one time,
325  * they remain blocked until the data is delivered to the application and the
326  * slot becomes free or it is delivered back to the event source
327  * (PORT_ALLOC_SCACHED). The activation of the callback function mentioned above
328  * is at the same time the indicator for the event source that the event
329  * structure/slot is free for reuse.
330  *
331  * 4. Delivery of events to the application
332  * The events structures/slots delivered by event sources remain in the
333  * port queue until they are retrieved by the application or the port
334  * is closed (exit(2) also closes all opened file descriptors)..
335  * The application uses port_get() or port_getn() to retrieve events from
336  * a port. port_get() retrieves a single event structure/slot and port_getn()
337  * retrieves a list of event structures/slots.
338  * Both functions are able to poll for events and return immediately or they
339  * can specify a timeout value.
340  * Before the events are delivered to the application they are moved to a
341  * second temporary internal queue. The idea is to avoid lock collisions or
342  * contentions of the global queue lock.
343  * The global queue lock is used every time when an event source delivers
344  * new events to the port.
345  * The port_get() and port_getn() functions
346  * a) retrieve single events from the temporary queue,
347  * b) prepare the data to be passed to the application memory,
348  * c) activate the callback function of the event sources:
349  *    - to get the latest event data,
350  *    - the event source can free all allocated resources associated with the
351  *      current event,
352  *    - the event source can re-use the current event slot/structure
353  *    - the event source can deny the delivery of the event to the application
354  *      (e.g. because of the wrong process).
355  * d) put the event back to the temporary queue if the event delivery was denied
356  * e) repeat a) until d) as long as there are events in the queue and
357  *    there is enough user space available.
358  *
359  * The loop described above could block for a very long time the global mutex,
360  * to avoid that a second mutex was introduced to synchronized concurrent
361  * threads accessing the temporary queue.
362  */
363 
364 static int64_t portfs(int, uintptr_t, uintptr_t, uintptr_t, uintptr_t,
365     uintptr_t);
366 
367 static struct sysent port_sysent = {
368 	6,
369 	SE_ARGC | SE_64RVAL | SE_NOUNLOAD,
370 	(int (*)())portfs,
371 };
372 
373 static struct modlsys modlsys = {
374 	&mod_syscallops, "event ports", &port_sysent
375 };
376 
377 #ifdef _SYSCALL32_IMPL
378 
379 static int64_t
380 portfs32(uint32_t arg1, int32_t arg2, uint32_t arg3, uint32_t arg4,
381     uint32_t arg5, uint32_t arg6);
382 
383 static struct sysent port_sysent32 = {
384 	6,
385 	SE_ARGC | SE_64RVAL | SE_NOUNLOAD,
386 	(int (*)())portfs32,
387 };
388 
389 static struct modlsys modlsys32 = {
390 	&mod_syscallops32,
391 	"32-bit event ports syscalls",
392 	&port_sysent32
393 };
394 #endif	/* _SYSCALL32_IMPL */
395 
396 static struct modlinkage modlinkage = {
397 	MODREV_1,
398 	&modlsys,
399 #ifdef _SYSCALL32_IMPL
400 	&modlsys32,
401 #endif
402 	NULL
403 };
404 
405 port_kstat_t port_kstat = {
406 	{ "ports",	KSTAT_DATA_UINT32 }
407 };
408 
409 dev_t	portdev;
410 struct	vnodeops *port_vnodeops;
411 struct	vfs port_vfs;
412 
413 extern	rctl_hndl_t rc_process_portev;
414 extern	rctl_hndl_t rc_project_portids;
415 extern	void aio_close_port(void *, int, pid_t, int);
416 
417 /*
418  * This table contains a list of event sources which need a static
419  * association with a port (every port).
420  * The last NULL entry in the table is required to detect "end of table".
421  */
422 struct port_ksource port_ksource_tab[] = {
423 	{PORT_SOURCE_AIO, aio_close_port, NULL, NULL},
424 	{0, NULL, NULL, NULL}
425 };
426 
427 /* local functions */
428 static int port_getn(port_t *, port_event_t *, uint_t, uint_t *,
429     port_gettimer_t *);
430 static int port_sendn(int [], int [], uint_t, int, void *, uint_t *);
431 static int port_alert(port_t *, int, int, void *);
432 static int port_dispatch_event(port_t *, int, int, int, uintptr_t, void *);
433 static int port_send(port_t *, int, int, void *);
434 static int port_create(int *);
435 static int port_get_alert(port_alert_t *, port_event_t *);
436 static int port_copy_event(port_event_t *, port_kevent_t *, list_t *);
437 static int *port_errorn(int *, int, int, int);
438 static int port_noshare(void *, int *, pid_t, int, void *);
439 static int port_get_timeout(timespec_t *, timespec_t *, timespec_t **, int *,
440     int);
441 static void port_init(port_t *);
442 static void port_remove_alert(port_queue_t *);
443 static void port_add_ksource_local(port_t *, port_ksource_t *);
444 static void port_check_return_cond(port_queue_t *);
445 static void port_dequeue_thread(port_queue_t *, portget_t *);
446 static portget_t *port_queue_thread(port_queue_t *, uint_t);
447 static void port_kstat_init(void);
448 
449 #ifdef	_SYSCALL32_IMPL
450 static int port_copy_event32(port_event32_t *, port_kevent_t *, list_t *);
451 #endif
452 
453 int
454 _init(void)
455 {
456 	static const fs_operation_def_t port_vfsops_template[] = {
457 		NULL, NULL
458 	};
459 	extern const	fs_operation_def_t port_vnodeops_template[];
460 	vfsops_t	*port_vfsops;
461 	int		error;
462 	major_t 	major;
463 
464 	if ((major = getudev()) == (major_t)-1)
465 		return (ENXIO);
466 	portdev = makedevice(major, 0);
467 
468 	/* Create a dummy vfs */
469 	error = vfs_makefsops(port_vfsops_template, &port_vfsops);
470 	if (error) {
471 		cmn_err(CE_WARN, "port init: bad vfs ops");
472 		return (error);
473 	}
474 	vfs_setops(&port_vfs, port_vfsops);
475 	port_vfs.vfs_flag = VFS_RDONLY;
476 	port_vfs.vfs_dev = portdev;
477 	vfs_make_fsid(&(port_vfs.vfs_fsid), portdev, 0);
478 
479 	error = vn_make_ops("portfs", port_vnodeops_template, &port_vnodeops);
480 	if (error) {
481 		vfs_freevfsops(port_vfsops);
482 		cmn_err(CE_WARN, "port init: bad vnode ops");
483 		return (error);
484 	}
485 
486 	mutex_init(&port_control.pc_mutex, NULL, MUTEX_DEFAULT, NULL);
487 	port_control.pc_nents = 0;	/* number of active ports */
488 
489 	/* create kmem_cache for port event structures */
490 	port_control.pc_cache = kmem_cache_create("port_cache",
491 	    sizeof (port_kevent_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
492 
493 	port_kstat_init();		/* init port kstats */
494 	return (mod_install(&modlinkage));
495 }
496 
497 int
498 _info(struct modinfo *modinfop)
499 {
500 	return (mod_info(&modlinkage, modinfop));
501 }
502 
503 /*
504  * System call wrapper for all port related system calls from 32-bit programs.
505  */
506 #ifdef _SYSCALL32_IMPL
507 static int64_t
508 portfs32(uint32_t opcode, int32_t a0, uint32_t a1, uint32_t a2, uint32_t a3,
509     uint32_t a4)
510 {
511 	int64_t	error;
512 
513 	switch (opcode & PORT_CODE_MASK) {
514 	case PORT_GET:
515 		error = portfs(PORT_GET, a0, a1, (int)a2, (int)a3, a4);
516 		break;
517 	case PORT_SENDN:
518 		error = portfs(opcode, (uint32_t)a0, a1, a2, a3, a4);
519 		break;
520 	default:
521 		error = portfs(opcode, a0, a1, a2, a3, a4);
522 		break;
523 	}
524 	return (error);
525 }
526 #endif	/* _SYSCALL32_IMPL */
527 
528 /*
529  * System entry point for port functions.
530  * a0 is a port file descriptor (except for PORT_SENDN and PORT_CREATE).
531  * The libc uses PORT_SYS_NOPORT in functions which do not deliver a
532  * port file descriptor as first argument.
533  */
534 static int64_t
535 portfs(int opcode, uintptr_t a0, uintptr_t a1, uintptr_t a2, uintptr_t a3,
536     uintptr_t a4)
537 {
538 	rval_t		r;
539 	port_t		*pp;
540 	int 		error = 0;
541 	uint_t		nget;
542 	file_t		*fp;
543 	port_gettimer_t	port_timer;
544 
545 	r.r_vals = 0;
546 	if (opcode & PORT_SYS_NOPORT) {
547 		opcode &= PORT_CODE_MASK;
548 		if (opcode == PORT_SENDN) {
549 			error = port_sendn((int *)a0, (int *)a1, (uint_t)a2,
550 			    (int)a3, (void *)a4, (uint_t *)&r.r_val1);
551 			if (error && (error != EIO))
552 				return ((int64_t)set_errno(error));
553 			return (r.r_vals);
554 		}
555 
556 		if (opcode == PORT_CREATE) {
557 			error = port_create(&r.r_val1);
558 			if (error)
559 				return ((int64_t)set_errno(error));
560 			return (r.r_vals);
561 		}
562 	}
563 
564 	/* opcodes using port as first argument (a0) */
565 
566 	if ((fp = getf((int)a0)) == NULL)
567 		return ((uintptr_t)set_errno(EBADF));
568 
569 	if (fp->f_vnode->v_type != VPORT) {
570 		releasef((int)a0);
571 		return ((uintptr_t)set_errno(EBADFD));
572 	}
573 
574 	pp = VTOEP(fp->f_vnode);
575 
576 	switch (opcode & PORT_CODE_MASK) {
577 	case	PORT_GET:
578 	{
579 		/* see PORT_GETN description */
580 		struct	timespec timeout;
581 
582 		port_timer.pgt_flags = PORTGET_ONE;
583 		port_timer.pgt_loop = 0;
584 		port_timer.pgt_rqtp = NULL;
585 		if (a4 != NULL) {
586 			port_timer.pgt_timeout = &timeout;
587 			timeout.tv_sec = (time_t)a2;
588 			timeout.tv_nsec = (long)a3;
589 		} else {
590 			port_timer.pgt_timeout = NULL;
591 		}
592 		do {
593 			nget = 1;
594 			error = port_getn(pp, (port_event_t *)a1, 1,
595 			    (uint_t *)&nget, &port_timer);
596 		} while (nget == 0 && error == 0 && port_timer.pgt_loop);
597 		break;
598 	}
599 	case	PORT_GETN:
600 	{
601 		/*
602 		 * port_getn() can only retrieve own or shareable events from
603 		 * other processes. The port_getn() function remains in the
604 		 * kernel until own or shareable events are available or the
605 		 * timeout elapses.
606 		 */
607 		port_timer.pgt_flags = 0;
608 		port_timer.pgt_loop = 0;
609 		port_timer.pgt_rqtp = NULL;
610 		port_timer.pgt_timeout = (struct timespec *)a4;
611 		do {
612 			nget = a3;
613 			error = port_getn(pp, (port_event_t *)a1, (uint_t)a2,
614 			    (uint_t *)&nget, &port_timer);
615 		} while (nget == 0 && error == 0 && port_timer.pgt_loop);
616 		r.r_val1 = nget;
617 		r.r_val2 = error;
618 		releasef((int)a0);
619 		if (error && error != ETIME)
620 			return ((int64_t)set_errno(error));
621 		return (r.r_vals);
622 	}
623 	case	PORT_ASSOCIATE:
624 	{
625 		/* currently only PORT_SOURCE_FD is implemented */
626 		if ((int)a1 != PORT_SOURCE_FD) {
627 			error = EINVAL;
628 			break;
629 		}
630 		error = port_associate_fd(pp, (int)a1, (uintptr_t)a2, (int)a3,
631 			    (void *)a4);
632 		break;
633 	}
634 	case	PORT_SEND:
635 	{
636 		/* user-defined events */
637 		error = port_send(pp, PORT_SOURCE_USER, (int)a1, (void *)a2);
638 		break;
639 	}
640 	case	PORT_DISPATCH:
641 	{
642 		/*
643 		 * library events, blocking
644 		 * Only events of type PORT_SOURCE_AIO or PORT_SOURCE_MQ
645 		 * are currently allowed.
646 		 */
647 		if ((int)a1 != PORT_SOURCE_AIO && (int)a1 != PORT_SOURCE_MQ) {
648 			error = EINVAL;
649 			break;
650 		}
651 		error = port_dispatch_event(pp, (int)opcode, (int)a1, (int)a2,
652 		    (uintptr_t)a3, (void *)a4);
653 		break;
654 	}
655 	case	PORT_DISSOCIATE:
656 	{
657 		/* currently only PORT_SOURCE_FD is implemented */
658 		if ((int)a1 != PORT_SOURCE_FD) {
659 			error = EINVAL;
660 			break;
661 		}
662 		error = port_dissociate_fd(pp, (uintptr_t)a2);
663 		break;
664 	}
665 	case	PORT_ALERT:
666 	{
667 		if ((int)a2)	/* a2 = events */
668 			error = port_alert(pp, (int)a1, (int)a2, (void *)a3);
669 		else
670 			port_remove_alert(&pp->port_queue);
671 		break;
672 	}
673 	default:
674 		error = EINVAL;
675 		break;
676 	}
677 
678 	releasef((int)a0);
679 	if (error)
680 		return ((int64_t)set_errno(error));
681 	return (r.r_vals);
682 }
683 
684 /*
685  * System call to create a port.
686  *
687  * The port_create() function creates a vnode of type VPORT per port.
688  * The port control data is associated with the vnode as vnode private data.
689  * The port_create() function returns an event port file descriptor.
690  */
691 static int
692 port_create(int *fdp)
693 {
694 	port_t		*pp;
695 	vnode_t		*vp;
696 	struct file	*fp;
697 	proc_t		*p = curproc;
698 
699 	/* initialize vnode and port private data */
700 	pp = kmem_zalloc(sizeof (port_t), KM_SLEEP);
701 
702 	pp->port_vnode = vn_alloc(KM_SLEEP);
703 	vp = EPTOV(pp);
704 	vn_setops(vp, port_vnodeops);
705 	vp->v_type = VPORT;
706 	vp->v_vfsp = &port_vfs;
707 	vp->v_data = (caddr_t)pp;
708 
709 	mutex_enter(&port_control.pc_mutex);
710 	/*
711 	 * Retrieve the maximal number of event ports allowed per system from
712 	 * the resource control: project.port-max-ids.
713 	 */
714 	mutex_enter(&p->p_lock);
715 	if (rctl_test(rc_project_portids, p->p_task->tk_proj->kpj_rctls, p,
716 	    port_control.pc_nents + 1, RCA_SAFE) & RCT_DENY) {
717 		mutex_exit(&p->p_lock);
718 		vn_free(vp);
719 		kmem_free(pp, sizeof (port_t));
720 		mutex_exit(&port_control.pc_mutex);
721 		return (EAGAIN);
722 	}
723 
724 	/*
725 	 * Retrieve the maximal number of events allowed per port from
726 	 * the resource control: process.port-max-events.
727 	 */
728 	pp->port_max_events = rctl_enforced_value(rc_process_portev,
729 	    p->p_rctls, p);
730 	mutex_exit(&p->p_lock);
731 
732 	/* allocate a new user file descriptor and a file structure */
733 	if (falloc(vp, 0, &fp, fdp)) {
734 		/*
735 		 * If the file table is full, free allocated resources.
736 		 */
737 		vn_free(vp);
738 		kmem_free(pp, sizeof (port_t));
739 		mutex_exit(&port_control.pc_mutex);
740 		return (EMFILE);
741 	}
742 
743 	mutex_exit(&fp->f_tlock);
744 
745 	pp->port_fd = *fdp;
746 	port_control.pc_nents++;
747 	p->p_portcnt++;
748 	port_kstat.pks_ports.value.ui32++;
749 	mutex_exit(&port_control.pc_mutex);
750 
751 	/* initializes port private data */
752 	port_init(pp);
753 	/* set user file pointer */
754 	setf(*fdp, fp);
755 	return (0);
756 }
757 
758 /*
759  * port_init() initializes event port specific data
760  */
761 static void
762 port_init(port_t *pp)
763 {
764 	port_queue_t	*portq;
765 	port_ksource_t	*pks;
766 
767 	mutex_init(&pp->port_mutex, NULL, MUTEX_DEFAULT, NULL);
768 	portq = &pp->port_queue;
769 	mutex_init(&portq->portq_mutex, NULL, MUTEX_DEFAULT, NULL);
770 	pp->port_flags |= PORT_INIT;
771 
772 	/*
773 	 * If it is not enough memory available to satisfy a user
774 	 * request using a single port_getn() call then port_getn()
775 	 * will reduce the size of the list to PORT_MAX_LIST.
776 	 */
777 	pp->port_max_list = port_max_list;
778 
779 	/* Set timestamp entries required for fstat(2) requests */
780 	gethrestime(&pp->port_ctime);
781 	pp->port_uid = crgetuid(curproc->p_cred);
782 	pp->port_gid = crgetgid(curproc->p_cred);
783 
784 	/* initialize port queue structs */
785 	list_create(&portq->portq_list, sizeof (port_kevent_t),
786 	    offsetof(port_kevent_t, portkev_node));
787 	list_create(&portq->portq_get_list, sizeof (port_kevent_t),
788 	    offsetof(port_kevent_t, portkev_node));
789 	portq->portq_flags = 0;
790 	pp->port_pid = curproc->p_pid;
791 
792 	/* Allocate cache skeleton for PORT_SOURCE_FD events */
793 	portq->portq_pcp = kmem_zalloc(sizeof (port_fdcache_t), KM_SLEEP);
794 	mutex_init(&portq->portq_pcp->pc_lock, NULL, MUTEX_DEFAULT, NULL);
795 
796 	/*
797 	 * Allocate cache skeleton for association of event sources.
798 	 */
799 	mutex_init(&portq->portq_source_mutex, NULL, MUTEX_DEFAULT, NULL);
800 	portq->portq_scache = kmem_zalloc(
801 	    PORT_SCACHE_SIZE * sizeof (port_source_t *), KM_SLEEP);
802 
803 	/*
804 	 * pre-associate some kernel sources with this port.
805 	 * The pre-association is required to create port_source_t
806 	 * structures for object association.
807 	 * Some sources can not get associated with a port before the first
808 	 * object association is requested. Another reason to pre_associate
809 	 * a particular source with a port is because of performance.
810 	 */
811 
812 	for (pks = port_ksource_tab; pks->pks_source != 0; pks++)
813 		port_add_ksource_local(pp, pks);
814 }
815 
816 /*
817  * The port_add_ksource_local() function is being used to associate
818  * event sources with every new port.
819  * The event sources need to be added to port_ksource_tab[].
820  */
821 static void
822 port_add_ksource_local(port_t *pp, port_ksource_t *pks)
823 {
824 	port_source_t	*pse;
825 	port_source_t	**ps;
826 
827 	mutex_enter(&pp->port_queue.portq_source_mutex);
828 	ps = &pp->port_queue.portq_scache[PORT_SHASH(pks->pks_source)];
829 	for (pse = *ps; pse != NULL; pse = pse->portsrc_next) {
830 		if (pse->portsrc_source == pks->pks_source)
831 			break;
832 	}
833 
834 	if (pse == NULL) {
835 		/* associate new source with the port */
836 		pse = kmem_zalloc(sizeof (port_source_t), KM_SLEEP);
837 		pse->portsrc_source = pks->pks_source;
838 		pse->portsrc_close = pks->pks_close;
839 		pse->portsrc_closearg = pks->pks_closearg;
840 		pse->portsrc_cnt = 1;
841 
842 		pks->pks_portsrc = pse;
843 		if (*ps != NULL)
844 			pse->portsrc_next = (*ps)->portsrc_next;
845 		*ps = pse;
846 	}
847 	mutex_exit(&pp->port_queue.portq_source_mutex);
848 }
849 
850 /*
851  * The port_send() function sends an event of type "source" to a
852  * port. This function is non-blocking. An event can be sent to
853  * a port as long as the number of events per port does not achieve the
854  * maximal allowed number of events. The max. number of events per port is
855  * defined by the resource control process.max-port-events.
856  * This function is used by the port library function port_send()
857  * and port_dispatch(). The port_send(3c) function is part of the
858  * event ports API and submits events of type PORT_SOURCE_USER. The
859  * port_dispatch() function is project private and it is used by library
860  * functions to submit events of other types than PORT_SOURCE_USER
861  * (e.g. PORT_SOURCE_AIO).
862  */
863 static int
864 port_send(port_t *pp, int source, int events, void *user)
865 {
866 	port_kevent_t	*pev;
867 	int		error;
868 
869 	error = port_alloc_event_local(pp, source, PORT_ALLOC_DEFAULT, &pev);
870 	if (error)
871 		return (error);
872 
873 	pev->portkev_object = 0;
874 	pev->portkev_events = events;
875 	pev->portkev_user = user;
876 	pev->portkev_callback = NULL;
877 	pev->portkev_arg = NULL;
878 	pev->portkev_flags = 0;
879 
880 	port_send_event(pev);
881 	return (0);
882 }
883 
884 /*
885  * The port_noshare() function returns 0 if the current event was generated
886  * by the same process. Otherwise is returns a value other than 0 and the
887  * event should not be delivered to the current processe.
888  * The port_noshare() function is normally used by the port_dispatch()
889  * function. The port_dispatch() function is project private and can only be
890  * used within the event port project.
891  * Currently the libaio uses the port_dispatch() function to deliver events
892  * of types PORT_SOURCE_AIO.
893  */
894 /* ARGSUSED */
895 static int
896 port_noshare(void *arg, int *events, pid_t pid, int flag, void *evp)
897 {
898 	if (flag == PORT_CALLBACK_DEFAULT && curproc->p_pid != pid)
899 		return (1);
900 	return (0);
901 }
902 
903 /*
904  * The port_dispatch_event() function is project private and it is used by
905  * libraries involved in the project to deliver events to the port.
906  * port_dispatch will sleep and wait for enough resources to satisfy the
907  * request, if necessary.
908  * The library can specify if the delivered event is shareable with other
909  * processes (see PORT_SYS_NOSHARE flag).
910  */
911 static int
912 port_dispatch_event(port_t *pp, int opcode, int source, int events,
913     uintptr_t object, void *user)
914 {
915 	port_kevent_t	*pev;
916 	int		error;
917 
918 	error = port_alloc_event_block(pp, source, PORT_ALLOC_DEFAULT, &pev);
919 	if (error)
920 		return (error);
921 
922 	pev->portkev_object = object;
923 	pev->portkev_events = events;
924 	pev->portkev_user = user;
925 	pev->portkev_arg = NULL;
926 	if (opcode & PORT_SYS_NOSHARE) {
927 		pev->portkev_flags = PORT_KEV_NOSHARE;
928 		pev->portkev_callback = port_noshare;
929 	} else {
930 		pev->portkev_flags = 0;
931 		pev->portkev_callback = NULL;
932 	}
933 
934 	port_send_event(pev);
935 	return (0);
936 }
937 
938 
939 /*
940  * The port_sendn() function is the kernel implementation of the event
941  * port API function port_sendn(3c).
942  * This function is able to send an event to a list of event ports.
943  */
944 static int
945 port_sendn(int ports[], int errors[], uint_t nent, int events, void *user,
946     uint_t *nget)
947 {
948 	port_kevent_t	*pev;
949 	int		errorcnt = 0;
950 	int		error = 0;
951 	int		count;
952 	int		port;
953 	int		*plist;
954 	int		*elist = NULL;
955 	file_t		*fp;
956 	port_t		*pp;
957 
958 	if (nent == 0 || nent > port_max_list)
959 		return (EINVAL);
960 
961 	plist = kmem_alloc(nent * sizeof (int), KM_SLEEP);
962 	if (copyin((void *)ports, plist, nent * sizeof (int))) {
963 		kmem_free(plist, nent * sizeof (int));
964 		return (EFAULT);
965 	}
966 
967 	/*
968 	 * Scan the list for event port file descriptors and send the
969 	 * attached user event data embedded in a event of type
970 	 * PORT_SOURCE_USER to every event port in the list.
971 	 * If a list entry is not a valid event port then the corresponding
972 	 * error code will be stored in the errors[] list with the same
973 	 * list offset as in the ports[] list.
974 	 */
975 
976 	for (count = 0; count < nent; count++) {
977 		port = plist[count];
978 		if ((fp = getf(port)) == NULL) {
979 			elist = port_errorn(elist, nent, EBADF, count);
980 			errorcnt++;
981 			continue;
982 		}
983 
984 		pp = VTOEP(fp->f_vnode);
985 		if (fp->f_vnode->v_type != VPORT) {
986 			releasef(port);
987 			elist = port_errorn(elist, nent, EBADFD, count);
988 			errorcnt++;
989 			continue;
990 		}
991 
992 		error = port_alloc_event_local(pp, PORT_SOURCE_USER,
993 		    PORT_ALLOC_DEFAULT, &pev);
994 		if (error) {
995 			releasef(port);
996 			elist = port_errorn(elist, nent, error, count);
997 			errorcnt++;
998 			continue;
999 		}
1000 
1001 		pev->portkev_object = 0;
1002 		pev->portkev_events = events;
1003 		pev->portkev_user = user;
1004 		pev->portkev_callback = NULL;
1005 		pev->portkev_arg = NULL;
1006 		pev->portkev_flags = 0;
1007 
1008 		port_send_event(pev);
1009 		releasef(port);
1010 	}
1011 	if (errorcnt) {
1012 		error = EIO;
1013 		if (copyout(elist, (void *)errors, nent * sizeof (int)))
1014 			error = EFAULT;
1015 		kmem_free(elist, nent * sizeof (int));
1016 	}
1017 	*nget = nent - errorcnt;
1018 	kmem_free(plist, nent * sizeof (int));
1019 	return (error);
1020 }
1021 
1022 static int *
1023 port_errorn(int *elist, int nent, int error, int index)
1024 {
1025 	if (elist == NULL)
1026 		elist = kmem_zalloc(nent * sizeof (int), KM_SLEEP);
1027 	elist[index] = error;
1028 	return (elist);
1029 }
1030 
1031 /*
1032  * port_alert()
1033  * The port_alert() funcion is a high priority event and it is always set
1034  * on top of the queue. It is also delivered as single event.
1035  * flags:
1036  *	- SET	:overwrite current alert data
1037  *	- UPDATE:set alert data or return EBUSY if alert mode is already set
1038  *
1039  * - set the ALERT flag
1040  * - wakeup all sleeping threads
1041  */
1042 static int
1043 port_alert(port_t *pp, int flags, int events, void *user)
1044 {
1045 	port_queue_t	*portq;
1046 	portget_t	*pgetp;
1047 	port_alert_t	*pa;
1048 
1049 	if ((flags & PORT_ALERT_INVALID) == PORT_ALERT_INVALID)
1050 		return (EINVAL);
1051 
1052 	portq = &pp->port_queue;
1053 	pa = &portq->portq_alert;
1054 	mutex_enter(&portq->portq_mutex);
1055 
1056 	/* check alert conditions */
1057 	if (flags == PORT_ALERT_UPDATE) {
1058 		if (portq->portq_flags & PORTQ_ALERT) {
1059 			mutex_exit(&portq->portq_mutex);
1060 			return (EBUSY);
1061 		}
1062 	}
1063 
1064 	/*
1065 	 * Store alert data in the port to be delivered to threads
1066 	 * which are using port_get(n) to retrieve events.
1067 	 */
1068 
1069 	portq->portq_flags |= PORTQ_ALERT;
1070 	pa->portal_events = events;		/* alert info */
1071 	pa->portal_pid = curproc->p_pid;	/* process owner */
1072 	pa->portal_object = 0;			/* no object */
1073 	pa->portal_user = user;			/* user alert data */
1074 
1075 	/* alert and deliver alert data to waiting threads */
1076 	pgetp = portq->portq_thread;
1077 	if (pgetp == NULL) {
1078 		/* no threads waiting for events */
1079 		mutex_exit(&portq->portq_mutex);
1080 		return (0);
1081 	}
1082 
1083 	/*
1084 	 * Set waiting threads in alert mode (PORTGET_ALERT)..
1085 	 * Every thread waiting for events already allocated a portget_t
1086 	 * structure to sleep on.
1087 	 * The port alert arguments are stored in the portget_t structure.
1088 	 * The PORTGET_ALERT flag is set to indicate the thread to return
1089 	 * immediately with the alert event.
1090 	 */
1091 	do {
1092 		if ((pgetp->portget_state & PORTGET_ALERT) == 0) {
1093 			pa = &pgetp->portget_alert;
1094 			pa->portal_events = events;
1095 			pa->portal_object = 0;
1096 			pa->portal_user = user;
1097 			pgetp->portget_state |= PORTGET_ALERT;
1098 			cv_signal(&pgetp->portget_cv);
1099 		}
1100 	} while ((pgetp = pgetp->portget_next) != portq->portq_thread);
1101 	mutex_exit(&portq->portq_mutex);
1102 	return (0);
1103 }
1104 
1105 /*
1106  * Clear alert state of the port
1107  */
1108 static void
1109 port_remove_alert(port_queue_t *portq)
1110 {
1111 	mutex_enter(&portq->portq_mutex);
1112 	portq->portq_flags &= ~PORTQ_ALERT;
1113 	mutex_exit(&portq->portq_mutex);
1114 }
1115 
1116 /*
1117  * The port_getn() function is used to retrieve events from a port.
1118  *
1119  * The port_getn() function returns immediately if there are enough events
1120  * available in the port to satisfy the request or if the port is in alert
1121  * mode (see port_alert(3c)).
1122  * The timeout argument of port_getn(3c) -which is embedded in the
1123  * port_gettimer_t structure- specifies if the system call should block or if it
1124  * should return immediately depending on the number of events available.
1125  * This function is internally used by port_getn(3c) as well as by
1126  * port_get(3c).
1127  */
1128 static int
1129 port_getn(port_t *pp, port_event_t *uevp, uint_t max, uint_t *nget,
1130     port_gettimer_t *pgt)
1131 {
1132 	port_queue_t	*portq;
1133 	port_kevent_t 	*pev;
1134 	port_kevent_t 	*lev;
1135 	int		error = 0;
1136 	uint_t		nmax;
1137 	uint_t		nevents;
1138 	uint_t		eventsz;
1139 	port_event_t	*kevp;
1140 	list_t		*glist;
1141 	uint_t		tnent;
1142 	int		rval;
1143 	int		blocking = -1;
1144 	int		flag;
1145 	timespec_t	rqtime;
1146 	timespec_t	*rqtp = NULL;
1147 	portget_t	*pgetp;
1148 	void		*results;
1149 	model_t		model = get_udatamodel();
1150 
1151 	flag = pgt->pgt_flags;
1152 
1153 	if (*nget > max && max > 0)
1154 		return (EINVAL);
1155 
1156 	portq = &pp->port_queue;
1157 	mutex_enter(&portq->portq_mutex);
1158 	if (max == 0) {
1159 		/*
1160 		 * Return number of objects with events.
1161 		 * The port_block() call is required to synchronize this
1162 		 * thread with another possible thread, which could be
1163 		 * retrieving events from the port queue.
1164 		 */
1165 		port_block(portq);
1166 		/*
1167 		 * Check if a second thread is currently retrieving events
1168 		 * and it is using the temporary event queue.
1169 		 */
1170 		if (portq->portq_tnent) {
1171 			/* put remaining events back to the port queue */
1172 			port_push_eventq(portq);
1173 		}
1174 		*nget = portq->portq_nent;
1175 		port_unblock(portq);
1176 		mutex_exit(&portq->portq_mutex);
1177 		return (0);
1178 	}
1179 
1180 	if (uevp == NULL) {
1181 		mutex_exit(&portq->portq_mutex);
1182 		return (EFAULT);
1183 	}
1184 	if (*nget == 0) {		/* no events required */
1185 		mutex_exit(&portq->portq_mutex);
1186 		return (0);
1187 	}
1188 
1189 	/* port is being closed ... */
1190 	if (portq->portq_flags & PORTQ_CLOSE) {
1191 		mutex_exit(&portq->portq_mutex);
1192 		return (EBADFD);
1193 	}
1194 
1195 	/* return immediately if port in alert mode */
1196 	if (portq->portq_flags & PORTQ_ALERT) {
1197 		error = port_get_alert(&portq->portq_alert, uevp);
1198 		if (error == 0)
1199 			*nget = 1;
1200 		mutex_exit(&portq->portq_mutex);
1201 		return (error);
1202 	}
1203 
1204 	portq->portq_thrcnt++;
1205 
1206 	/*
1207 	 * Now check if the completed events satisfy the
1208 	 * "wait" requirements of the current thread:
1209 	 */
1210 
1211 	if (pgt->pgt_loop) {
1212 		/*
1213 		 * loop entry of same thread
1214 		 * pgt_loop is set when the current thread returns
1215 		 * prematurely from this function. That could happen
1216 		 * when a port is being shared between processes and
1217 		 * this thread could not find events to return.
1218 		 * It is not allowed to a thread to retrieve non-shareable
1219 		 * events generated in other processes.
1220 		 * PORTQ_WAIT_EVENTS is set when a thread already
1221 		 * checked the current event queue and no new events
1222 		 * are added to the queue.
1223 		 */
1224 		if (((portq->portq_flags & PORTQ_WAIT_EVENTS) == 0) &&
1225 		    (portq->portq_nent >= *nget)) {
1226 			/* some new events arrived ...check them */
1227 			goto portnowait;
1228 		}
1229 		rqtp = pgt->pgt_rqtp;
1230 		pgt->pgt_flags |= PORTGET_WAIT_EVENTS;
1231 	} else {
1232 		/* check if enough events are available ... */
1233 		if (portq->portq_nent >= *nget)
1234 			goto portnowait;
1235 		/*
1236 		 * There are not enough events available to satisfy
1237 		 * the request, check timeout value and wait for
1238 		 * incoming events.
1239 		 */
1240 		error = port_get_timeout(pgt->pgt_timeout, &rqtime, &rqtp,
1241 		    &blocking, flag);
1242 		if (error) {
1243 			port_check_return_cond(portq);
1244 			mutex_exit(&portq->portq_mutex);
1245 			return (error);
1246 		}
1247 
1248 		if (blocking == 0) /* don't block, check fired events */
1249 			goto portnowait;
1250 
1251 		if (rqtp != NULL) {
1252 			timespec_t	now;
1253 			gethrestime(&now);
1254 			timespecadd(rqtp, &now);
1255 		}
1256 	}
1257 
1258 	/* enqueue thread in the list of waiting threads */
1259 	pgetp = port_queue_thread(portq, *nget);
1260 
1261 
1262 	/* Wait here until return conditions met */
1263 	for (;;) {
1264 		if (pgetp->portget_state & PORTGET_ALERT) {
1265 			/* reap alert event and return */
1266 			error = port_get_alert(&pgetp->portget_alert, uevp);
1267 			if (error)
1268 				*nget = 0;
1269 			else
1270 				*nget = 1;
1271 			port_dequeue_thread(&pp->port_queue, pgetp);
1272 			portq->portq_thrcnt--;
1273 			mutex_exit(&portq->portq_mutex);
1274 			return (error);
1275 		}
1276 
1277 		/*
1278 		 * Check if some other thread is already retrieving
1279 		 * events (portq_getn > 0).
1280 		 */
1281 
1282 		if ((portq->portq_getn  == 0) &&
1283 		    ((portq)->portq_nent >= *nget) &&
1284 		    (!((pgt)->pgt_flags & PORTGET_WAIT_EVENTS) ||
1285 		    !((portq)->portq_flags & PORTQ_WAIT_EVENTS)))
1286 			break;
1287 
1288 		if (portq->portq_flags & PORTQ_CLOSE) {
1289 			error = EBADFD;
1290 			break;
1291 		}
1292 
1293 		rval = cv_waituntil_sig(&pgetp->portget_cv, &portq->portq_mutex,
1294 		    rqtp);
1295 
1296 		if (rval <= 0) {
1297 			error = (rval == 0) ? EINTR : ETIME;
1298 			break;
1299 		}
1300 	}
1301 
1302 	/* take thread out of the wait queue */
1303 	port_dequeue_thread(portq, pgetp);
1304 
1305 	if (error != 0 && (error == EINTR || error == EBADFD ||
1306 	    (error == ETIME && flag))) {
1307 		/* return without events */
1308 		port_check_return_cond(portq);
1309 		mutex_exit(&portq->portq_mutex);
1310 		return (error);
1311 	}
1312 
1313 portnowait:
1314 	/*
1315 	 * Move port event queue to a temporary event queue .
1316 	 * New incoming events will be continue be posted to the event queue
1317 	 * and they will not be considered by the current thread.
1318 	 * The idea is to avoid lock contentions or an often locking/unlocking
1319 	 * of the port queue mutex. The contention and performance degradation
1320 	 * could happen because:
1321 	 * a) incoming events use the port queue mutex to enqueue new events and
1322 	 * b) before the event can be delivered to the application it is
1323 	 *    necessary to notify the event sources about the event delivery.
1324 	 *    Sometimes the event sources can require a long time to return and
1325 	 *    the queue mutex would block incoming events.
1326 	 * During this time incoming events (port_send_event()) do not need
1327 	 * to awake threads waiting for events. Before the current thread
1328 	 * returns it will check the conditions to awake other waiting threads.
1329 	 */
1330 	portq->portq_getn++;	/* number of threads retrieving events */
1331 	port_block(portq);	/* block other threads here */
1332 	nmax = max < portq->portq_nent ? max : portq->portq_nent;
1333 
1334 	if (portq->portq_tnent) {
1335 		/*
1336 		 * Move remaining events from previous thread back to the
1337 		 * port event queue.
1338 		 */
1339 		port_push_eventq(portq);
1340 	}
1341 	/* move port event queue to a temporary queue */
1342 	list_move_tail(&portq->portq_get_list, &portq->portq_list);
1343 	glist = &portq->portq_get_list;	/* use temporary event queue */
1344 	tnent = portq->portq_nent;	/* get current number of events */
1345 	portq->portq_nent = 0;		/* no events in the port event queue */
1346 	portq->portq_flags |= PORTQ_WAIT_EVENTS; /* detect incoming events */
1347 	mutex_exit(&portq->portq_mutex);    /* event queue can be reused now */
1348 
1349 	if (model == DATAMODEL_NATIVE) {
1350 		eventsz = sizeof (port_event_t);
1351 		kevp = kmem_alloc(eventsz * nmax, KM_NOSLEEP);
1352 		if (kevp == NULL) {
1353 			if (nmax > pp->port_max_list)
1354 				nmax = pp->port_max_list;
1355 			kevp = kmem_alloc(eventsz * nmax, KM_SLEEP);
1356 		}
1357 		results = kevp;
1358 		lev = NULL;	/* start with first event in the queue */
1359 		for (nevents = 0; nevents < nmax; ) {
1360 			pev = port_get_kevent(glist, lev);
1361 			if (pev == NULL)	/* no more events available */
1362 				break;
1363 			if (pev->portkev_flags & PORT_KEV_FREE) {
1364 				/* Just discard event */
1365 				list_remove(glist, pev);
1366 				pev->portkev_flags &= ~(PORT_CLEANUP_DONE);
1367 				if (PORT_FREE_EVENT(pev))
1368 					port_free_event_local(pev, 0);
1369 				tnent--;
1370 				continue;
1371 			}
1372 
1373 			/* move event data to copyout list */
1374 			if (port_copy_event(&kevp[nevents], pev, glist)) {
1375 				/*
1376 				 * Event can not be delivered to the
1377 				 * current process.
1378 				 */
1379 				if (lev != NULL)
1380 					list_insert_after(glist, lev, pev);
1381 				else
1382 					list_insert_head(glist, pev);
1383 				lev = pev;  /* last checked event */
1384 			} else {
1385 				nevents++;	/* # of events ready */
1386 			}
1387 		}
1388 #ifdef	_SYSCALL32_IMPL
1389 	} else {
1390 		port_event32_t	*kevp32;
1391 
1392 		eventsz = sizeof (port_event32_t);
1393 		kevp32 = kmem_alloc(eventsz * nmax, KM_NOSLEEP);
1394 		if (kevp32 == NULL) {
1395 			if (nmax > pp->port_max_list)
1396 				nmax = pp->port_max_list;
1397 			kevp32 = kmem_alloc(eventsz * nmax, KM_SLEEP);
1398 		}
1399 		results = kevp32;
1400 		lev = NULL;	/* start with first event in the queue */
1401 		for (nevents = 0; nevents < nmax; ) {
1402 			pev = port_get_kevent(glist, lev);
1403 			if (pev == NULL)	/* no more events available */
1404 				break;
1405 			if (pev->portkev_flags & PORT_KEV_FREE) {
1406 				/* Just discard event */
1407 				list_remove(glist, pev);
1408 				pev->portkev_flags &= ~(PORT_CLEANUP_DONE);
1409 				if (PORT_FREE_EVENT(pev))
1410 					port_free_event_local(pev, 0);
1411 				tnent--;
1412 				continue;
1413 			}
1414 
1415 			/* move event data to copyout list */
1416 			if (port_copy_event32(&kevp32[nevents], pev, glist)) {
1417 				/*
1418 				 * Event can not be delivered to the
1419 				 * current process.
1420 				 */
1421 				if (lev != NULL)
1422 					list_insert_after(glist, lev, pev);
1423 				else
1424 					list_insert_head(glist, pev);
1425 				lev = pev;  /* last checked event */
1426 			} else {
1427 				nevents++;	/* # of events ready */
1428 			}
1429 		}
1430 #endif	/* _SYSCALL32_IMPL */
1431 	}
1432 
1433 	/*
1434 	 *  Remember number of remaining events in the temporary event queue.
1435 	 */
1436 	portq->portq_tnent = tnent - nevents;
1437 
1438 	/*
1439 	 * Work to do before return :
1440 	 * - push list of remaining events back to the top of the standard
1441 	 *   port queue.
1442 	 * - if this is the last thread calling port_get(n) then wakeup the
1443 	 *   thread waiting on close(2).
1444 	 * - check for a deferred cv_signal from port_send_event() and wakeup
1445 	 *   the sleeping thread.
1446 	 */
1447 
1448 	mutex_enter(&portq->portq_mutex);
1449 	port_unblock(portq);
1450 	if (portq->portq_tnent) {
1451 		/*
1452 		 * move remaining events in the temporary event queue back
1453 		 * to the port event queue
1454 		 */
1455 		port_push_eventq(portq);
1456 	}
1457 	portq->portq_getn--;	/* update # of threads retrieving events */
1458 	if (--portq->portq_thrcnt == 0) { /* # of threads waiting ... */
1459 		/* Last thread => check close(2) conditions ... */
1460 		if (portq->portq_flags & PORTQ_CLOSE) {
1461 			cv_signal(&portq->portq_closecv);
1462 			mutex_exit(&portq->portq_mutex);
1463 			kmem_free(results, eventsz * nmax);
1464 			/* do not copyout events */
1465 			*nget = 0;
1466 			return (EBADFD);
1467 		}
1468 	} else if (portq->portq_getn == 0) {
1469 		/*
1470 		 * no other threads retrieving events ...
1471 		 * check wakeup conditions of sleeping threads
1472 		 */
1473 		if ((portq->portq_thread != NULL) &&
1474 		    (portq->portq_nent >= portq->portq_nget))
1475 			cv_signal(&portq->portq_thread->portget_cv);
1476 	}
1477 
1478 	/*
1479 	 * Check PORTQ_POLLIN here because the current thread set temporarily
1480 	 * the number of events in the queue to zero.
1481 	 */
1482 	if (portq->portq_flags & PORTQ_POLLIN) {
1483 		portq->portq_flags &= ~PORTQ_POLLIN;
1484 		mutex_exit(&portq->portq_mutex);
1485 		pollwakeup(&pp->port_pollhd, POLLIN);
1486 	} else {
1487 		mutex_exit(&portq->portq_mutex);
1488 	}
1489 
1490 	/* now copyout list of user event structures to user space */
1491 	if (nevents) {
1492 		if (copyout(results, uevp, nevents * eventsz))
1493 			error = EFAULT;
1494 	}
1495 	kmem_free(results, eventsz * nmax);
1496 
1497 	if (nevents == 0 && error == 0 && pgt->pgt_loop == 0 && blocking != 0) {
1498 		/* no events retrieved: check loop conditions */
1499 		if (blocking == -1) {
1500 			/* no timeout checked */
1501 			error = port_get_timeout(pgt->pgt_timeout,
1502 			    &pgt->pgt_rqtime, &rqtp, &blocking, flag);
1503 			if (error) {
1504 				*nget = nevents;
1505 				return (error);
1506 			}
1507 			if (rqtp != NULL) {
1508 				timespec_t	now;
1509 				gethrestime(&now);
1510 				timespecadd(&pgt->pgt_rqtime, &now);
1511 			}
1512 			pgt->pgt_rqtp = rqtp;
1513 		} else {
1514 			/* timeout already checked -> remember values */
1515 			pgt->pgt_rqtp = rqtp;
1516 			if (rqtp != NULL) {
1517 				pgt->pgt_rqtime = *rqtp;
1518 			}
1519 		}
1520 		if (blocking)
1521 			/* timeout remaining */
1522 			pgt->pgt_loop = 1;
1523 	}
1524 
1525 	/* set number of user event structures completed */
1526 	*nget = nevents;
1527 	return (error);
1528 }
1529 
1530 /*
1531  * 1. copy kernel event structure to user event structure.
1532  * 2. PORT_KEV_WIRED event structures will be reused by the "source"
1533  * 3. Remove PORT_KEV_DONEQ flag (event removed from the event queue)
1534  * 4. Other types of event structures can be delivered back to the port cache
1535  *    (port_free_event_local()).
1536  * 5. The event source callback function is the last opportunity for the
1537  *    event source to update events, to free local resources associated with
1538  *    the event or to deny the delivery of the event.
1539  */
1540 static int
1541 port_copy_event(port_event_t *puevp, port_kevent_t *pkevp, list_t *list)
1542 {
1543 	int	free_event = 0;
1544 	int	flags;
1545 	int	error;
1546 
1547 	puevp->portev_source = pkevp->portkev_source;
1548 	puevp->portev_object = pkevp->portkev_object;
1549 	puevp->portev_user = pkevp->portkev_user;
1550 	puevp->portev_events = pkevp->portkev_events;
1551 
1552 	/* remove event from the queue */
1553 	list_remove(list, pkevp);
1554 
1555 	/*
1556 	 * Events of type PORT_KEV_WIRED remain allocated by the
1557 	 * event source.
1558 	 */
1559 	flags = pkevp->portkev_flags;
1560 	if (pkevp->portkev_flags & PORT_KEV_WIRED)
1561 		pkevp->portkev_flags &= ~PORT_KEV_DONEQ;
1562 	else
1563 		free_event = 1;
1564 
1565 	if (pkevp->portkev_callback) {
1566 		error = (*pkevp->portkev_callback)(pkevp->portkev_arg,
1567 		    &puevp->portev_events, pkevp->portkev_pid,
1568 		    PORT_CALLBACK_DEFAULT, pkevp);
1569 
1570 		if (error) {
1571 			/*
1572 			 * Event can not be delivered.
1573 			 * Caller must reinsert the event into the queue.
1574 			 */
1575 			pkevp->portkev_flags = flags;
1576 			return (error);
1577 		}
1578 	}
1579 	if (free_event)
1580 		port_free_event_local(pkevp, 0);
1581 	return (0);
1582 }
1583 
1584 #ifdef	_SYSCALL32_IMPL
1585 /*
1586  * 1. copy kernel event structure to user event structure.
1587  * 2. PORT_KEV_WIRED event structures will be reused by the "source"
1588  * 3. Remove PORT_KEV_DONEQ flag (event removed from the event queue)
1589  * 4. Other types of event structures can be delivered back to the port cache
1590  *    (port_free_event_local()).
1591  * 5. The event source callback function is the last opportunity for the
1592  *    event source to update events, to free local resources associated with
1593  *    the event or to deny the delivery of the event.
1594  */
1595 static int
1596 port_copy_event32(port_event32_t *puevp, port_kevent_t *pkevp, list_t *list)
1597 {
1598 	int	free_event = 0;
1599 	int	error;
1600 	int	flags;
1601 
1602 	puevp->portev_source = pkevp->portkev_source;
1603 	puevp->portev_object = (daddr32_t)pkevp->portkev_object;
1604 	puevp->portev_user = (caddr32_t)(uintptr_t)pkevp->portkev_user;
1605 	puevp->portev_events = pkevp->portkev_events;
1606 
1607 	/* remove event from the queue */
1608 	list_remove(list, pkevp);
1609 
1610 	/*
1611 	 * Events if type PORT_KEV_WIRED remain allocated by the
1612 	 * sub-system (source).
1613 	 */
1614 
1615 	flags = pkevp->portkev_flags;
1616 	if (pkevp->portkev_flags & PORT_KEV_WIRED)
1617 		pkevp->portkev_flags &= ~PORT_KEV_DONEQ;
1618 	else
1619 		free_event = 1;
1620 
1621 	if (pkevp->portkev_callback != NULL) {
1622 		error = (*pkevp->portkev_callback)(pkevp->portkev_arg,
1623 		    &puevp->portev_events, pkevp->portkev_pid,
1624 		    PORT_CALLBACK_DEFAULT, pkevp);
1625 		if (error) {
1626 			/*
1627 			 * Event can not be delivered.
1628 			 * Caller must reinsert the event into the queue.
1629 			 */
1630 			pkevp->portkev_flags = flags;
1631 			return (error);
1632 		}
1633 	}
1634 	if (free_event)
1635 		port_free_event_local(pkevp, 0);
1636 	return (0);
1637 }
1638 #endif	/* _SYSCALL32_IMPL */
1639 
1640 /*
1641  * copyout alert event.
1642  */
1643 static int
1644 port_get_alert(port_alert_t *pa, port_event_t *uevp)
1645 {
1646 	model_t	model = get_udatamodel();
1647 
1648 	/* copyout alert event structures to user space */
1649 	if (model == DATAMODEL_NATIVE) {
1650 		port_event_t	uev;
1651 		uev.portev_source = PORT_SOURCE_ALERT;
1652 		uev.portev_object = pa->portal_object;
1653 		uev.portev_events = pa->portal_events;
1654 		uev.portev_user = pa->portal_user;
1655 		if (copyout(&uev, uevp, sizeof (port_event_t)))
1656 			return (EFAULT);
1657 #ifdef	_SYSCALL32_IMPL
1658 	} else {
1659 		port_event32_t	uev32;
1660 		uev32.portev_source = PORT_SOURCE_ALERT;
1661 		uev32.portev_object = (daddr32_t)pa->portal_object;
1662 		uev32.portev_events = pa->portal_events;
1663 		uev32.portev_user = (daddr32_t)(uintptr_t)pa->portal_user;
1664 		if (copyout(&uev32, uevp, sizeof (port_event32_t)))
1665 			return (EFAULT);
1666 #endif	/* _SYSCALL32_IMPL */
1667 	}
1668 	return (0);
1669 }
1670 
1671 /*
1672  * Check return conditions :
1673  * - pending port close(2)
1674  * - threads waiting for events
1675  */
1676 static void
1677 port_check_return_cond(port_queue_t *portq)
1678 {
1679 	ASSERT(MUTEX_HELD(&portq->portq_mutex));
1680 	portq->portq_thrcnt--;
1681 	if (portq->portq_flags & PORTQ_CLOSE) {
1682 		if (portq->portq_thrcnt == 0)
1683 			cv_signal(&portq->portq_closecv);
1684 		else
1685 			cv_signal(&portq->portq_thread->portget_cv);
1686 	}
1687 }
1688 
1689 /*
1690  * The port_get_kevent() function returns
1691  * - the event located at the head of the queue if 'last' pointer is NULL
1692  * - the next event after the event pointed by 'last'
1693  * The caller of this function is responsible for the integrity of the queue
1694  * in use:
1695  * - port_getn() is using a temporary queue protected with port_block().
1696  * - port_close_events() is working on the global event queue and protects
1697  *   the queue with portq->portq_mutex.
1698  */
1699 port_kevent_t *
1700 port_get_kevent(list_t *list, port_kevent_t *last)
1701 {
1702 	if (last == NULL)
1703 		return (list_head(list));
1704 	else
1705 		return (list_next(list, last));
1706 }
1707 
1708 /*
1709  * The port_get_timeout() function gets the timeout data from user space
1710  * and converts that info into a corresponding internal representation.
1711  * The kerneldata flag means that the timeout data is already loaded.
1712  */
1713 static int
1714 port_get_timeout(timespec_t *timeout, timespec_t *rqtime, timespec_t **rqtp,
1715     int *blocking, int kerneldata)
1716 {
1717 	model_t	model = get_udatamodel();
1718 
1719 	*rqtp = NULL;
1720 	if (timeout == NULL) {
1721 		*blocking = 1;
1722 		return (0);
1723 	}
1724 
1725 	if (kerneldata) {
1726 		*rqtime = *timeout;
1727 	} else {
1728 		if (model == DATAMODEL_NATIVE) {
1729 			if (copyin(timeout, rqtime, sizeof (*rqtime)))
1730 				return (EFAULT);
1731 #ifdef	_SYSCALL32_IMPL
1732 		} else {
1733 			timespec32_t 	wait_time_32;
1734 			if (copyin(timeout, &wait_time_32,
1735 			    sizeof (wait_time_32)))
1736 				return (EFAULT);
1737 			TIMESPEC32_TO_TIMESPEC(rqtime, &wait_time_32);
1738 #endif  /* _SYSCALL32_IMPL */
1739 		}
1740 	}
1741 
1742 	if (rqtime->tv_sec == 0 && rqtime->tv_nsec == 0) {
1743 		*blocking = 0;
1744 		return (0);
1745 	}
1746 
1747 	if (rqtime->tv_sec < 0 ||
1748 	    rqtime->tv_nsec < 0 || rqtime->tv_nsec >= NANOSEC)
1749 		return (EINVAL);
1750 
1751 	*rqtp = rqtime;
1752 	*blocking = 1;
1753 	return (0);
1754 }
1755 
1756 /*
1757  * port_queue_thread()
1758  * Threads requiring more events than available will be put in a wait queue.
1759  * There is a "thread wait queue" per port.
1760  * Threads requiring less events get a higher priority than others and they
1761  * will be awoken first.
1762  */
1763 static portget_t *
1764 port_queue_thread(port_queue_t *portq, uint_t nget)
1765 {
1766 	portget_t	*pgetp;
1767 	portget_t	*ttp;
1768 	portget_t	*htp;
1769 
1770 	pgetp = kmem_zalloc(sizeof (portget_t), KM_SLEEP);
1771 	pgetp->portget_nget = nget;
1772 	pgetp->portget_pid = curproc->p_pid;
1773 	if (portq->portq_thread == NULL) {
1774 		/* first waiting thread */
1775 		portq->portq_thread = pgetp;
1776 		portq->portq_nget = nget;
1777 		pgetp->portget_prev = pgetp;
1778 		pgetp->portget_next = pgetp;
1779 		return (pgetp);
1780 	}
1781 
1782 	/*
1783 	 * thread waiting for less events will be set on top of the queue.
1784 	 */
1785 	ttp = portq->portq_thread;
1786 	htp = ttp;
1787 	for (;;) {
1788 		if (nget <= ttp->portget_nget)
1789 			break;
1790 		if (htp == ttp->portget_next)
1791 			break;	/* last event */
1792 		ttp = ttp->portget_next;
1793 	}
1794 
1795 	/* add thread to the queue */
1796 	pgetp->portget_next = ttp;
1797 	pgetp->portget_prev = ttp->portget_prev;
1798 	ttp->portget_prev->portget_next = pgetp;
1799 	ttp->portget_prev = pgetp;
1800 	if (portq->portq_thread == ttp)
1801 		portq->portq_thread = pgetp;
1802 	portq->portq_nget = portq->portq_thread->portget_nget;
1803 	return (pgetp);
1804 }
1805 
1806 /*
1807  * Take thread out of the queue.
1808  */
1809 static void
1810 port_dequeue_thread(port_queue_t *portq, portget_t *pgetp)
1811 {
1812 	if (pgetp->portget_next == pgetp) {
1813 		/* last (single) waiting thread */
1814 		portq->portq_thread = NULL;
1815 		portq->portq_nget = 0;
1816 	} else {
1817 		pgetp->portget_prev->portget_next = pgetp->portget_next;
1818 		pgetp->portget_next->portget_prev = pgetp->portget_prev;
1819 		if (portq->portq_thread == pgetp)
1820 			portq->portq_thread = pgetp->portget_next;
1821 		portq->portq_nget = portq->portq_thread->portget_nget;
1822 	}
1823 	kmem_free(pgetp, sizeof (portget_t));
1824 }
1825 
1826 /*
1827  * Set up event port kstats.
1828  */
1829 static void
1830 port_kstat_init()
1831 {
1832 	kstat_t	*ksp;
1833 	uint_t	ndata;
1834 
1835 	ndata = sizeof (port_kstat) / sizeof (kstat_named_t);
1836 	ksp = kstat_create("portfs", 0, "Event Ports", "misc",
1837 	    KSTAT_TYPE_NAMED, ndata, KSTAT_FLAG_VIRTUAL);
1838 	if (ksp) {
1839 		ksp->ks_data = &port_kstat;
1840 		kstat_install(ksp);
1841 	}
1842 }
1843