1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27/*
28 * Copyright (c) 2015 Joyent, Inc.  All rights reserved.
29 */
30
31#include <sys/types.h>
32#include <sys/systm.h>
33#include <sys/cred.h>
34#include <sys/modctl.h>
35#include <sys/vfs.h>
36#include <sys/vfs_opreg.h>
37#include <sys/sysmacros.h>
38#include <sys/cmn_err.h>
39#include <sys/stat.h>
40#include <sys/errno.h>
41#include <sys/kmem.h>
42#include <sys/file.h>
43#include <sys/kstat.h>
44#include <sys/port_impl.h>
45#include <sys/task.h>
46#include <sys/project.h>
47
48/*
49 * Event Ports can be shared across threads or across processes.
50 * Every thread/process can use an own event port or a group of them
51 * can use a single port. A major request was also to get the ability
52 * to submit user-defined events to a port. The idea of the
53 * user-defined events is to use the event ports for communication between
54 * threads/processes (like message queues). User defined-events are queued
55 * in a port with the same priority as other event types.
56 *
57 * Events are delivered only once. The thread/process which is waiting
58 * for events with the "highest priority" (priority here is related to the
59 * internal strategy to wakeup waiting threads) will retrieve the event,
60 * all other threads/processes will not be notified. There is also
61 * the requirement to have events which should be submitted immediately
62 * to all "waiting" threads. That is the main task of the alert event.
63 * The alert event is submitted by the application to a port. The port
64 * changes from a standard mode to the alert mode. Now all waiting threads
65 * will be awaken immediately and they will return with the alert event.
66 * Threads trying to retrieve events from a port in alert mode will
67 * return immediately with the alert event.
68 *
69 *
70 * An event port is like a kernel queue, which accept events submitted from
71 * user level as well as events submitted from kernel sub-systems. Sub-systems
72 * able to submit events to a port are the so-called "event sources".
73 * Current event sources:
74 * PORT_SOURCE_AIO	 : events submitted per transaction completion from
75 *			   POSIX-I/O framework.
76 * PORT_SOURCE_TIMER	 : events submitted when a timer fires
77 *			   (see timer_create(3RT)).
78 * PORT_SOURCE_FD	 : events submitted per file descriptor (see poll(2)).
79 * PORT_SOURCE_ALERT	 : events submitted from user. This is not really a
80 *			   single event, this is actually a port mode
81 *			   (see port_alert(3c)).
82 * PORT_SOURCE_USER	 : events submitted by applications with
83 *			   port_send(3c) or port_sendn(3c).
84 * PORT_SOURCE_FILE	 : events submitted per file being watched for file
85 *			   change events  (see port_create(3c).
86 *
87 * There is a user API implemented in the libc library as well as a
88 * kernel API implemented in port_subr.c in genunix.
89 * The available user API functions are:
90 * port_create() : create a port as a file descriptor of portfs file system
91 *		   The standard close(2) function closes a port.
92 * port_associate() : associate a file descriptor with a port to be able to
93 *		      retrieve events from that file descriptor.
94 * port_dissociate(): remove the association of a file descriptor with a port.
95 * port_alert()	 : set/unset a port in alert mode
96 * port_send()	 : send an event of type PORT_SOURCE_USER to a port
97 * port_sendn()	 : send an event of type PORT_SOURCE_USER to a list of ports
98 * port_get()	 : retrieve a single event from a port
99 * port_getn()	 : retrieve a list of events from a port
100 *
101 * The available kernel API functions are:
102 * port_allocate_event(): allocate an event slot/structure of/from a port
103 * port_init_event()    : set event data in the event structure
104 * port_send_event()    : send event to a port
105 * port_free_event()    : deliver allocated slot/structure back to a port
106 * port_associate_ksource(): associate a kernel event source with a port
107 * port_dissociate_ksource(): dissociate a kernel event source from a port
108 *
109 * The libc implementation consists of small functions which pass the
110 * arguments to the kernel using the "portfs" system call. It means, all the
111 * synchronisation work is being done in the kernel. The "portfs" system
112 * call loads the portfs file system into the kernel.
113 *
114 * PORT CREATION
115 * The first function to be used is port_create() which internally creates
116 * a vnode and a portfs node. The portfs node is represented by the port_t
117 * structure, which again includes all the data necessary to control a port.
118 * port_create() returns a file descriptor, which needs to be used in almost
119 * all other event port functions.
120 * The maximum number of ports per system is controlled by the resource
121 * control: project:port-max-ids.
122 *
123 * EVENT GENERATION
124 * The second step is the triggering of events, which could be sent to a port.
125 * Every event source implements an own method to generate events for a port:
126 * PORT_SOURCE_AIO:
127 *	The sigevent structure of the standard POSIX-IO functions
128 *	was extended by an additional notification type.
129 *	Standard notification types:
130 *	SIGEV_NONE, SIGEV_SIGNAL and SIGEV_THREAD
131 *	Event ports introduced now SIGEV_PORT.
132 *	The notification type SIGEV_PORT specifies that a structure
133 *	of type port_notify_t has to be attached to the sigev_value.
134 *	The port_notify_t structure contains the event port file
135 *	descriptor and a user-defined pointer.
136 *	Internally the AIO implementation will use the kernel API
137 *	functions to allocate an event port slot per transaction (aiocb)
138 *	and sent the event to the port as soon as the transaction completes.
139 *	All the events submitted per transaction are of type
140 *	PORT_SOURCE_AIO.
141 * PORT_SOURCE_TIMER:
142 *	The timer_create() function uses the same method as the
143 *	PORT_SOURCE_AIO event source. It also uses the sigevent structure
144 *	to deliver the port information.
145 *	Internally the timer code will allocate a single event slot/struct
146 *	per timer and it will send the timer event as soon as the timer
147 *	fires. If the timer-fired event is not delivered to the application
148 *	before the next period elapsed, then an overrun counter will be
149 *	incremented. The timer event source uses a callback function to
150 *	detect the delivery of the event to the application. At that time
151 *	the timer callback function will update the event overrun counter.
152 * PORT_SOURCE_FD:
153 *	This event source uses the port_associate() function to allocate
154 *	an event slot/struct from a port. The application defines in the
155 *	events argument of port_associate() the type of events which it is
156 *	interested on.
157 *	The internal pollwakeup() function is used by all the file
158 *	systems --which are supporting the VOP_POLL() interface- to notify
159 *	the upper layer (poll(2), devpoll(7d) and now event ports) about
160 *	the event triggered (see valid events in poll(2)).
161 *	The pollwakeup() function forwards the event to the layer registered
162 *	to receive the current event.
163 *	The port_dissociate() function can be used to free the allocated
164 *	event slot from the port. Anyway, file descriptors deliver events
165 *	only one time and remain deactivated until the application
166 *	reactivates the association of a file descriptor with port_associate().
167 *	If an associated file descriptor is closed then the file descriptor
168 *	will be dissociated automatically from the port.
169 *
170 * PORT_SOURCE_ALERT:
171 *	This event type is generated when the port was previously set in
172 *	alert mode using the port_alert() function.
173 *	A single alert event is delivered to every thread which tries to
174 *	retrieve events from a port.
175 * PORT_SOURCE_USER:
176 *	This type of event is generated from user level using the port_send()
177 *	function to send a user event to a port or the port_sendn() function
178 *	to send an event to a list of ports.
179 * PORT_SOURCE_FILE:
180 *	This event source uses the port_associate() interface to register
181 *	a file to be monitored for changes. The file name that needs to be
182 *	monitored is specified in the file_obj_t structure, a pointer to which
183 *	is passed as an argument. The event types to be monitored are specified
184 *	in the events argument.
185 *	A file events monitor is represented internal per port per object
186 *	address(the file_obj_t pointer). Which means there can be multiple
187 *	watches registered on the same file using different file_obj_t
188 *	structure pointer. With the help of the	FEM(File Event Monitoring)
189 *	hooks, the file's vnode ops are intercepted and relevant events
190 *	delivered. The port_dissociate() function is used to de-register a
191 *	file events monitor on a file. When the specified file is
192 *	removed/renamed, the file events watch/monitor is automatically
193 *	removed.
194 *
195 * EVENT DELIVERY / RETRIEVING EVENTS
196 * Events remain in the port queue until:
197 * - the application uses port_get() or port_getn() to retrieve events,
198 * - the event source cancel the event,
199 * - the event port is closed or
200 * - the process exits.
201 * The maximal number of events in a port queue is the maximal number
202 * of event slots/structures which can be allocated by event sources.
203 * The allocation of event slots/structures is controlled by the resource
204 * control: process.port-max-events.
205 * The port_get() function retrieves a single event and the port_getn()
206 * function retrieves a list of events.
207 * Events are classified as shareable and non-shareable events across processes.
208 * Non-shareable events are invisible for the port_get(n)() functions of
209 * processes other than the owner of the event.
210 *    Shareable event types are:
211 *    PORT_SOURCE_USER events
212 *	This type of event is unconditionally shareable and without
213 *	limitations. If the parent process sends a user event and closes
214 *	the port afterwards, the event remains in the port and the child
215 *	process will still be able to retrieve the user event.
216 *    PORT_SOURCE_ALERT events
217 *	This type of event is shareable between processes.
218 *	Limitation:	The alert mode of the port is removed if the owner
219 *			(process which set the port in alert mode) of the
220 *			alert event closes the port.
221 *    PORT_SOURCE_FD events
222 *	This type of event is conditional shareable between processes.
223 *	After fork(2) all forked file descriptors are shareable between
224 *	the processes. The child process is allowed to retrieve events
225 *	from the associated file descriptors and it can also re-associate
226 *	the fd with the port.
227 *	Limitations:	The child process is not allowed to dissociate
228 *			the file descriptor from the port. Only the
229 *			owner (process) of the association is allowed to
230 *			dissociate the file descriptor from the port.
231 *			If the owner of the association closes the port
232 *			the association will be removed.
233 *    PORT_SOURCE_AIO events
234 *	This type of event is not shareable between processes.
235 *    PORT_SOURCE_TIMER events
236 *	This type of event is not shareable between processes.
237 *    PORT_SOURCE_FILE events
238 *	This type of event is not shareable between processes.
239 *
240 * FORK BEHAVIOUR
241 * On fork(2) the child process inherits all opened file descriptors from
242 * the parent process. This is also valid for port file descriptors.
243 * Associated file descriptors with a port maintain the association across the
244 * fork(2). It means, the child process gets full access to the port and
245 * it can retrieve events from all common associated file descriptors.
246 * Events of file descriptors created and associated with a port after the
247 * fork(2) are non-shareable and can only be retrieved by the same process.
248 *
249 * If the parent or the child process closes an exported port (using fork(2)
250 * or I_SENDFD) all the file descriptors associated with the port by the
251 * process will be dissociated from the port. Events of dissociated file
252 * descriptors as well as all non-shareable events will be discarded.
253 * The other process can continue working with the port as usual.
254 *
255 * CLOSING A PORT
256 * close(2) has to be used to close a port. See FORK BEHAVIOUR for details.
257 *
258 * PORT EVENT STRUCTURES
259 * The global control structure of the event ports framework is port_control_t.
260 * port_control_t keeps track of the number of created ports in the system.
261 * The cache of the port event structures is also located in port_control_t.
262 *
263 * On port_create() the vnode and the portfs node is also created.
264 * The portfs node is represented by the port_t structure.
265 * The port_t structure manages all port specific tasks:
266 * - management of resource control values
267 * - port VOP_POLL interface
268 * - creation time
269 * - uid and gid of the port
270 *
271 * The port_t structure contains the port_queue_t structure.
272 * The port_queue_t structure contains all the data necessary for the
273 * queue management:
274 * - locking
275 * - condition variables
276 * - event counters
277 * - submitted events	(represented by port_kevent_t structures)
278 * - threads waiting for event delivery (check portget_t structure)
279 * - PORT_SOURCE_FD cache	(managed by the port_fdcache_t structure)
280 * - event source management (managed by the port_source_t structure)
281 * - alert mode management	(check port_alert_t structure)
282 *
283 * EVENT MANAGEMENT
284 * The event port file system creates a kmem_cache for internal allocation of
285 * event port structures.
286 *
287 * 1. Event source association with a port:
288 * The first step to do for event sources is to get associated with a port
289 * using the port_associate_ksource() function or adding an entry to the
290 * port_ksource_tab[]. An event source can get dissociated from a port
291 * using the port_dissociate_ksource() function. An entry in the
292 * port_ksource_tab[] implies that the source will be associated
293 * automatically with every new created port.
294 * The event source can deliver a callback function, which is used by the
295 * port to notify the event source about close(2). The idea is that
296 * in such a case the event source should free all allocated resources
297 * and it must return to the port all allocated slots/structures.
298 * The port_close() function will wait until all allocated event
299 * structures/slots are returned to the port.
300 * The callback function is not necessary when the event source does not
301 * maintain local resources, a second condition is that the event source
302 * can guarantee that allocated event slots will be returned without
303 * delay to the port (it will not block and sleep somewhere).
304 *
305 * 2. Reservation of an event slot / event structure
306 * The event port reliability is based on the reservation of an event "slot"
307 * (allocation of an event structure) by the event source as part of the
308 * application call. If the maximal number of event slots is exhausted then
309 * the event source can return a corresponding error code to the application.
310 *
311 * The port_alloc_event() function has to be used by event sources to
312 * allocate an event slot (reserve an event structure). The port_alloc_event()
313 * doesn not block and it will return a 0 value on success or an error code
314 * if it fails.
315 * An argument of port_alloc_event() is a flag which determines the behavior
316 * of the event after it was delivered to the application:
317 * PORT_ALLOC_DEFAULT	: event slot becomes free after delivery to the
318 *			  application.
319 * PORT_ALLOC_PRIVATE	: event slot remains under the control of the event
320 *			  source. This kind of slots can not be used for
321 *			  event delivery and should only be used internally
322 *			  by the event source.
323 * PORT_KEV_CACHED	: event slot remains under the control of an event
324 *			  port cache. It does not become free after delivery
325 *			  to the application.
326 * PORT_ALLOC_SCACHED	: event slot remains under the control of the event
327 *			  source. The event source takes the control over
328 *			  the slot after the event is delivered to the
329 *			  application.
330 *
331 * 3. Delivery of events to the event port
332 * Earlier allocated event structure/slot has to be used to deliver
333 * event data to the port. Event source has to use the function
334 * port_send_event(). The single argument is a pointer to the previously
335 * reserved event structure/slot.
336 * The portkev_events field of the port_kevent_t structure can be updated/set
337 * in two ways:
338 * 1. using the port_set_event() function, or
339 * 2. updating the portkev_events field out of the callback function:
340 *    The event source can deliver a callback function to the port as an
341 *    argument of port_init_event().
342 *    One of the arguments of the callback function is a pointer to the
343 *    events field, which will be delivered to the application.
344 *    (see Delivery of events to the application).
345 * Event structures/slots can be delivered to the event port only one time,
346 * they remain blocked until the data is delivered to the application and the
347 * slot becomes free or it is delivered back to the event source
348 * (PORT_ALLOC_SCACHED). The activation of the callback function mentioned above
349 * is at the same time the indicator for the event source that the event
350 * structure/slot is free for reuse.
351 *
352 * 4. Delivery of events to the application
353 * The events structures/slots delivered by event sources remain in the
354 * port queue until they are retrieved by the application or the port
355 * is closed (exit(2) also closes all opened file descriptors)..
356 * The application uses port_get() or port_getn() to retrieve events from
357 * a port. port_get() retrieves a single event structure/slot and port_getn()
358 * retrieves a list of event structures/slots.
359 * Both functions are able to poll for events and return immediately or they
360 * can specify a timeout value.
361 * Before the events are delivered to the application they are moved to a
362 * second temporary internal queue. The idea is to avoid lock collisions or
363 * contentions of the global queue lock.
364 * The global queue lock is used every time when an event source delivers
365 * new events to the port.
366 * The port_get() and port_getn() functions
367 * a) retrieve single events from the temporary queue,
368 * b) prepare the data to be passed to the application memory,
369 * c) activate the callback function of the event sources:
370 *    - to get the latest event data,
371 *    - the event source can free all allocated resources associated with the
372 *      current event,
373 *    - the event source can re-use the current event slot/structure
374 *    - the event source can deny the delivery of the event to the application
375 *      (e.g. because of the wrong process).
376 * d) put the event back to the temporary queue if the event delivery was denied
377 * e) repeat a) until d) as long as there are events in the queue and
378 *    there is enough user space available.
379 *
380 * The loop described above could block for a very long time the global mutex,
381 * to avoid that a second mutex was introduced to synchronized concurrent
382 * threads accessing the temporary queue.
383 */
384
385static int64_t portfs(int, uintptr_t, uintptr_t, uintptr_t, uintptr_t,
386    uintptr_t);
387
388static struct sysent port_sysent = {
389	6,
390	SE_ARGC | SE_64RVAL | SE_NOUNLOAD,
391	(int (*)())(uintptr_t)portfs,
392};
393
394static struct modlsys modlsys = {
395	&mod_syscallops, "event ports", &port_sysent
396};
397
398#ifdef _SYSCALL32_IMPL
399
400static int64_t
401portfs32(uint32_t arg1, int32_t arg2, uint32_t arg3, uint32_t arg4,
402    uint32_t arg5, uint32_t arg6);
403
404static struct sysent port_sysent32 = {
405	6,
406	SE_ARGC | SE_64RVAL | SE_NOUNLOAD,
407	(int (*)())(uintptr_t)portfs32,
408};
409
410static struct modlsys modlsys32 = {
411	&mod_syscallops32,
412	"32-bit event ports syscalls",
413	&port_sysent32
414};
415#endif	/* _SYSCALL32_IMPL */
416
417static struct modlinkage modlinkage = {
418	MODREV_1,
419	&modlsys,
420#ifdef _SYSCALL32_IMPL
421	&modlsys32,
422#endif
423	NULL
424};
425
426port_kstat_t port_kstat = {
427	{ "ports",	KSTAT_DATA_UINT32 }
428};
429
430dev_t	portdev;
431struct	vnodeops *port_vnodeops;
432struct	vfs port_vfs;
433
434extern	rctl_hndl_t rc_process_portev;
435extern	rctl_hndl_t rc_project_portids;
436extern	void aio_close_port(void *, int, pid_t, int);
437
438/*
439 * This table contains a list of event sources which need a static
440 * association with a port (every port).
441 * The last NULL entry in the table is required to detect "end of table".
442 */
443struct port_ksource port_ksource_tab[] = {
444	{PORT_SOURCE_AIO, aio_close_port, NULL, NULL},
445	{0, NULL, NULL, NULL}
446};
447
448/* local functions */
449static int port_getn(port_t *, port_event_t *, uint_t, uint_t *,
450    port_gettimer_t *);
451static int port_sendn(int [], int [], uint_t, int, void *, uint_t *);
452static int port_alert(port_t *, int, int, void *);
453static int port_dispatch_event(port_t *, int, int, int, uintptr_t, void *);
454static int port_send(port_t *, int, int, void *);
455static int port_create(int *);
456static int port_get_alert(port_alert_t *, port_event_t *);
457static int port_copy_event(port_event_t *, port_kevent_t *, list_t *);
458static int *port_errorn(int *, int, int, int);
459static int port_noshare(void *, int *, pid_t, int, void *);
460static int port_get_timeout(timespec_t *, timespec_t *, timespec_t **, int *,
461    int);
462static void port_init(port_t *);
463static void port_remove_alert(port_queue_t *);
464static void port_add_ksource_local(port_t *, port_ksource_t *);
465static void port_check_return_cond(port_queue_t *);
466static void port_dequeue_thread(port_queue_t *, portget_t *);
467static portget_t *port_queue_thread(port_queue_t *, uint_t);
468static void port_kstat_init(void);
469
470#ifdef	_SYSCALL32_IMPL
471static int port_copy_event32(port_event32_t *, port_kevent_t *, list_t *);
472#endif
473
474int
475_init(void)
476{
477	static const fs_operation_def_t port_vfsops_template[] = {
478		NULL, NULL
479	};
480	extern const	fs_operation_def_t port_vnodeops_template[];
481	vfsops_t	*port_vfsops;
482	int		error;
483	major_t		major;
484
485	if ((major = getudev()) == (major_t)-1)
486		return (ENXIO);
487	portdev = makedevice(major, 0);
488
489	/* Create a dummy vfs */
490	error = vfs_makefsops(port_vfsops_template, &port_vfsops);
491	if (error) {
492		cmn_err(CE_WARN, "port init: bad vfs ops");
493		return (error);
494	}
495	vfs_setops(&port_vfs, port_vfsops);
496	port_vfs.vfs_flag = VFS_RDONLY;
497	port_vfs.vfs_dev = portdev;
498	vfs_make_fsid(&(port_vfs.vfs_fsid), portdev, 0);
499
500	error = vn_make_ops("portfs", port_vnodeops_template, &port_vnodeops);
501	if (error) {
502		vfs_freevfsops(port_vfsops);
503		cmn_err(CE_WARN, "port init: bad vnode ops");
504		return (error);
505	}
506
507	mutex_init(&port_control.pc_mutex, NULL, MUTEX_DEFAULT, NULL);
508	port_control.pc_nents = 0;	/* number of active ports */
509
510	/* create kmem_cache for port event structures */
511	port_control.pc_cache = kmem_cache_create("port_cache",
512	    sizeof (port_kevent_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
513
514	port_kstat_init();		/* init port kstats */
515	return (mod_install(&modlinkage));
516}
517
518int
519_info(struct modinfo *modinfop)
520{
521	return (mod_info(&modlinkage, modinfop));
522}
523
524/*
525 * System call wrapper for all port related system calls from 32-bit programs.
526 */
527#ifdef _SYSCALL32_IMPL
528static int64_t
529portfs32(uint32_t opcode, int32_t a0, uint32_t a1, uint32_t a2, uint32_t a3,
530    uint32_t a4)
531{
532	int64_t	error;
533
534	switch (opcode & PORT_CODE_MASK) {
535	case PORT_GET:
536		error = portfs(PORT_GET, a0, a1, (int)a2, (int)a3, a4);
537		break;
538	case PORT_SENDN:
539		error = portfs(opcode, (uint32_t)a0, a1, a2, a3, a4);
540		break;
541	default:
542		error = portfs(opcode, a0, a1, a2, a3, a4);
543		break;
544	}
545	return (error);
546}
547#endif	/* _SYSCALL32_IMPL */
548
549/*
550 * System entry point for port functions.
551 * a0 is a port file descriptor (except for PORT_SENDN and PORT_CREATE).
552 * The libc uses PORT_SYS_NOPORT in functions which do not deliver a
553 * port file descriptor as first argument.
554 */
555static int64_t
556portfs(int opcode, uintptr_t a0, uintptr_t a1, uintptr_t a2, uintptr_t a3,
557    uintptr_t a4)
558{
559	rval_t		r;
560	port_t		*pp;
561	int		error = 0;
562	uint_t		nget;
563	file_t		*fp;
564	port_gettimer_t	port_timer;
565
566	r.r_vals = 0;
567	if (opcode & PORT_SYS_NOPORT) {
568		opcode &= PORT_CODE_MASK;
569		if (opcode == PORT_SENDN) {
570			error = port_sendn((int *)a0, (int *)a1, (uint_t)a2,
571			    (int)a3, (void *)a4, (uint_t *)&r.r_val1);
572			if (error && (error != EIO))
573				return ((int64_t)set_errno(error));
574			return (r.r_vals);
575		}
576
577		if (opcode == PORT_CREATE) {
578			error = port_create(&r.r_val1);
579			if (error)
580				return ((int64_t)set_errno(error));
581			return (r.r_vals);
582		}
583	}
584
585	/* opcodes using port as first argument (a0) */
586
587	if ((fp = getf((int)a0)) == NULL)
588		return ((uintptr_t)set_errno(EBADF));
589
590	if (fp->f_vnode->v_type != VPORT) {
591		releasef((int)a0);
592		return ((uintptr_t)set_errno(EBADFD));
593	}
594
595	pp = VTOEP(fp->f_vnode);
596
597	switch (opcode & PORT_CODE_MASK) {
598	case	PORT_GET:
599	{
600		/* see PORT_GETN description */
601		struct	timespec timeout;
602
603		port_timer.pgt_flags = PORTGET_ONE;
604		port_timer.pgt_loop = 0;
605		port_timer.pgt_rqtp = NULL;
606		if (a4 != 0) {
607			port_timer.pgt_timeout = &timeout;
608			timeout.tv_sec = (time_t)a2;
609			timeout.tv_nsec = (long)a3;
610		} else {
611			port_timer.pgt_timeout = NULL;
612		}
613		do {
614			nget = 1;
615			error = port_getn(pp, (port_event_t *)a1, 1,
616			    (uint_t *)&nget, &port_timer);
617		} while (nget == 0 && error == 0 && port_timer.pgt_loop);
618		break;
619	}
620	case	PORT_GETN:
621	{
622		/*
623		 * port_getn() can only retrieve own or shareable events from
624		 * other processes. The port_getn() function remains in the
625		 * kernel until own or shareable events are available or the
626		 * timeout elapses.
627		 */
628		port_timer.pgt_flags = 0;
629		port_timer.pgt_loop = 0;
630		port_timer.pgt_rqtp = NULL;
631		port_timer.pgt_timeout = (struct timespec *)a4;
632		do {
633			nget = a3;
634			error = port_getn(pp, (port_event_t *)a1, (uint_t)a2,
635			    (uint_t *)&nget, &port_timer);
636		} while (nget == 0 && error == 0 && port_timer.pgt_loop);
637		r.r_val1 = nget;
638		r.r_val2 = error;
639		releasef((int)a0);
640		if (error && error != ETIME)
641			return ((int64_t)set_errno(error));
642		return (r.r_vals);
643	}
644	case	PORT_ASSOCIATE:
645	{
646		switch ((int)a1) {
647		case PORT_SOURCE_FD:
648			error = port_associate_fd(pp, (int)a1, (uintptr_t)a2,
649			    (int)a3, (void *)a4);
650			break;
651		case PORT_SOURCE_FILE:
652			error = port_associate_fop(pp, (int)a1, (uintptr_t)a2,
653			    (int)a3, (void *)a4);
654			break;
655		default:
656			error = EINVAL;
657			break;
658		}
659		break;
660	}
661	case	PORT_SEND:
662	{
663		/* user-defined events */
664		error = port_send(pp, PORT_SOURCE_USER, (int)a1, (void *)a2);
665		break;
666	}
667	case	PORT_DISPATCH:
668	{
669		/*
670		 * library events, blocking
671		 * Only events of type PORT_SOURCE_AIO or PORT_SOURCE_MQ
672		 * are currently allowed.
673		 */
674		if ((int)a1 != PORT_SOURCE_AIO && (int)a1 != PORT_SOURCE_MQ) {
675			error = EINVAL;
676			break;
677		}
678		error = port_dispatch_event(pp, (int)opcode, (int)a1, (int)a2,
679		    (uintptr_t)a3, (void *)a4);
680		break;
681	}
682	case	PORT_DISSOCIATE:
683	{
684		switch ((int)a1) {
685		case PORT_SOURCE_FD:
686			error = port_dissociate_fd(pp, (uintptr_t)a2);
687			break;
688		case PORT_SOURCE_FILE:
689			error = port_dissociate_fop(pp, (uintptr_t)a2);
690			break;
691		default:
692			error = EINVAL;
693			break;
694		}
695		break;
696	}
697	case	PORT_ALERT:
698	{
699		if ((int)a2)	/* a2 = events */
700			error = port_alert(pp, (int)a1, (int)a2, (void *)a3);
701		else
702			port_remove_alert(&pp->port_queue);
703		break;
704	}
705	default:
706		error = EINVAL;
707		break;
708	}
709
710	releasef((int)a0);
711	if (error)
712		return ((int64_t)set_errno(error));
713	return (r.r_vals);
714}
715
716/*
717 * System call to create a port.
718 *
719 * The port_create() function creates a vnode of type VPORT per port.
720 * The port control data is associated with the vnode as vnode private data.
721 * The port_create() function returns an event port file descriptor.
722 */
723static int
724port_create(int *fdp)
725{
726	port_t		*pp;
727	vnode_t		*vp;
728	struct file	*fp;
729	proc_t		*p = curproc;
730
731	/* initialize vnode and port private data */
732	pp = kmem_zalloc(sizeof (port_t), KM_SLEEP);
733
734	pp->port_vnode = vn_alloc(KM_SLEEP);
735	vp = EPTOV(pp);
736	vn_setops(vp, port_vnodeops);
737	vp->v_type = VPORT;
738	vp->v_vfsp = &port_vfs;
739	vp->v_data = (caddr_t)pp;
740
741	mutex_enter(&port_control.pc_mutex);
742	/*
743	 * Retrieve the maximal number of event ports allowed per system from
744	 * the resource control: project.port-max-ids.
745	 */
746	mutex_enter(&p->p_lock);
747	if (rctl_test(rc_project_portids, p->p_task->tk_proj->kpj_rctls, p,
748	    port_control.pc_nents + 1, RCA_SAFE) & RCT_DENY) {
749		mutex_exit(&p->p_lock);
750		vn_free(vp);
751		kmem_free(pp, sizeof (port_t));
752		mutex_exit(&port_control.pc_mutex);
753		return (EAGAIN);
754	}
755
756	/*
757	 * Retrieve the maximal number of events allowed per port from
758	 * the resource control: process.port-max-events.
759	 */
760	pp->port_max_events = rctl_enforced_value(rc_process_portev,
761	    p->p_rctls, p);
762	mutex_exit(&p->p_lock);
763
764	/* allocate a new user file descriptor and a file structure */
765	if (falloc(vp, 0, &fp, fdp)) {
766		/*
767		 * If the file table is full, free allocated resources.
768		 */
769		vn_free(vp);
770		kmem_free(pp, sizeof (port_t));
771		mutex_exit(&port_control.pc_mutex);
772		return (EMFILE);
773	}
774
775	mutex_exit(&fp->f_tlock);
776
777	pp->port_fd = *fdp;
778	port_control.pc_nents++;
779	p->p_portcnt++;
780	port_kstat.pks_ports.value.ui32++;
781	mutex_exit(&port_control.pc_mutex);
782
783	/* initializes port private data */
784	port_init(pp);
785	/* set user file pointer */
786	setf(*fdp, fp);
787	return (0);
788}
789
790/*
791 * port_init() initializes event port specific data
792 */
793static void
794port_init(port_t *pp)
795{
796	port_queue_t	*portq;
797	port_ksource_t	*pks;
798
799	mutex_init(&pp->port_mutex, NULL, MUTEX_DEFAULT, NULL);
800	portq = &pp->port_queue;
801	mutex_init(&portq->portq_mutex, NULL, MUTEX_DEFAULT, NULL);
802	pp->port_flags |= PORT_INIT;
803
804	/*
805	 * If it is not enough memory available to satisfy a user
806	 * request using a single port_getn() call then port_getn()
807	 * will reduce the size of the list to PORT_MAX_LIST.
808	 */
809	pp->port_max_list = port_max_list;
810
811	/* Set timestamp entries required for fstat(2) requests */
812	gethrestime(&pp->port_ctime);
813	pp->port_uid = crgetuid(curproc->p_cred);
814	pp->port_gid = crgetgid(curproc->p_cred);
815
816	/* initialize port queue structs */
817	list_create(&portq->portq_list, sizeof (port_kevent_t),
818	    offsetof(port_kevent_t, portkev_node));
819	list_create(&portq->portq_get_list, sizeof (port_kevent_t),
820	    offsetof(port_kevent_t, portkev_node));
821	portq->portq_flags = 0;
822	pp->port_pid = curproc->p_pid;
823
824	/* Allocate cache skeleton for PORT_SOURCE_FD events */
825	portq->portq_pcp = kmem_zalloc(sizeof (port_fdcache_t), KM_SLEEP);
826	mutex_init(&portq->portq_pcp->pc_lock, NULL, MUTEX_DEFAULT, NULL);
827
828	/*
829	 * Allocate cache skeleton for association of event sources.
830	 */
831	mutex_init(&portq->portq_source_mutex, NULL, MUTEX_DEFAULT, NULL);
832	portq->portq_scache = kmem_zalloc(
833	    PORT_SCACHE_SIZE * sizeof (port_source_t *), KM_SLEEP);
834
835	/*
836	 * pre-associate some kernel sources with this port.
837	 * The pre-association is required to create port_source_t
838	 * structures for object association.
839	 * Some sources can not get associated with a port before the first
840	 * object association is requested. Another reason to pre_associate
841	 * a particular source with a port is because of performance.
842	 */
843
844	for (pks = port_ksource_tab; pks->pks_source != 0; pks++)
845		port_add_ksource_local(pp, pks);
846}
847
848/*
849 * The port_add_ksource_local() function is being used to associate
850 * event sources with every new port.
851 * The event sources need to be added to port_ksource_tab[].
852 */
853static void
854port_add_ksource_local(port_t *pp, port_ksource_t *pks)
855{
856	port_source_t	*pse;
857	port_source_t	**ps;
858
859	mutex_enter(&pp->port_queue.portq_source_mutex);
860	ps = &pp->port_queue.portq_scache[PORT_SHASH(pks->pks_source)];
861	for (pse = *ps; pse != NULL; pse = pse->portsrc_next) {
862		if (pse->portsrc_source == pks->pks_source)
863			break;
864	}
865
866	if (pse == NULL) {
867		/* associate new source with the port */
868		pse = kmem_zalloc(sizeof (port_source_t), KM_SLEEP);
869		pse->portsrc_source = pks->pks_source;
870		pse->portsrc_close = pks->pks_close;
871		pse->portsrc_closearg = pks->pks_closearg;
872		pse->portsrc_cnt = 1;
873
874		pks->pks_portsrc = pse;
875		if (*ps != NULL)
876			pse->portsrc_next = (*ps)->portsrc_next;
877		*ps = pse;
878	}
879	mutex_exit(&pp->port_queue.portq_source_mutex);
880}
881
882/*
883 * The port_send() function sends an event of type "source" to a
884 * port. This function is non-blocking. An event can be sent to
885 * a port as long as the number of events per port does not achieve the
886 * maximal allowed number of events. The max. number of events per port is
887 * defined by the resource control process.max-port-events.
888 * This function is used by the port library function port_send()
889 * and port_dispatch(). The port_send(3c) function is part of the
890 * event ports API and submits events of type PORT_SOURCE_USER. The
891 * port_dispatch() function is project private and it is used by library
892 * functions to submit events of other types than PORT_SOURCE_USER
893 * (e.g. PORT_SOURCE_AIO).
894 */
895static int
896port_send(port_t *pp, int source, int events, void *user)
897{
898	port_kevent_t	*pev;
899	int		error;
900
901	error = port_alloc_event_local(pp, source, PORT_ALLOC_DEFAULT, &pev);
902	if (error)
903		return (error);
904
905	pev->portkev_object = 0;
906	pev->portkev_events = events;
907	pev->portkev_user = user;
908	pev->portkev_callback = NULL;
909	pev->portkev_arg = NULL;
910	pev->portkev_flags = 0;
911
912	port_send_event(pev);
913	return (0);
914}
915
916/*
917 * The port_noshare() function returns 0 if the current event was generated
918 * by the same process. Otherwise is returns a value other than 0 and the
919 * event should not be delivered to the current processe.
920 * The port_noshare() function is normally used by the port_dispatch()
921 * function. The port_dispatch() function is project private and can only be
922 * used within the event port project.
923 * Currently the libaio uses the port_dispatch() function to deliver events
924 * of types PORT_SOURCE_AIO.
925 */
926/* ARGSUSED */
927static int
928port_noshare(void *arg, int *events, pid_t pid, int flag, void *evp)
929{
930	if (flag == PORT_CALLBACK_DEFAULT && curproc->p_pid != pid)
931		return (1);
932	return (0);
933}
934
935/*
936 * The port_dispatch_event() function is project private and it is used by
937 * libraries involved in the project to deliver events to the port.
938 * port_dispatch will sleep and wait for enough resources to satisfy the
939 * request, if necessary.
940 * The library can specify if the delivered event is shareable with other
941 * processes (see PORT_SYS_NOSHARE flag).
942 */
943static int
944port_dispatch_event(port_t *pp, int opcode, int source, int events,
945    uintptr_t object, void *user)
946{
947	port_kevent_t	*pev;
948	int		error;
949
950	error = port_alloc_event_block(pp, source, PORT_ALLOC_DEFAULT, &pev);
951	if (error)
952		return (error);
953
954	pev->portkev_object = object;
955	pev->portkev_events = events;
956	pev->portkev_user = user;
957	pev->portkev_arg = NULL;
958	if (opcode & PORT_SYS_NOSHARE) {
959		pev->portkev_flags = PORT_KEV_NOSHARE;
960		pev->portkev_callback = port_noshare;
961	} else {
962		pev->portkev_flags = 0;
963		pev->portkev_callback = NULL;
964	}
965
966	port_send_event(pev);
967	return (0);
968}
969
970
971/*
972 * The port_sendn() function is the kernel implementation of the event
973 * port API function port_sendn(3c).
974 * This function is able to send an event to a list of event ports.
975 */
976static int
977port_sendn(int ports[], int errors[], uint_t nent, int events, void *user,
978    uint_t *nget)
979{
980	port_kevent_t	*pev;
981	int		errorcnt = 0;
982	int		error = 0;
983	int		count;
984	int		port;
985	int		*plist;
986	int		*elist = NULL;
987	file_t		*fp;
988	port_t		*pp;
989
990	if (nent == 0 || nent > port_max_list)
991		return (EINVAL);
992
993	plist = kmem_alloc(nent * sizeof (int), KM_SLEEP);
994	if (copyin((void *)ports, plist, nent * sizeof (int))) {
995		kmem_free(plist, nent * sizeof (int));
996		return (EFAULT);
997	}
998
999	/*
1000	 * Scan the list for event port file descriptors and send the
1001	 * attached user event data embedded in a event of type
1002	 * PORT_SOURCE_USER to every event port in the list.
1003	 * If a list entry is not a valid event port then the corresponding
1004	 * error code will be stored in the errors[] list with the same
1005	 * list offset as in the ports[] list.
1006	 */
1007
1008	for (count = 0; count < nent; count++) {
1009		port = plist[count];
1010		if ((fp = getf(port)) == NULL) {
1011			elist = port_errorn(elist, nent, EBADF, count);
1012			errorcnt++;
1013			continue;
1014		}
1015
1016		pp = VTOEP(fp->f_vnode);
1017		if (fp->f_vnode->v_type != VPORT) {
1018			releasef(port);
1019			elist = port_errorn(elist, nent, EBADFD, count);
1020			errorcnt++;
1021			continue;
1022		}
1023
1024		error = port_alloc_event_local(pp, PORT_SOURCE_USER,
1025		    PORT_ALLOC_DEFAULT, &pev);
1026		if (error) {
1027			releasef(port);
1028			elist = port_errorn(elist, nent, error, count);
1029			errorcnt++;
1030			continue;
1031		}
1032
1033		pev->portkev_object = 0;
1034		pev->portkev_events = events;
1035		pev->portkev_user = user;
1036		pev->portkev_callback = NULL;
1037		pev->portkev_arg = NULL;
1038		pev->portkev_flags = 0;
1039
1040		port_send_event(pev);
1041		releasef(port);
1042	}
1043	if (errorcnt) {
1044		error = EIO;
1045		if (copyout(elist, (void *)errors, nent * sizeof (int)))
1046			error = EFAULT;
1047		kmem_free(elist, nent * sizeof (int));
1048	}
1049	*nget = nent - errorcnt;
1050	kmem_free(plist, nent * sizeof (int));
1051	return (error);
1052}
1053
1054static int *
1055port_errorn(int *elist, int nent, int error, int index)
1056{
1057	if (elist == NULL)
1058		elist = kmem_zalloc(nent * sizeof (int), KM_SLEEP);
1059	elist[index] = error;
1060	return (elist);
1061}
1062
1063/*
1064 * port_alert()
1065 * The port_alert() funcion is a high priority event and it is always set
1066 * on top of the queue. It is also delivered as single event.
1067 * flags:
1068 *	- SET	:overwrite current alert data
1069 *	- UPDATE:set alert data or return EBUSY if alert mode is already set
1070 *
1071 * - set the ALERT flag
1072 * - wakeup all sleeping threads
1073 */
1074static int
1075port_alert(port_t *pp, int flags, int events, void *user)
1076{
1077	port_queue_t	*portq;
1078	portget_t	*pgetp;
1079	port_alert_t	*pa;
1080
1081	if ((flags & PORT_ALERT_INVALID) == PORT_ALERT_INVALID)
1082		return (EINVAL);
1083
1084	portq = &pp->port_queue;
1085	pa = &portq->portq_alert;
1086	mutex_enter(&portq->portq_mutex);
1087
1088	/* check alert conditions */
1089	if (flags == PORT_ALERT_UPDATE) {
1090		if (portq->portq_flags & PORTQ_ALERT) {
1091			mutex_exit(&portq->portq_mutex);
1092			return (EBUSY);
1093		}
1094	}
1095
1096	/*
1097	 * Store alert data in the port to be delivered to threads
1098	 * which are using port_get(n) to retrieve events.
1099	 */
1100
1101	portq->portq_flags |= PORTQ_ALERT;
1102	pa->portal_events = events;		/* alert info */
1103	pa->portal_pid = curproc->p_pid;	/* process owner */
1104	pa->portal_object = 0;			/* no object */
1105	pa->portal_user = user;			/* user alert data */
1106
1107	/* alert and deliver alert data to waiting threads */
1108	pgetp = portq->portq_thread;
1109	if (pgetp == NULL) {
1110		/* no threads waiting for events */
1111		mutex_exit(&portq->portq_mutex);
1112		return (0);
1113	}
1114
1115	/*
1116	 * Set waiting threads in alert mode (PORTGET_ALERT)..
1117	 * Every thread waiting for events already allocated a portget_t
1118	 * structure to sleep on.
1119	 * The port alert arguments are stored in the portget_t structure.
1120	 * The PORTGET_ALERT flag is set to indicate the thread to return
1121	 * immediately with the alert event.
1122	 */
1123	do {
1124		if ((pgetp->portget_state & PORTGET_ALERT) == 0) {
1125			pa = &pgetp->portget_alert;
1126			pa->portal_events = events;
1127			pa->portal_object = 0;
1128			pa->portal_user = user;
1129			pgetp->portget_state |= PORTGET_ALERT;
1130			cv_signal(&pgetp->portget_cv);
1131		}
1132	} while ((pgetp = pgetp->portget_next) != portq->portq_thread);
1133	mutex_exit(&portq->portq_mutex);
1134	return (0);
1135}
1136
1137/*
1138 * Clear alert state of the port
1139 */
1140static void
1141port_remove_alert(port_queue_t *portq)
1142{
1143	mutex_enter(&portq->portq_mutex);
1144	portq->portq_flags &= ~PORTQ_ALERT;
1145	mutex_exit(&portq->portq_mutex);
1146}
1147
1148/*
1149 * The port_getn() function is used to retrieve events from a port.
1150 *
1151 * The port_getn() function returns immediately if there are enough events
1152 * available in the port to satisfy the request or if the port is in alert
1153 * mode (see port_alert(3c)).
1154 * The timeout argument of port_getn(3c) -which is embedded in the
1155 * port_gettimer_t structure- specifies if the system call should block or if it
1156 * should return immediately depending on the number of events available.
1157 * This function is internally used by port_getn(3c) as well as by
1158 * port_get(3c).
1159 */
1160static int
1161port_getn(port_t *pp, port_event_t *uevp, uint_t max, uint_t *nget,
1162    port_gettimer_t *pgt)
1163{
1164	port_queue_t	*portq;
1165	port_kevent_t	*pev;
1166	port_kevent_t	*lev;
1167	int		error = 0;
1168	uint_t		nmax;
1169	uint_t		nevents;
1170	uint_t		eventsz;
1171	port_event_t	*kevp;
1172	list_t		*glist;
1173	uint_t		tnent;
1174	int		rval;
1175	int		blocking = -1;
1176	int		timecheck;
1177	int		flag;
1178	timespec_t	rqtime;
1179	timespec_t	*rqtp = NULL;
1180	portget_t	*pgetp;
1181	void		*results;
1182	model_t		model = get_udatamodel();
1183
1184	flag = pgt->pgt_flags;
1185
1186	if (*nget > max && max > 0)
1187		return (EINVAL);
1188
1189	portq = &pp->port_queue;
1190	mutex_enter(&portq->portq_mutex);
1191	if (max == 0) {
1192		/*
1193		 * Return number of objects with events.
1194		 * The port_block() call is required to synchronize this
1195		 * thread with another possible thread, which could be
1196		 * retrieving events from the port queue.
1197		 */
1198		port_block(portq);
1199		/*
1200		 * Check if a second thread is currently retrieving events
1201		 * and it is using the temporary event queue.
1202		 */
1203		if (portq->portq_tnent) {
1204			/* put remaining events back to the port queue */
1205			port_push_eventq(portq);
1206		}
1207		*nget = portq->portq_nent;
1208		port_unblock(portq);
1209		mutex_exit(&portq->portq_mutex);
1210		return (0);
1211	}
1212
1213	if (uevp == NULL) {
1214		mutex_exit(&portq->portq_mutex);
1215		return (EFAULT);
1216	}
1217	if (*nget == 0) {		/* no events required */
1218		mutex_exit(&portq->portq_mutex);
1219		return (0);
1220	}
1221
1222	/* port is being closed ... */
1223	if (portq->portq_flags & PORTQ_CLOSE) {
1224		mutex_exit(&portq->portq_mutex);
1225		return (EBADFD);
1226	}
1227
1228	/* return immediately if port in alert mode */
1229	if (portq->portq_flags & PORTQ_ALERT) {
1230		error = port_get_alert(&portq->portq_alert, uevp);
1231		if (error == 0)
1232			*nget = 1;
1233		mutex_exit(&portq->portq_mutex);
1234		return (error);
1235	}
1236
1237	portq->portq_thrcnt++;
1238
1239	/*
1240	 * Now check if the completed events satisfy the
1241	 * "wait" requirements of the current thread:
1242	 */
1243
1244	if (pgt->pgt_loop) {
1245		/*
1246		 * loop entry of same thread
1247		 * pgt_loop is set when the current thread returns
1248		 * prematurely from this function. That could happen
1249		 * when a port is being shared between processes and
1250		 * this thread could not find events to return.
1251		 * It is not allowed to a thread to retrieve non-shareable
1252		 * events generated in other processes.
1253		 * PORTQ_WAIT_EVENTS is set when a thread already
1254		 * checked the current event queue and no new events
1255		 * are added to the queue.
1256		 */
1257		if (((portq->portq_flags & PORTQ_WAIT_EVENTS) == 0) &&
1258		    (portq->portq_nent >= *nget)) {
1259			/* some new events arrived ...check them */
1260			goto portnowait;
1261		}
1262		rqtp = pgt->pgt_rqtp;
1263		timecheck = pgt->pgt_timecheck;
1264		pgt->pgt_flags |= PORTGET_WAIT_EVENTS;
1265	} else {
1266		/* check if enough events are available ... */
1267		if (portq->portq_nent >= *nget)
1268			goto portnowait;
1269		/*
1270		 * There are not enough events available to satisfy
1271		 * the request, check timeout value and wait for
1272		 * incoming events.
1273		 */
1274		error = port_get_timeout(pgt->pgt_timeout, &rqtime, &rqtp,
1275		    &blocking, flag);
1276		if (error) {
1277			port_check_return_cond(portq);
1278			mutex_exit(&portq->portq_mutex);
1279			return (error);
1280		}
1281
1282		if (blocking == 0) /* don't block, check fired events */
1283			goto portnowait;
1284
1285		if (rqtp != NULL) {
1286			timespec_t	now;
1287			timecheck = timechanged;
1288			gethrestime(&now);
1289			timespecadd(rqtp, &now);
1290		}
1291	}
1292
1293	/* enqueue thread in the list of waiting threads */
1294	pgetp = port_queue_thread(portq, *nget);
1295
1296
1297	/* Wait here until return conditions met */
1298	for (;;) {
1299		if (pgetp->portget_state & PORTGET_ALERT) {
1300			/* reap alert event and return */
1301			error = port_get_alert(&pgetp->portget_alert, uevp);
1302			if (error)
1303				*nget = 0;
1304			else
1305				*nget = 1;
1306			port_dequeue_thread(&pp->port_queue, pgetp);
1307			portq->portq_thrcnt--;
1308			mutex_exit(&portq->portq_mutex);
1309			return (error);
1310		}
1311
1312		/*
1313		 * Check if some other thread is already retrieving
1314		 * events (portq_getn > 0).
1315		 */
1316
1317		if ((portq->portq_getn  == 0) &&
1318		    ((portq)->portq_nent >= *nget) &&
1319		    (!((pgt)->pgt_flags & PORTGET_WAIT_EVENTS) ||
1320		    !((portq)->portq_flags & PORTQ_WAIT_EVENTS)))
1321			break;
1322
1323		if (portq->portq_flags & PORTQ_CLOSE) {
1324			error = EBADFD;
1325			break;
1326		}
1327
1328		rval = cv_waituntil_sig(&pgetp->portget_cv, &portq->portq_mutex,
1329		    rqtp, timecheck);
1330
1331		if (rval <= 0) {
1332			error = (rval == 0) ? EINTR : ETIME;
1333			break;
1334		}
1335	}
1336
1337	/* take thread out of the wait queue */
1338	port_dequeue_thread(portq, pgetp);
1339
1340	if (error != 0 && (error == EINTR || error == EBADFD ||
1341	    (error == ETIME && flag))) {
1342		/* return without events */
1343		port_check_return_cond(portq);
1344		mutex_exit(&portq->portq_mutex);
1345		return (error);
1346	}
1347
1348portnowait:
1349	/*
1350	 * Move port event queue to a temporary event queue .
1351	 * New incoming events will be continue be posted to the event queue
1352	 * and they will not be considered by the current thread.
1353	 * The idea is to avoid lock contentions or an often locking/unlocking
1354	 * of the port queue mutex. The contention and performance degradation
1355	 * could happen because:
1356	 * a) incoming events use the port queue mutex to enqueue new events and
1357	 * b) before the event can be delivered to the application it is
1358	 *    necessary to notify the event sources about the event delivery.
1359	 *    Sometimes the event sources can require a long time to return and
1360	 *    the queue mutex would block incoming events.
1361	 * During this time incoming events (port_send_event()) do not need
1362	 * to awake threads waiting for events. Before the current thread
1363	 * returns it will check the conditions to awake other waiting threads.
1364	 */
1365	portq->portq_getn++;	/* number of threads retrieving events */
1366	port_block(portq);	/* block other threads here */
1367	nmax = max < portq->portq_nent ? max : portq->portq_nent;
1368
1369	if (portq->portq_tnent) {
1370		/*
1371		 * Move remaining events from previous thread back to the
1372		 * port event queue.
1373		 */
1374		port_push_eventq(portq);
1375	}
1376	/* move port event queue to a temporary queue */
1377	list_move_tail(&portq->portq_get_list, &portq->portq_list);
1378	glist = &portq->portq_get_list;	/* use temporary event queue */
1379	tnent = portq->portq_nent;	/* get current number of events */
1380	portq->portq_nent = 0;		/* no events in the port event queue */
1381	portq->portq_flags |= PORTQ_WAIT_EVENTS; /* detect incoming events */
1382	mutex_exit(&portq->portq_mutex);    /* event queue can be reused now */
1383
1384	if (model == DATAMODEL_NATIVE) {
1385		eventsz = sizeof (port_event_t);
1386
1387		if (nmax == 0) {
1388			kevp = NULL;
1389		} else {
1390			kevp = kmem_alloc(eventsz * nmax, KM_NOSLEEP);
1391			if (kevp == NULL) {
1392				if (nmax > pp->port_max_list)
1393					nmax = pp->port_max_list;
1394				kevp = kmem_alloc(eventsz * nmax, KM_SLEEP);
1395			}
1396		}
1397
1398		results = kevp;
1399		lev = NULL;	/* start with first event in the queue */
1400		for (nevents = 0; nevents < nmax; ) {
1401			pev = port_get_kevent(glist, lev);
1402			if (pev == NULL)	/* no more events available */
1403				break;
1404			if (pev->portkev_flags & PORT_KEV_FREE) {
1405				/* Just discard event */
1406				list_remove(glist, pev);
1407				pev->portkev_flags &= ~(PORT_CLEANUP_DONE);
1408				if (PORT_FREE_EVENT(pev))
1409					port_free_event_local(pev, 0);
1410				tnent--;
1411				continue;
1412			}
1413
1414			/* move event data to copyout list */
1415			if (port_copy_event(&kevp[nevents], pev, glist)) {
1416				/*
1417				 * Event can not be delivered to the
1418				 * current process.
1419				 */
1420				if (lev != NULL)
1421					list_insert_after(glist, lev, pev);
1422				else
1423					list_insert_head(glist, pev);
1424				lev = pev;  /* last checked event */
1425			} else {
1426				nevents++;	/* # of events ready */
1427			}
1428		}
1429#ifdef	_SYSCALL32_IMPL
1430	} else {
1431		port_event32_t	*kevp32;
1432
1433		eventsz = sizeof (port_event32_t);
1434
1435		if (nmax == 0) {
1436			kevp32 = NULL;
1437		} else {
1438			kevp32 = kmem_alloc(eventsz * nmax, KM_NOSLEEP);
1439			if (kevp32 == NULL) {
1440				if (nmax > pp->port_max_list)
1441					nmax = pp->port_max_list;
1442				kevp32 = kmem_alloc(eventsz * nmax, KM_SLEEP);
1443			}
1444		}
1445
1446		results = kevp32;
1447		lev = NULL;	/* start with first event in the queue */
1448		for (nevents = 0; nevents < nmax; ) {
1449			pev = port_get_kevent(glist, lev);
1450			if (pev == NULL)	/* no more events available */
1451				break;
1452			if (pev->portkev_flags & PORT_KEV_FREE) {
1453				/* Just discard event */
1454				list_remove(glist, pev);
1455				pev->portkev_flags &= ~(PORT_CLEANUP_DONE);
1456				if (PORT_FREE_EVENT(pev))
1457					port_free_event_local(pev, 0);
1458				tnent--;
1459				continue;
1460			}
1461
1462			/* move event data to copyout list */
1463			if (port_copy_event32(&kevp32[nevents], pev, glist)) {
1464				/*
1465				 * Event can not be delivered to the
1466				 * current process.
1467				 */
1468				if (lev != NULL)
1469					list_insert_after(glist, lev, pev);
1470				else
1471					list_insert_head(glist, pev);
1472				lev = pev;  /* last checked event */
1473			} else {
1474				nevents++;	/* # of events ready */
1475			}
1476		}
1477#endif	/* _SYSCALL32_IMPL */
1478	}
1479
1480	/*
1481	 *  Remember number of remaining events in the temporary event queue.
1482	 */
1483	portq->portq_tnent = tnent - nevents;
1484
1485	/*
1486	 * Work to do before return :
1487	 * - push list of remaining events back to the top of the standard
1488	 *   port queue.
1489	 * - if this is the last thread calling port_get(n) then wakeup the
1490	 *   thread waiting on close(2).
1491	 * - check for a deferred cv_signal from port_send_event() and wakeup
1492	 *   the sleeping thread.
1493	 */
1494
1495	mutex_enter(&portq->portq_mutex);
1496	port_unblock(portq);
1497	if (portq->portq_tnent) {
1498		/*
1499		 * move remaining events in the temporary event queue back
1500		 * to the port event queue
1501		 */
1502		port_push_eventq(portq);
1503	}
1504	portq->portq_getn--;	/* update # of threads retrieving events */
1505	if (--portq->portq_thrcnt == 0) { /* # of threads waiting ... */
1506		/* Last thread => check close(2) conditions ... */
1507		if (portq->portq_flags & PORTQ_CLOSE) {
1508			cv_signal(&portq->portq_closecv);
1509			mutex_exit(&portq->portq_mutex);
1510			kmem_free(results, eventsz * nmax);
1511			/* do not copyout events */
1512			*nget = 0;
1513			return (EBADFD);
1514		}
1515	} else if (portq->portq_getn == 0) {
1516		/*
1517		 * no other threads retrieving events ...
1518		 * check wakeup conditions of sleeping threads
1519		 */
1520		if ((portq->portq_thread != NULL) &&
1521		    (portq->portq_nent >= portq->portq_nget))
1522			cv_signal(&portq->portq_thread->portget_cv);
1523	}
1524
1525	/*
1526	 * Check PORTQ_POLLIN here because the current thread set temporarily
1527	 * the number of events in the queue to zero.
1528	 */
1529	if (portq->portq_flags & PORTQ_POLLIN) {
1530		portq->portq_flags &= ~PORTQ_POLLIN;
1531		mutex_exit(&portq->portq_mutex);
1532		pollwakeup(&pp->port_pollhd, POLLIN);
1533	} else {
1534		mutex_exit(&portq->portq_mutex);
1535	}
1536
1537	/* now copyout list of user event structures to user space */
1538	if (nevents) {
1539		if (copyout(results, uevp, nevents * eventsz))
1540			error = EFAULT;
1541	}
1542	kmem_free(results, eventsz * nmax);
1543
1544	if (nevents == 0 && error == 0 && pgt->pgt_loop == 0 && blocking != 0) {
1545		/* no events retrieved: check loop conditions */
1546		if (blocking == -1) {
1547			/* no timeout checked */
1548			error = port_get_timeout(pgt->pgt_timeout,
1549			    &pgt->pgt_rqtime, &rqtp, &blocking, flag);
1550			if (error) {
1551				*nget = nevents;
1552				return (error);
1553			}
1554			if (rqtp != NULL) {
1555				timespec_t	now;
1556				pgt->pgt_timecheck = timechanged;
1557				gethrestime(&now);
1558				timespecadd(&pgt->pgt_rqtime, &now);
1559			}
1560			pgt->pgt_rqtp = rqtp;
1561		} else {
1562			/* timeout already checked -> remember values */
1563			pgt->pgt_rqtp = rqtp;
1564			if (rqtp != NULL) {
1565				pgt->pgt_timecheck = timecheck;
1566				pgt->pgt_rqtime = *rqtp;
1567			}
1568		}
1569		if (blocking)
1570			/* timeout remaining */
1571			pgt->pgt_loop = 1;
1572	}
1573
1574	/* set number of user event structures completed */
1575	*nget = nevents;
1576	return (error);
1577}
1578
1579/*
1580 * 1. copy kernel event structure to user event structure.
1581 * 2. PORT_KEV_WIRED event structures will be reused by the "source"
1582 * 3. Remove PORT_KEV_DONEQ flag (event removed from the event queue)
1583 * 4. Other types of event structures can be delivered back to the port cache
1584 *    (port_free_event_local()).
1585 * 5. The event source callback function is the last opportunity for the
1586 *    event source to update events, to free local resources associated with
1587 *    the event or to deny the delivery of the event.
1588 */
1589static int
1590port_copy_event(port_event_t *puevp, port_kevent_t *pkevp, list_t *list)
1591{
1592	int	free_event = 0;
1593	int	flags;
1594	int	error;
1595
1596	puevp->portev_source = pkevp->portkev_source;
1597	puevp->portev_object = pkevp->portkev_object;
1598	puevp->portev_user = pkevp->portkev_user;
1599	puevp->portev_events = pkevp->portkev_events;
1600
1601	/* remove event from the queue */
1602	list_remove(list, pkevp);
1603
1604	/*
1605	 * Events of type PORT_KEV_WIRED remain allocated by the
1606	 * event source.
1607	 */
1608	flags = pkevp->portkev_flags;
1609	if (pkevp->portkev_flags & PORT_KEV_WIRED)
1610		pkevp->portkev_flags &= ~PORT_KEV_DONEQ;
1611	else
1612		free_event = 1;
1613
1614	if (pkevp->portkev_callback) {
1615		error = (*pkevp->portkev_callback)(pkevp->portkev_arg,
1616		    &puevp->portev_events, pkevp->portkev_pid,
1617		    PORT_CALLBACK_DEFAULT, pkevp);
1618
1619		if (error) {
1620			/*
1621			 * Event can not be delivered.
1622			 * Caller must reinsert the event into the queue.
1623			 */
1624			pkevp->portkev_flags = flags;
1625			return (error);
1626		}
1627	}
1628	if (free_event)
1629		port_free_event_local(pkevp, 0);
1630	return (0);
1631}
1632
1633#ifdef	_SYSCALL32_IMPL
1634/*
1635 * 1. copy kernel event structure to user event structure.
1636 * 2. PORT_KEV_WIRED event structures will be reused by the "source"
1637 * 3. Remove PORT_KEV_DONEQ flag (event removed from the event queue)
1638 * 4. Other types of event structures can be delivered back to the port cache
1639 *    (port_free_event_local()).
1640 * 5. The event source callback function is the last opportunity for the
1641 *    event source to update events, to free local resources associated with
1642 *    the event or to deny the delivery of the event.
1643 */
1644static int
1645port_copy_event32(port_event32_t *puevp, port_kevent_t *pkevp, list_t *list)
1646{
1647	int	free_event = 0;
1648	int	error;
1649	int	flags;
1650
1651	puevp->portev_source = pkevp->portkev_source;
1652	puevp->portev_object = (daddr32_t)pkevp->portkev_object;
1653	puevp->portev_user = (caddr32_t)(uintptr_t)pkevp->portkev_user;
1654	puevp->portev_events = pkevp->portkev_events;
1655
1656	/* remove event from the queue */
1657	list_remove(list, pkevp);
1658
1659	/*
1660	 * Events if type PORT_KEV_WIRED remain allocated by the
1661	 * sub-system (source).
1662	 */
1663
1664	flags = pkevp->portkev_flags;
1665	if (pkevp->portkev_flags & PORT_KEV_WIRED)
1666		pkevp->portkev_flags &= ~PORT_KEV_DONEQ;
1667	else
1668		free_event = 1;
1669
1670	if (pkevp->portkev_callback != NULL) {
1671		error = (*pkevp->portkev_callback)(pkevp->portkev_arg,
1672		    &puevp->portev_events, pkevp->portkev_pid,
1673		    PORT_CALLBACK_DEFAULT, pkevp);
1674		if (error) {
1675			/*
1676			 * Event can not be delivered.
1677			 * Caller must reinsert the event into the queue.
1678			 */
1679			pkevp->portkev_flags = flags;
1680			return (error);
1681		}
1682	}
1683	if (free_event)
1684		port_free_event_local(pkevp, 0);
1685	return (0);
1686}
1687#endif	/* _SYSCALL32_IMPL */
1688
1689/*
1690 * copyout alert event.
1691 */
1692static int
1693port_get_alert(port_alert_t *pa, port_event_t *uevp)
1694{
1695	model_t	model = get_udatamodel();
1696
1697	/* copyout alert event structures to user space */
1698	if (model == DATAMODEL_NATIVE) {
1699		port_event_t	uev;
1700		uev.portev_source = PORT_SOURCE_ALERT;
1701		uev.portev_object = pa->portal_object;
1702		uev.portev_events = pa->portal_events;
1703		uev.portev_user = pa->portal_user;
1704		if (copyout(&uev, uevp, sizeof (port_event_t)))
1705			return (EFAULT);
1706#ifdef	_SYSCALL32_IMPL
1707	} else {
1708		port_event32_t	uev32;
1709		uev32.portev_source = PORT_SOURCE_ALERT;
1710		uev32.portev_object = (daddr32_t)pa->portal_object;
1711		uev32.portev_events = pa->portal_events;
1712		uev32.portev_user = (daddr32_t)(uintptr_t)pa->portal_user;
1713		if (copyout(&uev32, uevp, sizeof (port_event32_t)))
1714			return (EFAULT);
1715#endif	/* _SYSCALL32_IMPL */
1716	}
1717	return (0);
1718}
1719
1720/*
1721 * Check return conditions :
1722 * - pending port close(2)
1723 * - threads waiting for events
1724 */
1725static void
1726port_check_return_cond(port_queue_t *portq)
1727{
1728	ASSERT(MUTEX_HELD(&portq->portq_mutex));
1729	portq->portq_thrcnt--;
1730	if (portq->portq_flags & PORTQ_CLOSE) {
1731		if (portq->portq_thrcnt == 0)
1732			cv_signal(&portq->portq_closecv);
1733		else
1734			cv_signal(&portq->portq_thread->portget_cv);
1735	}
1736}
1737
1738/*
1739 * The port_get_kevent() function returns
1740 * - the event located at the head of the queue if 'last' pointer is NULL
1741 * - the next event after the event pointed by 'last'
1742 * The caller of this function is responsible for the integrity of the queue
1743 * in use:
1744 * - port_getn() is using a temporary queue protected with port_block().
1745 * - port_close_events() is working on the global event queue and protects
1746 *   the queue with portq->portq_mutex.
1747 */
1748port_kevent_t *
1749port_get_kevent(list_t *list, port_kevent_t *last)
1750{
1751	if (last == NULL)
1752		return (list_head(list));
1753	else
1754		return (list_next(list, last));
1755}
1756
1757/*
1758 * The port_get_timeout() function gets the timeout data from user space
1759 * and converts that info into a corresponding internal representation.
1760 * The kerneldata flag means that the timeout data is already loaded.
1761 */
1762static int
1763port_get_timeout(timespec_t *timeout, timespec_t *rqtime, timespec_t **rqtp,
1764    int *blocking, int kerneldata)
1765{
1766	model_t	model = get_udatamodel();
1767
1768	*rqtp = NULL;
1769	if (timeout == NULL) {
1770		*blocking = 1;
1771		return (0);
1772	}
1773
1774	if (kerneldata) {
1775		*rqtime = *timeout;
1776	} else {
1777		if (model == DATAMODEL_NATIVE) {
1778			if (copyin(timeout, rqtime, sizeof (*rqtime)))
1779				return (EFAULT);
1780#ifdef	_SYSCALL32_IMPL
1781		} else {
1782			timespec32_t	wait_time_32;
1783			if (copyin(timeout, &wait_time_32,
1784			    sizeof (wait_time_32)))
1785				return (EFAULT);
1786			TIMESPEC32_TO_TIMESPEC(rqtime, &wait_time_32);
1787#endif  /* _SYSCALL32_IMPL */
1788		}
1789	}
1790
1791	if (rqtime->tv_sec == 0 && rqtime->tv_nsec == 0) {
1792		*blocking = 0;
1793		return (0);
1794	}
1795
1796	if (rqtime->tv_sec < 0 ||
1797	    rqtime->tv_nsec < 0 || rqtime->tv_nsec >= NANOSEC)
1798		return (EINVAL);
1799
1800	*rqtp = rqtime;
1801	*blocking = 1;
1802	return (0);
1803}
1804
1805/*
1806 * port_queue_thread()
1807 * Threads requiring more events than available will be put in a wait queue.
1808 * There is a "thread wait queue" per port.
1809 * Threads requiring less events get a higher priority than others and they
1810 * will be awoken first.
1811 */
1812static portget_t *
1813port_queue_thread(port_queue_t *portq, uint_t nget)
1814{
1815	portget_t	*pgetp;
1816	portget_t	*ttp;
1817	portget_t	*htp;
1818
1819	pgetp = kmem_zalloc(sizeof (portget_t), KM_SLEEP);
1820	pgetp->portget_nget = nget;
1821	pgetp->portget_pid = curproc->p_pid;
1822	if (portq->portq_thread == NULL) {
1823		/* first waiting thread */
1824		portq->portq_thread = pgetp;
1825		portq->portq_nget = nget;
1826		pgetp->portget_prev = pgetp;
1827		pgetp->portget_next = pgetp;
1828		return (pgetp);
1829	}
1830
1831	/*
1832	 * thread waiting for less events will be set on top of the queue.
1833	 */
1834	ttp = portq->portq_thread;
1835	htp = ttp;
1836	for (;;) {
1837		if (nget <= ttp->portget_nget)
1838			break;
1839		if (htp == ttp->portget_next)
1840			break;	/* last event */
1841		ttp = ttp->portget_next;
1842	}
1843
1844	/* add thread to the queue */
1845	pgetp->portget_next = ttp;
1846	pgetp->portget_prev = ttp->portget_prev;
1847	ttp->portget_prev->portget_next = pgetp;
1848	ttp->portget_prev = pgetp;
1849	if (portq->portq_thread == ttp)
1850		portq->portq_thread = pgetp;
1851	portq->portq_nget = portq->portq_thread->portget_nget;
1852	return (pgetp);
1853}
1854
1855/*
1856 * Take thread out of the queue.
1857 */
1858static void
1859port_dequeue_thread(port_queue_t *portq, portget_t *pgetp)
1860{
1861	if (pgetp->portget_next == pgetp) {
1862		/* last (single) waiting thread */
1863		portq->portq_thread = NULL;
1864		portq->portq_nget = 0;
1865	} else {
1866		pgetp->portget_prev->portget_next = pgetp->portget_next;
1867		pgetp->portget_next->portget_prev = pgetp->portget_prev;
1868		if (portq->portq_thread == pgetp)
1869			portq->portq_thread = pgetp->portget_next;
1870		portq->portq_nget = portq->portq_thread->portget_nget;
1871	}
1872	kmem_free(pgetp, sizeof (portget_t));
1873}
1874
1875/*
1876 * Set up event port kstats.
1877 */
1878static void
1879port_kstat_init()
1880{
1881	kstat_t	*ksp;
1882	uint_t	ndata;
1883
1884	ndata = sizeof (port_kstat) / sizeof (kstat_named_t);
1885	ksp = kstat_create("portfs", 0, "Event Ports", "misc",
1886	    KSTAT_TYPE_NAMED, ndata, KSTAT_FLAG_VIRTUAL);
1887	if (ksp) {
1888		ksp->ks_data = &port_kstat;
1889		kstat_install(ksp);
1890	}
1891}
1892