xref: /illumos-gate/usr/src/cmd/bhyve/mevent.c (revision 251becc8)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2011 NetApp, Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  *
28  * $FreeBSD$
29  */
30 
31 /*
32  * Copyright 2018 Joyent, Inc.
33  * Copyright 2022 OmniOS Community Edition (OmniOSce) Association.
34  */
35 
36 /*
37  * Micro event library for FreeBSD, designed for a single i/o thread
38  * using kqueue, and having events be persistent by default.
39  */
40 
41 #include <sys/cdefs.h>
42 __FBSDID("$FreeBSD$");
43 
44 #include <assert.h>
45 #ifndef WITHOUT_CAPSICUM
46 #include <capsicum_helpers.h>
47 #endif
48 #include <err.h>
49 #include <errno.h>
50 #include <stdbool.h>
51 #include <stdlib.h>
52 #include <stdio.h>
53 #include <string.h>
54 #include <sysexits.h>
55 #include <unistd.h>
56 
57 #include <sys/types.h>
58 #ifndef WITHOUT_CAPSICUM
59 #include <sys/capsicum.h>
60 #endif
61 #ifdef __FreeBSD__
62 #include <sys/event.h>
63 #else
64 #include <port.h>
65 #include <sys/poll.h>
66 #include <sys/siginfo.h>
67 #include <sys/queue.h>
68 #include <sys/debug.h>
69 #include <libproc.h>
70 #endif
71 #include <sys/time.h>
72 
73 #include <pthread.h>
74 #include <pthread_np.h>
75 
76 #include "mevent.h"
77 
78 #define	MEVENT_MAX	64
79 
80 #ifndef __FreeBSD__
81 #define	EV_ENABLE	0x01
82 #define	EV_ADD		EV_ENABLE
83 #define	EV_DISABLE	0x02
84 #define	EV_DELETE	0x04
85 
86 static int mevent_file_poll_interval_ms = 5000;
87 #endif
88 
89 static pthread_t mevent_tid;
90 static pthread_once_t mevent_once = PTHREAD_ONCE_INIT;
91 #ifdef __FreeBSD__
92 static int mevent_timid = 43;
93 #endif
94 static int mevent_pipefd[2];
95 static int mfd;
96 static pthread_mutex_t mevent_lmutex = PTHREAD_MUTEX_INITIALIZER;
97 
98 struct mevent {
99 	void	(*me_func)(int, enum ev_type, void *);
100 #define me_msecs me_fd
101 	int	me_fd;
102 #ifdef __FreeBSD__
103 	int	me_timid;
104 #else
105 	timer_t me_timid;
106 #endif
107 	enum ev_type me_type;
108 	void    *me_param;
109 	int	me_cq;
110 	int	me_state; /* Desired kevent flags. */
111 	int	me_closefd;
112 	int	me_fflags;
113 #ifndef __FreeBSD__
114 	port_notify_t	me_notify;
115 	struct sigevent	me_sigev;
116 	boolean_t	me_auto_requeue;
117 	struct file_obj	me_fobj;
118 	char		*me_fname;
119 	struct {
120 		int	mp_fd;
121 		off_t	mp_size;
122 		void	(*mp_func)(int, enum ev_type, void *);
123 		void    *mp_param;
124 	} me_poll;
125 #endif
126 	LIST_ENTRY(mevent) me_list;
127 };
128 
129 static LIST_HEAD(listhead, mevent) global_head, change_head;
130 
131 static void
132 mevent_qlock(void)
133 {
134 	pthread_mutex_lock(&mevent_lmutex);
135 }
136 
137 static void
138 mevent_qunlock(void)
139 {
140 	pthread_mutex_unlock(&mevent_lmutex);
141 }
142 
143 static void
144 mevent_pipe_read(int fd, enum ev_type type, void *param)
145 {
146 	char buf[MEVENT_MAX];
147 	int status;
148 
149 	/*
150 	 * Drain the pipe read side. The fd is non-blocking so this is
151 	 * safe to do.
152 	 */
153 	do {
154 		status = read(fd, buf, sizeof(buf));
155 	} while (status == MEVENT_MAX);
156 }
157 
158 static void
159 mevent_notify(void)
160 {
161 	char c = '\0';
162 
163 	/*
164 	 * If calling from outside the i/o thread, write a byte on the
165 	 * pipe to force the i/o thread to exit the blocking kevent call.
166 	 */
167 	if (mevent_pipefd[1] != 0 && pthread_self() != mevent_tid) {
168 		write(mevent_pipefd[1], &c, 1);
169 	}
170 }
171 
172 static void
173 mevent_init(void)
174 {
175 #ifndef WITHOUT_CAPSICUM
176 	cap_rights_t rights;
177 #endif
178 
179 #ifdef __FreeBSD__
180 	mfd = kqueue();
181 #else
182 	mfd = port_create();
183 #endif
184 	assert(mfd > 0);
185 
186 #ifndef WITHOUT_CAPSICUM
187 	cap_rights_init(&rights, CAP_KQUEUE);
188 	if (caph_rights_limit(mfd, &rights) == -1)
189 		errx(EX_OSERR, "Unable to apply rights for sandbox");
190 #endif
191 
192 	LIST_INIT(&change_head);
193 	LIST_INIT(&global_head);
194 }
195 
196 
197 #ifdef __FreeBSD__
198 static int
199 mevent_kq_filter(struct mevent *mevp)
200 {
201 	int retval;
202 
203 	retval = 0;
204 
205 	if (mevp->me_type == EVF_READ)
206 		retval = EVFILT_READ;
207 
208 	if (mevp->me_type == EVF_WRITE)
209 		retval = EVFILT_WRITE;
210 
211 	if (mevp->me_type == EVF_TIMER)
212 		retval = EVFILT_TIMER;
213 
214 	if (mevp->me_type == EVF_SIGNAL)
215 		retval = EVFILT_SIGNAL;
216 
217 	if (mevp->me_type == EVF_VNODE)
218 		retval = EVFILT_VNODE;
219 
220 	return (retval);
221 }
222 
223 static int
224 mevent_kq_flags(struct mevent *mevp)
225 {
226 	int retval;
227 
228 	retval = mevp->me_state;
229 
230 	if (mevp->me_type == EVF_VNODE)
231 		retval |= EV_CLEAR;
232 
233 	return (retval);
234 }
235 
236 static int
237 mevent_kq_fflags(struct mevent *mevp)
238 {
239 	int retval;
240 
241 	retval = 0;
242 
243 	switch (mevp->me_type) {
244 	case EVF_VNODE:
245 		if ((mevp->me_fflags & EVFF_ATTRIB) != 0)
246 			retval |= NOTE_ATTRIB;
247 		break;
248 	case EVF_READ:
249 	case EVF_WRITE:
250 	case EVF_TIMER:
251 	case EVF_SIGNAL:
252 		break;
253 	}
254 
255 	return (retval);
256 }
257 
258 static void
259 mevent_populate(struct mevent *mevp, struct kevent *kev)
260 {
261 	if (mevp->me_type == EVF_TIMER) {
262 		kev->ident = mevp->me_timid;
263 		kev->data = mevp->me_msecs;
264 	} else {
265 		kev->ident = mevp->me_fd;
266 		kev->data = 0;
267 	}
268 	kev->filter = mevent_kq_filter(mevp);
269 	kev->flags = mevent_kq_flags(mevp);
270 	kev->fflags = mevent_kq_fflags(mevp);
271 	kev->udata = mevp;
272 }
273 
274 static int
275 mevent_build(struct kevent *kev)
276 {
277 	struct mevent *mevp, *tmpp;
278 	int i;
279 
280 	i = 0;
281 
282 	mevent_qlock();
283 
284 	LIST_FOREACH_SAFE(mevp, &change_head, me_list, tmpp) {
285 		if (mevp->me_closefd) {
286 			/*
287 			 * A close of the file descriptor will remove the
288 			 * event
289 			 */
290 			close(mevp->me_fd);
291 		} else {
292 			assert((mevp->me_state & EV_ADD) == 0);
293 			mevent_populate(mevp, &kev[i]);
294 			i++;
295 		}
296 
297 		mevp->me_cq = 0;
298 		LIST_REMOVE(mevp, me_list);
299 
300 		if (mevp->me_state & EV_DELETE) {
301 			free(mevp);
302 		} else {
303 			LIST_INSERT_HEAD(&global_head, mevp, me_list);
304 		}
305 
306 		assert(i < MEVENT_MAX);
307 	}
308 
309 	mevent_qunlock();
310 
311 	return (i);
312 }
313 
314 static void
315 mevent_handle(struct kevent *kev, int numev)
316 {
317 	struct mevent *mevp;
318 	int i;
319 
320 	for (i = 0; i < numev; i++) {
321 		mevp = kev[i].udata;
322 
323 		/* XXX check for EV_ERROR ? */
324 
325 		(*mevp->me_func)(mevp->me_fd, mevp->me_type, mevp->me_param);
326 	}
327 }
328 
329 #else /* __FreeBSD__ */
330 
331 static boolean_t
332 mevent_clarify_state(struct mevent *mevp)
333 {
334 	const int state = mevp->me_state;
335 
336 	if ((state & EV_DELETE) != 0) {
337 		/* All other intents are overriden by delete. */
338 		mevp->me_state = EV_DELETE;
339 		return (B_TRUE);
340 	}
341 
342 	/*
343 	 * Without a distinction between EV_ADD and EV_ENABLE in our emulation,
344 	 * handling the add-disabled case means eliding the portfs operation
345 	 * when both flags are present.
346 	 *
347 	 * This is not a concern for subsequent enable/disable operations, as
348 	 * mevent_update() toggles the flags properly so they are not left in
349 	 * conflict.
350 	 */
351 	if (state == (EV_ENABLE|EV_DISABLE)) {
352 		mevp->me_state = EV_DISABLE;
353 		return (B_FALSE);
354 	}
355 
356 	return (B_TRUE);
357 }
358 
359 static char *
360 mevent_fdpath(int fd)
361 {
362 	prfdinfo_t *fdinfo;
363 	char *path;
364 	size_t len;
365 
366 	fdinfo = proc_get_fdinfo(getpid(), fd);
367 	if (fdinfo == NULL) {
368 		(void) fprintf(stderr, "%s: proc_get_fdinfo(%d) failed: %s\n",
369 		    __func__, fd, strerror(errno));
370 		path = NULL;
371 	} else {
372 		path = (char *)proc_fdinfo_misc(fdinfo, PR_PATHNAME, &len);
373 	}
374 
375 	if (path == NULL) {
376 		(void) fprintf(stderr, "%s: Fall back to /proc/self/fd/%d\n",
377 		    __func__, fd);
378 		(void) asprintf(&path, "/proc/self/fd/%d", fd);
379 	} else {
380 		path = strdup(path);
381 	}
382 
383 	proc_fdinfo_free(fdinfo);
384 
385 	if (path == NULL) {
386 		(void) fprintf(stderr,
387 		    "%s: Error building path for fd %d: %s\n", __func__,
388 		    fd, strerror(errno));
389 	}
390 
391 	return (path);
392 }
393 
394 static void
395 mevent_poll_file_attrib(int fd, enum ev_type type, void *param)
396 {
397 	struct mevent *mevp = param;
398 	struct stat st;
399 
400 	if (fstat(mevp->me_poll.mp_fd, &st) != 0) {
401 		(void) fprintf(stderr, "%s: fstat(%d) \"%s\" failed: %s\n",
402 		    __func__, fd, mevp->me_fname, strerror(errno));
403 		return;
404 	}
405 
406 	if (mevp->me_poll.mp_size != st.st_size ||
407 	    mevp->me_fobj.fo_ctime.tv_sec != st.st_ctim.tv_sec ||
408 	    mevp->me_fobj.fo_ctime.tv_nsec != st.st_ctim.tv_nsec) {
409 		mevp->me_poll.mp_size = st.st_size;
410 		mevp->me_fobj.fo_atime = st.st_atim;
411 		mevp->me_fobj.fo_mtime = st.st_mtim;
412 		mevp->me_fobj.fo_ctime = st.st_ctim;
413 
414 		(*mevp->me_poll.mp_func)(mevp->me_poll.mp_fd, EVF_VNODE,
415 		    mevp->me_poll.mp_param);
416 	}
417 }
418 
419 static void
420 mevent_update_one_readwrite(struct mevent *mevp)
421 {
422 	int portfd = mevp->me_notify.portnfy_port;
423 
424 	mevp->me_auto_requeue = B_FALSE;
425 
426 	switch (mevp->me_state) {
427 	case EV_ENABLE:
428 	{
429 		const int events = (mevp->me_type == EVF_READ) ?
430 		    POLLIN : POLLOUT;
431 
432 		if (port_associate(portfd, PORT_SOURCE_FD, mevp->me_fd,
433 		    events, mevp) != 0) {
434 			(void) fprintf(stderr,
435 			    "port_associate fd %d %p failed: %s\n",
436 			    mevp->me_fd, mevp, strerror(errno));
437 		}
438 		return;
439 	}
440 	case EV_DISABLE:
441 	case EV_DELETE:
442 		/*
443 		 * A disable that comes in while an event is being
444 		 * handled will result in an ENOENT.
445 		 */
446 		if (port_dissociate(portfd, PORT_SOURCE_FD,
447 		    mevp->me_fd) != 0 && errno != ENOENT) {
448 			(void) fprintf(stderr, "port_dissociate "
449 			    "portfd %d fd %d mevp %p failed: %s\n",
450 			    portfd, mevp->me_fd, mevp, strerror(errno));
451 		}
452 		return;
453 	default:
454 		(void) fprintf(stderr, "%s: unhandled state %d\n", __func__,
455 		    mevp->me_state);
456 		abort();
457 	}
458 }
459 
460 static void
461 mevent_update_one_timer(struct mevent *mevp)
462 {
463 	mevp->me_auto_requeue = B_TRUE;
464 
465 	switch (mevp->me_state) {
466 	case EV_ENABLE:
467 	{
468 		struct itimerspec it = { 0 };
469 
470 		mevp->me_sigev.sigev_notify = SIGEV_PORT;
471 		mevp->me_sigev.sigev_value.sival_ptr = &mevp->me_notify;
472 
473 		if (timer_create(CLOCK_REALTIME, &mevp->me_sigev,
474 		    &mevp->me_timid) != 0) {
475 			(void) fprintf(stderr, "timer_create failed: %s",
476 			    strerror(errno));
477 			return;
478 		}
479 
480 		/* The first timeout */
481 		it.it_value.tv_sec = mevp->me_msecs / MILLISEC;
482 		it.it_value.tv_nsec =
483 			MSEC2NSEC(mevp->me_msecs % MILLISEC);
484 		/* Repeat at the same interval */
485 		it.it_interval = it.it_value;
486 
487 		if (timer_settime(mevp->me_timid, 0, &it, NULL) != 0) {
488 			(void) fprintf(stderr, "timer_settime failed: %s",
489 			    strerror(errno));
490 		}
491 		return;
492 	}
493 	case EV_DISABLE:
494 	case EV_DELETE:
495 		if (timer_delete(mevp->me_timid) != 0) {
496 			(void) fprintf(stderr, "timer_delete failed: %s",
497 			    strerror(errno));
498 		}
499 		mevp->me_timid = -1;
500 		return;
501 	default:
502 		(void) fprintf(stderr, "%s: unhandled state %d\n", __func__,
503 		    mevp->me_state);
504 		abort();
505 	}
506 }
507 
508 static void
509 mevent_update_one_vnode(struct mevent *mevp)
510 {
511 	int portfd = mevp->me_notify.portnfy_port;
512 
513 	mevp->me_auto_requeue = B_FALSE;
514 
515 	switch (mevp->me_state) {
516 	case EV_ENABLE:
517 	{
518 		int events = 0;
519 
520 		if ((mevp->me_fflags & EVFF_ATTRIB) != 0)
521 			events |= FILE_ATTRIB;
522 
523 		assert(events != 0);
524 
525 		if (mevp->me_fname == NULL) {
526 			mevp->me_fname = mevent_fdpath(mevp->me_fd);
527 			if (mevp->me_fname == NULL)
528 				return;
529 		}
530 
531 		bzero(&mevp->me_fobj, sizeof (mevp->me_fobj));
532 		mevp->me_fobj.fo_name = mevp->me_fname;
533 
534 		if (port_associate(portfd, PORT_SOURCE_FILE,
535 		    (uintptr_t)&mevp->me_fobj, events, mevp) != 0) {
536 			/*
537 			 * If this file does not support event ports
538 			 * (e.g. ZVOLs do not yet have support)
539 			 * then convert this to a timer event and poll for
540 			 * file attribute changes.
541 			 */
542 			struct stat st;
543 
544 			if (errno != ENOTSUP) {
545 				(void) fprintf(stderr,
546 				    "port_associate fd %d (%s) %p failed: %s"
547 				    ", polling instead\n",
548 				    mevp->me_fd, mevp->me_fname, mevp,
549 				    strerror(errno));
550 			}
551 
552 			if (fstat(mevp->me_fd, &st) != 0) {
553 				(void) fprintf(stderr,
554 				    "fstat(%d) \"%s\" failed: %s\n",
555 				    mevp->me_fd, mevp->me_fname,
556 				    strerror(errno));
557 				return;
558 			}
559 
560 			mevp->me_fobj.fo_atime = st.st_atim;
561 			mevp->me_fobj.fo_mtime = st.st_mtim;
562 			mevp->me_fobj.fo_ctime = st.st_ctim;
563 
564 			mevp->me_poll.mp_fd = mevp->me_fd;
565 			mevp->me_poll.mp_size = st.st_size;
566 
567 			mevp->me_poll.mp_func = mevp->me_func;
568 			mevp->me_poll.mp_param = mevp->me_param;
569 			mevp->me_func = mevent_poll_file_attrib;
570 			mevp->me_param = mevp;
571 
572 			mevp->me_type = EVF_TIMER;
573 			mevp->me_timid = -1;
574 			mevp->me_msecs = mevent_file_poll_interval_ms;
575 			mevent_update_one_timer(mevp);
576 		}
577 		return;
578 	}
579 	case EV_DISABLE:
580 	case EV_DELETE:
581 		/*
582 		 * A disable that comes in while an event is being
583 		 * handled will result in an ENOENT.
584 		 */
585 		if (port_dissociate(portfd, PORT_SOURCE_FILE,
586 		    (uintptr_t)&mevp->me_fobj) != 0 &&
587 		    errno != ENOENT) {
588 			(void) fprintf(stderr, "port_dissociate "
589 			    "portfd %d fd %d mevp %p failed: %s\n",
590 			    portfd, mevp->me_fd, mevp, strerror(errno));
591 		}
592 		free(mevp->me_fname);
593 		mevp->me_fname = NULL;
594 		return;
595 	default:
596 		(void) fprintf(stderr, "%s: unhandled state %d\n", __func__,
597 		    mevp->me_state);
598 		abort();
599 	}
600 }
601 
602 static void
603 mevent_update_one(struct mevent *mevp)
604 {
605 	switch (mevp->me_type) {
606 	case EVF_READ:
607 	case EVF_WRITE:
608 		mevent_update_one_readwrite(mevp);
609 		break;
610 	case EVF_TIMER:
611 		mevent_update_one_timer(mevp);
612 		break;
613 	case EVF_VNODE:
614 		mevent_update_one_vnode(mevp);
615 		break;
616 	case EVF_SIGNAL: /* EVF_SIGNAL not yet implemented. */
617 	default:
618 		(void) fprintf(stderr, "%s: unhandled event type %d\n",
619 		    __func__, mevp->me_type);
620 		abort();
621 	}
622 }
623 
624 static void
625 mevent_populate(struct mevent *mevp)
626 {
627 	mevp->me_notify.portnfy_port = mfd;
628 	mevp->me_notify.portnfy_user = mevp;
629 }
630 
631 static void
632 mevent_update_pending()
633 {
634 	struct mevent *mevp, *tmpp;
635 
636 	mevent_qlock();
637 
638 	LIST_FOREACH_SAFE(mevp, &change_head, me_list, tmpp) {
639 		mevent_populate(mevp);
640 		if (mevp->me_closefd) {
641 			/*
642 			 * A close of the file descriptor will remove the
643 			 * event
644 			 */
645 			(void) close(mevp->me_fd);
646 			mevp->me_fd = -1;
647 		} else {
648 			if (mevent_clarify_state(mevp)) {
649 				mevent_update_one(mevp);
650 			}
651 		}
652 
653 		mevp->me_cq = 0;
654 		LIST_REMOVE(mevp, me_list);
655 
656 		if (mevp->me_state & EV_DELETE) {
657 			free(mevp->me_fname);
658 			free(mevp);
659 		} else {
660 			LIST_INSERT_HEAD(&global_head, mevp, me_list);
661 		}
662 	}
663 
664 	mevent_qunlock();
665 }
666 
667 static void
668 mevent_handle_pe(port_event_t *pe)
669 {
670 	struct mevent *mevp = pe->portev_user;
671 
672 	mevent_qunlock();
673 
674 	(*mevp->me_func)(mevp->me_fd, mevp->me_type, mevp->me_param);
675 
676 	mevent_qlock();
677 	if (!mevp->me_cq && !mevp->me_auto_requeue) {
678 		mevent_update_one(mevp);
679 	}
680 	mevent_qunlock();
681 }
682 #endif
683 
684 static struct mevent *
685 mevent_add_state(int tfd, enum ev_type type,
686 	   void (*func)(int, enum ev_type, void *), void *param,
687 	   int state, int fflags)
688 {
689 #ifdef __FreeBSD__
690 	struct kevent kev;
691 #endif
692 	struct mevent *lp, *mevp;
693 #ifdef __FreeBSD__
694 	int ret;
695 #endif
696 
697 	if (tfd < 0 || func == NULL) {
698 		return (NULL);
699 	}
700 
701 	mevp = NULL;
702 
703 	pthread_once(&mevent_once, mevent_init);
704 
705 	mevent_qlock();
706 
707 	/*
708 	 * Verify that the fd/type tuple is not present in any list
709 	 */
710 	LIST_FOREACH(lp, &global_head, me_list) {
711 		if (type != EVF_TIMER && lp->me_fd == tfd &&
712 		    lp->me_type == type) {
713 			goto exit;
714 		}
715 	}
716 
717 	LIST_FOREACH(lp, &change_head, me_list) {
718 		if (type != EVF_TIMER && lp->me_fd == tfd &&
719 		    lp->me_type == type) {
720 			goto exit;
721 		}
722 	}
723 
724 	/*
725 	 * Allocate an entry and populate it.
726 	 */
727 	mevp = calloc(1, sizeof(struct mevent));
728 	if (mevp == NULL) {
729 		goto exit;
730 	}
731 
732 	if (type == EVF_TIMER) {
733 		mevp->me_msecs = tfd;
734 #ifdef __FreeBSD__
735 		mevp->me_timid = mevent_timid++;
736 #else
737 		mevp->me_timid = -1;
738 #endif
739 	} else
740 		mevp->me_fd = tfd;
741 	mevp->me_type = type;
742 	mevp->me_func = func;
743 	mevp->me_param = param;
744 	mevp->me_state = state;
745 	mevp->me_fflags = fflags;
746 
747 	/*
748 	 * Try to add the event.  If this fails, report the failure to
749 	 * the caller.
750 	 */
751 #ifdef __FreeBSD__
752 	mevent_populate(mevp, &kev);
753 	ret = kevent(mfd, &kev, 1, NULL, 0, NULL);
754 	if (ret == -1) {
755 		free(mevp);
756 		mevp = NULL;
757 		goto exit;
758 	}
759 	mevp->me_state &= ~EV_ADD;
760 #else
761 	mevent_populate(mevp);
762 	if (mevent_clarify_state(mevp))
763 		mevent_update_one(mevp);
764 #endif
765 
766 	LIST_INSERT_HEAD(&global_head, mevp, me_list);
767 
768 exit:
769 	mevent_qunlock();
770 
771 	return (mevp);
772 }
773 
774 struct mevent *
775 mevent_add(int tfd, enum ev_type type,
776 	   void (*func)(int, enum ev_type, void *), void *param)
777 {
778 
779 	return (mevent_add_state(tfd, type, func, param, EV_ADD, 0));
780 }
781 
782 struct mevent *
783 mevent_add_flags(int tfd, enum ev_type type, int fflags,
784 		 void (*func)(int, enum ev_type, void *), void *param)
785 {
786 
787 	return (mevent_add_state(tfd, type, func, param, EV_ADD, fflags));
788 }
789 
790 struct mevent *
791 mevent_add_disabled(int tfd, enum ev_type type,
792 		    void (*func)(int, enum ev_type, void *), void *param)
793 {
794 
795 	return (mevent_add_state(tfd, type, func, param, EV_ADD | EV_DISABLE, 0));
796 }
797 
798 static int
799 mevent_update(struct mevent *evp, bool enable)
800 {
801 	int newstate;
802 
803 	mevent_qlock();
804 
805 	/*
806 	 * It's not possible to enable/disable a deleted event
807 	 */
808 	assert((evp->me_state & EV_DELETE) == 0);
809 
810 	newstate = evp->me_state;
811 	if (enable) {
812 		newstate |= EV_ENABLE;
813 		newstate &= ~EV_DISABLE;
814 	} else {
815 		newstate |= EV_DISABLE;
816 		newstate &= ~EV_ENABLE;
817 	}
818 
819 	/*
820 	 * No update needed if state isn't changing
821 	 */
822 	if (evp->me_state != newstate) {
823 		evp->me_state = newstate;
824 
825 		/*
826 		 * Place the entry onto the changed list if not
827 		 * already there.
828 		 */
829 		if (evp->me_cq == 0) {
830 			evp->me_cq = 1;
831 			LIST_REMOVE(evp, me_list);
832 			LIST_INSERT_HEAD(&change_head, evp, me_list);
833 			mevent_notify();
834 		}
835 	}
836 
837 	mevent_qunlock();
838 
839 	return (0);
840 }
841 
842 int
843 mevent_enable(struct mevent *evp)
844 {
845 
846 	return (mevent_update(evp, true));
847 }
848 
849 int
850 mevent_disable(struct mevent *evp)
851 {
852 
853 	return (mevent_update(evp, false));
854 }
855 
856 static int
857 mevent_delete_event(struct mevent *evp, int closefd)
858 {
859 	mevent_qlock();
860 
861 	/*
862          * Place the entry onto the changed list if not already there, and
863 	 * mark as to be deleted.
864          */
865         if (evp->me_cq == 0) {
866 		evp->me_cq = 1;
867 		LIST_REMOVE(evp, me_list);
868 		LIST_INSERT_HEAD(&change_head, evp, me_list);
869 		mevent_notify();
870         }
871 	evp->me_state = EV_DELETE;
872 
873 	if (closefd)
874 		evp->me_closefd = 1;
875 
876 	mevent_qunlock();
877 
878 	return (0);
879 }
880 
881 int
882 mevent_delete(struct mevent *evp)
883 {
884 
885 	return (mevent_delete_event(evp, 0));
886 }
887 
888 int
889 mevent_delete_close(struct mevent *evp)
890 {
891 
892 	return (mevent_delete_event(evp, 1));
893 }
894 
895 static void
896 mevent_set_name(void)
897 {
898 
899 	pthread_set_name_np(mevent_tid, "mevent");
900 }
901 
902 void
903 mevent_dispatch(void)
904 {
905 #ifdef __FreeBSD__
906 	struct kevent changelist[MEVENT_MAX];
907 	struct kevent eventlist[MEVENT_MAX];
908 	struct mevent *pipev;
909 	int numev;
910 #else
911 	struct mevent *pipev;
912 #endif
913 	int ret;
914 #ifndef WITHOUT_CAPSICUM
915 	cap_rights_t rights;
916 #endif
917 
918 	mevent_tid = pthread_self();
919 	mevent_set_name();
920 
921 	pthread_once(&mevent_once, mevent_init);
922 
923 	/*
924 	 * Open the pipe that will be used for other threads to force
925 	 * the blocking kqueue call to exit by writing to it. Set the
926 	 * descriptor to non-blocking.
927 	 */
928 	ret = pipe(mevent_pipefd);
929 	if (ret < 0) {
930 		perror("pipe");
931 		exit(0);
932 	}
933 
934 #ifndef WITHOUT_CAPSICUM
935 	cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE);
936 	if (caph_rights_limit(mevent_pipefd[0], &rights) == -1)
937 		errx(EX_OSERR, "Unable to apply rights for sandbox");
938 	if (caph_rights_limit(mevent_pipefd[1], &rights) == -1)
939 		errx(EX_OSERR, "Unable to apply rights for sandbox");
940 #endif
941 
942 	/*
943 	 * Add internal event handler for the pipe write fd
944 	 */
945 	pipev = mevent_add(mevent_pipefd[0], EVF_READ, mevent_pipe_read, NULL);
946 	assert(pipev != NULL);
947 
948 	for (;;) {
949 #ifdef __FreeBSD__
950 		/*
951 		 * Build changelist if required.
952 		 * XXX the changelist can be put into the blocking call
953 		 * to eliminate the extra syscall. Currently better for
954 		 * debug.
955 		 */
956 		numev = mevent_build(changelist);
957 		if (numev) {
958 			ret = kevent(mfd, changelist, numev, NULL, 0, NULL);
959 			if (ret == -1) {
960 				perror("Error return from kevent change");
961 			}
962 		}
963 
964 		/*
965 		 * Block awaiting events
966 		 */
967 		ret = kevent(mfd, NULL, 0, eventlist, MEVENT_MAX, NULL);
968 		if (ret == -1 && errno != EINTR) {
969 			perror("Error return from kevent monitor");
970 		}
971 
972 		/*
973 		 * Handle reported events
974 		 */
975 		mevent_handle(eventlist, ret);
976 
977 #else /* __FreeBSD__ */
978 		port_event_t pev;
979 
980 		/* Handle any pending updates */
981 		mevent_update_pending();
982 
983 		/* Block awaiting events */
984 		ret = port_get(mfd, &pev, NULL);
985 		if (ret != 0) {
986 			if (errno != EINTR)
987 				perror("Error return from port_get");
988 			continue;
989 		}
990 
991 		/* Handle reported event */
992 		mevent_handle_pe(&pev);
993 #endif /* __FreeBSD__ */
994 	}
995 }
996