xref: /illumos-gate/usr/src/uts/common/io/devpoll.c (revision 086d96878f5f62a25a6d90e5b03a1ef9ba352231)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * Copyright (c) 2012 by Delphix. All rights reserved.
28  * Copyright 2018 Joyent, Inc.
29  */
30 
31 #include <sys/types.h>
32 #include <sys/devops.h>
33 #include <sys/conf.h>
34 #include <sys/modctl.h>
35 #include <sys/sunddi.h>
36 #include <sys/stat.h>
37 #include <sys/poll_impl.h>
38 #include <sys/errno.h>
39 #include <sys/kmem.h>
40 #include <sys/mkdev.h>
41 #include <sys/debug.h>
42 #include <sys/file.h>
43 #include <sys/sysmacros.h>
44 #include <sys/systm.h>
45 #include <sys/bitmap.h>
46 #include <sys/devpoll.h>
47 #include <sys/rctl.h>
48 #include <sys/resource.h>
49 #include <sys/schedctl.h>
50 #include <sys/epoll.h>
51 
52 #define	RESERVED	1
53 
54 /* local data struct */
55 static	dp_entry_t	**devpolltbl;	/* dev poll entries */
56 static	size_t		dptblsize;
57 
58 static	kmutex_t	devpoll_lock;	/* lock protecting dev tbl */
59 int			devpoll_init;	/* is /dev/poll initialized already */
60 
61 /* device local functions */
62 
63 static int dpopen(dev_t *devp, int flag, int otyp, cred_t *credp);
64 static int dpwrite(dev_t dev, struct uio *uiop, cred_t *credp);
65 static int dpioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
66     int *rvalp);
67 static int dppoll(dev_t dev, short events, int anyyet, short *reventsp,
68     struct pollhead **phpp);
69 static int dpclose(dev_t dev, int flag, int otyp, cred_t *credp);
70 static dev_info_t *dpdevi;
71 
72 
73 static struct cb_ops    dp_cb_ops = {
74 	dpopen,			/* open */
75 	dpclose,		/* close */
76 	nodev,			/* strategy */
77 	nodev,			/* print */
78 	nodev,			/* dump */
79 	nodev,			/* read */
80 	dpwrite,		/* write */
81 	dpioctl,		/* ioctl */
82 	nodev,			/* devmap */
83 	nodev,			/* mmap */
84 	nodev,			/* segmap */
85 	dppoll,			/* poll */
86 	ddi_prop_op,		/* prop_op */
87 	(struct streamtab *)0,	/* streamtab */
88 	D_MP,			/* flags */
89 	CB_REV,			/* cb_ops revision */
90 	nodev,			/* aread */
91 	nodev			/* awrite */
92 };
93 
94 static int dpattach(dev_info_t *, ddi_attach_cmd_t);
95 static int dpdetach(dev_info_t *, ddi_detach_cmd_t);
96 static int dpinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
97 
98 static struct dev_ops dp_ops = {
99 	DEVO_REV,		/* devo_rev */
100 	0,			/* refcnt */
101 	dpinfo,			/* info */
102 	nulldev,		/* identify */
103 	nulldev,		/* probe */
104 	dpattach,		/* attach */
105 	dpdetach,		/* detach */
106 	nodev,			/* reset */
107 	&dp_cb_ops,		/* driver operations */
108 	(struct bus_ops *)NULL, /* bus operations */
109 	nulldev,		/* power */
110 	ddi_quiesce_not_needed,		/* quiesce */
111 };
112 
113 
114 static struct modldrv modldrv = {
115 	&mod_driverops,		/* type of module - a driver */
116 	"/dev/poll driver",
117 	&dp_ops,
118 };
119 
120 static struct modlinkage modlinkage = {
121 	MODREV_1,
122 	(void *)&modldrv,
123 	NULL
124 };
125 
126 static void pcachelink_assoc(pollcache_t *, pollcache_t *);
127 static void pcachelink_mark_stale(pollcache_t *);
128 static void pcachelink_purge_stale(pollcache_t *);
129 static void pcachelink_purge_all(pollcache_t *);
130 
131 
132 /*
133  * Locking Design
134  *
135  * The /dev/poll driver shares most of its code with poll sys call whose
136  * code is in common/syscall/poll.c. In poll(2) design, the pollcache
137  * structure is per lwp. An implicit assumption is made there that some
138  * portion of pollcache will never be touched by other lwps. E.g., in
139  * poll(2) design, no lwp will ever need to grow bitmap of other lwp.
140  * This assumption is not true for /dev/poll; hence the need for extra
141  * locking.
142  *
143  * To allow more parallelism, each /dev/poll file descriptor (indexed by
144  * minor number) has its own lock. Since read (dpioctl) is a much more
145  * frequent operation than write, we want to allow multiple reads on same
146  * /dev/poll fd. However, we prevent writes from being starved by giving
147  * priority to write operation. Theoretically writes can starve reads as
148  * well. But in practical sense this is not important because (1) writes
149  * happens less often than reads, and (2) write operation defines the
150  * content of poll fd a cache set. If writes happens so often that they
151  * can starve reads, that means the cached set is very unstable. It may
152  * not make sense to read an unstable cache set anyway. Therefore, the
153  * writers starving readers case is not handled in this design.
154  */
155 
156 int
157 _init()
158 {
159 	int	error;
160 
161 	dptblsize = DEVPOLLSIZE;
162 	devpolltbl = kmem_zalloc(sizeof (caddr_t) * dptblsize, KM_SLEEP);
163 	mutex_init(&devpoll_lock, NULL, MUTEX_DEFAULT, NULL);
164 	devpoll_init = 1;
165 	if ((error = mod_install(&modlinkage)) != 0) {
166 		kmem_free(devpolltbl, sizeof (caddr_t) * dptblsize);
167 		devpoll_init = 0;
168 	}
169 	return (error);
170 }
171 
172 int
173 _fini()
174 {
175 	int error;
176 
177 	if ((error = mod_remove(&modlinkage)) != 0) {
178 		return (error);
179 	}
180 	mutex_destroy(&devpoll_lock);
181 	kmem_free(devpolltbl, sizeof (caddr_t) * dptblsize);
182 	return (0);
183 }
184 
185 int
186 _info(struct modinfo *modinfop)
187 {
188 	return (mod_info(&modlinkage, modinfop));
189 }
190 
191 /*ARGSUSED*/
192 static int
193 dpattach(dev_info_t *devi, ddi_attach_cmd_t cmd)
194 {
195 	if (ddi_create_minor_node(devi, "poll", S_IFCHR, 0, DDI_PSEUDO, 0)
196 	    == DDI_FAILURE) {
197 		ddi_remove_minor_node(devi, NULL);
198 		return (DDI_FAILURE);
199 	}
200 	dpdevi = devi;
201 	return (DDI_SUCCESS);
202 }
203 
204 static int
205 dpdetach(dev_info_t *devi, ddi_detach_cmd_t cmd)
206 {
207 	if (cmd != DDI_DETACH)
208 		return (DDI_FAILURE);
209 
210 	ddi_remove_minor_node(devi, NULL);
211 	return (DDI_SUCCESS);
212 }
213 
214 /* ARGSUSED */
215 static int
216 dpinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
217 {
218 	int error;
219 
220 	switch (infocmd) {
221 	case DDI_INFO_DEVT2DEVINFO:
222 		*result = (void *)dpdevi;
223 		error = DDI_SUCCESS;
224 		break;
225 	case DDI_INFO_DEVT2INSTANCE:
226 		*result = (void *)0;
227 		error = DDI_SUCCESS;
228 		break;
229 	default:
230 		error = DDI_FAILURE;
231 	}
232 	return (error);
233 }
234 
235 /*
236  * dp_pcache_poll has similar logic to pcache_poll() in poll.c. The major
237  * differences are: (1) /dev/poll requires scanning the bitmap starting at
238  * where it was stopped last time, instead of always starting from 0,
239  * (2) since user may not have cleaned up the cached fds when they are
240  * closed, some polldats in cache may refer to closed or reused fds. We
241  * need to check for those cases.
242  *
243  * NOTE: Upon closing an fd, automatic poll cache cleanup is done for
244  *	 poll(2) caches but NOT for /dev/poll caches. So expect some
245  *	 stale entries!
246  */
247 static int
248 dp_pcache_poll(dp_entry_t *dpep, void *dpbuf, pollcache_t *pcp, nfds_t nfds,
249     int *fdcntp)
250 {
251 	int		start, ostart, end, fdcnt, error = 0;
252 	boolean_t	done, no_wrap;
253 	pollfd_t	*pfdp;
254 	epoll_event_t	*epoll;
255 	const short	mask = POLLRDHUP | POLLWRBAND;
256 	const boolean_t	is_epoll = (dpep->dpe_flag & DP_ISEPOLLCOMPAT) != 0;
257 
258 	ASSERT(MUTEX_HELD(&pcp->pc_lock));
259 	if (pcp->pc_bitmap == NULL) {
260 		/* No Need to search because no poll fd has been cached. */
261 		return (0);
262 	}
263 
264 	if (is_epoll) {
265 		pfdp = NULL;
266 		epoll = (epoll_event_t *)dpbuf;
267 	} else {
268 		pfdp = (pollfd_t *)dpbuf;
269 		epoll = NULL;
270 	}
271 retry:
272 	start = ostart = pcp->pc_mapstart;
273 	end = pcp->pc_mapend;
274 
275 	if (start == 0) {
276 		/*
277 		 * started from every begining, no need to wrap around.
278 		 */
279 		no_wrap = B_TRUE;
280 	} else {
281 		no_wrap = B_FALSE;
282 	}
283 	done = B_FALSE;
284 	fdcnt = 0;
285 	while ((fdcnt < nfds) && !done) {
286 		pollhead_t *php = NULL;
287 		short revent = 0;
288 		uf_entry_gen_t gen;
289 		int fd;
290 
291 		/*
292 		 * Examine the bit map in a circular fashion
293 		 * to avoid starvation. Always resume from
294 		 * last stop. Scan till end of the map. Then
295 		 * wrap around.
296 		 */
297 		fd = bt_getlowbit(pcp->pc_bitmap, start, end);
298 		ASSERT(fd <= end);
299 		if (fd >= 0) {
300 			file_t *fp;
301 			polldat_t *pdp;
302 
303 			if (fd == end) {
304 				if (no_wrap) {
305 					done = B_TRUE;
306 				} else {
307 					start = 0;
308 					end = ostart - 1;
309 					no_wrap = B_TRUE;
310 				}
311 			} else {
312 				start = fd + 1;
313 			}
314 			pdp = pcache_lookup_fd(pcp, fd);
315 repoll:
316 			ASSERT(pdp != NULL);
317 			ASSERT(pdp->pd_fd == fd);
318 			if (pdp->pd_fp == NULL) {
319 				/*
320 				 * The fd is POLLREMOVed. This fd is
321 				 * logically no longer cached. So move
322 				 * on to the next one.
323 				 */
324 				continue;
325 			}
326 			if ((fp = getf_gen(fd, &gen)) == NULL) {
327 				if (is_epoll) {
328 					/*
329 					 * In the epoll compatibility case, we
330 					 * actually perform the implicit
331 					 * removal to remain closer to the
332 					 * epoll semantics.
333 					 */
334 					pdp->pd_fp = NULL;
335 					pdp->pd_events = 0;
336 
337 					if (pdp->pd_php != NULL) {
338 						pollhead_delete(pdp->pd_php,
339 						    pdp);
340 						pdp->pd_php = NULL;
341 					}
342 
343 					BT_CLEAR(pcp->pc_bitmap, fd);
344 				} else if (pfdp != NULL) {
345 					/*
346 					 * The fd has been closed, but user has
347 					 * not done a POLLREMOVE on this fd
348 					 * yet. Instead of cleaning it here
349 					 * implicitly, we return POLLNVAL. This
350 					 * is consistent with poll(2) polling a
351 					 * closed fd. Hope this will remind
352 					 * user to do a POLLREMOVE.
353 					 */
354 					pfdp[fdcnt].fd = fd;
355 					pfdp[fdcnt].revents = POLLNVAL;
356 					fdcnt++;
357 				}
358 				continue;
359 			}
360 
361 			/*
362 			 * Detect a change to the resource underlying a cached
363 			 * file descriptor.  While the fd generation comparison
364 			 * will catch nearly all cases, the file_t comparison
365 			 * is maintained as a failsafe as well.
366 			 */
367 			if (gen != pdp->pd_gen || fp != pdp->pd_fp) {
368 				/*
369 				 * The user is polling on a cached fd which was
370 				 * closed and then reused.  Unfortunately there
371 				 * is no good way to communicate this fact to
372 				 * the consumer.
373 				 *
374 				 * When this situation has been detected, it's
375 				 * likely that any existing pollhead is
376 				 * ill-suited to perform proper wake-ups.
377 				 *
378 				 * Clean up the old entry under the expectation
379 				 * that a valid one will be provided as part of
380 				 * the later VOP_POLL.
381 				 */
382 				if (pdp->pd_php != NULL) {
383 					pollhead_delete(pdp->pd_php, pdp);
384 					pdp->pd_php = NULL;
385 				}
386 
387 				/*
388 				 * Since epoll is expected to act on the
389 				 * underlying 'struct file' (in Linux terms,
390 				 * our vnode_t would be a closer analog) rather
391 				 * than the fd itself, an implicit remove
392 				 * is necessary under these circumstances to
393 				 * suppress any results (or errors) from the
394 				 * new resource occupying the fd.
395 				 */
396 				if (is_epoll) {
397 					pdp->pd_fp = NULL;
398 					pdp->pd_events = 0;
399 					BT_CLEAR(pcp->pc_bitmap, fd);
400 					releasef(fd);
401 					continue;
402 				} else {
403 					/*
404 					 * Regular /dev/poll is unbothered
405 					 * about the fd reassignment.
406 					 */
407 					pdp->pd_fp = fp;
408 					pdp->pd_gen = gen;
409 				}
410 			}
411 			/*
412 			 * XXX - pollrelock() logic needs to know which
413 			 * which pollcache lock to grab. It'd be a
414 			 * cleaner solution if we could pass pcp as
415 			 * an arguement in VOP_POLL interface instead
416 			 * of implicitly passing it using thread_t
417 			 * struct. On the other hand, changing VOP_POLL
418 			 * interface will require all driver/file system
419 			 * poll routine to change. May want to revisit
420 			 * the tradeoff later.
421 			 */
422 			curthread->t_pollcache = pcp;
423 			error = VOP_POLL(fp->f_vnode, pdp->pd_events, 0,
424 			    &revent, &php, NULL);
425 
426 			/*
427 			 * Recheck edge-triggered descriptors which lack a
428 			 * pollhead.  While this check is performed when an fd
429 			 * is added to the pollcache in dpwrite(), subsequent
430 			 * descriptor manipulation could cause a different
431 			 * resource to be present now.
432 			 */
433 			if ((pdp->pd_events & POLLET) && error == 0 &&
434 			    pdp->pd_php == NULL && php == NULL && revent != 0) {
435 				short levent = 0;
436 
437 				/*
438 				 * The same POLLET-only VOP_POLL is used in an
439 				 * attempt to coax a pollhead from older
440 				 * driver logic.
441 				 */
442 				error = VOP_POLL(fp->f_vnode, POLLET,
443 				    0, &levent, &php, NULL);
444 			}
445 
446 			curthread->t_pollcache = NULL;
447 			releasef(fd);
448 			if (error != 0) {
449 				break;
450 			}
451 
452 			/*
453 			 * layered devices (e.g. console driver)
454 			 * may change the vnode and thus the pollhead
455 			 * pointer out from underneath us.
456 			 */
457 			if (php != NULL && pdp->pd_php != NULL &&
458 			    php != pdp->pd_php) {
459 				pollhead_delete(pdp->pd_php, pdp);
460 				pdp->pd_php = php;
461 				pollhead_insert(php, pdp);
462 				/*
463 				 * The bit should still be set.
464 				 */
465 				ASSERT(BT_TEST(pcp->pc_bitmap, fd));
466 				goto retry;
467 			}
468 
469 			if (revent != 0) {
470 				if (pfdp != NULL) {
471 					pfdp[fdcnt].fd = fd;
472 					pfdp[fdcnt].events = pdp->pd_events;
473 					pfdp[fdcnt].revents = revent;
474 				} else if (epoll != NULL) {
475 					epoll_event_t *ep = &epoll[fdcnt];
476 
477 					ASSERT(epoll != NULL);
478 					ep->data.u64 = pdp->pd_epolldata;
479 
480 					/*
481 					 * Since POLLNVAL is a legal event for
482 					 * VOP_POLL handlers to emit, it must
483 					 * be translated epoll-legal.
484 					 */
485 					if (revent & POLLNVAL) {
486 						revent &= ~POLLNVAL;
487 						revent |= POLLERR;
488 					}
489 
490 					/*
491 					 * If any of the event bits are set for
492 					 * which poll and epoll representations
493 					 * differ, swizzle in the native epoll
494 					 * values.
495 					 */
496 					if (revent & mask) {
497 						ep->events = (revent & ~mask) |
498 						    ((revent & POLLRDHUP) ?
499 						    EPOLLRDHUP : 0) |
500 						    ((revent & POLLWRBAND) ?
501 						    EPOLLWRBAND : 0);
502 					} else {
503 						ep->events = revent;
504 					}
505 
506 					/*
507 					 * We define POLLWRNORM to be POLLOUT,
508 					 * but epoll has separate definitions
509 					 * for them; if POLLOUT is set and the
510 					 * user has asked for EPOLLWRNORM, set
511 					 * that as well.
512 					 */
513 					if ((revent & POLLOUT) &&
514 					    (pdp->pd_events & EPOLLWRNORM)) {
515 						ep->events |= EPOLLWRNORM;
516 					}
517 				} else {
518 					pollstate_t *ps =
519 					    curthread->t_pollstate;
520 					/*
521 					 * The devpoll handle itself is being
522 					 * polled.  Notify the caller of any
523 					 * readable event(s), leaving as much
524 					 * state as possible untouched.
525 					 */
526 					VERIFY(fdcnt == 0);
527 					VERIFY(ps != NULL);
528 
529 					/*
530 					 * If a call to pollunlock() fails
531 					 * during VOP_POLL, skip over the fd
532 					 * and continue polling.
533 					 *
534 					 * Otherwise, report that there is an
535 					 * event pending.
536 					 */
537 					if ((ps->ps_flags & POLLSTATE_ULFAIL)
538 					    != 0) {
539 						ps->ps_flags &=
540 						    ~POLLSTATE_ULFAIL;
541 						continue;
542 					} else {
543 						fdcnt++;
544 						break;
545 					}
546 				}
547 
548 				/* Handle special polling modes. */
549 				if (pdp->pd_events & POLLONESHOT) {
550 					/*
551 					 * If POLLONESHOT is set, perform the
552 					 * implicit POLLREMOVE.
553 					 */
554 					pdp->pd_fp = NULL;
555 					pdp->pd_events = 0;
556 
557 					if (pdp->pd_php != NULL) {
558 						pollhead_delete(pdp->pd_php,
559 						    pdp);
560 						pdp->pd_php = NULL;
561 					}
562 
563 					BT_CLEAR(pcp->pc_bitmap, fd);
564 				} else if (pdp->pd_events & POLLET) {
565 					/*
566 					 * Wire up the pollhead which should
567 					 * have been provided.  Edge-triggered
568 					 * polling cannot function properly
569 					 * with drivers which do not emit one.
570 					 */
571 					if (php != NULL &&
572 					    pdp->pd_php == NULL) {
573 						pollhead_insert(php, pdp);
574 						pdp->pd_php = php;
575 					}
576 
577 					/*
578 					 * If the driver has emitted a pollhead,
579 					 * clear the bit in the bitmap which
580 					 * effectively latches the edge on a
581 					 * pollwakeup() from the driver.
582 					 */
583 					if (pdp->pd_php != NULL) {
584 						BT_CLEAR(pcp->pc_bitmap, fd);
585 					}
586 				}
587 
588 				fdcnt++;
589 			} else if (php != NULL) {
590 				/*
591 				 * We clear a bit or cache a poll fd if
592 				 * the driver returns a poll head ptr,
593 				 * which is expected in the case of 0
594 				 * revents. Some buggy driver may return
595 				 * NULL php pointer with 0 revents. In
596 				 * this case, we just treat the driver as
597 				 * "noncachable" and not clearing the bit
598 				 * in bitmap.
599 				 */
600 				if ((pdp->pd_php != NULL) &&
601 				    ((pcp->pc_flag & PC_POLLWAKE) == 0)) {
602 					BT_CLEAR(pcp->pc_bitmap, fd);
603 				}
604 				if (pdp->pd_php == NULL) {
605 					pollhead_insert(php, pdp);
606 					pdp->pd_php = php;
607 					/*
608 					 * An event of interest may have
609 					 * arrived between the VOP_POLL() and
610 					 * the pollhead_insert(); check again.
611 					 */
612 					goto repoll;
613 				}
614 			}
615 		} else {
616 			/*
617 			 * No bit set in the range. Check for wrap around.
618 			 */
619 			if (!no_wrap) {
620 				start = 0;
621 				end = ostart - 1;
622 				no_wrap = B_TRUE;
623 			} else {
624 				done = B_TRUE;
625 			}
626 		}
627 	}
628 
629 	if (!done) {
630 		pcp->pc_mapstart = start;
631 	}
632 	ASSERT(*fdcntp == 0);
633 	*fdcntp = fdcnt;
634 	return (error);
635 }
636 
637 /*ARGSUSED*/
638 static int
639 dpopen(dev_t *devp, int flag, int otyp, cred_t *credp)
640 {
641 	minor_t		minordev;
642 	dp_entry_t	*dpep;
643 	pollcache_t	*pcp;
644 
645 	ASSERT(devpoll_init);
646 	ASSERT(dptblsize <= MAXMIN);
647 	mutex_enter(&devpoll_lock);
648 	for (minordev = 0; minordev < dptblsize; minordev++) {
649 		if (devpolltbl[minordev] == NULL) {
650 			devpolltbl[minordev] = (dp_entry_t *)RESERVED;
651 			break;
652 		}
653 	}
654 	if (minordev == dptblsize) {
655 		dp_entry_t	**newtbl;
656 		size_t		oldsize;
657 
658 		/*
659 		 * Used up every entry in the existing devpoll table.
660 		 * Grow the table by DEVPOLLSIZE.
661 		 */
662 		if ((oldsize = dptblsize) >= MAXMIN) {
663 			mutex_exit(&devpoll_lock);
664 			return (ENXIO);
665 		}
666 		dptblsize += DEVPOLLSIZE;
667 		if (dptblsize > MAXMIN) {
668 			dptblsize = MAXMIN;
669 		}
670 		newtbl = kmem_zalloc(sizeof (caddr_t) * dptblsize, KM_SLEEP);
671 		bcopy(devpolltbl, newtbl, sizeof (caddr_t) * oldsize);
672 		kmem_free(devpolltbl, sizeof (caddr_t) * oldsize);
673 		devpolltbl = newtbl;
674 		devpolltbl[minordev] = (dp_entry_t *)RESERVED;
675 	}
676 	mutex_exit(&devpoll_lock);
677 
678 	dpep = kmem_zalloc(sizeof (dp_entry_t), KM_SLEEP);
679 	/*
680 	 * allocate a pollcache skeleton here. Delay allocating bitmap
681 	 * structures until dpwrite() time, since we don't know the
682 	 * optimal size yet.  We also delay setting the pid until either
683 	 * dpwrite() or attempt to poll on the instance, allowing parents
684 	 * to create instances of /dev/poll for their children.  (In the
685 	 * epoll compatibility case, this check isn't performed to maintain
686 	 * semantic compatibility.)
687 	 */
688 	pcp = pcache_alloc();
689 	dpep->dpe_pcache = pcp;
690 	pcp->pc_pid = -1;
691 	*devp = makedevice(getmajor(*devp), minordev);  /* clone the driver */
692 	mutex_enter(&devpoll_lock);
693 	ASSERT(minordev < dptblsize);
694 	ASSERT(devpolltbl[minordev] == (dp_entry_t *)RESERVED);
695 	devpolltbl[minordev] = dpep;
696 	mutex_exit(&devpoll_lock);
697 	return (0);
698 }
699 
700 /*
701  * Write to dev/poll add/remove fd's to/from a cached poll fd set,
702  * or change poll events for a watched fd.
703  */
704 /*ARGSUSED*/
705 static int
706 dpwrite(dev_t dev, struct uio *uiop, cred_t *credp)
707 {
708 	minor_t		minor;
709 	dp_entry_t	*dpep;
710 	pollcache_t	*pcp;
711 	pollfd_t	*pollfdp, *pfdp;
712 	dvpoll_epollfd_t *epfdp;
713 	uintptr_t	limit;
714 	int		error;
715 	uint_t		size;
716 	size_t		copysize, uiosize;
717 	nfds_t		pollfdnum;
718 	boolean_t	is_epoll, fds_added = B_FALSE;
719 
720 	minor = getminor(dev);
721 
722 	mutex_enter(&devpoll_lock);
723 	ASSERT(minor < dptblsize);
724 	dpep = devpolltbl[minor];
725 	ASSERT(dpep != NULL);
726 	mutex_exit(&devpoll_lock);
727 
728 	mutex_enter(&dpep->dpe_lock);
729 	pcp = dpep->dpe_pcache;
730 	is_epoll = (dpep->dpe_flag & DP_ISEPOLLCOMPAT) != 0;
731 	size = (is_epoll) ? sizeof (dvpoll_epollfd_t) : sizeof (pollfd_t);
732 	mutex_exit(&dpep->dpe_lock);
733 
734 	if (!is_epoll && curproc->p_pid != pcp->pc_pid) {
735 		if (pcp->pc_pid != -1) {
736 			return (EACCES);
737 		}
738 
739 		pcp->pc_pid = curproc->p_pid;
740 	}
741 
742 	if (uiop->uio_resid < 0) {
743 		/* No one else is this careful, but maybe they should be. */
744 		return (EINVAL);
745 	}
746 
747 	uiosize = (size_t)uiop->uio_resid;
748 	pollfdnum = uiosize / size;
749 
750 	/*
751 	 * For epoll-enabled handles, restrict the allowed write size to 2.
752 	 * This corresponds to an epoll_ctl(3C) performing an EPOLL_CTL_MOD
753 	 * operation which is expanded into two operations (DEL and ADD).
754 	 *
755 	 * All other operations performed through epoll_ctl(3C) will consist of
756 	 * a single entry.
757 	 */
758 	if (is_epoll && pollfdnum > 2) {
759 		return (EINVAL);
760 	}
761 
762 	/*
763 	 * We want to make sure that pollfdnum isn't large enough to DoS us,
764 	 * but we also don't want to grab p_lock unnecessarily -- so we
765 	 * perform the full check against our resource limits if and only if
766 	 * pollfdnum is larger than the known-to-be-sane value of UINT8_MAX.
767 	 */
768 	if (pollfdnum > UINT8_MAX) {
769 		mutex_enter(&curproc->p_lock);
770 		if (pollfdnum >
771 		    (uint_t)rctl_enforced_value(rctlproc_legacy[RLIMIT_NOFILE],
772 		    curproc->p_rctls, curproc)) {
773 			(void) rctl_action(rctlproc_legacy[RLIMIT_NOFILE],
774 			    curproc->p_rctls, curproc, RCA_SAFE);
775 			mutex_exit(&curproc->p_lock);
776 			return (EINVAL);
777 		}
778 		mutex_exit(&curproc->p_lock);
779 	}
780 
781 	/*
782 	 * Copy in the pollfd array.  Walk through the array and add
783 	 * each polled fd to the cached set.
784 	 */
785 	pollfdp = kmem_alloc(uiosize, KM_SLEEP);
786 	limit = (uintptr_t)pollfdp + (pollfdnum * size);
787 
788 	/*
789 	 * Although /dev/poll uses the write(2) interface to cache fds, it's
790 	 * not supposed to function as a seekable device. To prevent offset
791 	 * from growing and eventually exceed the maximum, reset the offset
792 	 * here for every call.
793 	 */
794 	uiop->uio_loffset = 0;
795 
796 	/*
797 	 * Use uiocopy instead of uiomove when populating pollfdp, keeping
798 	 * uio_resid untouched for now.  Write syscalls will translate EINTR
799 	 * into a success if they detect "successfully transfered" data via an
800 	 * updated uio_resid.  Falsely suppressing such errors is disastrous.
801 	 */
802 	if ((error = uiocopy((caddr_t)pollfdp, uiosize, UIO_WRITE, uiop,
803 	    &copysize)) != 0) {
804 		kmem_free(pollfdp, uiosize);
805 		return (error);
806 	}
807 
808 	/*
809 	 * We are about to enter the core portion of dpwrite(). Make sure this
810 	 * write has exclusive access in this portion of the code, i.e., no
811 	 * other writers in this code.
812 	 *
813 	 * Waiting for all readers to drop their references to the dpe is
814 	 * unecessary since the pollcache itself is protected by pc_lock.
815 	 */
816 	mutex_enter(&dpep->dpe_lock);
817 	dpep->dpe_writerwait++;
818 	while ((dpep->dpe_flag & DP_WRITER_PRESENT) != 0) {
819 		ASSERT(dpep->dpe_refcnt != 0);
820 
821 		/*
822 		 * The epoll API does not allow EINTR as a result when making
823 		 * modifications to the set of polled fds.  Given that write
824 		 * activity is relatively quick and the size of accepted writes
825 		 * is limited above to two entries, a signal-ignorant wait is
826 		 * used here to avoid the EINTR.
827 		 */
828 		if (is_epoll) {
829 			cv_wait(&dpep->dpe_cv, &dpep->dpe_lock);
830 			continue;
831 		}
832 
833 		/*
834 		 * Non-epoll writers to /dev/poll handles can tolerate EINTR.
835 		 */
836 		if (!cv_wait_sig_swap(&dpep->dpe_cv, &dpep->dpe_lock)) {
837 			dpep->dpe_writerwait--;
838 			mutex_exit(&dpep->dpe_lock);
839 			kmem_free(pollfdp, uiosize);
840 			return (EINTR);
841 		}
842 	}
843 	dpep->dpe_writerwait--;
844 	dpep->dpe_flag |= DP_WRITER_PRESENT;
845 	dpep->dpe_refcnt++;
846 
847 	if (!is_epoll && (dpep->dpe_flag & DP_ISEPOLLCOMPAT) != 0) {
848 		/*
849 		 * The epoll compat mode was enabled while we were waiting to
850 		 * establish write access. It is not safe to continue since
851 		 * state was prepared for non-epoll operation.
852 		 */
853 		error = EBUSY;
854 		goto bypass;
855 	}
856 	mutex_exit(&dpep->dpe_lock);
857 
858 	/*
859 	 * Since the dpwrite() may recursively walk an added /dev/poll handle,
860 	 * pollstate_enter() deadlock and loop detection must be used.
861 	 */
862 	(void) pollstate_create();
863 	VERIFY(pollstate_enter(pcp) == PSE_SUCCESS);
864 
865 	if (pcp->pc_bitmap == NULL) {
866 		pcache_create(pcp, pollfdnum);
867 	}
868 	for (pfdp = pollfdp; (uintptr_t)pfdp < limit;
869 	    pfdp = (pollfd_t *)((uintptr_t)pfdp + size)) {
870 		int fd = pfdp->fd;
871 		polldat_t *pdp;
872 
873 		if ((uint_t)fd >= P_FINFO(curproc)->fi_nfiles) {
874 			/*
875 			 * epoll semantics demand that we return EBADF if our
876 			 * specified fd is invalid.
877 			 */
878 			if (is_epoll) {
879 				error = EBADF;
880 				break;
881 			}
882 
883 			continue;
884 		}
885 
886 		pdp = pcache_lookup_fd(pcp, fd);
887 		if (pfdp->events != POLLREMOVE) {
888 			uf_entry_gen_t gen;
889 			file_t *fp = NULL;
890 			struct pollhead *php = NULL;
891 
892 			/*
893 			 * If we're in epoll compatibility mode, check that the
894 			 * fd is valid before allocating anything for it; epoll
895 			 * semantics demand that we return EBADF if our
896 			 * specified fd is invalid.
897 			 */
898 			if (is_epoll) {
899 				if ((fp = getf_gen(fd, &gen)) == NULL) {
900 					error = EBADF;
901 					break;
902 				}
903 			}
904 			if (pdp == NULL) {
905 				pdp = pcache_alloc_fd(0);
906 				pdp->pd_fd = fd;
907 				pdp->pd_pcache = pcp;
908 				pcache_insert_fd(pcp, pdp, pollfdnum);
909 			}
910 
911 			if (is_epoll) {
912 				/*
913 				 * If the fd is already a member of the epoll
914 				 * set, error emission is needed only when the
915 				 * fd assignment generation matches the one
916 				 * recorded in the polldat_t.  Absence of such
917 				 * a generation match indicates that a new
918 				 * resource has been assigned at that fd.
919 				 *
920 				 * Caveat: It is possible to force a generation
921 				 * update while keeping the same backing
922 				 * resource.  This is possible via dup2, but
923 				 * does not represent real-world use cases,
924 				 * making the lack of error acceptable.
925 				 */
926 				if (pdp->pd_fp != NULL && pdp->pd_gen == gen) {
927 					error = EEXIST;
928 					releasef(fd);
929 					break;
930 				}
931 
932 				/*
933 				 * We have decided that the cached information
934 				 * was stale.  Clear pd_events to assure that
935 				 * we don't mistakenly operate on cached event
936 				 * disposition.
937 				 */
938 				pdp->pd_events = 0;
939 
940 				epfdp = (dvpoll_epollfd_t *)pfdp;
941 				pdp->pd_epolldata = epfdp->dpep_data;
942 
943 			}
944 
945 			ASSERT(pdp->pd_fd == fd);
946 			ASSERT(pdp->pd_pcache == pcp);
947 			if (fd >= pcp->pc_mapsize) {
948 				mutex_exit(&pcp->pc_lock);
949 				pcache_grow_map(pcp, fd);
950 				mutex_enter(&pcp->pc_lock);
951 			}
952 			if (fd > pcp->pc_mapend) {
953 				pcp->pc_mapend = fd;
954 			}
955 
956 			if (!is_epoll) {
957 				ASSERT(fp == NULL);
958 
959 				if ((fp = getf_gen(fd, &gen)) == NULL) {
960 					/*
961 					 * The fd is not valid. Since we can't
962 					 * pass this error back in the write()
963 					 * call, set the bit in bitmap to force
964 					 * DP_POLL ioctl to examine it.
965 					 */
966 					BT_SET(pcp->pc_bitmap, fd);
967 					pdp->pd_events |= pfdp->events;
968 					continue;
969 				}
970 				/*
971 				 * Don't do VOP_POLL for an already cached fd
972 				 * with same poll events.
973 				 */
974 				if ((pdp->pd_events == pfdp->events) &&
975 				    (pdp->pd_fp == fp)) {
976 					/*
977 					 * the events are already cached
978 					 */
979 					releasef(fd);
980 					continue;
981 				}
982 			}
983 
984 
985 			/*
986 			 * do VOP_POLL and cache this poll fd.
987 			 */
988 			/*
989 			 * XXX - pollrelock() logic needs to know which
990 			 * which pollcache lock to grab. It'd be a
991 			 * cleaner solution if we could pass pcp as
992 			 * an arguement in VOP_POLL interface instead
993 			 * of implicitly passing it using thread_t
994 			 * struct. On the other hand, changing VOP_POLL
995 			 * interface will require all driver/file system
996 			 * poll routine to change. May want to revisit
997 			 * the tradeoff later.
998 			 */
999 			curthread->t_pollcache = pcp;
1000 			error = VOP_POLL(fp->f_vnode, pfdp->events, 0,
1001 			    &pfdp->revents, &php, NULL);
1002 
1003 			/*
1004 			 * Edge-triggered polling requires a pollhead in order
1005 			 * to initiate wake-ups properly.  Drivers which are
1006 			 * savvy to POLLET presence, which should include
1007 			 * everything in-gate, will always emit one, regardless
1008 			 * of revent status.  Older drivers which only emit a
1009 			 * pollhead if 'revents == 0' are given a second chance
1010 			 * here via a second VOP_POLL, with only POLLET set in
1011 			 * the events of interest.  These circumstances should
1012 			 * induce any cacheable drivers to emit a pollhead for
1013 			 * wake-ups.
1014 			 *
1015 			 * Drivers which never emit a pollhead will simply
1016 			 * disobey the exectation of edge-triggered behavior.
1017 			 * This includes recursive epoll which, even on Linux,
1018 			 * yields its events in a level-triggered fashion only.
1019 			 */
1020 			if ((pdp->pd_events & POLLET) && error == 0 &&
1021 			    php == NULL) {
1022 				short levent = 0;
1023 
1024 				error = VOP_POLL(fp->f_vnode, POLLET, 0,
1025 				    &levent, &php, NULL);
1026 			}
1027 
1028 			curthread->t_pollcache = NULL;
1029 			/*
1030 			 * We always set the bit when this fd is cached;
1031 			 * this forces the first DP_POLL to poll this fd.
1032 			 * Real performance gain comes from subsequent
1033 			 * DP_POLL.  We also attempt a pollhead_insert();
1034 			 * if it's not possible, we'll do it in dpioctl().
1035 			 */
1036 			BT_SET(pcp->pc_bitmap, fd);
1037 			if (error != 0) {
1038 				releasef(fd);
1039 				break;
1040 			}
1041 			pdp->pd_fp = fp;
1042 			pdp->pd_gen = gen;
1043 			pdp->pd_events |= pfdp->events;
1044 			if (php != NULL) {
1045 				if (pdp->pd_php == NULL) {
1046 					pollhead_insert(php, pdp);
1047 					pdp->pd_php = php;
1048 				} else {
1049 					if (pdp->pd_php != php) {
1050 						pollhead_delete(pdp->pd_php,
1051 						    pdp);
1052 						pollhead_insert(php, pdp);
1053 						pdp->pd_php = php;
1054 					}
1055 				}
1056 			}
1057 			fds_added = B_TRUE;
1058 			releasef(fd);
1059 		} else {
1060 			if (pdp == NULL || pdp->pd_fp == NULL) {
1061 				if (is_epoll) {
1062 					/*
1063 					 * As with the add case (above), epoll
1064 					 * semantics demand that we error out
1065 					 * in this case.
1066 					 */
1067 					error = ENOENT;
1068 					break;
1069 				}
1070 
1071 				continue;
1072 			}
1073 			ASSERT(pdp->pd_fd == fd);
1074 			pdp->pd_fp = NULL;
1075 			pdp->pd_events = 0;
1076 			ASSERT(pdp->pd_thread == NULL);
1077 			if (pdp->pd_php != NULL) {
1078 				pollhead_delete(pdp->pd_php, pdp);
1079 				pdp->pd_php = NULL;
1080 			}
1081 			BT_CLEAR(pcp->pc_bitmap, fd);
1082 		}
1083 	}
1084 	/*
1085 	 * Wake any pollcache waiters so they can check the new descriptors.
1086 	 *
1087 	 * Any fds added to an recursive-capable pollcache could themselves be
1088 	 * /dev/poll handles. To ensure that proper event propagation occurs,
1089 	 * parent pollcaches are woken too, so that they can create any needed
1090 	 * pollcache links.
1091 	 */
1092 	if (fds_added) {
1093 		cv_broadcast(&pcp->pc_cv);
1094 		pcache_wake_parents(pcp);
1095 	}
1096 	pollstate_exit(pcp);
1097 	mutex_enter(&dpep->dpe_lock);
1098 bypass:
1099 	dpep->dpe_flag &= ~DP_WRITER_PRESENT;
1100 	dpep->dpe_refcnt--;
1101 	cv_broadcast(&dpep->dpe_cv);
1102 	mutex_exit(&dpep->dpe_lock);
1103 	kmem_free(pollfdp, uiosize);
1104 	if (error == 0) {
1105 		/*
1106 		 * The state of uio_resid is updated only after the pollcache
1107 		 * is successfully modified.
1108 		 */
1109 		uioskip(uiop, copysize);
1110 	}
1111 	return (error);
1112 }
1113 
1114 #define	DP_SIGMASK_RESTORE(ksetp) {					\
1115 	if (ksetp != NULL) {						\
1116 		mutex_enter(&p->p_lock);				\
1117 		if (lwp->lwp_cursig == 0) {				\
1118 			t->t_hold = lwp->lwp_sigoldmask;		\
1119 			t->t_flag &= ~T_TOMASK;				\
1120 		}							\
1121 		mutex_exit(&p->p_lock);					\
1122 	}								\
1123 }
1124 
1125 /*ARGSUSED*/
1126 static int
1127 dpioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp)
1128 {
1129 	minor_t		minor;
1130 	dp_entry_t	*dpep;
1131 	pollcache_t	*pcp;
1132 	hrtime_t	now;
1133 	int		error = 0;
1134 	boolean_t	is_epoll;
1135 	STRUCT_DECL(dvpoll, dvpoll);
1136 
1137 	if (cmd == DP_POLL || cmd == DP_PPOLL) {
1138 		/* do this now, before we sleep on DP_WRITER_PRESENT */
1139 		now = gethrtime();
1140 	}
1141 
1142 	minor = getminor(dev);
1143 	mutex_enter(&devpoll_lock);
1144 	ASSERT(minor < dptblsize);
1145 	dpep = devpolltbl[minor];
1146 	mutex_exit(&devpoll_lock);
1147 	ASSERT(dpep != NULL);
1148 	pcp = dpep->dpe_pcache;
1149 
1150 	mutex_enter(&dpep->dpe_lock);
1151 	is_epoll = (dpep->dpe_flag & DP_ISEPOLLCOMPAT) != 0;
1152 
1153 	if (cmd == DP_EPOLLCOMPAT) {
1154 		if (dpep->dpe_refcnt != 0) {
1155 			/*
1156 			 * We can't turn on epoll compatibility while there
1157 			 * are outstanding operations.
1158 			 */
1159 			mutex_exit(&dpep->dpe_lock);
1160 			return (EBUSY);
1161 		}
1162 
1163 		/*
1164 		 * epoll compatibility is a one-way street: there's no way
1165 		 * to turn it off for a particular open.
1166 		 */
1167 		dpep->dpe_flag |= DP_ISEPOLLCOMPAT;
1168 
1169 		/* Record the epoll-enabled nature in the pollcache too */
1170 		mutex_enter(&pcp->pc_lock);
1171 		pcp->pc_flag |= PC_EPOLL;
1172 		mutex_exit(&pcp->pc_lock);
1173 
1174 		mutex_exit(&dpep->dpe_lock);
1175 		return (0);
1176 	}
1177 
1178 	if (!is_epoll && curproc->p_pid != pcp->pc_pid) {
1179 		if (pcp->pc_pid != -1) {
1180 			mutex_exit(&dpep->dpe_lock);
1181 			return (EACCES);
1182 		}
1183 
1184 		pcp->pc_pid = curproc->p_pid;
1185 	}
1186 
1187 	/* Wait until all writers have cleared the handle before continuing */
1188 	while ((dpep->dpe_flag & DP_WRITER_PRESENT) != 0 ||
1189 	    (dpep->dpe_writerwait != 0)) {
1190 		if (!cv_wait_sig_swap(&dpep->dpe_cv, &dpep->dpe_lock)) {
1191 			mutex_exit(&dpep->dpe_lock);
1192 			return (EINTR);
1193 		}
1194 	}
1195 	dpep->dpe_refcnt++;
1196 	mutex_exit(&dpep->dpe_lock);
1197 
1198 	switch (cmd) {
1199 	case	DP_POLL:
1200 	case	DP_PPOLL:
1201 	{
1202 		pollstate_t	*ps;
1203 		nfds_t		nfds;
1204 		int		fdcnt = 0;
1205 		size_t		size, fdsize, dpsize;
1206 		hrtime_t	deadline = 0;
1207 		k_sigset_t	*ksetp = NULL;
1208 		k_sigset_t	kset;
1209 		sigset_t	set;
1210 		kthread_t	*t = curthread;
1211 		klwp_t		*lwp = ttolwp(t);
1212 		struct proc	*p = ttoproc(curthread);
1213 
1214 		STRUCT_INIT(dvpoll, mode);
1215 
1216 		/*
1217 		 * The dp_setp member is only required/consumed for DP_PPOLL,
1218 		 * which otherwise uses the same structure as DP_POLL.
1219 		 */
1220 		if (cmd == DP_POLL) {
1221 			dpsize = (uintptr_t)STRUCT_FADDR(dvpoll, dp_setp) -
1222 			    (uintptr_t)STRUCT_FADDR(dvpoll, dp_fds);
1223 		} else {
1224 			ASSERT(cmd == DP_PPOLL);
1225 			dpsize = STRUCT_SIZE(dvpoll);
1226 		}
1227 
1228 		if ((mode & FKIOCTL) != 0) {
1229 			/* Kernel-internal ioctl call */
1230 			bcopy((caddr_t)arg, STRUCT_BUF(dvpoll), dpsize);
1231 			error = 0;
1232 		} else {
1233 			error = copyin((caddr_t)arg, STRUCT_BUF(dvpoll),
1234 			    dpsize);
1235 		}
1236 
1237 		if (error) {
1238 			DP_REFRELE(dpep);
1239 			return (EFAULT);
1240 		}
1241 
1242 		deadline = STRUCT_FGET(dvpoll, dp_timeout);
1243 		if (deadline > 0) {
1244 			/*
1245 			 * Convert the deadline from relative milliseconds
1246 			 * to absolute nanoseconds.  They must wait for at
1247 			 * least a tick.
1248 			 */
1249 			deadline = MSEC2NSEC(deadline);
1250 			deadline = MAX(deadline, nsec_per_tick);
1251 			deadline += now;
1252 		}
1253 
1254 		if (cmd == DP_PPOLL) {
1255 			void *setp = STRUCT_FGETP(dvpoll, dp_setp);
1256 
1257 			if (setp != NULL) {
1258 				if ((mode & FKIOCTL) != 0) {
1259 					/* Use the signal set directly */
1260 					ksetp = (k_sigset_t *)setp;
1261 				} else {
1262 					if (copyin(setp, &set, sizeof (set))) {
1263 						DP_REFRELE(dpep);
1264 						return (EFAULT);
1265 					}
1266 					sigutok(&set, &kset);
1267 					ksetp = &kset;
1268 				}
1269 
1270 				mutex_enter(&p->p_lock);
1271 				schedctl_finish_sigblock(t);
1272 				lwp->lwp_sigoldmask = t->t_hold;
1273 				t->t_hold = *ksetp;
1274 				t->t_flag |= T_TOMASK;
1275 
1276 				/*
1277 				 * Like ppoll() with a non-NULL sigset, we'll
1278 				 * call cv_reltimedwait_sig() just to check for
1279 				 * signals.  This call will return immediately
1280 				 * with either 0 (signalled) or -1 (no signal).
1281 				 * There are some conditions whereby we can
1282 				 * get 0 from cv_reltimedwait_sig() without
1283 				 * a true signal (e.g., a directed stop), so
1284 				 * we restore our signal mask in the unlikely
1285 				 * event that lwp_cursig is 0.
1286 				 */
1287 				if (!cv_reltimedwait_sig(&t->t_delay_cv,
1288 				    &p->p_lock, 0, TR_CLOCK_TICK)) {
1289 					if (lwp->lwp_cursig == 0) {
1290 						t->t_hold = lwp->lwp_sigoldmask;
1291 						t->t_flag &= ~T_TOMASK;
1292 					}
1293 
1294 					mutex_exit(&p->p_lock);
1295 
1296 					DP_REFRELE(dpep);
1297 					return (EINTR);
1298 				}
1299 
1300 				mutex_exit(&p->p_lock);
1301 			}
1302 		}
1303 
1304 		if ((nfds = STRUCT_FGET(dvpoll, dp_nfds)) == 0) {
1305 			/*
1306 			 * We are just using DP_POLL to sleep, so
1307 			 * we don't any of the devpoll apparatus.
1308 			 * Do not check for signals if we have a zero timeout.
1309 			 */
1310 			DP_REFRELE(dpep);
1311 			if (deadline == 0) {
1312 				DP_SIGMASK_RESTORE(ksetp);
1313 				return (0);
1314 			}
1315 
1316 			mutex_enter(&curthread->t_delay_lock);
1317 			while ((error =
1318 			    cv_timedwait_sig_hrtime(&curthread->t_delay_cv,
1319 			    &curthread->t_delay_lock, deadline)) > 0)
1320 				continue;
1321 			mutex_exit(&curthread->t_delay_lock);
1322 
1323 			DP_SIGMASK_RESTORE(ksetp);
1324 
1325 			return (error == 0 ? EINTR : 0);
1326 		}
1327 
1328 		if (is_epoll) {
1329 			size = nfds * (fdsize = sizeof (epoll_event_t));
1330 		} else {
1331 			size = nfds * (fdsize = sizeof (pollfd_t));
1332 		}
1333 
1334 		/*
1335 		 * XXX It would be nice not to have to alloc each time, but it
1336 		 * requires another per thread structure hook. This can be
1337 		 * implemented later if data suggests that it's necessary.
1338 		 */
1339 		ps = pollstate_create();
1340 
1341 		if (ps->ps_dpbufsize < size) {
1342 			/*
1343 			 * If nfds is larger than twice the current maximum
1344 			 * open file count, we'll silently clamp it.  This
1345 			 * only limits our exposure to allocating an
1346 			 * inordinate amount of kernel memory; it doesn't
1347 			 * otherwise affect the semantics.  (We have this
1348 			 * check at twice the maximum instead of merely the
1349 			 * maximum because some applications pass an nfds that
1350 			 * is only slightly larger than their limit.)
1351 			 */
1352 			mutex_enter(&p->p_lock);
1353 			if ((nfds >> 1) > p->p_fno_ctl) {
1354 				nfds = p->p_fno_ctl;
1355 				size = nfds * fdsize;
1356 			}
1357 			mutex_exit(&p->p_lock);
1358 
1359 			if (ps->ps_dpbufsize < size) {
1360 				kmem_free(ps->ps_dpbuf, ps->ps_dpbufsize);
1361 				ps->ps_dpbuf = kmem_zalloc(size, KM_SLEEP);
1362 				ps->ps_dpbufsize = size;
1363 			}
1364 		}
1365 
1366 		VERIFY(pollstate_enter(pcp) == PSE_SUCCESS);
1367 		for (;;) {
1368 			pcp->pc_flag &= ~PC_POLLWAKE;
1369 
1370 			/*
1371 			 * Mark all child pcachelinks as stale.
1372 			 * Those which are still part of the tree will be
1373 			 * marked as valid during the poll.
1374 			 */
1375 			pcachelink_mark_stale(pcp);
1376 
1377 			error = dp_pcache_poll(dpep, ps->ps_dpbuf,
1378 			    pcp, nfds, &fdcnt);
1379 			if (fdcnt > 0 || error != 0)
1380 				break;
1381 
1382 			/* Purge still-stale child pcachelinks */
1383 			pcachelink_purge_stale(pcp);
1384 
1385 			/*
1386 			 * A pollwake has happened since we polled cache.
1387 			 */
1388 			if (pcp->pc_flag & PC_POLLWAKE)
1389 				continue;
1390 
1391 			/*
1392 			 * Sleep until we are notified, signaled, or timed out.
1393 			 */
1394 			if (deadline == 0) {
1395 				/* immediate timeout; do not check signals */
1396 				break;
1397 			}
1398 
1399 			error = cv_timedwait_sig_hrtime(&pcp->pc_cv,
1400 			    &pcp->pc_lock, deadline);
1401 
1402 			/*
1403 			 * If we were awakened by a signal or timeout then
1404 			 * break the loop, else poll again.
1405 			 */
1406 			if (error <= 0) {
1407 				error = (error == 0) ? EINTR : 0;
1408 				break;
1409 			} else {
1410 				error = 0;
1411 			}
1412 		}
1413 		pollstate_exit(pcp);
1414 
1415 		DP_SIGMASK_RESTORE(ksetp);
1416 
1417 		if (error == 0 && fdcnt > 0) {
1418 			/*
1419 			 * It should be noted that FKIOCTL does not influence
1420 			 * the copyout (vs bcopy) of dp_fds at this time.
1421 			 */
1422 			if (copyout(ps->ps_dpbuf,
1423 			    STRUCT_FGETP(dvpoll, dp_fds), fdcnt * fdsize)) {
1424 				DP_REFRELE(dpep);
1425 				return (EFAULT);
1426 			}
1427 			*rvalp = fdcnt;
1428 		}
1429 		break;
1430 	}
1431 
1432 	case	DP_ISPOLLED:
1433 	{
1434 		pollfd_t	pollfd;
1435 		polldat_t	*pdp;
1436 
1437 		STRUCT_INIT(dvpoll, mode);
1438 		error = copyin((caddr_t)arg, &pollfd, sizeof (pollfd_t));
1439 		if (error) {
1440 			DP_REFRELE(dpep);
1441 			return (EFAULT);
1442 		}
1443 		mutex_enter(&pcp->pc_lock);
1444 		if (pcp->pc_hash == NULL) {
1445 			/*
1446 			 * No Need to search because no poll fd
1447 			 * has been cached.
1448 			 */
1449 			mutex_exit(&pcp->pc_lock);
1450 			DP_REFRELE(dpep);
1451 			return (0);
1452 		}
1453 		if (pollfd.fd < 0) {
1454 			mutex_exit(&pcp->pc_lock);
1455 			break;
1456 		}
1457 		pdp = pcache_lookup_fd(pcp, pollfd.fd);
1458 		if ((pdp != NULL) && (pdp->pd_fd == pollfd.fd) &&
1459 		    (pdp->pd_fp != NULL)) {
1460 			pollfd.revents = pdp->pd_events;
1461 			if (copyout(&pollfd, (caddr_t)arg, sizeof (pollfd_t))) {
1462 				mutex_exit(&pcp->pc_lock);
1463 				DP_REFRELE(dpep);
1464 				return (EFAULT);
1465 			}
1466 			*rvalp = 1;
1467 		}
1468 		mutex_exit(&pcp->pc_lock);
1469 		break;
1470 	}
1471 
1472 	default:
1473 		DP_REFRELE(dpep);
1474 		return (EINVAL);
1475 	}
1476 	DP_REFRELE(dpep);
1477 	return (error);
1478 }
1479 
1480 /*
1481  * Overview of Recursive Polling
1482  *
1483  * It is possible for /dev/poll to poll for events on file descriptors which
1484  * themselves are /dev/poll handles.  Pending events in the child handle are
1485  * represented as readable data via the POLLIN flag.  To limit surface area,
1486  * this recursion is presently allowed on only /dev/poll handles which have
1487  * been placed in epoll mode via the DP_EPOLLCOMPAT ioctl.  Recursion depth is
1488  * limited to 5 in order to be consistent with Linux epoll.
1489  *
1490  * Extending dppoll() for VOP_POLL:
1491  *
1492  * The recursive /dev/poll implementation begins by extending dppoll() to
1493  * report when resources contained in the pollcache have relevant event state.
1494  * At the highest level, it means calling dp_pcache_poll() so it indicates if
1495  * fd events are present without consuming them or altering the pollcache
1496  * bitmap.  This ensures that a subsequent DP_POLL operation on the bitmap will
1497  * yield the initiating event.  Additionally, the VOP_POLL should return in
1498  * such a way that dp_pcache_poll() does not clear the parent bitmap entry
1499  * which corresponds to the child /dev/poll fd.  This means that child
1500  * pollcaches will be checked during every poll which facilitates wake-up
1501  * behavior detailed below.
1502  *
1503  * Pollcache Links and Wake Events:
1504  *
1505  * Recursive /dev/poll avoids complicated pollcache locking constraints during
1506  * pollwakeup events by eschewing the traditional pollhead mechanism in favor
1507  * of a different approach.  For each pollcache at the root of a recursive
1508  * /dev/poll "tree", pcachelink_t structures are established to all child
1509  * /dev/poll pollcaches.  During pollnotify() in a child pollcache, the
1510  * linked list of pcachelink_t entries is walked, where those marked as valid
1511  * incur a cv_broadcast to their parent pollcache.  Most notably, these
1512  * pcachelink_t cv wakeups are performed without acquiring pc_lock on the
1513  * parent pollcache (which would require careful deadlock avoidance).  This
1514  * still allows the woken poll on the parent to discover the pertinent events
1515  * due to the fact that bitmap entires for the child pollcache are always
1516  * maintained by the dppoll() logic above.
1517  *
1518  * Depth Limiting and Loop Prevention:
1519  *
1520  * As each pollcache is encountered (either via DP_POLL or dppoll()), depth and
1521  * loop constraints are enforced via pollstate_enter().  The pollcache_t
1522  * pointer is compared against any existing entries in ps_pc_stack and is added
1523  * to the end if no match (and therefore loop) is found.  Once poll operations
1524  * for a given pollcache_t are complete, pollstate_exit() clears the pointer
1525  * from the list.  The pollstate_enter() and pollstate_exit() functions are
1526  * responsible for acquiring and releasing pc_lock, respectively.
1527  *
1528  * Deadlock Safety:
1529  *
1530  * Descending through a tree of recursive /dev/poll handles involves the tricky
1531  * business of sequentially entering multiple pollcache locks.  This tree
1532  * topology cannot define a lock acquisition order in such a way that it is
1533  * immune to deadlocks between threads.  The pollstate_enter() and
1534  * pollstate_exit() functions provide an interface for recursive /dev/poll
1535  * operations to safely lock pollcaches while failing gracefully in the face of
1536  * deadlocking topologies. (See pollstate_contend() for more detail about how
1537  * deadlocks are detected and resolved.)
1538  */
1539 
1540 /*ARGSUSED*/
1541 static int
1542 dppoll(dev_t dev, short events, int anyyet, short *reventsp,
1543     struct pollhead **phpp)
1544 {
1545 	minor_t		minor;
1546 	dp_entry_t	*dpep;
1547 	pollcache_t	*pcp;
1548 	int		res, rc = 0;
1549 
1550 	minor = getminor(dev);
1551 	mutex_enter(&devpoll_lock);
1552 	ASSERT(minor < dptblsize);
1553 	dpep = devpolltbl[minor];
1554 	ASSERT(dpep != NULL);
1555 	mutex_exit(&devpoll_lock);
1556 
1557 	mutex_enter(&dpep->dpe_lock);
1558 	if ((dpep->dpe_flag & DP_ISEPOLLCOMPAT) == 0) {
1559 		/* Poll recursion is not yet supported for non-epoll handles */
1560 		*reventsp = POLLERR;
1561 		mutex_exit(&dpep->dpe_lock);
1562 		return (0);
1563 	} else {
1564 		dpep->dpe_refcnt++;
1565 		pcp = dpep->dpe_pcache;
1566 		mutex_exit(&dpep->dpe_lock);
1567 	}
1568 
1569 	res = pollstate_enter(pcp);
1570 	if (res == PSE_SUCCESS) {
1571 		nfds_t		nfds = 1;
1572 		int		fdcnt = 0;
1573 		pollstate_t	*ps = curthread->t_pollstate;
1574 
1575 		/*
1576 		 * Recursive polling will only emit certain events.  Skip a
1577 		 * scan of the pollcache if those events are not of interest.
1578 		 */
1579 		if (events & (POLLIN|POLLRDNORM)) {
1580 			rc = dp_pcache_poll(dpep, NULL, pcp, nfds, &fdcnt);
1581 		} else {
1582 			rc = 0;
1583 			fdcnt = 0;
1584 		}
1585 
1586 		if (rc == 0 && fdcnt > 0) {
1587 			*reventsp = POLLIN|POLLRDNORM;
1588 		} else {
1589 			*reventsp = 0;
1590 		}
1591 		pcachelink_assoc(pcp, ps->ps_pc_stack[0]);
1592 		pollstate_exit(pcp);
1593 	} else {
1594 		switch (res) {
1595 		case PSE_FAIL_DEPTH:
1596 			rc = EINVAL;
1597 			break;
1598 		case PSE_FAIL_LOOP:
1599 		case PSE_FAIL_DEADLOCK:
1600 			rc = ELOOP;
1601 			break;
1602 		default:
1603 			/*
1604 			 * If anything else has gone awry, such as being polled
1605 			 * from an unexpected context, fall back to the
1606 			 * recursion-intolerant response.
1607 			 */
1608 			*reventsp = POLLERR;
1609 			rc = 0;
1610 			break;
1611 		}
1612 	}
1613 
1614 	DP_REFRELE(dpep);
1615 	return (rc);
1616 }
1617 
1618 /*
1619  * devpoll close should do enough clean up before the pollcache is deleted,
1620  * i.e., it should ensure no one still references the pollcache later.
1621  * There is no "permission" check in here. Any process having the last
1622  * reference of this /dev/poll fd can close.
1623  */
1624 /*ARGSUSED*/
1625 static int
1626 dpclose(dev_t dev, int flag, int otyp, cred_t *credp)
1627 {
1628 	minor_t		minor;
1629 	dp_entry_t	*dpep;
1630 	pollcache_t	*pcp;
1631 	int		i;
1632 	polldat_t	**hashtbl;
1633 	polldat_t	*pdp;
1634 
1635 	minor = getminor(dev);
1636 
1637 	mutex_enter(&devpoll_lock);
1638 	dpep = devpolltbl[minor];
1639 	ASSERT(dpep != NULL);
1640 	devpolltbl[minor] = NULL;
1641 	mutex_exit(&devpoll_lock);
1642 	pcp = dpep->dpe_pcache;
1643 	ASSERT(pcp != NULL);
1644 	/*
1645 	 * At this point, no other lwp can access this pollcache via the
1646 	 * /dev/poll fd. This pollcache is going away, so do the clean
1647 	 * up without the pc_lock.
1648 	 */
1649 	hashtbl = pcp->pc_hash;
1650 	for (i = 0; i < pcp->pc_hashsize; i++) {
1651 		for (pdp = hashtbl[i]; pdp; pdp = pdp->pd_hashnext) {
1652 			if (pdp->pd_php != NULL) {
1653 				pollhead_delete(pdp->pd_php, pdp);
1654 				pdp->pd_php = NULL;
1655 				pdp->pd_fp = NULL;
1656 			}
1657 		}
1658 	}
1659 	/*
1660 	 * pollwakeup() may still interact with this pollcache. Wait until
1661 	 * it is done.
1662 	 */
1663 	mutex_enter(&pcp->pc_no_exit);
1664 	ASSERT(pcp->pc_busy >= 0);
1665 	while (pcp->pc_busy > 0)
1666 		cv_wait(&pcp->pc_busy_cv, &pcp->pc_no_exit);
1667 	mutex_exit(&pcp->pc_no_exit);
1668 
1669 	/* Clean up any pollcache links created via recursive /dev/poll */
1670 	if (pcp->pc_parents != NULL || pcp->pc_children != NULL) {
1671 		/*
1672 		 * Because of the locking rules for pcachelink manipulation,
1673 		 * acquring pc_lock is required for this step.
1674 		 */
1675 		mutex_enter(&pcp->pc_lock);
1676 		pcachelink_purge_all(pcp);
1677 		mutex_exit(&pcp->pc_lock);
1678 	}
1679 
1680 	pcache_destroy(pcp);
1681 	ASSERT(dpep->dpe_refcnt == 0);
1682 	kmem_free(dpep, sizeof (dp_entry_t));
1683 	return (0);
1684 }
1685 
1686 static void
1687 pcachelink_locked_rele(pcachelink_t *pl)
1688 {
1689 	ASSERT(MUTEX_HELD(&pl->pcl_lock));
1690 	VERIFY(pl->pcl_refcnt >= 1);
1691 
1692 	pl->pcl_refcnt--;
1693 	if (pl->pcl_refcnt == 0) {
1694 		VERIFY(pl->pcl_state == PCL_INVALID);
1695 		ASSERT(pl->pcl_parent_pc == NULL);
1696 		ASSERT(pl->pcl_child_pc == NULL);
1697 		ASSERT(pl->pcl_parent_next == NULL);
1698 		ASSERT(pl->pcl_child_next == NULL);
1699 
1700 		pl->pcl_state = PCL_FREE;
1701 		mutex_destroy(&pl->pcl_lock);
1702 		kmem_free(pl, sizeof (pcachelink_t));
1703 	} else {
1704 		mutex_exit(&pl->pcl_lock);
1705 	}
1706 }
1707 
1708 /*
1709  * Associate parent and child pollcaches via a pcachelink_t.  If an existing
1710  * link (stale or valid) between the two is found, it will be reused.  If a
1711  * suitable link is not found for reuse, a new one will be allocated.
1712  */
1713 static void
1714 pcachelink_assoc(pollcache_t *child, pollcache_t *parent)
1715 {
1716 	pcachelink_t	*pl, **plpn;
1717 
1718 	ASSERT(MUTEX_HELD(&child->pc_lock));
1719 	ASSERT(MUTEX_HELD(&parent->pc_lock));
1720 
1721 	/* Search for an existing link we can reuse. */
1722 	plpn = &child->pc_parents;
1723 	for (pl = child->pc_parents; pl != NULL; pl = *plpn) {
1724 		mutex_enter(&pl->pcl_lock);
1725 		if (pl->pcl_state == PCL_INVALID) {
1726 			/* Clean any invalid links while walking the list */
1727 			*plpn = pl->pcl_parent_next;
1728 			pl->pcl_child_pc = NULL;
1729 			pl->pcl_parent_next = NULL;
1730 			pcachelink_locked_rele(pl);
1731 		} else if (pl->pcl_parent_pc == parent) {
1732 			/* Successfully found parent link */
1733 			ASSERT(pl->pcl_state == PCL_VALID ||
1734 			    pl->pcl_state == PCL_STALE);
1735 			pl->pcl_state = PCL_VALID;
1736 			mutex_exit(&pl->pcl_lock);
1737 			return;
1738 		} else {
1739 			plpn = &pl->pcl_parent_next;
1740 			mutex_exit(&pl->pcl_lock);
1741 		}
1742 	}
1743 
1744 	/* No existing link to the parent was found.  Create a fresh one. */
1745 	pl = kmem_zalloc(sizeof (pcachelink_t), KM_SLEEP);
1746 	mutex_init(&pl->pcl_lock,  NULL, MUTEX_DEFAULT, NULL);
1747 
1748 	pl->pcl_parent_pc = parent;
1749 	pl->pcl_child_next = parent->pc_children;
1750 	parent->pc_children = pl;
1751 	pl->pcl_refcnt++;
1752 
1753 	pl->pcl_child_pc = child;
1754 	pl->pcl_parent_next = child->pc_parents;
1755 	child->pc_parents = pl;
1756 	pl->pcl_refcnt++;
1757 
1758 	pl->pcl_state = PCL_VALID;
1759 }
1760 
1761 /*
1762  * Mark all child links in a pollcache as stale.  Any invalid child links found
1763  * during iteration are purged.
1764  */
1765 static void
1766 pcachelink_mark_stale(pollcache_t *pcp)
1767 {
1768 	pcachelink_t	*pl, **plpn;
1769 
1770 	ASSERT(MUTEX_HELD(&pcp->pc_lock));
1771 
1772 	plpn = &pcp->pc_children;
1773 	for (pl = pcp->pc_children; pl != NULL; pl = *plpn) {
1774 		mutex_enter(&pl->pcl_lock);
1775 		if (pl->pcl_state == PCL_INVALID) {
1776 			/*
1777 			 * Remove any invalid links while we are going to the
1778 			 * trouble of walking the list.
1779 			 */
1780 			*plpn = pl->pcl_child_next;
1781 			pl->pcl_parent_pc = NULL;
1782 			pl->pcl_child_next = NULL;
1783 			pcachelink_locked_rele(pl);
1784 		} else {
1785 			pl->pcl_state = PCL_STALE;
1786 			plpn = &pl->pcl_child_next;
1787 			mutex_exit(&pl->pcl_lock);
1788 		}
1789 	}
1790 }
1791 
1792 /*
1793  * Purge all stale (or invalid) child links from a pollcache.
1794  */
1795 static void
1796 pcachelink_purge_stale(pollcache_t *pcp)
1797 {
1798 	pcachelink_t	*pl, **plpn;
1799 
1800 	ASSERT(MUTEX_HELD(&pcp->pc_lock));
1801 
1802 	plpn = &pcp->pc_children;
1803 	for (pl = pcp->pc_children; pl != NULL; pl = *plpn) {
1804 		mutex_enter(&pl->pcl_lock);
1805 		switch (pl->pcl_state) {
1806 		case PCL_STALE:
1807 			pl->pcl_state = PCL_INVALID;
1808 			/* FALLTHROUGH */
1809 		case PCL_INVALID:
1810 			*plpn = pl->pcl_child_next;
1811 			pl->pcl_parent_pc = NULL;
1812 			pl->pcl_child_next = NULL;
1813 			pcachelink_locked_rele(pl);
1814 			break;
1815 		default:
1816 			plpn = &pl->pcl_child_next;
1817 			mutex_exit(&pl->pcl_lock);
1818 		}
1819 	}
1820 }
1821 
1822 /*
1823  * Purge all child and parent links from a pollcache, regardless of status.
1824  */
1825 static void
1826 pcachelink_purge_all(pollcache_t *pcp)
1827 {
1828 	pcachelink_t	*pl, **plpn;
1829 
1830 	ASSERT(MUTEX_HELD(&pcp->pc_lock));
1831 
1832 	plpn = &pcp->pc_parents;
1833 	for (pl = pcp->pc_parents; pl != NULL; pl = *plpn) {
1834 		mutex_enter(&pl->pcl_lock);
1835 		pl->pcl_state = PCL_INVALID;
1836 		*plpn = pl->pcl_parent_next;
1837 		pl->pcl_child_pc = NULL;
1838 		pl->pcl_parent_next = NULL;
1839 		pcachelink_locked_rele(pl);
1840 	}
1841 
1842 	plpn = &pcp->pc_children;
1843 	for (pl = pcp->pc_children; pl != NULL; pl = *plpn) {
1844 		mutex_enter(&pl->pcl_lock);
1845 		pl->pcl_state = PCL_INVALID;
1846 		*plpn = pl->pcl_child_next;
1847 		pl->pcl_parent_pc = NULL;
1848 		pl->pcl_child_next = NULL;
1849 		pcachelink_locked_rele(pl);
1850 	}
1851 
1852 	ASSERT(pcp->pc_parents == NULL);
1853 	ASSERT(pcp->pc_children == NULL);
1854 }
1855