xref: /illumos-gate/usr/src/uts/common/io/devpoll.c (revision 95e434b5)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * Copyright (c) 2012 by Delphix. All rights reserved.
28  * Copyright 2019 Joyent, Inc.
29  */
30 
31 #include <sys/types.h>
32 #include <sys/devops.h>
33 #include <sys/conf.h>
34 #include <sys/modctl.h>
35 #include <sys/sunddi.h>
36 #include <sys/stat.h>
37 #include <sys/poll_impl.h>
38 #include <sys/errno.h>
39 #include <sys/kmem.h>
40 #include <sys/mkdev.h>
41 #include <sys/debug.h>
42 #include <sys/file.h>
43 #include <sys/sysmacros.h>
44 #include <sys/systm.h>
45 #include <sys/bitmap.h>
46 #include <sys/devpoll.h>
47 #include <sys/rctl.h>
48 #include <sys/resource.h>
49 #include <sys/schedctl.h>
50 #include <sys/epoll.h>
51 
52 #define	RESERVED	1
53 
54 /* local data struct */
55 static	dp_entry_t	**devpolltbl;	/* dev poll entries */
56 static	size_t		dptblsize;
57 
58 static	kmutex_t	devpoll_lock;	/* lock protecting dev tbl */
59 int			devpoll_init;	/* is /dev/poll initialized already */
60 
61 /* device local functions */
62 
63 static int dpopen(dev_t *devp, int flag, int otyp, cred_t *credp);
64 static int dpwrite(dev_t dev, struct uio *uiop, cred_t *credp);
65 static int dpioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
66     int *rvalp);
67 static int dppoll(dev_t dev, short events, int anyyet, short *reventsp,
68     struct pollhead **phpp);
69 static int dpclose(dev_t dev, int flag, int otyp, cred_t *credp);
70 static dev_info_t *dpdevi;
71 
72 
73 static struct cb_ops    dp_cb_ops = {
74 	dpopen,			/* open */
75 	dpclose,		/* close */
76 	nodev,			/* strategy */
77 	nodev,			/* print */
78 	nodev,			/* dump */
79 	nodev,			/* read */
80 	dpwrite,		/* write */
81 	dpioctl,		/* ioctl */
82 	nodev,			/* devmap */
83 	nodev,			/* mmap */
84 	nodev,			/* segmap */
85 	dppoll,			/* poll */
86 	ddi_prop_op,		/* prop_op */
87 	(struct streamtab *)0,	/* streamtab */
88 	D_MP,			/* flags */
89 	CB_REV,			/* cb_ops revision */
90 	nodev,			/* aread */
91 	nodev			/* awrite */
92 };
93 
94 static int dpattach(dev_info_t *, ddi_attach_cmd_t);
95 static int dpdetach(dev_info_t *, ddi_detach_cmd_t);
96 static int dpinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
97 
98 static struct dev_ops dp_ops = {
99 	DEVO_REV,		/* devo_rev */
100 	0,			/* refcnt */
101 	dpinfo,			/* info */
102 	nulldev,		/* identify */
103 	nulldev,		/* probe */
104 	dpattach,		/* attach */
105 	dpdetach,		/* detach */
106 	nodev,			/* reset */
107 	&dp_cb_ops,		/* driver operations */
108 	(struct bus_ops *)NULL, /* bus operations */
109 	nulldev,		/* power */
110 	ddi_quiesce_not_needed,		/* quiesce */
111 };
112 
113 
114 static struct modldrv modldrv = {
115 	&mod_driverops,		/* type of module - a driver */
116 	"/dev/poll driver",
117 	&dp_ops,
118 };
119 
120 static struct modlinkage modlinkage = {
121 	MODREV_1,
122 	(void *)&modldrv,
123 	NULL
124 };
125 
126 static void pcachelink_assoc(pollcache_t *, pollcache_t *);
127 static void pcachelink_mark_stale(pollcache_t *);
128 static void pcachelink_purge_stale(pollcache_t *);
129 static void pcachelink_purge_all(pollcache_t *);
130 
131 
132 /*
133  * Locking Design
134  *
135  * The /dev/poll driver shares most of its code with poll sys call whose
136  * code is in common/syscall/poll.c. In poll(2) design, the pollcache
137  * structure is per lwp. An implicit assumption is made there that some
138  * portion of pollcache will never be touched by other lwps. E.g., in
139  * poll(2) design, no lwp will ever need to grow bitmap of other lwp.
140  * This assumption is not true for /dev/poll; hence the need for extra
141  * locking.
142  *
143  * To allow more parallelism, each /dev/poll file descriptor (indexed by
144  * minor number) has its own lock. Since read (dpioctl) is a much more
145  * frequent operation than write, we want to allow multiple reads on same
146  * /dev/poll fd. However, we prevent writes from being starved by giving
147  * priority to write operation. Theoretically writes can starve reads as
148  * well. But in practical sense this is not important because (1) writes
149  * happens less often than reads, and (2) write operation defines the
150  * content of poll fd a cache set. If writes happens so often that they
151  * can starve reads, that means the cached set is very unstable. It may
152  * not make sense to read an unstable cache set anyway. Therefore, the
153  * writers starving readers case is not handled in this design.
154  */
155 
156 int
157 _init()
158 {
159 	int	error;
160 
161 	dptblsize = DEVPOLLSIZE;
162 	devpolltbl = kmem_zalloc(sizeof (caddr_t) * dptblsize, KM_SLEEP);
163 	mutex_init(&devpoll_lock, NULL, MUTEX_DEFAULT, NULL);
164 	devpoll_init = 1;
165 	if ((error = mod_install(&modlinkage)) != 0) {
166 		kmem_free(devpolltbl, sizeof (caddr_t) * dptblsize);
167 		devpoll_init = 0;
168 	}
169 	return (error);
170 }
171 
172 int
173 _fini()
174 {
175 	int error;
176 
177 	if ((error = mod_remove(&modlinkage)) != 0) {
178 		return (error);
179 	}
180 	mutex_destroy(&devpoll_lock);
181 	kmem_free(devpolltbl, sizeof (caddr_t) * dptblsize);
182 	return (0);
183 }
184 
185 int
186 _info(struct modinfo *modinfop)
187 {
188 	return (mod_info(&modlinkage, modinfop));
189 }
190 
191 /*ARGSUSED*/
192 static int
193 dpattach(dev_info_t *devi, ddi_attach_cmd_t cmd)
194 {
195 	if (ddi_create_minor_node(devi, "poll", S_IFCHR, 0, DDI_PSEUDO, 0)
196 	    == DDI_FAILURE) {
197 		ddi_remove_minor_node(devi, NULL);
198 		return (DDI_FAILURE);
199 	}
200 	dpdevi = devi;
201 	return (DDI_SUCCESS);
202 }
203 
204 static int
205 dpdetach(dev_info_t *devi, ddi_detach_cmd_t cmd)
206 {
207 	if (cmd != DDI_DETACH)
208 		return (DDI_FAILURE);
209 
210 	ddi_remove_minor_node(devi, NULL);
211 	return (DDI_SUCCESS);
212 }
213 
214 /* ARGSUSED */
215 static int
216 dpinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
217 {
218 	int error;
219 
220 	switch (infocmd) {
221 	case DDI_INFO_DEVT2DEVINFO:
222 		*result = (void *)dpdevi;
223 		error = DDI_SUCCESS;
224 		break;
225 	case DDI_INFO_DEVT2INSTANCE:
226 		*result = (void *)0;
227 		error = DDI_SUCCESS;
228 		break;
229 	default:
230 		error = DDI_FAILURE;
231 	}
232 	return (error);
233 }
234 
235 /*
236  * dp_pcache_poll has similar logic to pcache_poll() in poll.c. The major
237  * differences are: (1) /dev/poll requires scanning the bitmap starting at
238  * where it was stopped last time, instead of always starting from 0,
239  * (2) since user may not have cleaned up the cached fds when they are
240  * closed, some polldats in cache may refer to closed or reused fds. We
241  * need to check for those cases.
242  *
243  * NOTE: Upon closing an fd, automatic poll cache cleanup is done for
244  *	 poll(2) caches but NOT for /dev/poll caches. So expect some
245  *	 stale entries!
246  */
247 static int
248 dp_pcache_poll(dp_entry_t *dpep, void *dpbuf, pollcache_t *pcp, nfds_t nfds,
249     int *fdcntp)
250 {
251 	int		start, ostart, end, fdcnt, error = 0;
252 	boolean_t	done, no_wrap;
253 	pollfd_t	*pfdp;
254 	epoll_event_t	*epoll;
255 	const short	mask = POLLRDHUP | POLLWRBAND;
256 	const boolean_t	is_epoll = (dpep->dpe_flag & DP_ISEPOLLCOMPAT) != 0;
257 
258 	ASSERT(MUTEX_HELD(&pcp->pc_lock));
259 	if (pcp->pc_bitmap == NULL) {
260 		/* No Need to search because no poll fd has been cached. */
261 		return (0);
262 	}
263 
264 	if (is_epoll) {
265 		pfdp = NULL;
266 		epoll = (epoll_event_t *)dpbuf;
267 	} else {
268 		pfdp = (pollfd_t *)dpbuf;
269 		epoll = NULL;
270 	}
271 retry:
272 	start = ostart = pcp->pc_mapstart;
273 	end = pcp->pc_mapend;
274 
275 	if (start == 0) {
276 		/*
277 		 * started from every begining, no need to wrap around.
278 		 */
279 		no_wrap = B_TRUE;
280 	} else {
281 		no_wrap = B_FALSE;
282 	}
283 	done = B_FALSE;
284 	fdcnt = 0;
285 	while ((fdcnt < nfds) && !done) {
286 		pollhead_t *php = NULL;
287 		short revent = 0;
288 		uf_entry_gen_t gen;
289 		int fd;
290 
291 		/*
292 		 * Examine the bit map in a circular fashion
293 		 * to avoid starvation. Always resume from
294 		 * last stop. Scan till end of the map. Then
295 		 * wrap around.
296 		 */
297 		fd = bt_getlowbit(pcp->pc_bitmap, start, end);
298 		ASSERT(fd <= end);
299 		if (fd >= 0) {
300 			file_t *fp;
301 			polldat_t *pdp;
302 
303 			if (fd == end) {
304 				if (no_wrap) {
305 					done = B_TRUE;
306 				} else {
307 					start = 0;
308 					end = ostart - 1;
309 					no_wrap = B_TRUE;
310 				}
311 			} else {
312 				start = fd + 1;
313 			}
314 			pdp = pcache_lookup_fd(pcp, fd);
315 repoll:
316 			ASSERT(pdp != NULL);
317 			ASSERT(pdp->pd_fd == fd);
318 			if (pdp->pd_fp == NULL) {
319 				/*
320 				 * The fd is POLLREMOVed. This fd is
321 				 * logically no longer cached. So move
322 				 * on to the next one.
323 				 */
324 				continue;
325 			}
326 			if ((fp = getf_gen(fd, &gen)) == NULL) {
327 				if (is_epoll) {
328 					/*
329 					 * In the epoll compatibility case, we
330 					 * actually perform the implicit
331 					 * removal to remain closer to the
332 					 * epoll semantics.
333 					 */
334 					pdp->pd_fp = NULL;
335 					pdp->pd_events = 0;
336 
337 					if (pdp->pd_php != NULL) {
338 						pollhead_delete(pdp->pd_php,
339 						    pdp);
340 						pdp->pd_php = NULL;
341 					}
342 
343 					BT_CLEAR(pcp->pc_bitmap, fd);
344 				} else if (pfdp != NULL) {
345 					/*
346 					 * The fd has been closed, but user has
347 					 * not done a POLLREMOVE on this fd
348 					 * yet. Instead of cleaning it here
349 					 * implicitly, we return POLLNVAL. This
350 					 * is consistent with poll(2) polling a
351 					 * closed fd. Hope this will remind
352 					 * user to do a POLLREMOVE.
353 					 */
354 					pfdp[fdcnt].fd = fd;
355 					pfdp[fdcnt].revents = POLLNVAL;
356 					fdcnt++;
357 				}
358 				continue;
359 			}
360 
361 			/*
362 			 * Detect a change to the resource underlying a cached
363 			 * file descriptor.  While the fd generation comparison
364 			 * will catch nearly all cases, the file_t comparison
365 			 * is maintained as a failsafe as well.
366 			 */
367 			if (gen != pdp->pd_gen || fp != pdp->pd_fp) {
368 				/*
369 				 * The user is polling on a cached fd which was
370 				 * closed and then reused.  Unfortunately there
371 				 * is no good way to communicate this fact to
372 				 * the consumer.
373 				 *
374 				 * When this situation has been detected, it's
375 				 * likely that any existing pollhead is
376 				 * ill-suited to perform proper wake-ups.
377 				 *
378 				 * Clean up the old entry under the expectation
379 				 * that a valid one will be provided as part of
380 				 * the later VOP_POLL.
381 				 */
382 				if (pdp->pd_php != NULL) {
383 					pollhead_delete(pdp->pd_php, pdp);
384 					pdp->pd_php = NULL;
385 				}
386 
387 				/*
388 				 * Since epoll is expected to act on the
389 				 * underlying 'struct file' (in Linux terms,
390 				 * our vnode_t would be a closer analog) rather
391 				 * than the fd itself, an implicit remove
392 				 * is necessary under these circumstances to
393 				 * suppress any results (or errors) from the
394 				 * new resource occupying the fd.
395 				 */
396 				if (is_epoll) {
397 					pdp->pd_fp = NULL;
398 					pdp->pd_events = 0;
399 					BT_CLEAR(pcp->pc_bitmap, fd);
400 					releasef(fd);
401 					continue;
402 				} else {
403 					/*
404 					 * Regular /dev/poll is unbothered
405 					 * about the fd reassignment.
406 					 */
407 					pdp->pd_fp = fp;
408 					pdp->pd_gen = gen;
409 				}
410 			}
411 
412 			/*
413 			 * Skip entries marked with the sentinal value for
414 			 * having already fired under oneshot conditions.
415 			 */
416 			if (pdp->pd_events == POLLONESHOT) {
417 				releasef(fd);
418 				BT_CLEAR(pcp->pc_bitmap, fd);
419 				continue;
420 			}
421 
422 			/*
423 			 * XXX - pollrelock() logic needs to know which
424 			 * which pollcache lock to grab. It'd be a
425 			 * cleaner solution if we could pass pcp as
426 			 * an arguement in VOP_POLL interface instead
427 			 * of implicitly passing it using thread_t
428 			 * struct. On the other hand, changing VOP_POLL
429 			 * interface will require all driver/file system
430 			 * poll routine to change. May want to revisit
431 			 * the tradeoff later.
432 			 */
433 			curthread->t_pollcache = pcp;
434 			error = VOP_POLL(fp->f_vnode, pdp->pd_events, 0,
435 			    &revent, &php, NULL);
436 
437 			/*
438 			 * Recheck edge-triggered descriptors which lack a
439 			 * pollhead.  While this check is performed when an fd
440 			 * is added to the pollcache in dpwrite(), subsequent
441 			 * descriptor manipulation could cause a different
442 			 * resource to be present now.
443 			 */
444 			if ((pdp->pd_events & POLLET) && error == 0 &&
445 			    pdp->pd_php == NULL && php == NULL && revent != 0) {
446 				short levent = 0;
447 
448 				/*
449 				 * The same POLLET-only VOP_POLL is used in an
450 				 * attempt to coax a pollhead from older
451 				 * driver logic.
452 				 */
453 				error = VOP_POLL(fp->f_vnode, POLLET,
454 				    0, &levent, &php, NULL);
455 			}
456 
457 			curthread->t_pollcache = NULL;
458 			releasef(fd);
459 			if (error != 0) {
460 				break;
461 			}
462 
463 			/*
464 			 * layered devices (e.g. console driver)
465 			 * may change the vnode and thus the pollhead
466 			 * pointer out from underneath us.
467 			 */
468 			if (php != NULL && pdp->pd_php != NULL &&
469 			    php != pdp->pd_php) {
470 				pollhead_delete(pdp->pd_php, pdp);
471 				pdp->pd_php = php;
472 				pollhead_insert(php, pdp);
473 				/*
474 				 * The bit should still be set.
475 				 */
476 				ASSERT(BT_TEST(pcp->pc_bitmap, fd));
477 				goto retry;
478 			}
479 
480 			if (revent != 0) {
481 				if (pfdp != NULL) {
482 					pfdp[fdcnt].fd = fd;
483 					pfdp[fdcnt].events = pdp->pd_events;
484 					pfdp[fdcnt].revents = revent;
485 				} else if (epoll != NULL) {
486 					epoll_event_t *ep = &epoll[fdcnt];
487 
488 					ASSERT(epoll != NULL);
489 					ep->data.u64 = pdp->pd_epolldata;
490 
491 					/*
492 					 * Since POLLNVAL is a legal event for
493 					 * VOP_POLL handlers to emit, it must
494 					 * be translated epoll-legal.
495 					 */
496 					if (revent & POLLNVAL) {
497 						revent &= ~POLLNVAL;
498 						revent |= POLLERR;
499 					}
500 
501 					/*
502 					 * If any of the event bits are set for
503 					 * which poll and epoll representations
504 					 * differ, swizzle in the native epoll
505 					 * values.
506 					 */
507 					if (revent & mask) {
508 						ep->events = (revent & ~mask) |
509 						    ((revent & POLLRDHUP) ?
510 						    EPOLLRDHUP : 0) |
511 						    ((revent & POLLWRBAND) ?
512 						    EPOLLWRBAND : 0);
513 					} else {
514 						ep->events = revent;
515 					}
516 
517 					/*
518 					 * We define POLLWRNORM to be POLLOUT,
519 					 * but epoll has separate definitions
520 					 * for them; if POLLOUT is set and the
521 					 * user has asked for EPOLLWRNORM, set
522 					 * that as well.
523 					 */
524 					if ((revent & POLLOUT) &&
525 					    (pdp->pd_events & EPOLLWRNORM)) {
526 						ep->events |= EPOLLWRNORM;
527 					}
528 				} else {
529 					pollstate_t *ps =
530 					    curthread->t_pollstate;
531 					/*
532 					 * The devpoll handle itself is being
533 					 * polled.  Notify the caller of any
534 					 * readable event(s), leaving as much
535 					 * state as possible untouched.
536 					 */
537 					VERIFY(fdcnt == 0);
538 					VERIFY(ps != NULL);
539 
540 					/*
541 					 * If a call to pollunlock() fails
542 					 * during VOP_POLL, skip over the fd
543 					 * and continue polling.
544 					 *
545 					 * Otherwise, report that there is an
546 					 * event pending.
547 					 */
548 					if ((ps->ps_flags & POLLSTATE_ULFAIL)
549 					    != 0) {
550 						ps->ps_flags &=
551 						    ~POLLSTATE_ULFAIL;
552 						continue;
553 					} else {
554 						fdcnt++;
555 						break;
556 					}
557 				}
558 
559 				/* Handle special polling modes. */
560 				if (pdp->pd_events & POLLONESHOT) {
561 					/*
562 					 * Entries operating under POLLONESHOT
563 					 * will be marked with a sentinel value
564 					 * to indicate that they have "fired"
565 					 * when emitting an event.  This will
566 					 * disable them from polling until a
567 					 * later add/modify event rearms them.
568 					 */
569 					pdp->pd_events = POLLONESHOT;
570 					if (pdp->pd_php != NULL) {
571 						pollhead_delete(pdp->pd_php,
572 						    pdp);
573 						pdp->pd_php = NULL;
574 					}
575 					BT_CLEAR(pcp->pc_bitmap, fd);
576 				} else if (pdp->pd_events & POLLET) {
577 					/*
578 					 * Wire up the pollhead which should
579 					 * have been provided.  Edge-triggered
580 					 * polling cannot function properly
581 					 * with drivers which do not emit one.
582 					 */
583 					if (php != NULL &&
584 					    pdp->pd_php == NULL) {
585 						pollhead_insert(php, pdp);
586 						pdp->pd_php = php;
587 					}
588 
589 					/*
590 					 * If the driver has emitted a pollhead,
591 					 * clear the bit in the bitmap which
592 					 * effectively latches the edge on a
593 					 * pollwakeup() from the driver.
594 					 */
595 					if (pdp->pd_php != NULL) {
596 						BT_CLEAR(pcp->pc_bitmap, fd);
597 					}
598 				}
599 
600 				fdcnt++;
601 			} else if (php != NULL) {
602 				/*
603 				 * We clear a bit or cache a poll fd if
604 				 * the driver returns a poll head ptr,
605 				 * which is expected in the case of 0
606 				 * revents. Some buggy driver may return
607 				 * NULL php pointer with 0 revents. In
608 				 * this case, we just treat the driver as
609 				 * "noncachable" and not clearing the bit
610 				 * in bitmap.
611 				 */
612 				if ((pdp->pd_php != NULL) &&
613 				    ((pcp->pc_flag & PC_POLLWAKE) == 0)) {
614 					BT_CLEAR(pcp->pc_bitmap, fd);
615 				}
616 				if (pdp->pd_php == NULL) {
617 					pollhead_insert(php, pdp);
618 					pdp->pd_php = php;
619 					/*
620 					 * An event of interest may have
621 					 * arrived between the VOP_POLL() and
622 					 * the pollhead_insert(); check again.
623 					 */
624 					goto repoll;
625 				}
626 			}
627 		} else {
628 			/*
629 			 * No bit set in the range. Check for wrap around.
630 			 */
631 			if (!no_wrap) {
632 				start = 0;
633 				end = ostart - 1;
634 				no_wrap = B_TRUE;
635 			} else {
636 				done = B_TRUE;
637 			}
638 		}
639 	}
640 
641 	if (!done) {
642 		pcp->pc_mapstart = start;
643 	}
644 	ASSERT(*fdcntp == 0);
645 	*fdcntp = fdcnt;
646 	return (error);
647 }
648 
649 /*ARGSUSED*/
650 static int
651 dpopen(dev_t *devp, int flag, int otyp, cred_t *credp)
652 {
653 	minor_t		minordev;
654 	dp_entry_t	*dpep;
655 	pollcache_t	*pcp;
656 
657 	ASSERT(devpoll_init);
658 	ASSERT(dptblsize <= MAXMIN);
659 	mutex_enter(&devpoll_lock);
660 	for (minordev = 0; minordev < dptblsize; minordev++) {
661 		if (devpolltbl[minordev] == NULL) {
662 			devpolltbl[minordev] = (dp_entry_t *)RESERVED;
663 			break;
664 		}
665 	}
666 	if (minordev == dptblsize) {
667 		dp_entry_t	**newtbl;
668 		size_t		oldsize;
669 
670 		/*
671 		 * Used up every entry in the existing devpoll table.
672 		 * Grow the table by DEVPOLLSIZE.
673 		 */
674 		if ((oldsize = dptblsize) >= MAXMIN) {
675 			mutex_exit(&devpoll_lock);
676 			return (ENXIO);
677 		}
678 		dptblsize += DEVPOLLSIZE;
679 		if (dptblsize > MAXMIN) {
680 			dptblsize = MAXMIN;
681 		}
682 		newtbl = kmem_zalloc(sizeof (caddr_t) * dptblsize, KM_SLEEP);
683 		bcopy(devpolltbl, newtbl, sizeof (caddr_t) * oldsize);
684 		kmem_free(devpolltbl, sizeof (caddr_t) * oldsize);
685 		devpolltbl = newtbl;
686 		devpolltbl[minordev] = (dp_entry_t *)RESERVED;
687 	}
688 	mutex_exit(&devpoll_lock);
689 
690 	dpep = kmem_zalloc(sizeof (dp_entry_t), KM_SLEEP);
691 	/*
692 	 * allocate a pollcache skeleton here. Delay allocating bitmap
693 	 * structures until dpwrite() time, since we don't know the
694 	 * optimal size yet.  We also delay setting the pid until either
695 	 * dpwrite() or attempt to poll on the instance, allowing parents
696 	 * to create instances of /dev/poll for their children.  (In the
697 	 * epoll compatibility case, this check isn't performed to maintain
698 	 * semantic compatibility.)
699 	 */
700 	pcp = pcache_alloc();
701 	dpep->dpe_pcache = pcp;
702 	pcp->pc_pid = -1;
703 	*devp = makedevice(getmajor(*devp), minordev);  /* clone the driver */
704 	mutex_enter(&devpoll_lock);
705 	ASSERT(minordev < dptblsize);
706 	ASSERT(devpolltbl[minordev] == (dp_entry_t *)RESERVED);
707 	devpolltbl[minordev] = dpep;
708 	mutex_exit(&devpoll_lock);
709 	return (0);
710 }
711 
712 /*
713  * Write to dev/poll add/remove fd's to/from a cached poll fd set,
714  * or change poll events for a watched fd.
715  */
716 /*ARGSUSED*/
717 static int
718 dpwrite(dev_t dev, struct uio *uiop, cred_t *credp)
719 {
720 	minor_t		minor;
721 	dp_entry_t	*dpep;
722 	pollcache_t	*pcp;
723 	pollfd_t	*pollfdp, *pfdp;
724 	dvpoll_epollfd_t *epfdp;
725 	uintptr_t	limit;
726 	int		error;
727 	uint_t		size;
728 	size_t		copysize, uiosize;
729 	nfds_t		pollfdnum;
730 	boolean_t	is_epoll, fds_added = B_FALSE;
731 
732 	minor = getminor(dev);
733 
734 	mutex_enter(&devpoll_lock);
735 	ASSERT(minor < dptblsize);
736 	dpep = devpolltbl[minor];
737 	ASSERT(dpep != NULL);
738 	mutex_exit(&devpoll_lock);
739 
740 	mutex_enter(&dpep->dpe_lock);
741 	pcp = dpep->dpe_pcache;
742 	is_epoll = (dpep->dpe_flag & DP_ISEPOLLCOMPAT) != 0;
743 	size = (is_epoll) ? sizeof (dvpoll_epollfd_t) : sizeof (pollfd_t);
744 	mutex_exit(&dpep->dpe_lock);
745 
746 	if (!is_epoll && curproc->p_pid != pcp->pc_pid) {
747 		if (pcp->pc_pid != -1) {
748 			return (EACCES);
749 		}
750 
751 		pcp->pc_pid = curproc->p_pid;
752 	}
753 
754 	if (uiop->uio_resid < 0) {
755 		/* No one else is this careful, but maybe they should be. */
756 		return (EINVAL);
757 	}
758 
759 	uiosize = (size_t)uiop->uio_resid;
760 	pollfdnum = uiosize / size;
761 
762 	/*
763 	 * For epoll-enabled handles, restrict the allowed write size to 2.
764 	 * This corresponds to an epoll_ctl(3C) performing an EPOLL_CTL_MOD
765 	 * operation which is expanded into two operations (DEL and ADD).
766 	 *
767 	 * All other operations performed through epoll_ctl(3C) will consist of
768 	 * a single entry.
769 	 */
770 	if (is_epoll && pollfdnum > 2) {
771 		return (EINVAL);
772 	}
773 
774 	/*
775 	 * We want to make sure that pollfdnum isn't large enough to DoS us,
776 	 * but we also don't want to grab p_lock unnecessarily -- so we
777 	 * perform the full check against our resource limits if and only if
778 	 * pollfdnum is larger than the known-to-be-sane value of UINT8_MAX.
779 	 */
780 	if (pollfdnum > UINT8_MAX) {
781 		mutex_enter(&curproc->p_lock);
782 		if (pollfdnum >
783 		    (uint_t)rctl_enforced_value(rctlproc_legacy[RLIMIT_NOFILE],
784 		    curproc->p_rctls, curproc)) {
785 			(void) rctl_action(rctlproc_legacy[RLIMIT_NOFILE],
786 			    curproc->p_rctls, curproc, RCA_SAFE);
787 			mutex_exit(&curproc->p_lock);
788 			return (EINVAL);
789 		}
790 		mutex_exit(&curproc->p_lock);
791 	}
792 
793 	/*
794 	 * Copy in the pollfd array.  Walk through the array and add
795 	 * each polled fd to the cached set.
796 	 */
797 	pollfdp = kmem_alloc(uiosize, KM_SLEEP);
798 	limit = (uintptr_t)pollfdp + (pollfdnum * size);
799 
800 	/*
801 	 * Although /dev/poll uses the write(2) interface to cache fds, it's
802 	 * not supposed to function as a seekable device. To prevent offset
803 	 * from growing and eventually exceed the maximum, reset the offset
804 	 * here for every call.
805 	 */
806 	uiop->uio_loffset = 0;
807 
808 	/*
809 	 * Use uiocopy instead of uiomove when populating pollfdp, keeping
810 	 * uio_resid untouched for now.  Write syscalls will translate EINTR
811 	 * into a success if they detect "successfully transfered" data via an
812 	 * updated uio_resid.  Falsely suppressing such errors is disastrous.
813 	 */
814 	if ((error = uiocopy((caddr_t)pollfdp, uiosize, UIO_WRITE, uiop,
815 	    &copysize)) != 0) {
816 		kmem_free(pollfdp, uiosize);
817 		return (error);
818 	}
819 
820 	/*
821 	 * We are about to enter the core portion of dpwrite(). Make sure this
822 	 * write has exclusive access in this portion of the code, i.e., no
823 	 * other writers in this code.
824 	 *
825 	 * Waiting for all readers to drop their references to the dpe is
826 	 * unecessary since the pollcache itself is protected by pc_lock.
827 	 */
828 	mutex_enter(&dpep->dpe_lock);
829 	dpep->dpe_writerwait++;
830 	while ((dpep->dpe_flag & DP_WRITER_PRESENT) != 0) {
831 		ASSERT(dpep->dpe_refcnt != 0);
832 
833 		/*
834 		 * The epoll API does not allow EINTR as a result when making
835 		 * modifications to the set of polled fds.  Given that write
836 		 * activity is relatively quick and the size of accepted writes
837 		 * is limited above to two entries, a signal-ignorant wait is
838 		 * used here to avoid the EINTR.
839 		 */
840 		if (is_epoll) {
841 			cv_wait(&dpep->dpe_cv, &dpep->dpe_lock);
842 			continue;
843 		}
844 
845 		/*
846 		 * Non-epoll writers to /dev/poll handles can tolerate EINTR.
847 		 */
848 		if (!cv_wait_sig_swap(&dpep->dpe_cv, &dpep->dpe_lock)) {
849 			dpep->dpe_writerwait--;
850 			mutex_exit(&dpep->dpe_lock);
851 			kmem_free(pollfdp, uiosize);
852 			return (EINTR);
853 		}
854 	}
855 	dpep->dpe_writerwait--;
856 	dpep->dpe_flag |= DP_WRITER_PRESENT;
857 	dpep->dpe_refcnt++;
858 
859 	if (!is_epoll && (dpep->dpe_flag & DP_ISEPOLLCOMPAT) != 0) {
860 		/*
861 		 * The epoll compat mode was enabled while we were waiting to
862 		 * establish write access. It is not safe to continue since
863 		 * state was prepared for non-epoll operation.
864 		 */
865 		error = EBUSY;
866 		goto bypass;
867 	}
868 	mutex_exit(&dpep->dpe_lock);
869 
870 	/*
871 	 * Since the dpwrite() may recursively walk an added /dev/poll handle,
872 	 * pollstate_enter() deadlock and loop detection must be used.
873 	 */
874 	(void) pollstate_create();
875 	VERIFY(pollstate_enter(pcp) == PSE_SUCCESS);
876 
877 	if (pcp->pc_bitmap == NULL) {
878 		pcache_create(pcp, pollfdnum);
879 	}
880 	for (pfdp = pollfdp; (uintptr_t)pfdp < limit;
881 	    pfdp = (pollfd_t *)((uintptr_t)pfdp + size)) {
882 		int fd = pfdp->fd;
883 		polldat_t *pdp;
884 
885 		if ((uint_t)fd >= P_FINFO(curproc)->fi_nfiles) {
886 			/*
887 			 * epoll semantics demand that we return EBADF if our
888 			 * specified fd is invalid.
889 			 */
890 			if (is_epoll) {
891 				error = EBADF;
892 				break;
893 			}
894 
895 			continue;
896 		}
897 
898 		pdp = pcache_lookup_fd(pcp, fd);
899 		if (pfdp->events != POLLREMOVE) {
900 			uf_entry_gen_t gen;
901 			file_t *fp = NULL;
902 			struct pollhead *php = NULL;
903 
904 			/*
905 			 * If we're in epoll compatibility mode, check that the
906 			 * fd is valid before allocating anything for it; epoll
907 			 * semantics demand that we return EBADF if our
908 			 * specified fd is invalid.
909 			 */
910 			if (is_epoll) {
911 				if ((fp = getf_gen(fd, &gen)) == NULL) {
912 					error = EBADF;
913 					break;
914 				}
915 			}
916 			if (pdp == NULL) {
917 				pdp = pcache_alloc_fd(0);
918 				pdp->pd_fd = fd;
919 				pdp->pd_pcache = pcp;
920 				pcache_insert_fd(pcp, pdp, pollfdnum);
921 			}
922 
923 			if (is_epoll) {
924 				/*
925 				 * If the fd is already a member of the epoll
926 				 * set, error emission is needed only when the
927 				 * fd assignment generation matches the one
928 				 * recorded in the polldat_t.  Absence of such
929 				 * a generation match indicates that a new
930 				 * resource has been assigned at that fd.
931 				 *
932 				 * Caveat: It is possible to force a generation
933 				 * update while keeping the same backing
934 				 * resource.  This is possible via dup2, but
935 				 * does not represent real-world use cases,
936 				 * making the lack of error acceptable.
937 				 */
938 				if (pdp->pd_fp != NULL && pdp->pd_gen == gen) {
939 					error = EEXIST;
940 					releasef(fd);
941 					break;
942 				}
943 
944 				/*
945 				 * We have decided that the cached information
946 				 * was stale.  Reset pd_events to assure that
947 				 * we don't mistakenly operate on cached event
948 				 * disposition.  This configures the implicit
949 				 * subscription to HUP and ERR events which
950 				 * epoll features.
951 				 */
952 				pdp->pd_events = POLLERR|POLLHUP;
953 
954 				epfdp = (dvpoll_epollfd_t *)pfdp;
955 				pdp->pd_epolldata = epfdp->dpep_data;
956 			}
957 
958 			ASSERT(pdp->pd_fd == fd);
959 			ASSERT(pdp->pd_pcache == pcp);
960 			if (fd >= pcp->pc_mapsize) {
961 				mutex_exit(&pcp->pc_lock);
962 				pcache_grow_map(pcp, fd);
963 				mutex_enter(&pcp->pc_lock);
964 			}
965 			if (fd > pcp->pc_mapend) {
966 				pcp->pc_mapend = fd;
967 			}
968 
969 			if (!is_epoll) {
970 				ASSERT(fp == NULL);
971 
972 				if ((fp = getf_gen(fd, &gen)) == NULL) {
973 					/*
974 					 * The fd is not valid. Since we can't
975 					 * pass this error back in the write()
976 					 * call, set the bit in bitmap to force
977 					 * DP_POLL ioctl to examine it.
978 					 */
979 					BT_SET(pcp->pc_bitmap, fd);
980 					pdp->pd_events |= pfdp->events;
981 					continue;
982 				}
983 				/*
984 				 * Don't do VOP_POLL for an already cached fd
985 				 * with same poll events.
986 				 */
987 				if ((pdp->pd_events == pfdp->events) &&
988 				    (pdp->pd_fp == fp)) {
989 					/*
990 					 * the events are already cached
991 					 */
992 					releasef(fd);
993 					continue;
994 				}
995 			}
996 
997 
998 			/*
999 			 * do VOP_POLL and cache this poll fd.
1000 			 */
1001 			/*
1002 			 * XXX - pollrelock() logic needs to know which
1003 			 * which pollcache lock to grab. It'd be a
1004 			 * cleaner solution if we could pass pcp as
1005 			 * an arguement in VOP_POLL interface instead
1006 			 * of implicitly passing it using thread_t
1007 			 * struct. On the other hand, changing VOP_POLL
1008 			 * interface will require all driver/file system
1009 			 * poll routine to change. May want to revisit
1010 			 * the tradeoff later.
1011 			 */
1012 			curthread->t_pollcache = pcp;
1013 			error = VOP_POLL(fp->f_vnode, pfdp->events, 0,
1014 			    &pfdp->revents, &php, NULL);
1015 
1016 			/*
1017 			 * Edge-triggered polling requires a pollhead in order
1018 			 * to initiate wake-ups properly.  Drivers which are
1019 			 * savvy to POLLET presence, which should include
1020 			 * everything in-gate, will always emit one, regardless
1021 			 * of revent status.  Older drivers which only emit a
1022 			 * pollhead if 'revents == 0' are given a second chance
1023 			 * here via a second VOP_POLL, with only POLLET set in
1024 			 * the events of interest.  These circumstances should
1025 			 * induce any cacheable drivers to emit a pollhead for
1026 			 * wake-ups.
1027 			 *
1028 			 * Drivers which never emit a pollhead will simply
1029 			 * disobey the expectation of edge-triggered behavior.
1030 			 * This includes recursive epoll which, even on Linux,
1031 			 * yields its events in a level-triggered fashion only.
1032 			 */
1033 			if ((pfdp->events & POLLET) != 0 && error == 0 &&
1034 			    php == NULL) {
1035 				short levent = 0;
1036 
1037 				error = VOP_POLL(fp->f_vnode, POLLET, 0,
1038 				    &levent, &php, NULL);
1039 			}
1040 
1041 			curthread->t_pollcache = NULL;
1042 			/*
1043 			 * We always set the bit when this fd is cached;
1044 			 * this forces the first DP_POLL to poll this fd.
1045 			 * Real performance gain comes from subsequent
1046 			 * DP_POLL.  We also attempt a pollhead_insert();
1047 			 * if it's not possible, we'll do it in dpioctl().
1048 			 */
1049 			BT_SET(pcp->pc_bitmap, fd);
1050 			if (error != 0) {
1051 				releasef(fd);
1052 				break;
1053 			}
1054 			pdp->pd_fp = fp;
1055 			pdp->pd_gen = gen;
1056 			pdp->pd_events |= pfdp->events;
1057 			if (php != NULL) {
1058 				if (pdp->pd_php == NULL) {
1059 					pollhead_insert(php, pdp);
1060 					pdp->pd_php = php;
1061 				} else {
1062 					if (pdp->pd_php != php) {
1063 						pollhead_delete(pdp->pd_php,
1064 						    pdp);
1065 						pollhead_insert(php, pdp);
1066 						pdp->pd_php = php;
1067 					}
1068 				}
1069 			}
1070 			fds_added = B_TRUE;
1071 			releasef(fd);
1072 		} else {
1073 			if (pdp == NULL || pdp->pd_fp == NULL) {
1074 				if (is_epoll) {
1075 					/*
1076 					 * As with the add case (above), epoll
1077 					 * semantics demand that we error out
1078 					 * in this case.
1079 					 */
1080 					error = ENOENT;
1081 					break;
1082 				}
1083 
1084 				continue;
1085 			}
1086 			ASSERT(pdp->pd_fd == fd);
1087 			pdp->pd_fp = NULL;
1088 			pdp->pd_events = 0;
1089 			ASSERT(pdp->pd_thread == NULL);
1090 			if (pdp->pd_php != NULL) {
1091 				pollhead_delete(pdp->pd_php, pdp);
1092 				pdp->pd_php = NULL;
1093 			}
1094 			BT_CLEAR(pcp->pc_bitmap, fd);
1095 		}
1096 	}
1097 	/*
1098 	 * Wake any pollcache waiters so they can check the new descriptors.
1099 	 *
1100 	 * Any fds added to an recursive-capable pollcache could themselves be
1101 	 * /dev/poll handles. To ensure that proper event propagation occurs,
1102 	 * parent pollcaches are woken too, so that they can create any needed
1103 	 * pollcache links.
1104 	 */
1105 	if (fds_added) {
1106 		cv_broadcast(&pcp->pc_cv);
1107 		pcache_wake_parents(pcp);
1108 	}
1109 	pollstate_exit(pcp);
1110 	mutex_enter(&dpep->dpe_lock);
1111 bypass:
1112 	dpep->dpe_flag &= ~DP_WRITER_PRESENT;
1113 	dpep->dpe_refcnt--;
1114 	cv_broadcast(&dpep->dpe_cv);
1115 	mutex_exit(&dpep->dpe_lock);
1116 	kmem_free(pollfdp, uiosize);
1117 	if (error == 0) {
1118 		/*
1119 		 * The state of uio_resid is updated only after the pollcache
1120 		 * is successfully modified.
1121 		 */
1122 		uioskip(uiop, copysize);
1123 	}
1124 	return (error);
1125 }
1126 
1127 #define	DP_SIGMASK_RESTORE(ksetp) {					\
1128 	if (ksetp != NULL) {						\
1129 		mutex_enter(&p->p_lock);				\
1130 		if (lwp->lwp_cursig == 0) {				\
1131 			t->t_hold = lwp->lwp_sigoldmask;		\
1132 			t->t_flag &= ~T_TOMASK;				\
1133 		}							\
1134 		mutex_exit(&p->p_lock);					\
1135 	}								\
1136 }
1137 
1138 /*ARGSUSED*/
1139 static int
1140 dpioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp)
1141 {
1142 	minor_t		minor;
1143 	dp_entry_t	*dpep;
1144 	pollcache_t	*pcp;
1145 	hrtime_t	now;
1146 	int		error = 0;
1147 	boolean_t	is_epoll;
1148 	STRUCT_DECL(dvpoll, dvpoll);
1149 
1150 	if (cmd == DP_POLL || cmd == DP_PPOLL) {
1151 		/* do this now, before we sleep on DP_WRITER_PRESENT */
1152 		now = gethrtime();
1153 	}
1154 
1155 	minor = getminor(dev);
1156 	mutex_enter(&devpoll_lock);
1157 	ASSERT(minor < dptblsize);
1158 	dpep = devpolltbl[minor];
1159 	mutex_exit(&devpoll_lock);
1160 	ASSERT(dpep != NULL);
1161 	pcp = dpep->dpe_pcache;
1162 
1163 	mutex_enter(&dpep->dpe_lock);
1164 	is_epoll = (dpep->dpe_flag & DP_ISEPOLLCOMPAT) != 0;
1165 
1166 	if (cmd == DP_EPOLLCOMPAT) {
1167 		if (dpep->dpe_refcnt != 0) {
1168 			/*
1169 			 * We can't turn on epoll compatibility while there
1170 			 * are outstanding operations.
1171 			 */
1172 			mutex_exit(&dpep->dpe_lock);
1173 			return (EBUSY);
1174 		}
1175 
1176 		/*
1177 		 * epoll compatibility is a one-way street: there's no way
1178 		 * to turn it off for a particular open.
1179 		 */
1180 		dpep->dpe_flag |= DP_ISEPOLLCOMPAT;
1181 
1182 		/* Record the epoll-enabled nature in the pollcache too */
1183 		mutex_enter(&pcp->pc_lock);
1184 		pcp->pc_flag |= PC_EPOLL;
1185 		mutex_exit(&pcp->pc_lock);
1186 
1187 		mutex_exit(&dpep->dpe_lock);
1188 		return (0);
1189 	}
1190 
1191 	if (!is_epoll && curproc->p_pid != pcp->pc_pid) {
1192 		if (pcp->pc_pid != -1) {
1193 			mutex_exit(&dpep->dpe_lock);
1194 			return (EACCES);
1195 		}
1196 
1197 		pcp->pc_pid = curproc->p_pid;
1198 	}
1199 
1200 	/* Wait until all writers have cleared the handle before continuing */
1201 	while ((dpep->dpe_flag & DP_WRITER_PRESENT) != 0 ||
1202 	    (dpep->dpe_writerwait != 0)) {
1203 		if (!cv_wait_sig_swap(&dpep->dpe_cv, &dpep->dpe_lock)) {
1204 			mutex_exit(&dpep->dpe_lock);
1205 			return (EINTR);
1206 		}
1207 	}
1208 	dpep->dpe_refcnt++;
1209 	mutex_exit(&dpep->dpe_lock);
1210 
1211 	switch (cmd) {
1212 	case	DP_POLL:
1213 	case	DP_PPOLL:
1214 	{
1215 		pollstate_t	*ps;
1216 		nfds_t		nfds;
1217 		int		fdcnt = 0;
1218 		size_t		size, fdsize, dpsize;
1219 		hrtime_t	deadline = 0;
1220 		k_sigset_t	*ksetp = NULL;
1221 		k_sigset_t	kset;
1222 		sigset_t	set;
1223 		kthread_t	*t = curthread;
1224 		klwp_t		*lwp = ttolwp(t);
1225 		struct proc	*p = ttoproc(curthread);
1226 
1227 		STRUCT_INIT(dvpoll, mode);
1228 
1229 		/*
1230 		 * The dp_setp member is only required/consumed for DP_PPOLL,
1231 		 * which otherwise uses the same structure as DP_POLL.
1232 		 */
1233 		if (cmd == DP_POLL) {
1234 			dpsize = (uintptr_t)STRUCT_FADDR(dvpoll, dp_setp) -
1235 			    (uintptr_t)STRUCT_FADDR(dvpoll, dp_fds);
1236 		} else {
1237 			ASSERT(cmd == DP_PPOLL);
1238 			dpsize = STRUCT_SIZE(dvpoll);
1239 		}
1240 
1241 		if ((mode & FKIOCTL) != 0) {
1242 			/* Kernel-internal ioctl call */
1243 			bcopy((caddr_t)arg, STRUCT_BUF(dvpoll), dpsize);
1244 			error = 0;
1245 		} else {
1246 			error = copyin((caddr_t)arg, STRUCT_BUF(dvpoll),
1247 			    dpsize);
1248 		}
1249 
1250 		if (error) {
1251 			DP_REFRELE(dpep);
1252 			return (EFAULT);
1253 		}
1254 
1255 		deadline = STRUCT_FGET(dvpoll, dp_timeout);
1256 		if (deadline > 0) {
1257 			/*
1258 			 * Convert the deadline from relative milliseconds
1259 			 * to absolute nanoseconds.  They must wait for at
1260 			 * least a tick.
1261 			 */
1262 			deadline = MSEC2NSEC(deadline);
1263 			deadline = MAX(deadline, nsec_per_tick);
1264 			deadline += now;
1265 		}
1266 
1267 		if (cmd == DP_PPOLL) {
1268 			void *setp = STRUCT_FGETP(dvpoll, dp_setp);
1269 
1270 			if (setp != NULL) {
1271 				if ((mode & FKIOCTL) != 0) {
1272 					/* Use the signal set directly */
1273 					ksetp = (k_sigset_t *)setp;
1274 				} else {
1275 					if (copyin(setp, &set, sizeof (set))) {
1276 						DP_REFRELE(dpep);
1277 						return (EFAULT);
1278 					}
1279 					sigutok(&set, &kset);
1280 					ksetp = &kset;
1281 				}
1282 
1283 				mutex_enter(&p->p_lock);
1284 				schedctl_finish_sigblock(t);
1285 				lwp->lwp_sigoldmask = t->t_hold;
1286 				t->t_hold = *ksetp;
1287 				t->t_flag |= T_TOMASK;
1288 
1289 				/*
1290 				 * Like ppoll() with a non-NULL sigset, we'll
1291 				 * call cv_reltimedwait_sig() just to check for
1292 				 * signals.  This call will return immediately
1293 				 * with either 0 (signalled) or -1 (no signal).
1294 				 * There are some conditions whereby we can
1295 				 * get 0 from cv_reltimedwait_sig() without
1296 				 * a true signal (e.g., a directed stop), so
1297 				 * we restore our signal mask in the unlikely
1298 				 * event that lwp_cursig is 0.
1299 				 */
1300 				if (!cv_reltimedwait_sig(&t->t_delay_cv,
1301 				    &p->p_lock, 0, TR_CLOCK_TICK)) {
1302 					if (lwp->lwp_cursig == 0) {
1303 						t->t_hold = lwp->lwp_sigoldmask;
1304 						t->t_flag &= ~T_TOMASK;
1305 					}
1306 
1307 					mutex_exit(&p->p_lock);
1308 
1309 					DP_REFRELE(dpep);
1310 					return (EINTR);
1311 				}
1312 
1313 				mutex_exit(&p->p_lock);
1314 			}
1315 		}
1316 
1317 		if ((nfds = STRUCT_FGET(dvpoll, dp_nfds)) == 0) {
1318 			/*
1319 			 * We are just using DP_POLL to sleep, so
1320 			 * we don't any of the devpoll apparatus.
1321 			 * Do not check for signals if we have a zero timeout.
1322 			 */
1323 			DP_REFRELE(dpep);
1324 			if (deadline == 0) {
1325 				DP_SIGMASK_RESTORE(ksetp);
1326 				return (0);
1327 			}
1328 
1329 			mutex_enter(&curthread->t_delay_lock);
1330 			while ((error =
1331 			    cv_timedwait_sig_hrtime(&curthread->t_delay_cv,
1332 			    &curthread->t_delay_lock, deadline)) > 0)
1333 				continue;
1334 			mutex_exit(&curthread->t_delay_lock);
1335 
1336 			DP_SIGMASK_RESTORE(ksetp);
1337 
1338 			return (error == 0 ? EINTR : 0);
1339 		}
1340 
1341 		if (is_epoll) {
1342 			size = nfds * (fdsize = sizeof (epoll_event_t));
1343 		} else {
1344 			size = nfds * (fdsize = sizeof (pollfd_t));
1345 		}
1346 
1347 		/*
1348 		 * XXX It would be nice not to have to alloc each time, but it
1349 		 * requires another per thread structure hook. This can be
1350 		 * implemented later if data suggests that it's necessary.
1351 		 */
1352 		ps = pollstate_create();
1353 
1354 		if (ps->ps_dpbufsize < size) {
1355 			/*
1356 			 * If nfds is larger than twice the current maximum
1357 			 * open file count, we'll silently clamp it.  This
1358 			 * only limits our exposure to allocating an
1359 			 * inordinate amount of kernel memory; it doesn't
1360 			 * otherwise affect the semantics.  (We have this
1361 			 * check at twice the maximum instead of merely the
1362 			 * maximum because some applications pass an nfds that
1363 			 * is only slightly larger than their limit.)
1364 			 */
1365 			mutex_enter(&p->p_lock);
1366 			if ((nfds >> 1) > p->p_fno_ctl) {
1367 				nfds = p->p_fno_ctl;
1368 				size = nfds * fdsize;
1369 			}
1370 			mutex_exit(&p->p_lock);
1371 
1372 			if (ps->ps_dpbufsize < size) {
1373 				kmem_free(ps->ps_dpbuf, ps->ps_dpbufsize);
1374 				ps->ps_dpbuf = kmem_zalloc(size, KM_SLEEP);
1375 				ps->ps_dpbufsize = size;
1376 			}
1377 		}
1378 
1379 		VERIFY(pollstate_enter(pcp) == PSE_SUCCESS);
1380 		for (;;) {
1381 			pcp->pc_flag &= ~PC_POLLWAKE;
1382 
1383 			/*
1384 			 * Mark all child pcachelinks as stale.
1385 			 * Those which are still part of the tree will be
1386 			 * marked as valid during the poll.
1387 			 */
1388 			pcachelink_mark_stale(pcp);
1389 
1390 			error = dp_pcache_poll(dpep, ps->ps_dpbuf,
1391 			    pcp, nfds, &fdcnt);
1392 			if (fdcnt > 0 || error != 0)
1393 				break;
1394 
1395 			/* Purge still-stale child pcachelinks */
1396 			pcachelink_purge_stale(pcp);
1397 
1398 			/*
1399 			 * A pollwake has happened since we polled cache.
1400 			 */
1401 			if (pcp->pc_flag & PC_POLLWAKE)
1402 				continue;
1403 
1404 			/*
1405 			 * Sleep until we are notified, signaled, or timed out.
1406 			 */
1407 			if (deadline == 0) {
1408 				/* immediate timeout; do not check signals */
1409 				break;
1410 			}
1411 
1412 			error = cv_timedwait_sig_hrtime(&pcp->pc_cv,
1413 			    &pcp->pc_lock, deadline);
1414 
1415 			/*
1416 			 * If we were awakened by a signal or timeout then
1417 			 * break the loop, else poll again.
1418 			 */
1419 			if (error <= 0) {
1420 				error = (error == 0) ? EINTR : 0;
1421 				break;
1422 			} else {
1423 				error = 0;
1424 			}
1425 		}
1426 		pollstate_exit(pcp);
1427 
1428 		DP_SIGMASK_RESTORE(ksetp);
1429 
1430 		if (error == 0 && fdcnt > 0) {
1431 			/*
1432 			 * It should be noted that FKIOCTL does not influence
1433 			 * the copyout (vs bcopy) of dp_fds at this time.
1434 			 */
1435 			if (copyout(ps->ps_dpbuf,
1436 			    STRUCT_FGETP(dvpoll, dp_fds), fdcnt * fdsize)) {
1437 				DP_REFRELE(dpep);
1438 				return (EFAULT);
1439 			}
1440 			*rvalp = fdcnt;
1441 		}
1442 		break;
1443 	}
1444 
1445 	case	DP_ISPOLLED:
1446 	{
1447 		pollfd_t	pollfd;
1448 		polldat_t	*pdp;
1449 
1450 		STRUCT_INIT(dvpoll, mode);
1451 		error = copyin((caddr_t)arg, &pollfd, sizeof (pollfd_t));
1452 		if (error) {
1453 			DP_REFRELE(dpep);
1454 			return (EFAULT);
1455 		}
1456 		mutex_enter(&pcp->pc_lock);
1457 		if (pcp->pc_hash == NULL) {
1458 			/*
1459 			 * No Need to search because no poll fd
1460 			 * has been cached.
1461 			 */
1462 			mutex_exit(&pcp->pc_lock);
1463 			DP_REFRELE(dpep);
1464 			return (0);
1465 		}
1466 		if (pollfd.fd < 0) {
1467 			mutex_exit(&pcp->pc_lock);
1468 			break;
1469 		}
1470 		pdp = pcache_lookup_fd(pcp, pollfd.fd);
1471 		if ((pdp != NULL) && (pdp->pd_fd == pollfd.fd) &&
1472 		    (pdp->pd_fp != NULL)) {
1473 			pollfd.revents = pdp->pd_events;
1474 			if (copyout(&pollfd, (caddr_t)arg, sizeof (pollfd_t))) {
1475 				mutex_exit(&pcp->pc_lock);
1476 				DP_REFRELE(dpep);
1477 				return (EFAULT);
1478 			}
1479 			*rvalp = 1;
1480 		}
1481 		mutex_exit(&pcp->pc_lock);
1482 		break;
1483 	}
1484 
1485 	default:
1486 		DP_REFRELE(dpep);
1487 		return (EINVAL);
1488 	}
1489 	DP_REFRELE(dpep);
1490 	return (error);
1491 }
1492 
1493 /*
1494  * Overview of Recursive Polling
1495  *
1496  * It is possible for /dev/poll to poll for events on file descriptors which
1497  * themselves are /dev/poll handles.  Pending events in the child handle are
1498  * represented as readable data via the POLLIN flag.  To limit surface area,
1499  * this recursion is presently allowed on only /dev/poll handles which have
1500  * been placed in epoll mode via the DP_EPOLLCOMPAT ioctl.  Recursion depth is
1501  * limited to 5 in order to be consistent with Linux epoll.
1502  *
1503  * Extending dppoll() for VOP_POLL:
1504  *
1505  * The recursive /dev/poll implementation begins by extending dppoll() to
1506  * report when resources contained in the pollcache have relevant event state.
1507  * At the highest level, it means calling dp_pcache_poll() so it indicates if
1508  * fd events are present without consuming them or altering the pollcache
1509  * bitmap.  This ensures that a subsequent DP_POLL operation on the bitmap will
1510  * yield the initiating event.  Additionally, the VOP_POLL should return in
1511  * such a way that dp_pcache_poll() does not clear the parent bitmap entry
1512  * which corresponds to the child /dev/poll fd.  This means that child
1513  * pollcaches will be checked during every poll which facilitates wake-up
1514  * behavior detailed below.
1515  *
1516  * Pollcache Links and Wake Events:
1517  *
1518  * Recursive /dev/poll avoids complicated pollcache locking constraints during
1519  * pollwakeup events by eschewing the traditional pollhead mechanism in favor
1520  * of a different approach.  For each pollcache at the root of a recursive
1521  * /dev/poll "tree", pcachelink_t structures are established to all child
1522  * /dev/poll pollcaches.  During pollnotify() in a child pollcache, the
1523  * linked list of pcachelink_t entries is walked, where those marked as valid
1524  * incur a cv_broadcast to their parent pollcache.  Most notably, these
1525  * pcachelink_t cv wakeups are performed without acquiring pc_lock on the
1526  * parent pollcache (which would require careful deadlock avoidance).  This
1527  * still allows the woken poll on the parent to discover the pertinent events
1528  * due to the fact that bitmap entires for the child pollcache are always
1529  * maintained by the dppoll() logic above.
1530  *
1531  * Depth Limiting and Loop Prevention:
1532  *
1533  * As each pollcache is encountered (either via DP_POLL or dppoll()), depth and
1534  * loop constraints are enforced via pollstate_enter().  The pollcache_t
1535  * pointer is compared against any existing entries in ps_pc_stack and is added
1536  * to the end if no match (and therefore loop) is found.  Once poll operations
1537  * for a given pollcache_t are complete, pollstate_exit() clears the pointer
1538  * from the list.  The pollstate_enter() and pollstate_exit() functions are
1539  * responsible for acquiring and releasing pc_lock, respectively.
1540  *
1541  * Deadlock Safety:
1542  *
1543  * Descending through a tree of recursive /dev/poll handles involves the tricky
1544  * business of sequentially entering multiple pollcache locks.  This tree
1545  * topology cannot define a lock acquisition order in such a way that it is
1546  * immune to deadlocks between threads.  The pollstate_enter() and
1547  * pollstate_exit() functions provide an interface for recursive /dev/poll
1548  * operations to safely lock pollcaches while failing gracefully in the face of
1549  * deadlocking topologies. (See pollstate_contend() for more detail about how
1550  * deadlocks are detected and resolved.)
1551  */
1552 
1553 /*ARGSUSED*/
1554 static int
1555 dppoll(dev_t dev, short events, int anyyet, short *reventsp,
1556     struct pollhead **phpp)
1557 {
1558 	minor_t		minor;
1559 	dp_entry_t	*dpep;
1560 	pollcache_t	*pcp;
1561 	int		res, rc = 0;
1562 
1563 	minor = getminor(dev);
1564 	mutex_enter(&devpoll_lock);
1565 	ASSERT(minor < dptblsize);
1566 	dpep = devpolltbl[minor];
1567 	ASSERT(dpep != NULL);
1568 	mutex_exit(&devpoll_lock);
1569 
1570 	mutex_enter(&dpep->dpe_lock);
1571 	if ((dpep->dpe_flag & DP_ISEPOLLCOMPAT) == 0) {
1572 		/* Poll recursion is not yet supported for non-epoll handles */
1573 		*reventsp = POLLERR;
1574 		mutex_exit(&dpep->dpe_lock);
1575 		return (0);
1576 	} else {
1577 		dpep->dpe_refcnt++;
1578 		pcp = dpep->dpe_pcache;
1579 		mutex_exit(&dpep->dpe_lock);
1580 	}
1581 
1582 	res = pollstate_enter(pcp);
1583 	if (res == PSE_SUCCESS) {
1584 		nfds_t		nfds = 1;
1585 		int		fdcnt = 0;
1586 		pollstate_t	*ps = curthread->t_pollstate;
1587 
1588 		/*
1589 		 * Recursive polling will only emit certain events.  Skip a
1590 		 * scan of the pollcache if those events are not of interest.
1591 		 */
1592 		if (events & (POLLIN|POLLRDNORM)) {
1593 			rc = dp_pcache_poll(dpep, NULL, pcp, nfds, &fdcnt);
1594 		} else {
1595 			rc = 0;
1596 			fdcnt = 0;
1597 		}
1598 
1599 		if (rc == 0 && fdcnt > 0) {
1600 			*reventsp = POLLIN|POLLRDNORM;
1601 		} else {
1602 			*reventsp = 0;
1603 		}
1604 		pcachelink_assoc(pcp, ps->ps_pc_stack[0]);
1605 		pollstate_exit(pcp);
1606 	} else {
1607 		switch (res) {
1608 		case PSE_FAIL_DEPTH:
1609 			rc = EINVAL;
1610 			break;
1611 		case PSE_FAIL_LOOP:
1612 		case PSE_FAIL_DEADLOCK:
1613 			rc = ELOOP;
1614 			break;
1615 		default:
1616 			/*
1617 			 * If anything else has gone awry, such as being polled
1618 			 * from an unexpected context, fall back to the
1619 			 * recursion-intolerant response.
1620 			 */
1621 			*reventsp = POLLERR;
1622 			rc = 0;
1623 			break;
1624 		}
1625 	}
1626 
1627 	DP_REFRELE(dpep);
1628 	return (rc);
1629 }
1630 
1631 /*
1632  * devpoll close should do enough clean up before the pollcache is deleted,
1633  * i.e., it should ensure no one still references the pollcache later.
1634  * There is no "permission" check in here. Any process having the last
1635  * reference of this /dev/poll fd can close.
1636  */
1637 /*ARGSUSED*/
1638 static int
1639 dpclose(dev_t dev, int flag, int otyp, cred_t *credp)
1640 {
1641 	minor_t		minor;
1642 	dp_entry_t	*dpep;
1643 	pollcache_t	*pcp;
1644 	int		i;
1645 	polldat_t	**hashtbl;
1646 	polldat_t	*pdp;
1647 
1648 	minor = getminor(dev);
1649 
1650 	mutex_enter(&devpoll_lock);
1651 	dpep = devpolltbl[minor];
1652 	ASSERT(dpep != NULL);
1653 	devpolltbl[minor] = NULL;
1654 	mutex_exit(&devpoll_lock);
1655 	pcp = dpep->dpe_pcache;
1656 	ASSERT(pcp != NULL);
1657 	/*
1658 	 * At this point, no other lwp can access this pollcache via the
1659 	 * /dev/poll fd. This pollcache is going away, so do the clean
1660 	 * up without the pc_lock.
1661 	 */
1662 	hashtbl = pcp->pc_hash;
1663 	for (i = 0; i < pcp->pc_hashsize; i++) {
1664 		for (pdp = hashtbl[i]; pdp; pdp = pdp->pd_hashnext) {
1665 			if (pdp->pd_php != NULL) {
1666 				pollhead_delete(pdp->pd_php, pdp);
1667 				pdp->pd_php = NULL;
1668 				pdp->pd_fp = NULL;
1669 			}
1670 		}
1671 	}
1672 	/*
1673 	 * pollwakeup() may still interact with this pollcache. Wait until
1674 	 * it is done.
1675 	 */
1676 	mutex_enter(&pcp->pc_no_exit);
1677 	ASSERT(pcp->pc_busy >= 0);
1678 	while (pcp->pc_busy > 0)
1679 		cv_wait(&pcp->pc_busy_cv, &pcp->pc_no_exit);
1680 	mutex_exit(&pcp->pc_no_exit);
1681 
1682 	/* Clean up any pollcache links created via recursive /dev/poll */
1683 	if (pcp->pc_parents != NULL || pcp->pc_children != NULL) {
1684 		/*
1685 		 * Because of the locking rules for pcachelink manipulation,
1686 		 * acquring pc_lock is required for this step.
1687 		 */
1688 		mutex_enter(&pcp->pc_lock);
1689 		pcachelink_purge_all(pcp);
1690 		mutex_exit(&pcp->pc_lock);
1691 	}
1692 
1693 	pcache_destroy(pcp);
1694 	ASSERT(dpep->dpe_refcnt == 0);
1695 	kmem_free(dpep, sizeof (dp_entry_t));
1696 	return (0);
1697 }
1698 
1699 static void
1700 pcachelink_locked_rele(pcachelink_t *pl)
1701 {
1702 	ASSERT(MUTEX_HELD(&pl->pcl_lock));
1703 	VERIFY(pl->pcl_refcnt >= 1);
1704 
1705 	pl->pcl_refcnt--;
1706 	if (pl->pcl_refcnt == 0) {
1707 		VERIFY(pl->pcl_state == PCL_INVALID);
1708 		ASSERT(pl->pcl_parent_pc == NULL);
1709 		ASSERT(pl->pcl_child_pc == NULL);
1710 		ASSERT(pl->pcl_parent_next == NULL);
1711 		ASSERT(pl->pcl_child_next == NULL);
1712 
1713 		pl->pcl_state = PCL_FREE;
1714 		mutex_destroy(&pl->pcl_lock);
1715 		kmem_free(pl, sizeof (pcachelink_t));
1716 	} else {
1717 		mutex_exit(&pl->pcl_lock);
1718 	}
1719 }
1720 
1721 /*
1722  * Associate parent and child pollcaches via a pcachelink_t.  If an existing
1723  * link (stale or valid) between the two is found, it will be reused.  If a
1724  * suitable link is not found for reuse, a new one will be allocated.
1725  */
1726 static void
1727 pcachelink_assoc(pollcache_t *child, pollcache_t *parent)
1728 {
1729 	pcachelink_t	*pl, **plpn;
1730 
1731 	ASSERT(MUTEX_HELD(&child->pc_lock));
1732 	ASSERT(MUTEX_HELD(&parent->pc_lock));
1733 
1734 	/* Search for an existing link we can reuse. */
1735 	plpn = &child->pc_parents;
1736 	for (pl = child->pc_parents; pl != NULL; pl = *plpn) {
1737 		mutex_enter(&pl->pcl_lock);
1738 		if (pl->pcl_state == PCL_INVALID) {
1739 			/* Clean any invalid links while walking the list */
1740 			*plpn = pl->pcl_parent_next;
1741 			pl->pcl_child_pc = NULL;
1742 			pl->pcl_parent_next = NULL;
1743 			pcachelink_locked_rele(pl);
1744 		} else if (pl->pcl_parent_pc == parent) {
1745 			/* Successfully found parent link */
1746 			ASSERT(pl->pcl_state == PCL_VALID ||
1747 			    pl->pcl_state == PCL_STALE);
1748 			pl->pcl_state = PCL_VALID;
1749 			mutex_exit(&pl->pcl_lock);
1750 			return;
1751 		} else {
1752 			plpn = &pl->pcl_parent_next;
1753 			mutex_exit(&pl->pcl_lock);
1754 		}
1755 	}
1756 
1757 	/* No existing link to the parent was found.  Create a fresh one. */
1758 	pl = kmem_zalloc(sizeof (pcachelink_t), KM_SLEEP);
1759 	mutex_init(&pl->pcl_lock,  NULL, MUTEX_DEFAULT, NULL);
1760 
1761 	pl->pcl_parent_pc = parent;
1762 	pl->pcl_child_next = parent->pc_children;
1763 	parent->pc_children = pl;
1764 	pl->pcl_refcnt++;
1765 
1766 	pl->pcl_child_pc = child;
1767 	pl->pcl_parent_next = child->pc_parents;
1768 	child->pc_parents = pl;
1769 	pl->pcl_refcnt++;
1770 
1771 	pl->pcl_state = PCL_VALID;
1772 }
1773 
1774 /*
1775  * Mark all child links in a pollcache as stale.  Any invalid child links found
1776  * during iteration are purged.
1777  */
1778 static void
1779 pcachelink_mark_stale(pollcache_t *pcp)
1780 {
1781 	pcachelink_t	*pl, **plpn;
1782 
1783 	ASSERT(MUTEX_HELD(&pcp->pc_lock));
1784 
1785 	plpn = &pcp->pc_children;
1786 	for (pl = pcp->pc_children; pl != NULL; pl = *plpn) {
1787 		mutex_enter(&pl->pcl_lock);
1788 		if (pl->pcl_state == PCL_INVALID) {
1789 			/*
1790 			 * Remove any invalid links while we are going to the
1791 			 * trouble of walking the list.
1792 			 */
1793 			*plpn = pl->pcl_child_next;
1794 			pl->pcl_parent_pc = NULL;
1795 			pl->pcl_child_next = NULL;
1796 			pcachelink_locked_rele(pl);
1797 		} else {
1798 			pl->pcl_state = PCL_STALE;
1799 			plpn = &pl->pcl_child_next;
1800 			mutex_exit(&pl->pcl_lock);
1801 		}
1802 	}
1803 }
1804 
1805 /*
1806  * Purge all stale (or invalid) child links from a pollcache.
1807  */
1808 static void
1809 pcachelink_purge_stale(pollcache_t *pcp)
1810 {
1811 	pcachelink_t	*pl, **plpn;
1812 
1813 	ASSERT(MUTEX_HELD(&pcp->pc_lock));
1814 
1815 	plpn = &pcp->pc_children;
1816 	for (pl = pcp->pc_children; pl != NULL; pl = *plpn) {
1817 		mutex_enter(&pl->pcl_lock);
1818 		switch (pl->pcl_state) {
1819 		case PCL_STALE:
1820 			pl->pcl_state = PCL_INVALID;
1821 			/* FALLTHROUGH */
1822 		case PCL_INVALID:
1823 			*plpn = pl->pcl_child_next;
1824 			pl->pcl_parent_pc = NULL;
1825 			pl->pcl_child_next = NULL;
1826 			pcachelink_locked_rele(pl);
1827 			break;
1828 		default:
1829 			plpn = &pl->pcl_child_next;
1830 			mutex_exit(&pl->pcl_lock);
1831 		}
1832 	}
1833 }
1834 
1835 /*
1836  * Purge all child and parent links from a pollcache, regardless of status.
1837  */
1838 static void
1839 pcachelink_purge_all(pollcache_t *pcp)
1840 {
1841 	pcachelink_t	*pl, **plpn;
1842 
1843 	ASSERT(MUTEX_HELD(&pcp->pc_lock));
1844 
1845 	plpn = &pcp->pc_parents;
1846 	for (pl = pcp->pc_parents; pl != NULL; pl = *plpn) {
1847 		mutex_enter(&pl->pcl_lock);
1848 		pl->pcl_state = PCL_INVALID;
1849 		*plpn = pl->pcl_parent_next;
1850 		pl->pcl_child_pc = NULL;
1851 		pl->pcl_parent_next = NULL;
1852 		pcachelink_locked_rele(pl);
1853 	}
1854 
1855 	plpn = &pcp->pc_children;
1856 	for (pl = pcp->pc_children; pl != NULL; pl = *plpn) {
1857 		mutex_enter(&pl->pcl_lock);
1858 		pl->pcl_state = PCL_INVALID;
1859 		*plpn = pl->pcl_child_next;
1860 		pl->pcl_parent_pc = NULL;
1861 		pl->pcl_child_next = NULL;
1862 		pcachelink_locked_rele(pl);
1863 	}
1864 
1865 	ASSERT(pcp->pc_parents == NULL);
1866 	ASSERT(pcp->pc_children == NULL);
1867 }
1868