/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* * Copyright (c) 2012 by Delphix. All rights reserved. * Copyright 2019 Joyent, Inc. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define RESERVED 1 /* local data struct */ static dp_entry_t **devpolltbl; /* dev poll entries */ static size_t dptblsize; static kmutex_t devpoll_lock; /* lock protecting dev tbl */ int devpoll_init; /* is /dev/poll initialized already */ /* device local functions */ static int dpopen(dev_t *devp, int flag, int otyp, cred_t *credp); static int dpwrite(dev_t dev, struct uio *uiop, cred_t *credp); static int dpioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp); static int dppoll(dev_t dev, short events, int anyyet, short *reventsp, struct pollhead **phpp); static int dpclose(dev_t dev, int flag, int otyp, cred_t *credp); static dev_info_t *dpdevi; static struct cb_ops dp_cb_ops = { dpopen, /* open */ dpclose, /* close */ nodev, /* strategy */ nodev, /* print */ nodev, /* dump */ nodev, /* read */ dpwrite, /* write */ dpioctl, /* ioctl */ nodev, /* devmap */ nodev, /* mmap */ nodev, /* segmap */ dppoll, /* poll */ ddi_prop_op, /* prop_op */ (struct streamtab *)0, /* streamtab */ D_MP, /* flags */ CB_REV, /* cb_ops revision */ nodev, /* aread */ nodev /* awrite */ }; static int dpattach(dev_info_t *, ddi_attach_cmd_t); static int dpdetach(dev_info_t *, ddi_detach_cmd_t); static int dpinfo(dev_info_t *, ddi_info_cmd_t, void *, void **); static struct dev_ops dp_ops = { DEVO_REV, /* devo_rev */ 0, /* refcnt */ dpinfo, /* info */ nulldev, /* identify */ nulldev, /* probe */ dpattach, /* attach */ dpdetach, /* detach */ nodev, /* reset */ &dp_cb_ops, /* driver operations */ (struct bus_ops *)NULL, /* bus operations */ nulldev, /* power */ ddi_quiesce_not_needed, /* quiesce */ }; static struct modldrv modldrv = { &mod_driverops, /* type of module - a driver */ "/dev/poll driver", &dp_ops, }; static struct modlinkage modlinkage = { MODREV_1, (void *)&modldrv, NULL }; static void pcachelink_assoc(pollcache_t *, pollcache_t *); static void pcachelink_mark_stale(pollcache_t *); static void pcachelink_purge_stale(pollcache_t *); static void pcachelink_purge_all(pollcache_t *); /* * Locking Design * * The /dev/poll driver shares most of its code with poll sys call whose * code is in common/syscall/poll.c. In poll(2) design, the pollcache * structure is per lwp. An implicit assumption is made there that some * portion of pollcache will never be touched by other lwps. E.g., in * poll(2) design, no lwp will ever need to grow bitmap of other lwp. * This assumption is not true for /dev/poll; hence the need for extra * locking. * * To allow more parallelism, each /dev/poll file descriptor (indexed by * minor number) has its own lock. Since read (dpioctl) is a much more * frequent operation than write, we want to allow multiple reads on same * /dev/poll fd. However, we prevent writes from being starved by giving * priority to write operation. Theoretically writes can starve reads as * well. But in practical sense this is not important because (1) writes * happens less often than reads, and (2) write operation defines the * content of poll fd a cache set. If writes happens so often that they * can starve reads, that means the cached set is very unstable. It may * not make sense to read an unstable cache set anyway. Therefore, the * writers starving readers case is not handled in this design. */ int _init() { int error; dptblsize = DEVPOLLSIZE; devpolltbl = kmem_zalloc(sizeof (caddr_t) * dptblsize, KM_SLEEP); mutex_init(&devpoll_lock, NULL, MUTEX_DEFAULT, NULL); devpoll_init = 1; if ((error = mod_install(&modlinkage)) != 0) { kmem_free(devpolltbl, sizeof (caddr_t) * dptblsize); devpoll_init = 0; } return (error); } int _fini() { int error; if ((error = mod_remove(&modlinkage)) != 0) { return (error); } mutex_destroy(&devpoll_lock); kmem_free(devpolltbl, sizeof (caddr_t) * dptblsize); return (0); } int _info(struct modinfo *modinfop) { return (mod_info(&modlinkage, modinfop)); } /*ARGSUSED*/ static int dpattach(dev_info_t *devi, ddi_attach_cmd_t cmd) { if (ddi_create_minor_node(devi, "poll", S_IFCHR, 0, DDI_PSEUDO, 0) == DDI_FAILURE) { ddi_remove_minor_node(devi, NULL); return (DDI_FAILURE); } dpdevi = devi; return (DDI_SUCCESS); } static int dpdetach(dev_info_t *devi, ddi_detach_cmd_t cmd) { if (cmd != DDI_DETACH) return (DDI_FAILURE); ddi_remove_minor_node(devi, NULL); return (DDI_SUCCESS); } /* ARGSUSED */ static int dpinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result) { int error; switch (infocmd) { case DDI_INFO_DEVT2DEVINFO: *result = (void *)dpdevi; error = DDI_SUCCESS; break; case DDI_INFO_DEVT2INSTANCE: *result = (void *)0; error = DDI_SUCCESS; break; default: error = DDI_FAILURE; } return (error); } /* * dp_pcache_poll has similar logic to pcache_poll() in poll.c. The major * differences are: (1) /dev/poll requires scanning the bitmap starting at * where it was stopped last time, instead of always starting from 0, * (2) since user may not have cleaned up the cached fds when they are * closed, some polldats in cache may refer to closed or reused fds. We * need to check for those cases. * * NOTE: Upon closing an fd, automatic poll cache cleanup is done for * poll(2) caches but NOT for /dev/poll caches. So expect some * stale entries! */ static int dp_pcache_poll(dp_entry_t *dpep, void *dpbuf, pollcache_t *pcp, nfds_t nfds, int *fdcntp) { int start, ostart, end, fdcnt, error = 0; boolean_t done, no_wrap; pollfd_t *pfdp; epoll_event_t *epoll; const short mask = POLLRDHUP | POLLWRBAND; const boolean_t is_epoll = (dpep->dpe_flag & DP_ISEPOLLCOMPAT) != 0; ASSERT(MUTEX_HELD(&pcp->pc_lock)); if (pcp->pc_bitmap == NULL) { /* No Need to search because no poll fd has been cached. */ return (0); } if (is_epoll) { pfdp = NULL; epoll = (epoll_event_t *)dpbuf; } else { pfdp = (pollfd_t *)dpbuf; epoll = NULL; } retry: start = ostart = pcp->pc_mapstart; end = pcp->pc_mapend; if (start == 0) { /* * started from every begining, no need to wrap around. */ no_wrap = B_TRUE; } else { no_wrap = B_FALSE; } done = B_FALSE; fdcnt = 0; while ((fdcnt < nfds) && !done) { pollhead_t *php = NULL; short revent = 0; uf_entry_gen_t gen; int fd; /* * Examine the bit map in a circular fashion * to avoid starvation. Always resume from * last stop. Scan till end of the map. Then * wrap around. */ fd = bt_getlowbit(pcp->pc_bitmap, start, end); ASSERT(fd <= end); if (fd >= 0) { file_t *fp; polldat_t *pdp; if (fd == end) { if (no_wrap) { done = B_TRUE; } else { start = 0; end = ostart - 1; no_wrap = B_TRUE; } } else { start = fd + 1; } pdp = pcache_lookup_fd(pcp, fd); repoll: ASSERT(pdp != NULL); ASSERT(pdp->pd_fd == fd); if (pdp->pd_fp == NULL) { /* * The fd is POLLREMOVed. This fd is * logically no longer cached. So move * on to the next one. */ continue; } if ((fp = getf_gen(fd, &gen)) == NULL) { if (is_epoll) { /* * In the epoll compatibility case, we * actually perform the implicit * removal to remain closer to the * epoll semantics. */ pdp->pd_fp = NULL; pdp->pd_events = 0; if (pdp->pd_php != NULL) { pollhead_delete(pdp->pd_php, pdp); pdp->pd_php = NULL; } BT_CLEAR(pcp->pc_bitmap, fd); } else if (pfdp != NULL) { /* * The fd has been closed, but user has * not done a POLLREMOVE on this fd * yet. Instead of cleaning it here * implicitly, we return POLLNVAL. This * is consistent with poll(2) polling a * closed fd. Hope this will remind * user to do a POLLREMOVE. */ pfdp[fdcnt].fd = fd; pfdp[fdcnt].revents = POLLNVAL; fdcnt++; } continue; } /* * Detect a change to the resource underlying a cached * file descriptor. While the fd generation comparison * will catch nearly all cases, the file_t comparison * is maintained as a failsafe as well. */ if (gen != pdp->pd_gen || fp != pdp->pd_fp) { /* * The user is polling on a cached fd which was * closed and then reused. Unfortunately there * is no good way to communicate this fact to * the consumer. * * When this situation has been detected, it's * likely that any existing pollhead is * ill-suited to perform proper wake-ups. * * Clean up the old entry under the expectation * that a valid one will be provided as part of * the later VOP_POLL. */ if (pdp->pd_php != NULL) { pollhead_delete(pdp->pd_php, pdp); pdp->pd_php = NULL; } /* * Since epoll is expected to act on the * underlying 'struct file' (in Linux terms, * our vnode_t would be a closer analog) rather * than the fd itself, an implicit remove * is necessary under these circumstances to * suppress any results (or errors) from the * new resource occupying the fd. */ if (is_epoll) { pdp->pd_fp = NULL; pdp->pd_events = 0; BT_CLEAR(pcp->pc_bitmap, fd); releasef(fd); continue; } else { /* * Regular /dev/poll is unbothered * about the fd reassignment. */ pdp->pd_fp = fp; pdp->pd_gen = gen; } } /* * Skip entries marked with the sentinal value for * having already fired under oneshot conditions. */ if (pdp->pd_events == POLLONESHOT) { releasef(fd); BT_CLEAR(pcp->pc_bitmap, fd); continue; } /* * XXX - pollrelock() logic needs to know which * which pollcache lock to grab. It'd be a * cleaner solution if we could pass pcp as * an arguement in VOP_POLL interface instead * of implicitly passing it using thread_t * struct. On the other hand, changing VOP_POLL * interface will require all driver/file system * poll routine to change. May want to revisit * the tradeoff later. */ curthread->t_pollcache = pcp; error = VOP_POLL(fp->f_vnode, pdp->pd_events, 0, &revent, &php, NULL); /* * Recheck edge-triggered descriptors which lack a * pollhead. While this check is performed when an fd * is added to the pollcache in dpwrite(), subsequent * descriptor manipulation could cause a different * resource to be present now. */ if ((pdp->pd_events & POLLET) && error == 0 && pdp->pd_php == NULL && php == NULL && revent != 0) { short levent = 0; /* * The same POLLET-only VOP_POLL is used in an * attempt to coax a pollhead from older * driver logic. */ error = VOP_POLL(fp->f_vnode, POLLET, 0, &levent, &php, NULL); } curthread->t_pollcache = NULL; releasef(fd); if (error != 0) { break; } /* * layered devices (e.g. console driver) * may change the vnode and thus the pollhead * pointer out from underneath us. */ if (php != NULL && pdp->pd_php != NULL && php != pdp->pd_php) { pollhead_delete(pdp->pd_php, pdp); pdp->pd_php = php; pollhead_insert(php, pdp); /* * The bit should still be set. */ ASSERT(BT_TEST(pcp->pc_bitmap, fd)); goto retry; } if (revent != 0) { if (pfdp != NULL) { pfdp[fdcnt].fd = fd; pfdp[fdcnt].events = pdp->pd_events; pfdp[fdcnt].revents = revent; } else if (epoll != NULL) { epoll_event_t *ep = &epoll[fdcnt]; ASSERT(epoll != NULL); ep->data.u64 = pdp->pd_epolldata; /* * Since POLLNVAL is a legal event for * VOP_POLL handlers to emit, it must * be translated epoll-legal. */ if (revent & POLLNVAL) { revent &= ~POLLNVAL; revent |= POLLERR; } /* * If any of the event bits are set for * which poll and epoll representations * differ, swizzle in the native epoll * values. */ if (revent & mask) { ep->events = (revent & ~mask) | ((revent & POLLRDHUP) ? EPOLLRDHUP : 0) | ((revent & POLLWRBAND) ? EPOLLWRBAND : 0); } else { ep->events = revent; } /* * We define POLLWRNORM to be POLLOUT, * but epoll has separate definitions * for them; if POLLOUT is set and the * user has asked for EPOLLWRNORM, set * that as well. */ if ((revent & POLLOUT) && (pdp->pd_events & EPOLLWRNORM)) { ep->events |= EPOLLWRNORM; } } else { pollstate_t *ps = curthread->t_pollstate; /* * The devpoll handle itself is being * polled. Notify the caller of any * readable event(s), leaving as much * state as possible untouched. */ VERIFY(fdcnt == 0); VERIFY(ps != NULL); /* * If a call to pollunlock() fails * during VOP_POLL, skip over the fd * and continue polling. * * Otherwise, report that there is an * event pending. */ if ((ps->ps_flags & POLLSTATE_ULFAIL) != 0) { ps->ps_flags &= ~POLLSTATE_ULFAIL; continue; } else { fdcnt++; break; } } /* Handle special polling modes. */ if (pdp->pd_events & POLLONESHOT) { /* * Entries operating under POLLONESHOT * will be marked with a sentinel value * to indicate that they have "fired" * when emitting an event. This will * disable them from polling until a * later add/modify event rearms them. */ pdp->pd_events = POLLONESHOT; if (pdp->pd_php != NULL) { pollhead_delete(pdp->pd_php, pdp); pdp->pd_php = NULL; } BT_CLEAR(pcp->pc_bitmap, fd); } else if (pdp->pd_events & POLLET) { /* * Wire up the pollhead which should * have been provided. Edge-triggered * polling cannot function properly * with drivers which do not emit one. */ if (php != NULL && pdp->pd_php == NULL) { pollhead_insert(php, pdp); pdp->pd_php = php; } /* * If the driver has emitted a pollhead, * clear the bit in the bitmap which * effectively latches the edge on a * pollwakeup() from the driver. */ if (pdp->pd_php != NULL) { BT_CLEAR(pcp->pc_bitmap, fd); } } fdcnt++; } else if (php != NULL) { /* * We clear a bit or cache a poll fd if * the driver returns a poll head ptr, * which is expected in the case of 0 * revents. Some buggy driver may return * NULL php pointer with 0 revents. In * this case, we just treat the driver as * "noncachable" and not clearing the bit * in bitmap. */ if ((pdp->pd_php != NULL) && ((pcp->pc_flag & PC_POLLWAKE) == 0)) { BT_CLEAR(pcp->pc_bitmap, fd); } if (pdp->pd_php == NULL) { pollhead_insert(php, pdp); pdp->pd_php = php; /* * An event of interest may have * arrived between the VOP_POLL() and * the pollhead_insert(); check again. */ goto repoll; } } } else { /* * No bit set in the range. Check for wrap around. */ if (!no_wrap) { start = 0; end = ostart - 1; no_wrap = B_TRUE; } else { done = B_TRUE; } } } if (!done) { pcp->pc_mapstart = start; } ASSERT(*fdcntp == 0); *fdcntp = fdcnt; return (error); } /*ARGSUSED*/ static int dpopen(dev_t *devp, int flag, int otyp, cred_t *credp) { minor_t minordev; dp_entry_t *dpep; pollcache_t *pcp; ASSERT(devpoll_init); ASSERT(dptblsize <= MAXMIN); mutex_enter(&devpoll_lock); for (minordev = 0; minordev < dptblsize; minordev++) { if (devpolltbl[minordev] == NULL) { devpolltbl[minordev] = (dp_entry_t *)RESERVED; break; } } if (minordev == dptblsize) { dp_entry_t **newtbl; size_t oldsize; /* * Used up every entry in the existing devpoll table. * Grow the table by DEVPOLLSIZE. */ if ((oldsize = dptblsize) >= MAXMIN) { mutex_exit(&devpoll_lock); return (ENXIO); } dptblsize += DEVPOLLSIZE; if (dptblsize > MAXMIN) { dptblsize = MAXMIN; } newtbl = kmem_zalloc(sizeof (caddr_t) * dptblsize, KM_SLEEP); bcopy(devpolltbl, newtbl, sizeof (caddr_t) * oldsize); kmem_free(devpolltbl, sizeof (caddr_t) * oldsize); devpolltbl = newtbl; devpolltbl[minordev] = (dp_entry_t *)RESERVED; } mutex_exit(&devpoll_lock); dpep = kmem_zalloc(sizeof (dp_entry_t), KM_SLEEP); /* * allocate a pollcache skeleton here. Delay allocating bitmap * structures until dpwrite() time, since we don't know the * optimal size yet. We also delay setting the pid until either * dpwrite() or attempt to poll on the instance, allowing parents * to create instances of /dev/poll for their children. (In the * epoll compatibility case, this check isn't performed to maintain * semantic compatibility.) */ pcp = pcache_alloc(); dpep->dpe_pcache = pcp; pcp->pc_pid = -1; *devp = makedevice(getmajor(*devp), minordev); /* clone the driver */ mutex_enter(&devpoll_lock); ASSERT(minordev < dptblsize); ASSERT(devpolltbl[minordev] == (dp_entry_t *)RESERVED); devpolltbl[minordev] = dpep; mutex_exit(&devpoll_lock); return (0); } /* * Write to dev/poll add/remove fd's to/from a cached poll fd set, * or change poll events for a watched fd. */ /*ARGSUSED*/ static int dpwrite(dev_t dev, struct uio *uiop, cred_t *credp) { minor_t minor; dp_entry_t *dpep; pollcache_t *pcp; pollfd_t *pollfdp, *pfdp; dvpoll_epollfd_t *epfdp; uintptr_t limit; int error; uint_t size; size_t copysize, uiosize; nfds_t pollfdnum; boolean_t is_epoll, fds_added = B_FALSE; minor = getminor(dev); mutex_enter(&devpoll_lock); ASSERT(minor < dptblsize); dpep = devpolltbl[minor]; ASSERT(dpep != NULL); mutex_exit(&devpoll_lock); mutex_enter(&dpep->dpe_lock); pcp = dpep->dpe_pcache; is_epoll = (dpep->dpe_flag & DP_ISEPOLLCOMPAT) != 0; size = (is_epoll) ? sizeof (dvpoll_epollfd_t) : sizeof (pollfd_t); mutex_exit(&dpep->dpe_lock); if (!is_epoll && curproc->p_pid != pcp->pc_pid) { if (pcp->pc_pid != -1) { return (EACCES); } pcp->pc_pid = curproc->p_pid; } if (uiop->uio_resid < 0) { /* No one else is this careful, but maybe they should be. */ return (EINVAL); } uiosize = (size_t)uiop->uio_resid; pollfdnum = uiosize / size; /* * For epoll-enabled handles, restrict the allowed write size to 2. * This corresponds to an epoll_ctl(3C) performing an EPOLL_CTL_MOD * operation which is expanded into two operations (DEL and ADD). * * All other operations performed through epoll_ctl(3C) will consist of * a single entry. */ if (is_epoll && pollfdnum > 2) { return (EINVAL); } /* * We want to make sure that pollfdnum isn't large enough to DoS us, * but we also don't want to grab p_lock unnecessarily -- so we * perform the full check against our resource limits if and only if * pollfdnum is larger than the known-to-be-sane value of UINT8_MAX. */ if (pollfdnum > UINT8_MAX) { mutex_enter(&curproc->p_lock); if (pollfdnum > (uint_t)rctl_enforced_value(rctlproc_legacy[RLIMIT_NOFILE], curproc->p_rctls, curproc)) { (void) rctl_action(rctlproc_legacy[RLIMIT_NOFILE], curproc->p_rctls, curproc, RCA_SAFE); mutex_exit(&curproc->p_lock); return (EINVAL); } mutex_exit(&curproc->p_lock); } /* * Copy in the pollfd array. Walk through the array and add * each polled fd to the cached set. */ pollfdp = kmem_alloc(uiosize, KM_SLEEP); limit = (uintptr_t)pollfdp + (pollfdnum * size); /* * Although /dev/poll uses the write(2) interface to cache fds, it's * not supposed to function as a seekable device. To prevent offset * from growing and eventually exceed the maximum, reset the offset * here for every call. */ uiop->uio_loffset = 0; /* * Use uiocopy instead of uiomove when populating pollfdp, keeping * uio_resid untouched for now. Write syscalls will translate EINTR * into a success if they detect "successfully transfered" data via an * updated uio_resid. Falsely suppressing such errors is disastrous. */ if ((error = uiocopy((caddr_t)pollfdp, uiosize, UIO_WRITE, uiop, ©size)) != 0) { kmem_free(pollfdp, uiosize); return (error); } /* * We are about to enter the core portion of dpwrite(). Make sure this * write has exclusive access in this portion of the code, i.e., no * other writers in this code. * * Waiting for all readers to drop their references to the dpe is * unecessary since the pollcache itself is protected by pc_lock. */ mutex_enter(&dpep->dpe_lock); dpep->dpe_writerwait++; while ((dpep->dpe_flag & DP_WRITER_PRESENT) != 0) { ASSERT(dpep->dpe_refcnt != 0); /* * The epoll API does not allow EINTR as a result when making * modifications to the set of polled fds. Given that write * activity is relatively quick and the size of accepted writes * is limited above to two entries, a signal-ignorant wait is * used here to avoid the EINTR. */ if (is_epoll) { cv_wait(&dpep->dpe_cv, &dpep->dpe_lock); continue; } /* * Non-epoll writers to /dev/poll handles can tolerate EINTR. */ if (!cv_wait_sig_swap(&dpep->dpe_cv, &dpep->dpe_lock)) { dpep->dpe_writerwait--; mutex_exit(&dpep->dpe_lock); kmem_free(pollfdp, uiosize); return (EINTR); } } dpep->dpe_writerwait--; dpep->dpe_flag |= DP_WRITER_PRESENT; dpep->dpe_refcnt++; if (!is_epoll && (dpep->dpe_flag & DP_ISEPOLLCOMPAT) != 0) { /* * The epoll compat mode was enabled while we were waiting to * establish write access. It is not safe to continue since * state was prepared for non-epoll operation. */ error = EBUSY; goto bypass; } mutex_exit(&dpep->dpe_lock); /* * Since the dpwrite() may recursively walk an added /dev/poll handle, * pollstate_enter() deadlock and loop detection must be used. */ (void) pollstate_create(); VERIFY(pollstate_enter(pcp) == PSE_SUCCESS); if (pcp->pc_bitmap == NULL) { pcache_create(pcp, pollfdnum); } for (pfdp = pollfdp; (uintptr_t)pfdp < limit; pfdp = (pollfd_t *)((uintptr_t)pfdp + size)) { int fd = pfdp->fd; polldat_t *pdp; if ((uint_t)fd >= P_FINFO(curproc)->fi_nfiles) { /* * epoll semantics demand that we return EBADF if our * specified fd is invalid. */ if (is_epoll) { error = EBADF; break; } continue; } pdp = pcache_lookup_fd(pcp, fd); if (pfdp->events != POLLREMOVE) { uf_entry_gen_t gen; file_t *fp = NULL; struct pollhead *php = NULL; /* * If we're in epoll compatibility mode, check that the * fd is valid before allocating anything for it; epoll * semantics demand that we return EBADF if our * specified fd is invalid. */ if (is_epoll) { if ((fp = getf_gen(fd, &gen)) == NULL) { error = EBADF; break; } } if (pdp == NULL) { pdp = pcache_alloc_fd(0); pdp->pd_fd = fd; pdp->pd_pcache = pcp; pcache_insert_fd(pcp, pdp, pollfdnum); } if (is_epoll) { /* * If the fd is already a member of the epoll * set, error emission is needed only when the * fd assignment generation matches the one * recorded in the polldat_t. Absence of such * a generation match indicates that a new * resource has been assigned at that fd. * * Caveat: It is possible to force a generation * update while keeping the same backing * resource. This is possible via dup2, but * does not represent real-world use cases, * making the lack of error acceptable. */ if (pdp->pd_fp != NULL && pdp->pd_gen == gen) { error = EEXIST; releasef(fd); break; } /* * We have decided that the cached information * was stale. Reset pd_events to assure that * we don't mistakenly operate on cached event * disposition. This configures the implicit * subscription to HUP and ERR events which * epoll features. */ pdp->pd_events = POLLERR|POLLHUP; epfdp = (dvpoll_epollfd_t *)pfdp; pdp->pd_epolldata = epfdp->dpep_data; } ASSERT(pdp->pd_fd == fd); ASSERT(pdp->pd_pcache == pcp); if (fd >= pcp->pc_mapsize) { mutex_exit(&pcp->pc_lock); pcache_grow_map(pcp, fd); mutex_enter(&pcp->pc_lock); } if (fd > pcp->pc_mapend) { pcp->pc_mapend = fd; } if (!is_epoll) { ASSERT(fp == NULL); if ((fp = getf_gen(fd, &gen)) == NULL) { /* * The fd is not valid. Since we can't * pass this error back in the write() * call, set the bit in bitmap to force * DP_POLL ioctl to examine it. */ BT_SET(pcp->pc_bitmap, fd); pdp->pd_events |= pfdp->events; continue; } /* * Don't do VOP_POLL for an already cached fd * with same poll events. */ if ((pdp->pd_events == pfdp->events) && (pdp->pd_fp == fp)) { /* * the events are already cached */ releasef(fd); continue; } } /* * do VOP_POLL and cache this poll fd. */ /* * XXX - pollrelock() logic needs to know which * which pollcache lock to grab. It'd be a * cleaner solution if we could pass pcp as * an arguement in VOP_POLL interface instead * of implicitly passing it using thread_t * struct. On the other hand, changing VOP_POLL * interface will require all driver/file system * poll routine to change. May want to revisit * the tradeoff later. */ curthread->t_pollcache = pcp; error = VOP_POLL(fp->f_vnode, pfdp->events, 0, &pfdp->revents, &php, NULL); /* * Edge-triggered polling requires a pollhead in order * to initiate wake-ups properly. Drivers which are * savvy to POLLET presence, which should include * everything in-gate, will always emit one, regardless * of revent status. Older drivers which only emit a * pollhead if 'revents == 0' are given a second chance * here via a second VOP_POLL, with only POLLET set in * the events of interest. These circumstances should * induce any cacheable drivers to emit a pollhead for * wake-ups. * * Drivers which never emit a pollhead will simply * disobey the expectation of edge-triggered behavior. * This includes recursive epoll which, even on Linux, * yields its events in a level-triggered fashion only. */ if ((pfdp->events & POLLET) != 0 && error == 0 && php == NULL) { short levent = 0; error = VOP_POLL(fp->f_vnode, POLLET, 0, &levent, &php, NULL); } curthread->t_pollcache = NULL; /* * We always set the bit when this fd is cached; * this forces the first DP_POLL to poll this fd. * Real performance gain comes from subsequent * DP_POLL. We also attempt a pollhead_insert(); * if it's not possible, we'll do it in dpioctl(). */ BT_SET(pcp->pc_bitmap, fd); if (error != 0) { releasef(fd); break; } pdp->pd_fp = fp; pdp->pd_gen = gen; pdp->pd_events |= pfdp->events; if (php != NULL) { if (pdp->pd_php == NULL) { pollhead_insert(php, pdp); pdp->pd_php = php; } else { if (pdp->pd_php != php) { pollhead_delete(pdp->pd_php, pdp); pollhead_insert(php, pdp); pdp->pd_php = php; } } } fds_added = B_TRUE; releasef(fd); } else { if (pdp == NULL || pdp->pd_fp == NULL) { if (is_epoll) { /* * As with the add case (above), epoll * semantics demand that we error out * in this case. */ error = ENOENT; break; } continue; } ASSERT(pdp->pd_fd == fd); pdp->pd_fp = NULL; pdp->pd_events = 0; ASSERT(pdp->pd_thread == NULL); if (pdp->pd_php != NULL) { pollhead_delete(pdp->pd_php, pdp); pdp->pd_php = NULL; } BT_CLEAR(pcp->pc_bitmap, fd); } } /* * Wake any pollcache waiters so they can check the new descriptors. * * Any fds added to an recursive-capable pollcache could themselves be * /dev/poll handles. To ensure that proper event propagation occurs, * parent pollcaches are woken too, so that they can create any needed * pollcache links. */ if (fds_added) { cv_broadcast(&pcp->pc_cv); pcache_wake_parents(pcp); } pollstate_exit(pcp); mutex_enter(&dpep->dpe_lock); bypass: dpep->dpe_flag &= ~DP_WRITER_PRESENT; dpep->dpe_refcnt--; cv_broadcast(&dpep->dpe_cv); mutex_exit(&dpep->dpe_lock); kmem_free(pollfdp, uiosize); if (error == 0) { /* * The state of uio_resid is updated only after the pollcache * is successfully modified. */ uioskip(uiop, copysize); } return (error); } #define DP_SIGMASK_RESTORE(ksetp) { \ if (ksetp != NULL) { \ mutex_enter(&p->p_lock); \ if (lwp->lwp_cursig == 0) { \ t->t_hold = lwp->lwp_sigoldmask; \ t->t_flag &= ~T_TOMASK; \ } \ mutex_exit(&p->p_lock); \ } \ } /*ARGSUSED*/ static int dpioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp) { minor_t minor; dp_entry_t *dpep; pollcache_t *pcp; hrtime_t now; int error = 0; boolean_t is_epoll; STRUCT_DECL(dvpoll, dvpoll); if (cmd == DP_POLL || cmd == DP_PPOLL) { /* do this now, before we sleep on DP_WRITER_PRESENT */ now = gethrtime(); } minor = getminor(dev); mutex_enter(&devpoll_lock); ASSERT(minor < dptblsize); dpep = devpolltbl[minor]; mutex_exit(&devpoll_lock); ASSERT(dpep != NULL); pcp = dpep->dpe_pcache; mutex_enter(&dpep->dpe_lock); is_epoll = (dpep->dpe_flag & DP_ISEPOLLCOMPAT) != 0; if (cmd == DP_EPOLLCOMPAT) { if (dpep->dpe_refcnt != 0) { /* * We can't turn on epoll compatibility while there * are outstanding operations. */ mutex_exit(&dpep->dpe_lock); return (EBUSY); } /* * epoll compatibility is a one-way street: there's no way * to turn it off for a particular open. */ dpep->dpe_flag |= DP_ISEPOLLCOMPAT; /* Record the epoll-enabled nature in the pollcache too */ mutex_enter(&pcp->pc_lock); pcp->pc_flag |= PC_EPOLL; mutex_exit(&pcp->pc_lock); mutex_exit(&dpep->dpe_lock); return (0); } if (!is_epoll && curproc->p_pid != pcp->pc_pid) { if (pcp->pc_pid != -1) { mutex_exit(&dpep->dpe_lock); return (EACCES); } pcp->pc_pid = curproc->p_pid; } /* Wait until all writers have cleared the handle before continuing */ while ((dpep->dpe_flag & DP_WRITER_PRESENT) != 0 || (dpep->dpe_writerwait != 0)) { if (!cv_wait_sig_swap(&dpep->dpe_cv, &dpep->dpe_lock)) { mutex_exit(&dpep->dpe_lock); return (EINTR); } } dpep->dpe_refcnt++; mutex_exit(&dpep->dpe_lock); switch (cmd) { case DP_POLL: case DP_PPOLL: { pollstate_t *ps; nfds_t nfds; int fdcnt = 0; size_t size, fdsize, dpsize; hrtime_t deadline = 0; k_sigset_t *ksetp = NULL; k_sigset_t kset; sigset_t set; kthread_t *t = curthread; klwp_t *lwp = ttolwp(t); struct proc *p = ttoproc(curthread); STRUCT_INIT(dvpoll, mode); /* * The dp_setp member is only required/consumed for DP_PPOLL, * which otherwise uses the same structure as DP_POLL. */ if (cmd == DP_POLL) { dpsize = (uintptr_t)STRUCT_FADDR(dvpoll, dp_setp) - (uintptr_t)STRUCT_FADDR(dvpoll, dp_fds); } else { ASSERT(cmd == DP_PPOLL); dpsize = STRUCT_SIZE(dvpoll); } if ((mode & FKIOCTL) != 0) { /* Kernel-internal ioctl call */ bcopy((caddr_t)arg, STRUCT_BUF(dvpoll), dpsize); error = 0; } else { error = copyin((caddr_t)arg, STRUCT_BUF(dvpoll), dpsize); } if (error) { DP_REFRELE(dpep); return (EFAULT); } deadline = STRUCT_FGET(dvpoll, dp_timeout); if (deadline > 0) { /* * Convert the deadline from relative milliseconds * to absolute nanoseconds. They must wait for at * least a tick. */ deadline = MSEC2NSEC(deadline); deadline = MAX(deadline, nsec_per_tick); deadline += now; } if (cmd == DP_PPOLL) { void *setp = STRUCT_FGETP(dvpoll, dp_setp); if (setp != NULL) { if ((mode & FKIOCTL) != 0) { /* Use the signal set directly */ ksetp = (k_sigset_t *)setp; } else { if (copyin(setp, &set, sizeof (set))) { DP_REFRELE(dpep); return (EFAULT); } sigutok(&set, &kset); ksetp = &kset; } mutex_enter(&p->p_lock); schedctl_finish_sigblock(t); lwp->lwp_sigoldmask = t->t_hold; t->t_hold = *ksetp; t->t_flag |= T_TOMASK; /* * Like ppoll() with a non-NULL sigset, we'll * call cv_reltimedwait_sig() just to check for * signals. This call will return immediately * with either 0 (signalled) or -1 (no signal). * There are some conditions whereby we can * get 0 from cv_reltimedwait_sig() without * a true signal (e.g., a directed stop), so * we restore our signal mask in the unlikely * event that lwp_cursig is 0. */ if (!cv_reltimedwait_sig(&t->t_delay_cv, &p->p_lock, 0, TR_CLOCK_TICK)) { if (lwp->lwp_cursig == 0) { t->t_hold = lwp->lwp_sigoldmask; t->t_flag &= ~T_TOMASK; } mutex_exit(&p->p_lock); DP_REFRELE(dpep); return (EINTR); } mutex_exit(&p->p_lock); } } if ((nfds = STRUCT_FGET(dvpoll, dp_nfds)) == 0) { /* * We are just using DP_POLL to sleep, so * we don't any of the devpoll apparatus. * Do not check for signals if we have a zero timeout. */ DP_REFRELE(dpep); if (deadline == 0) { DP_SIGMASK_RESTORE(ksetp); return (0); } mutex_enter(&curthread->t_delay_lock); while ((error = cv_timedwait_sig_hrtime(&curthread->t_delay_cv, &curthread->t_delay_lock, deadline)) > 0) continue; mutex_exit(&curthread->t_delay_lock); DP_SIGMASK_RESTORE(ksetp); return (error == 0 ? EINTR : 0); } if (is_epoll) { size = nfds * (fdsize = sizeof (epoll_event_t)); } else { size = nfds * (fdsize = sizeof (pollfd_t)); } /* * XXX It would be nice not to have to alloc each time, but it * requires another per thread structure hook. This can be * implemented later if data suggests that it's necessary. */ ps = pollstate_create(); if (ps->ps_dpbufsize < size) { /* * If nfds is larger than twice the current maximum * open file count, we'll silently clamp it. This * only limits our exposure to allocating an * inordinate amount of kernel memory; it doesn't * otherwise affect the semantics. (We have this * check at twice the maximum instead of merely the * maximum because some applications pass an nfds that * is only slightly larger than their limit.) */ mutex_enter(&p->p_lock); if ((nfds >> 1) > p->p_fno_ctl) { nfds = p->p_fno_ctl; size = nfds * fdsize; } mutex_exit(&p->p_lock); if (ps->ps_dpbufsize < size) { kmem_free(ps->ps_dpbuf, ps->ps_dpbufsize); ps->ps_dpbuf = kmem_zalloc(size, KM_SLEEP); ps->ps_dpbufsize = size; } } VERIFY(pollstate_enter(pcp) == PSE_SUCCESS); for (;;) { pcp->pc_flag &= ~PC_POLLWAKE; /* * Mark all child pcachelinks as stale. * Those which are still part of the tree will be * marked as valid during the poll. */ pcachelink_mark_stale(pcp); error = dp_pcache_poll(dpep, ps->ps_dpbuf, pcp, nfds, &fdcnt); if (fdcnt > 0 || error != 0) break; /* Purge still-stale child pcachelinks */ pcachelink_purge_stale(pcp); /* * A pollwake has happened since we polled cache. */ if (pcp->pc_flag & PC_POLLWAKE) continue; /* * Sleep until we are notified, signaled, or timed out. */ if (deadline == 0) { /* immediate timeout; do not check signals */ break; } error = cv_timedwait_sig_hrtime(&pcp->pc_cv, &pcp->pc_lock, deadline); /* * If we were awakened by a signal or timeout then * break the loop, else poll again. */ if (error <= 0) { error = (error == 0) ? EINTR : 0; break; } else { error = 0; } } pollstate_exit(pcp); DP_SIGMASK_RESTORE(ksetp); if (error == 0 && fdcnt > 0) { /* * It should be noted that FKIOCTL does not influence * the copyout (vs bcopy) of dp_fds at this time. */ if (copyout(ps->ps_dpbuf, STRUCT_FGETP(dvpoll, dp_fds), fdcnt * fdsize)) { DP_REFRELE(dpep); return (EFAULT); } *rvalp = fdcnt; } break; } case DP_ISPOLLED: { pollfd_t pollfd; polldat_t *pdp; STRUCT_INIT(dvpoll, mode); error = copyin((caddr_t)arg, &pollfd, sizeof (pollfd_t)); if (error) { DP_REFRELE(dpep); return (EFAULT); } mutex_enter(&pcp->pc_lock); if (pcp->pc_hash == NULL) { /* * No Need to search because no poll fd * has been cached. */ mutex_exit(&pcp->pc_lock); DP_REFRELE(dpep); return (0); } if (pollfd.fd < 0) { mutex_exit(&pcp->pc_lock); break; } pdp = pcache_lookup_fd(pcp, pollfd.fd); if ((pdp != NULL) && (pdp->pd_fd == pollfd.fd) && (pdp->pd_fp != NULL)) { pollfd.revents = pdp->pd_events; if (copyout(&pollfd, (caddr_t)arg, sizeof (pollfd_t))) { mutex_exit(&pcp->pc_lock); DP_REFRELE(dpep); return (EFAULT); } *rvalp = 1; } mutex_exit(&pcp->pc_lock); break; } default: DP_REFRELE(dpep); return (EINVAL); } DP_REFRELE(dpep); return (error); } /* * Overview of Recursive Polling * * It is possible for /dev/poll to poll for events on file descriptors which * themselves are /dev/poll handles. Pending events in the child handle are * represented as readable data via the POLLIN flag. To limit surface area, * this recursion is presently allowed on only /dev/poll handles which have * been placed in epoll mode via the DP_EPOLLCOMPAT ioctl. Recursion depth is * limited to 5 in order to be consistent with Linux epoll. * * Extending dppoll() for VOP_POLL: * * The recursive /dev/poll implementation begins by extending dppoll() to * report when resources contained in the pollcache have relevant event state. * At the highest level, it means calling dp_pcache_poll() so it indicates if * fd events are present without consuming them or altering the pollcache * bitmap. This ensures that a subsequent DP_POLL operation on the bitmap will * yield the initiating event. Additionally, the VOP_POLL should return in * such a way that dp_pcache_poll() does not clear the parent bitmap entry * which corresponds to the child /dev/poll fd. This means that child * pollcaches will be checked during every poll which facilitates wake-up * behavior detailed below. * * Pollcache Links and Wake Events: * * Recursive /dev/poll avoids complicated pollcache locking constraints during * pollwakeup events by eschewing the traditional pollhead mechanism in favor * of a different approach. For each pollcache at the root of a recursive * /dev/poll "tree", pcachelink_t structures are established to all child * /dev/poll pollcaches. During pollnotify() in a child pollcache, the * linked list of pcachelink_t entries is walked, where those marked as valid * incur a cv_broadcast to their parent pollcache. Most notably, these * pcachelink_t cv wakeups are performed without acquiring pc_lock on the * parent pollcache (which would require careful deadlock avoidance). This * still allows the woken poll on the parent to discover the pertinent events * due to the fact that bitmap entires for the child pollcache are always * maintained by the dppoll() logic above. * * Depth Limiting and Loop Prevention: * * As each pollcache is encountered (either via DP_POLL or dppoll()), depth and * loop constraints are enforced via pollstate_enter(). The pollcache_t * pointer is compared against any existing entries in ps_pc_stack and is added * to the end if no match (and therefore loop) is found. Once poll operations * for a given pollcache_t are complete, pollstate_exit() clears the pointer * from the list. The pollstate_enter() and pollstate_exit() functions are * responsible for acquiring and releasing pc_lock, respectively. * * Deadlock Safety: * * Descending through a tree of recursive /dev/poll handles involves the tricky * business of sequentially entering multiple pollcache locks. This tree * topology cannot define a lock acquisition order in such a way that it is * immune to deadlocks between threads. The pollstate_enter() and * pollstate_exit() functions provide an interface for recursive /dev/poll * operations to safely lock pollcaches while failing gracefully in the face of * deadlocking topologies. (See pollstate_contend() for more detail about how * deadlocks are detected and resolved.) */ /*ARGSUSED*/ static int dppoll(dev_t dev, short events, int anyyet, short *reventsp, struct pollhead **phpp) { minor_t minor; dp_entry_t *dpep; pollcache_t *pcp; int res, rc = 0; minor = getminor(dev); mutex_enter(&devpoll_lock); ASSERT(minor < dptblsize); dpep = devpolltbl[minor]; ASSERT(dpep != NULL); mutex_exit(&devpoll_lock); mutex_enter(&dpep->dpe_lock); if ((dpep->dpe_flag & DP_ISEPOLLCOMPAT) == 0) { /* Poll recursion is not yet supported for non-epoll handles */ *reventsp = POLLERR; mutex_exit(&dpep->dpe_lock); return (0); } else { dpep->dpe_refcnt++; pcp = dpep->dpe_pcache; mutex_exit(&dpep->dpe_lock); } res = pollstate_enter(pcp); if (res == PSE_SUCCESS) { nfds_t nfds = 1; int fdcnt = 0; pollstate_t *ps = curthread->t_pollstate; /* * Recursive polling will only emit certain events. Skip a * scan of the pollcache if those events are not of interest. */ if (events & (POLLIN|POLLRDNORM)) { rc = dp_pcache_poll(dpep, NULL, pcp, nfds, &fdcnt); } else { rc = 0; fdcnt = 0; } if (rc == 0 && fdcnt > 0) { *reventsp = POLLIN|POLLRDNORM; } else { *reventsp = 0; } pcachelink_assoc(pcp, ps->ps_pc_stack[0]); pollstate_exit(pcp); } else { switch (res) { case PSE_FAIL_DEPTH: rc = EINVAL; break; case PSE_FAIL_LOOP: case PSE_FAIL_DEADLOCK: rc = ELOOP; break; default: /* * If anything else has gone awry, such as being polled * from an unexpected context, fall back to the * recursion-intolerant response. */ *reventsp = POLLERR; rc = 0; break; } } DP_REFRELE(dpep); return (rc); } /* * devpoll close should do enough clean up before the pollcache is deleted, * i.e., it should ensure no one still references the pollcache later. * There is no "permission" check in here. Any process having the last * reference of this /dev/poll fd can close. */ /*ARGSUSED*/ static int dpclose(dev_t dev, int flag, int otyp, cred_t *credp) { minor_t minor; dp_entry_t *dpep; pollcache_t *pcp; int i; polldat_t **hashtbl; polldat_t *pdp; minor = getminor(dev); mutex_enter(&devpoll_lock); dpep = devpolltbl[minor]; ASSERT(dpep != NULL); devpolltbl[minor] = NULL; mutex_exit(&devpoll_lock); pcp = dpep->dpe_pcache; ASSERT(pcp != NULL); /* * At this point, no other lwp can access this pollcache via the * /dev/poll fd. This pollcache is going away, so do the clean * up without the pc_lock. */ hashtbl = pcp->pc_hash; for (i = 0; i < pcp->pc_hashsize; i++) { for (pdp = hashtbl[i]; pdp; pdp = pdp->pd_hashnext) { if (pdp->pd_php != NULL) { pollhead_delete(pdp->pd_php, pdp); pdp->pd_php = NULL; pdp->pd_fp = NULL; } } } /* * pollwakeup() may still interact with this pollcache. Wait until * it is done. */ mutex_enter(&pcp->pc_no_exit); ASSERT(pcp->pc_busy >= 0); while (pcp->pc_busy > 0) cv_wait(&pcp->pc_busy_cv, &pcp->pc_no_exit); mutex_exit(&pcp->pc_no_exit); /* Clean up any pollcache links created via recursive /dev/poll */ if (pcp->pc_parents != NULL || pcp->pc_children != NULL) { /* * Because of the locking rules for pcachelink manipulation, * acquring pc_lock is required for this step. */ mutex_enter(&pcp->pc_lock); pcachelink_purge_all(pcp); mutex_exit(&pcp->pc_lock); } pcache_destroy(pcp); ASSERT(dpep->dpe_refcnt == 0); kmem_free(dpep, sizeof (dp_entry_t)); return (0); } static void pcachelink_locked_rele(pcachelink_t *pl) { ASSERT(MUTEX_HELD(&pl->pcl_lock)); VERIFY(pl->pcl_refcnt >= 1); pl->pcl_refcnt--; if (pl->pcl_refcnt == 0) { VERIFY(pl->pcl_state == PCL_INVALID); ASSERT(pl->pcl_parent_pc == NULL); ASSERT(pl->pcl_child_pc == NULL); ASSERT(pl->pcl_parent_next == NULL); ASSERT(pl->pcl_child_next == NULL); pl->pcl_state = PCL_FREE; mutex_destroy(&pl->pcl_lock); kmem_free(pl, sizeof (pcachelink_t)); } else { mutex_exit(&pl->pcl_lock); } } /* * Associate parent and child pollcaches via a pcachelink_t. If an existing * link (stale or valid) between the two is found, it will be reused. If a * suitable link is not found for reuse, a new one will be allocated. */ static void pcachelink_assoc(pollcache_t *child, pollcache_t *parent) { pcachelink_t *pl, **plpn; ASSERT(MUTEX_HELD(&child->pc_lock)); ASSERT(MUTEX_HELD(&parent->pc_lock)); /* Search for an existing link we can reuse. */ plpn = &child->pc_parents; for (pl = child->pc_parents; pl != NULL; pl = *plpn) { mutex_enter(&pl->pcl_lock); if (pl->pcl_state == PCL_INVALID) { /* Clean any invalid links while walking the list */ *plpn = pl->pcl_parent_next; pl->pcl_child_pc = NULL; pl->pcl_parent_next = NULL; pcachelink_locked_rele(pl); } else if (pl->pcl_parent_pc == parent) { /* Successfully found parent link */ ASSERT(pl->pcl_state == PCL_VALID || pl->pcl_state == PCL_STALE); pl->pcl_state = PCL_VALID; mutex_exit(&pl->pcl_lock); return; } else { plpn = &pl->pcl_parent_next; mutex_exit(&pl->pcl_lock); } } /* No existing link to the parent was found. Create a fresh one. */ pl = kmem_zalloc(sizeof (pcachelink_t), KM_SLEEP); mutex_init(&pl->pcl_lock, NULL, MUTEX_DEFAULT, NULL); pl->pcl_parent_pc = parent; pl->pcl_child_next = parent->pc_children; parent->pc_children = pl; pl->pcl_refcnt++; pl->pcl_child_pc = child; pl->pcl_parent_next = child->pc_parents; child->pc_parents = pl; pl->pcl_refcnt++; pl->pcl_state = PCL_VALID; } /* * Mark all child links in a pollcache as stale. Any invalid child links found * during iteration are purged. */ static void pcachelink_mark_stale(pollcache_t *pcp) { pcachelink_t *pl, **plpn; ASSERT(MUTEX_HELD(&pcp->pc_lock)); plpn = &pcp->pc_children; for (pl = pcp->pc_children; pl != NULL; pl = *plpn) { mutex_enter(&pl->pcl_lock); if (pl->pcl_state == PCL_INVALID) { /* * Remove any invalid links while we are going to the * trouble of walking the list. */ *plpn = pl->pcl_child_next; pl->pcl_parent_pc = NULL; pl->pcl_child_next = NULL; pcachelink_locked_rele(pl); } else { pl->pcl_state = PCL_STALE; plpn = &pl->pcl_child_next; mutex_exit(&pl->pcl_lock); } } } /* * Purge all stale (or invalid) child links from a pollcache. */ static void pcachelink_purge_stale(pollcache_t *pcp) { pcachelink_t *pl, **plpn; ASSERT(MUTEX_HELD(&pcp->pc_lock)); plpn = &pcp->pc_children; for (pl = pcp->pc_children; pl != NULL; pl = *plpn) { mutex_enter(&pl->pcl_lock); switch (pl->pcl_state) { case PCL_STALE: pl->pcl_state = PCL_INVALID; /* FALLTHROUGH */ case PCL_INVALID: *plpn = pl->pcl_child_next; pl->pcl_parent_pc = NULL; pl->pcl_child_next = NULL; pcachelink_locked_rele(pl); break; default: plpn = &pl->pcl_child_next; mutex_exit(&pl->pcl_lock); } } } /* * Purge all child and parent links from a pollcache, regardless of status. */ static void pcachelink_purge_all(pollcache_t *pcp) { pcachelink_t *pl, **plpn; ASSERT(MUTEX_HELD(&pcp->pc_lock)); plpn = &pcp->pc_parents; for (pl = pcp->pc_parents; pl != NULL; pl = *plpn) { mutex_enter(&pl->pcl_lock); pl->pcl_state = PCL_INVALID; *plpn = pl->pcl_parent_next; pl->pcl_child_pc = NULL; pl->pcl_parent_next = NULL; pcachelink_locked_rele(pl); } plpn = &pcp->pc_children; for (pl = pcp->pc_children; pl != NULL; pl = *plpn) { mutex_enter(&pl->pcl_lock); pl->pcl_state = PCL_INVALID; *plpn = pl->pcl_child_next; pl->pcl_parent_pc = NULL; pl->pcl_child_next = NULL; pcachelink_locked_rele(pl); } ASSERT(pcp->pc_parents == NULL); ASSERT(pcp->pc_children == NULL); }