1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27#include <sys/types.h>
28#include <sys/proc.h>
29#include <sys/file.h>
30#include <sys/errno.h>
31#include <sys/param.h>
32#include <sys/sysmacros.h>
33#include <sys/cmn_err.h>
34#include <sys/systm.h>
35#include <vm/as.h>
36#include <vm/page.h>
37#include <sys/uio.h>
38#include <sys/kmem.h>
39#include <sys/debug.h>
40#include <sys/aio_impl.h>
41#include <sys/epm.h>
42#include <sys/fs/snode.h>
43#include <sys/siginfo.h>
44#include <sys/cpuvar.h>
45#include <sys/tnf_probe.h>
46#include <sys/conf.h>
47#include <sys/sdt.h>
48
49int aphysio(int (*)(), int (*)(), dev_t, int, void (*)(), struct aio_req *);
50int aio_done(struct buf *);
51void aphysio_unlock(aio_req_t *);
52void aio_cleanup(int);
53void aio_cleanup_exit(void);
54
55/*
56 * private functions
57 */
58static void aio_sigev_send(proc_t *, sigqueue_t *);
59static void aio_hash_delete(aio_t *, aio_req_t *);
60static void aio_lio_free(aio_t *, aio_lio_t *);
61static int aio_cleanup_cleanupq(aio_t *, aio_req_t *, int);
62static int aio_cleanup_notifyq(aio_t *, aio_req_t *, int);
63static void aio_cleanup_pollq(aio_t *, aio_req_t *, int);
64static void aio_cleanup_portq(aio_t *, aio_req_t *, int);
65
66/*
67 * async version of physio() that doesn't wait synchronously
68 * for the driver's strategy routine to complete.
69 */
70
71int
72aphysio(
73	int (*strategy)(struct buf *),
74	int (*cancel)(struct buf *),
75	dev_t dev,
76	int rw,
77	void (*mincnt)(struct buf *),
78	struct aio_req *aio)
79{
80	struct uio *uio = aio->aio_uio;
81	aio_req_t *reqp = (aio_req_t *)aio->aio_private;
82	struct buf *bp = &reqp->aio_req_buf;
83	struct iovec *iov;
84	struct as *as;
85	char *a;
86	int	error;
87	size_t	c;
88	struct page **pplist;
89	struct dev_ops *ops = devopsp[getmajor(dev)];
90
91	if (uio->uio_loffset < 0)
92		return (EINVAL);
93#ifdef	_ILP32
94	/*
95	 * For 32-bit kernels, check against SPEC_MAXOFFSET_T which represents
96	 * the maximum size that can be supported by the IO subsystem.
97	 * XXX this code assumes a D_64BIT driver.
98	 */
99	if (uio->uio_loffset > SPEC_MAXOFFSET_T)
100		return (EINVAL);
101#endif	/* _ILP32 */
102
103	TNF_PROBE_5(aphysio_start, "kaio", /* CSTYLED */,
104	    tnf_opaque, bp, bp,
105	    tnf_device, device, dev,
106	    tnf_offset, blkno, btodt(uio->uio_loffset),
107	    tnf_size, size, uio->uio_iov->iov_len,
108	    tnf_bioflags, rw, rw);
109
110	if (rw == B_READ) {
111		CPU_STATS_ADD_K(sys, phread, 1);
112	} else {
113		CPU_STATS_ADD_K(sys, phwrite, 1);
114	}
115
116	iov = uio->uio_iov;
117	sema_init(&bp->b_sem, 0, NULL, SEMA_DEFAULT, NULL);
118	sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
119
120	bp->b_error = 0;
121	bp->b_flags = B_BUSY | B_PHYS | B_ASYNC | rw;
122	bp->b_edev = dev;
123	bp->b_dev = cmpdev(dev);
124	bp->b_lblkno = btodt(uio->uio_loffset);
125	bp->b_offset = uio->uio_loffset;
126	(void) ops->devo_getinfo(NULL, DDI_INFO_DEVT2DEVINFO,
127	    (void *)bp->b_edev, (void **)&bp->b_dip);
128
129	/*
130	 * Clustering: Clustering can set the b_iodone, b_forw and
131	 * b_proc fields to cluster-specifc values.
132	 */
133	if (bp->b_iodone == NULL) {
134		bp->b_iodone = aio_done;
135		/* b_forw points at an aio_req_t structure */
136		bp->b_forw = (struct buf *)reqp;
137		bp->b_proc = curproc;
138	}
139
140	a = bp->b_un.b_addr = iov->iov_base;
141	c = bp->b_bcount = iov->iov_len;
142
143	(*mincnt)(bp);
144	if (bp->b_bcount != iov->iov_len)
145		return (ENOTSUP);
146
147	as = bp->b_proc->p_as;
148
149	error = as_pagelock(as, &pplist, a,
150	    c, rw == B_READ? S_WRITE : S_READ);
151	if (error != 0) {
152		bp->b_flags |= B_ERROR;
153		bp->b_error = error;
154		bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_SHADOW);
155		return (error);
156	}
157	reqp->aio_req_flags |= AIO_PAGELOCKDONE;
158	bp->b_shadow = pplist;
159	if (pplist != NULL) {
160		bp->b_flags |= B_SHADOW;
161	}
162
163	if (cancel != anocancel)
164		cmn_err(CE_PANIC,
165		    "aphysio: cancellation not supported, use anocancel");
166
167	reqp->aio_req_cancel = cancel;
168
169	DTRACE_IO1(start, struct buf *, bp);
170
171	return ((*strategy)(bp));
172}
173
174/*ARGSUSED*/
175int
176anocancel(struct buf *bp)
177{
178	return (ENXIO);
179}
180
181/*
182 * Called from biodone().
183 * Notify process that a pending AIO has finished.
184 */
185
186/*
187 * Clustering: This function is made non-static as it is used
188 * by clustering s/w as contract private interface.
189 */
190
191int
192aio_done(struct buf *bp)
193{
194	proc_t *p;
195	struct as *as;
196	aio_req_t *reqp;
197	aio_lio_t *head = NULL;
198	aio_t *aiop;
199	sigqueue_t *sigev = NULL;
200	sigqueue_t *lio_sigev = NULL;
201	port_kevent_t *pkevp = NULL;
202	port_kevent_t *lio_pkevp = NULL;
203	int fd;
204	int cleanupqflag;
205	int pollqflag;
206	int portevpend;
207	void (*func)();
208	int use_port = 0;
209	int reqp_flags = 0;
210	int send_signal = 0;
211
212	p = bp->b_proc;
213	as = p->p_as;
214	reqp = (aio_req_t *)bp->b_forw;
215	fd = reqp->aio_req_fd;
216
217	TNF_PROBE_5(aphysio_end, "kaio", /* CSTYLED */,
218	    tnf_opaque, bp, bp,
219	    tnf_device, device, bp->b_edev,
220	    tnf_offset, blkno, btodt(reqp->aio_req_uio.uio_loffset),
221	    tnf_size, size, reqp->aio_req_uio.uio_iov->iov_len,
222	    tnf_bioflags, rw, (bp->b_flags & (B_READ|B_WRITE)));
223
224	/*
225	 * mapout earlier so that more kmem is available when aio is
226	 * heavily used. bug #1262082
227	 */
228	if (bp->b_flags & B_REMAPPED)
229		bp_mapout(bp);
230
231	/* decrement fd's ref count by one, now that aio request is done. */
232	areleasef(fd, P_FINFO(p));
233
234	aiop = p->p_aio;
235	ASSERT(aiop != NULL);
236
237	mutex_enter(&aiop->aio_portq_mutex);
238	mutex_enter(&aiop->aio_mutex);
239	ASSERT(aiop->aio_pending > 0);
240	ASSERT(reqp->aio_req_flags & AIO_PENDING);
241	aiop->aio_pending--;
242	reqp->aio_req_flags &= ~AIO_PENDING;
243	reqp_flags = reqp->aio_req_flags;
244	if ((pkevp = reqp->aio_req_portkev) != NULL) {
245		/* Event port notification is desired for this transaction */
246		if (reqp->aio_req_flags & AIO_CLOSE_PORT) {
247			/*
248			 * The port is being closed and it is waiting for
249			 * pending asynchronous I/O transactions to complete.
250			 */
251			portevpend = --aiop->aio_portpendcnt;
252			aio_deq(&aiop->aio_portpending, reqp);
253			aio_enq(&aiop->aio_portq, reqp, 0);
254			mutex_exit(&aiop->aio_mutex);
255			mutex_exit(&aiop->aio_portq_mutex);
256			port_send_event(pkevp);
257			if (portevpend == 0)
258				cv_broadcast(&aiop->aio_portcv);
259			return (0);
260		}
261
262		if (aiop->aio_flags & AIO_CLEANUP) {
263			/*
264			 * aio_cleanup_thread() is waiting for completion of
265			 * transactions.
266			 */
267			mutex_enter(&as->a_contents);
268			aio_deq(&aiop->aio_portpending, reqp);
269			aio_enq(&aiop->aio_portcleanupq, reqp, 0);
270			cv_signal(&aiop->aio_cleanupcv);
271			mutex_exit(&as->a_contents);
272			mutex_exit(&aiop->aio_mutex);
273			mutex_exit(&aiop->aio_portq_mutex);
274			return (0);
275		}
276
277		aio_deq(&aiop->aio_portpending, reqp);
278		aio_enq(&aiop->aio_portq, reqp, 0);
279
280		use_port = 1;
281	} else {
282		/*
283		 * when the AIO_CLEANUP flag is enabled for this
284		 * process, or when the AIO_POLL bit is set for
285		 * this request, special handling is required.
286		 * otherwise the request is put onto the doneq.
287		 */
288		cleanupqflag = (aiop->aio_flags & AIO_CLEANUP);
289		pollqflag = (reqp->aio_req_flags & AIO_POLL);
290		if (cleanupqflag | pollqflag) {
291
292			if (cleanupqflag)
293				mutex_enter(&as->a_contents);
294
295			/*
296			 * requests with their AIO_POLL bit set are put
297			 * on the pollq, requests with sigevent structures
298			 * or with listio heads are put on the notifyq, and
299			 * the remaining requests don't require any special
300			 * cleanup handling, so they're put onto the default
301			 * cleanupq.
302			 */
303			if (pollqflag)
304				aio_enq(&aiop->aio_pollq, reqp, AIO_POLLQ);
305			else if (reqp->aio_req_sigqp || reqp->aio_req_lio)
306				aio_enq(&aiop->aio_notifyq, reqp, AIO_NOTIFYQ);
307			else
308				aio_enq(&aiop->aio_cleanupq, reqp,
309				    AIO_CLEANUPQ);
310
311			if (cleanupqflag) {
312				cv_signal(&aiop->aio_cleanupcv);
313				mutex_exit(&as->a_contents);
314				mutex_exit(&aiop->aio_mutex);
315				mutex_exit(&aiop->aio_portq_mutex);
316			} else {
317				ASSERT(pollqflag);
318				/* block aio_cleanup_exit until we're done */
319				aiop->aio_flags |= AIO_DONE_ACTIVE;
320				mutex_exit(&aiop->aio_mutex);
321				mutex_exit(&aiop->aio_portq_mutex);
322				/*
323				 * let the cleanup processing happen from an AST
324				 * set an AST on all threads in this process
325				 */
326				mutex_enter(&p->p_lock);
327				set_proc_ast(p);
328				mutex_exit(&p->p_lock);
329				mutex_enter(&aiop->aio_mutex);
330				/* wakeup anybody waiting in aiowait() */
331				cv_broadcast(&aiop->aio_waitcv);
332
333				/* wakeup aio_cleanup_exit if needed */
334				if (aiop->aio_flags & AIO_CLEANUP)
335					cv_signal(&aiop->aio_cleanupcv);
336				aiop->aio_flags &= ~AIO_DONE_ACTIVE;
337				mutex_exit(&aiop->aio_mutex);
338			}
339			return (0);
340		}
341
342		/*
343		 * save req's sigevent pointer, and check its
344		 * value after releasing aio_mutex lock.
345		 */
346		sigev = reqp->aio_req_sigqp;
347		reqp->aio_req_sigqp = NULL;
348
349		/* put request on done queue. */
350		aio_enq(&aiop->aio_doneq, reqp, AIO_DONEQ);
351	} /* portkevent */
352
353	/*
354	 * when list IO notification is enabled, a notification or
355	 * signal is sent only when all entries in the list are done.
356	 */
357	if ((head = reqp->aio_req_lio) != NULL) {
358		ASSERT(head->lio_refcnt > 0);
359		if (--head->lio_refcnt == 0) {
360			/*
361			 * save lio's sigevent pointer, and check
362			 * its value after releasing aio_mutex lock.
363			 */
364			lio_sigev = head->lio_sigqp;
365			head->lio_sigqp = NULL;
366			cv_signal(&head->lio_notify);
367			if (head->lio_port >= 0 &&
368			    (lio_pkevp = head->lio_portkev) != NULL)
369				head->lio_port = -1;
370		}
371	}
372
373	/*
374	 * if AIO_WAITN set then
375	 * send signal only when we reached the
376	 * required amount of IO's finished
377	 * or when all IO's are done
378	 */
379	if (aiop->aio_flags & AIO_WAITN) {
380		if (aiop->aio_waitncnt > 0)
381			aiop->aio_waitncnt--;
382		if (aiop->aio_pending == 0 ||
383		    aiop->aio_waitncnt == 0)
384			cv_broadcast(&aiop->aio_waitcv);
385	} else {
386		cv_broadcast(&aiop->aio_waitcv);
387	}
388
389	/*
390	 * No need to set this flag for pollq, portq, lio requests.
391	 * If this is an old Solaris aio request, and the process has
392	 * a SIGIO signal handler enabled, then send a SIGIO signal.
393	 */
394	if (!sigev && !use_port && head == NULL &&
395	    (reqp->aio_req_flags & AIO_SOLARIS) &&
396	    (func = PTOU(p)->u_signal[SIGIO - 1]) != SIG_DFL &&
397	    (func != SIG_IGN)) {
398		send_signal = 1;
399		reqp->aio_req_flags |= AIO_SIGNALLED;
400	}
401
402	mutex_exit(&aiop->aio_mutex);
403	mutex_exit(&aiop->aio_portq_mutex);
404
405	/*
406	 * Could the cleanup thread be waiting for AIO with locked
407	 * resources to finish?
408	 * Ideally in that case cleanup thread should block on cleanupcv,
409	 * but there is a window, where it could miss to see a new aio
410	 * request that sneaked in.
411	 */
412	mutex_enter(&as->a_contents);
413	if ((reqp_flags & AIO_PAGELOCKDONE) && AS_ISUNMAPWAIT(as))
414		cv_broadcast(&as->a_cv);
415	mutex_exit(&as->a_contents);
416
417	if (sigev)
418		aio_sigev_send(p, sigev);
419	else if (send_signal)
420		psignal(p, SIGIO);
421
422	if (pkevp)
423		port_send_event(pkevp);
424	if (lio_sigev)
425		aio_sigev_send(p, lio_sigev);
426	if (lio_pkevp)
427		port_send_event(lio_pkevp);
428
429	return (0);
430}
431
432/*
433 * send a queued signal to the specified process when
434 * the event signal is non-NULL. A return value of 1
435 * will indicate that a signal is queued, and 0 means that
436 * no signal was specified, nor sent.
437 */
438static void
439aio_sigev_send(proc_t *p, sigqueue_t *sigev)
440{
441	ASSERT(sigev != NULL);
442
443	mutex_enter(&p->p_lock);
444	sigaddqa(p, NULL, sigev);
445	mutex_exit(&p->p_lock);
446}
447
448/*
449 * special case handling for zero length requests. the aio request
450 * short circuits the normal completion path since all that's required
451 * to complete this request is to copyout a zero to the aio request's
452 * return value.
453 */
454void
455aio_zerolen(aio_req_t *reqp)
456{
457
458	struct buf *bp = &reqp->aio_req_buf;
459
460	reqp->aio_req_flags |= AIO_ZEROLEN;
461
462	bp->b_forw = (struct buf *)reqp;
463	bp->b_proc = curproc;
464
465	bp->b_resid = 0;
466	bp->b_flags = 0;
467
468	aio_done(bp);
469}
470
471/*
472 * unlock pages previously locked by as_pagelock
473 */
474void
475aphysio_unlock(aio_req_t *reqp)
476{
477	struct buf *bp;
478	struct iovec *iov;
479	int flags;
480
481	if (reqp->aio_req_flags & AIO_PHYSIODONE)
482		return;
483
484	reqp->aio_req_flags |= AIO_PHYSIODONE;
485
486	if (reqp->aio_req_flags & AIO_ZEROLEN)
487		return;
488
489	bp = &reqp->aio_req_buf;
490	iov = reqp->aio_req_uio.uio_iov;
491	flags = (((bp->b_flags & B_READ) == B_READ) ? S_WRITE : S_READ);
492	if (reqp->aio_req_flags & AIO_PAGELOCKDONE) {
493		as_pageunlock(bp->b_proc->p_as,
494		    bp->b_flags & B_SHADOW ? bp->b_shadow : NULL,
495		    iov->iov_base, iov->iov_len, flags);
496		reqp->aio_req_flags &= ~AIO_PAGELOCKDONE;
497	}
498	bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_SHADOW);
499	bp->b_flags |= B_DONE;
500}
501
502/*
503 * deletes a requests id from the hash table of outstanding io.
504 */
505static void
506aio_hash_delete(aio_t *aiop, struct aio_req_t *reqp)
507{
508	long index;
509	aio_result_t *resultp = reqp->aio_req_resultp;
510	aio_req_t *current;
511	aio_req_t **nextp;
512
513	index = AIO_HASH(resultp);
514	nextp = (aiop->aio_hash + index);
515	while ((current = *nextp) != NULL) {
516		if (current->aio_req_resultp == resultp) {
517			*nextp = current->aio_hash_next;
518			return;
519		}
520		nextp = &current->aio_hash_next;
521	}
522}
523
524/*
525 * Put a list head struct onto its free list.
526 */
527static void
528aio_lio_free(aio_t *aiop, aio_lio_t *head)
529{
530	ASSERT(MUTEX_HELD(&aiop->aio_mutex));
531
532	if (head->lio_sigqp != NULL)
533		kmem_free(head->lio_sigqp, sizeof (sigqueue_t));
534	head->lio_next = aiop->aio_lio_free;
535	aiop->aio_lio_free = head;
536}
537
538/*
539 * Put a reqp onto the freelist.
540 */
541void
542aio_req_free(aio_t *aiop, aio_req_t *reqp)
543{
544	aio_lio_t *liop;
545
546	ASSERT(MUTEX_HELD(&aiop->aio_mutex));
547
548	if (reqp->aio_req_portkev) {
549		port_free_event(reqp->aio_req_portkev);
550		reqp->aio_req_portkev = NULL;
551	}
552
553	if ((liop = reqp->aio_req_lio) != NULL) {
554		if (--liop->lio_nent == 0)
555			aio_lio_free(aiop, liop);
556		reqp->aio_req_lio = NULL;
557	}
558	if (reqp->aio_req_sigqp != NULL) {
559		kmem_free(reqp->aio_req_sigqp, sizeof (sigqueue_t));
560		reqp->aio_req_sigqp = NULL;
561	}
562	reqp->aio_req_next = aiop->aio_free;
563	reqp->aio_req_prev = NULL;
564	aiop->aio_free = reqp;
565	aiop->aio_outstanding--;
566	if (aiop->aio_outstanding == 0)
567		cv_broadcast(&aiop->aio_waitcv);
568	aio_hash_delete(aiop, reqp);
569}
570
571/*
572 * Put a reqp onto the freelist.
573 */
574void
575aio_req_free_port(aio_t *aiop, aio_req_t *reqp)
576{
577	ASSERT(MUTEX_HELD(&aiop->aio_mutex));
578
579	reqp->aio_req_next = aiop->aio_free;
580	reqp->aio_req_prev = NULL;
581	aiop->aio_free = reqp;
582	aiop->aio_outstanding--;
583	aio_hash_delete(aiop, reqp);
584}
585
586
587/*
588 * Verify the integrity of a queue.
589 */
590#if defined(DEBUG)
591static void
592aio_verify_queue(aio_req_t *head,
593	aio_req_t *entry_present, aio_req_t *entry_missing)
594{
595	aio_req_t *reqp;
596	int found = 0;
597	int present = 0;
598
599	if ((reqp = head) != NULL) {
600		do {
601			ASSERT(reqp->aio_req_prev->aio_req_next == reqp);
602			ASSERT(reqp->aio_req_next->aio_req_prev == reqp);
603			if (entry_present == reqp)
604				found++;
605			if (entry_missing == reqp)
606				present++;
607		} while ((reqp = reqp->aio_req_next) != head);
608	}
609	ASSERT(entry_present == NULL || found == 1);
610	ASSERT(entry_missing == NULL || present == 0);
611}
612#else
613#define	aio_verify_queue(x, y, z)
614#endif
615
616/*
617 * Put a request onto the tail of a queue.
618 */
619void
620aio_enq(aio_req_t **qhead, aio_req_t *reqp, int qflg_new)
621{
622	aio_req_t *head;
623	aio_req_t *prev;
624
625	aio_verify_queue(*qhead, NULL, reqp);
626
627	if ((head = *qhead) == NULL) {
628		reqp->aio_req_next = reqp;
629		reqp->aio_req_prev = reqp;
630		*qhead = reqp;
631	} else {
632		reqp->aio_req_next = head;
633		reqp->aio_req_prev = prev = head->aio_req_prev;
634		prev->aio_req_next = reqp;
635		head->aio_req_prev = reqp;
636	}
637	reqp->aio_req_flags |= qflg_new;
638}
639
640/*
641 * Remove a request from its queue.
642 */
643void
644aio_deq(aio_req_t **qhead, aio_req_t *reqp)
645{
646	aio_verify_queue(*qhead, reqp, NULL);
647
648	if (reqp->aio_req_next == reqp) {
649		*qhead = NULL;
650	} else {
651		reqp->aio_req_prev->aio_req_next = reqp->aio_req_next;
652		reqp->aio_req_next->aio_req_prev = reqp->aio_req_prev;
653		if (*qhead == reqp)
654			*qhead = reqp->aio_req_next;
655	}
656	reqp->aio_req_next = NULL;
657	reqp->aio_req_prev = NULL;
658}
659
660/*
661 * concatenate a specified queue with the cleanupq. the specified
662 * queue is put onto the tail of the cleanupq. all elements on the
663 * specified queue should have their aio_req_flags field cleared.
664 */
665/*ARGSUSED*/
666void
667aio_cleanupq_concat(aio_t *aiop, aio_req_t *q2, int qflg)
668{
669	aio_req_t *cleanupqhead, *q2tail;
670	aio_req_t *reqp = q2;
671
672	do {
673		ASSERT(reqp->aio_req_flags & qflg);
674		reqp->aio_req_flags &= ~qflg;
675		reqp->aio_req_flags |= AIO_CLEANUPQ;
676	} while ((reqp = reqp->aio_req_next) != q2);
677
678	cleanupqhead = aiop->aio_cleanupq;
679	if (cleanupqhead == NULL)
680		aiop->aio_cleanupq = q2;
681	else {
682		cleanupqhead->aio_req_prev->aio_req_next = q2;
683		q2tail = q2->aio_req_prev;
684		q2tail->aio_req_next = cleanupqhead;
685		q2->aio_req_prev = cleanupqhead->aio_req_prev;
686		cleanupqhead->aio_req_prev = q2tail;
687	}
688}
689
690/*
691 * cleanup aio requests that are on the per-process poll queue.
692 */
693void
694aio_cleanup(int flag)
695{
696	aio_t *aiop = curproc->p_aio;
697	aio_req_t *pollqhead, *cleanupqhead, *notifyqhead;
698	aio_req_t *cleanupport;
699	aio_req_t *portq = NULL;
700	void (*func)();
701	int signalled = 0;
702	int qflag = 0;
703	int exitflg;
704
705	ASSERT(aiop != NULL);
706
707	if (flag == AIO_CLEANUP_EXIT)
708		exitflg = AIO_CLEANUP_EXIT;
709	else
710		exitflg = 0;
711
712	/*
713	 * We need to get the aio_cleanupq_mutex because we are calling
714	 * aio_cleanup_cleanupq()
715	 */
716	mutex_enter(&aiop->aio_cleanupq_mutex);
717	/*
718	 * take all the requests off the cleanupq, the notifyq,
719	 * and the pollq.
720	 */
721	mutex_enter(&aiop->aio_mutex);
722	if ((cleanupqhead = aiop->aio_cleanupq) != NULL) {
723		aiop->aio_cleanupq = NULL;
724		qflag++;
725	}
726	if ((notifyqhead = aiop->aio_notifyq) != NULL) {
727		aiop->aio_notifyq = NULL;
728		qflag++;
729	}
730	if ((pollqhead = aiop->aio_pollq) != NULL) {
731		aiop->aio_pollq = NULL;
732		qflag++;
733	}
734	if (flag) {
735		if ((portq = aiop->aio_portq) != NULL)
736			qflag++;
737
738		if ((cleanupport = aiop->aio_portcleanupq) != NULL) {
739			aiop->aio_portcleanupq = NULL;
740			qflag++;
741		}
742	}
743	mutex_exit(&aiop->aio_mutex);
744
745	/*
746	 * return immediately if cleanupq, pollq, and
747	 * notifyq are all empty. someone else must have
748	 * emptied them.
749	 */
750	if (!qflag) {
751		mutex_exit(&aiop->aio_cleanupq_mutex);
752		return;
753	}
754
755	/*
756	 * do cleanup for the various queues.
757	 */
758	if (cleanupqhead)
759		signalled = aio_cleanup_cleanupq(aiop, cleanupqhead, exitflg);
760	mutex_exit(&aiop->aio_cleanupq_mutex);
761	if (notifyqhead)
762		signalled = aio_cleanup_notifyq(aiop, notifyqhead, exitflg);
763	if (pollqhead)
764		aio_cleanup_pollq(aiop, pollqhead, exitflg);
765	if (flag && (cleanupport || portq))
766		aio_cleanup_portq(aiop, cleanupport, exitflg);
767
768	if (exitflg)
769		return;
770
771	/*
772	 * If we have an active aio_cleanup_thread it's possible for
773	 * this routine to push something on to the done queue after
774	 * an aiowait/aiosuspend thread has already decided to block.
775	 * This being the case, we need a cv_broadcast here to wake
776	 * these threads up. It is simpler and cleaner to do this
777	 * broadcast here than in the individual cleanup routines.
778	 */
779
780	mutex_enter(&aiop->aio_mutex);
781	/*
782	 * If there has never been an old solaris aio request
783	 * issued by this process, then do not send a SIGIO signal.
784	 */
785	if (!(aiop->aio_flags & AIO_SOLARIS_REQ))
786		signalled = 1;
787	cv_broadcast(&aiop->aio_waitcv);
788	mutex_exit(&aiop->aio_mutex);
789
790	/*
791	 * Only if the process wasn't already signalled,
792	 * determine if a SIGIO signal should be delievered.
793	 */
794	if (!signalled &&
795	    (func = PTOU(curproc)->u_signal[SIGIO - 1]) != SIG_DFL &&
796	    func != SIG_IGN)
797		psignal(curproc, SIGIO);
798}
799
800
801/*
802 * Do cleanup for every element of the port cleanup queue.
803 */
804static void
805aio_cleanup_portq(aio_t *aiop, aio_req_t *cleanupq, int exitflag)
806{
807	aio_req_t	*reqp;
808	aio_req_t	*next;
809	aio_req_t	*headp;
810	aio_lio_t	*liop;
811
812	/* first check the portq */
813	if (exitflag || ((aiop->aio_flags & AIO_CLEANUP_PORT) == 0)) {
814		mutex_enter(&aiop->aio_mutex);
815		if (aiop->aio_flags & AIO_CLEANUP)
816			aiop->aio_flags |= AIO_CLEANUP_PORT;
817		mutex_exit(&aiop->aio_mutex);
818
819		/*
820		 * It is not allowed to hold locks during aphysio_unlock().
821		 * The aio_done() interrupt function will try to acquire
822		 * aio_mutex and aio_portq_mutex.  Therefore we disconnect
823		 * the portq list from the aiop for the duration of the
824		 * aphysio_unlock() loop below.
825		 */
826		mutex_enter(&aiop->aio_portq_mutex);
827		headp = aiop->aio_portq;
828		aiop->aio_portq = NULL;
829		mutex_exit(&aiop->aio_portq_mutex);
830		if ((reqp = headp) != NULL) {
831			do {
832				next = reqp->aio_req_next;
833				aphysio_unlock(reqp);
834				if (exitflag) {
835					mutex_enter(&aiop->aio_mutex);
836					aio_req_free(aiop, reqp);
837					mutex_exit(&aiop->aio_mutex);
838				}
839			} while ((reqp = next) != headp);
840		}
841
842		if (headp != NULL && exitflag == 0) {
843			/* move unlocked requests back to the port queue */
844			aio_req_t *newq;
845
846			mutex_enter(&aiop->aio_portq_mutex);
847			if ((newq = aiop->aio_portq) != NULL) {
848				aio_req_t *headprev = headp->aio_req_prev;
849				aio_req_t *newqprev = newq->aio_req_prev;
850
851				headp->aio_req_prev = newqprev;
852				newq->aio_req_prev = headprev;
853				headprev->aio_req_next = newq;
854				newqprev->aio_req_next = headp;
855			}
856			aiop->aio_portq = headp;
857			cv_broadcast(&aiop->aio_portcv);
858			mutex_exit(&aiop->aio_portq_mutex);
859		}
860	}
861
862	/* now check the port cleanup queue */
863	if ((reqp = cleanupq) == NULL)
864		return;
865	do {
866		next = reqp->aio_req_next;
867		aphysio_unlock(reqp);
868		if (exitflag) {
869			mutex_enter(&aiop->aio_mutex);
870			aio_req_free(aiop, reqp);
871			mutex_exit(&aiop->aio_mutex);
872		} else {
873			mutex_enter(&aiop->aio_portq_mutex);
874			aio_enq(&aiop->aio_portq, reqp, 0);
875			mutex_exit(&aiop->aio_portq_mutex);
876			port_send_event(reqp->aio_req_portkev);
877			if ((liop = reqp->aio_req_lio) != NULL) {
878				int send_event = 0;
879
880				mutex_enter(&aiop->aio_mutex);
881				ASSERT(liop->lio_refcnt > 0);
882				if (--liop->lio_refcnt == 0) {
883					if (liop->lio_port >= 0 &&
884					    liop->lio_portkev) {
885						liop->lio_port = -1;
886						send_event = 1;
887					}
888				}
889				mutex_exit(&aiop->aio_mutex);
890				if (send_event)
891					port_send_event(liop->lio_portkev);
892			}
893		}
894	} while ((reqp = next) != cleanupq);
895}
896
897/*
898 * Do cleanup for every element of the cleanupq.
899 */
900static int
901aio_cleanup_cleanupq(aio_t *aiop, aio_req_t *qhead, int exitflg)
902{
903	aio_req_t *reqp, *next;
904	int signalled = 0;
905
906	ASSERT(MUTEX_HELD(&aiop->aio_cleanupq_mutex));
907
908	/*
909	 * Since aio_req_done() or aio_req_find() use the HASH list to find
910	 * the required requests, they could potentially take away elements
911	 * if they are already done (AIO_DONEQ is set).
912	 * The aio_cleanupq_mutex protects the queue for the duration of the
913	 * loop from aio_req_done() and aio_req_find().
914	 */
915	if ((reqp = qhead) == NULL)
916		return (0);
917	do {
918		ASSERT(reqp->aio_req_flags & AIO_CLEANUPQ);
919		ASSERT(reqp->aio_req_portkev == NULL);
920		next = reqp->aio_req_next;
921		aphysio_unlock(reqp);
922		mutex_enter(&aiop->aio_mutex);
923		if (exitflg)
924			aio_req_free(aiop, reqp);
925		else
926			aio_enq(&aiop->aio_doneq, reqp, AIO_DONEQ);
927		if (!exitflg) {
928			if (reqp->aio_req_flags & AIO_SIGNALLED)
929				signalled++;
930			else
931				reqp->aio_req_flags |= AIO_SIGNALLED;
932		}
933		mutex_exit(&aiop->aio_mutex);
934	} while ((reqp = next) != qhead);
935	return (signalled);
936}
937
938/*
939 * do cleanup for every element of the notify queue.
940 */
941static int
942aio_cleanup_notifyq(aio_t *aiop, aio_req_t *qhead, int exitflg)
943{
944	aio_req_t *reqp, *next;
945	aio_lio_t *liohead;
946	sigqueue_t *sigev, *lio_sigev = NULL;
947	int signalled = 0;
948
949	if ((reqp = qhead) == NULL)
950		return (0);
951	do {
952		ASSERT(reqp->aio_req_flags & AIO_NOTIFYQ);
953		next = reqp->aio_req_next;
954		aphysio_unlock(reqp);
955		if (exitflg) {
956			mutex_enter(&aiop->aio_mutex);
957			aio_req_free(aiop, reqp);
958			mutex_exit(&aiop->aio_mutex);
959		} else {
960			mutex_enter(&aiop->aio_mutex);
961			aio_enq(&aiop->aio_doneq, reqp, AIO_DONEQ);
962			sigev = reqp->aio_req_sigqp;
963			reqp->aio_req_sigqp = NULL;
964			if ((liohead = reqp->aio_req_lio) != NULL) {
965				ASSERT(liohead->lio_refcnt > 0);
966				if (--liohead->lio_refcnt == 0) {
967					cv_signal(&liohead->lio_notify);
968					lio_sigev = liohead->lio_sigqp;
969					liohead->lio_sigqp = NULL;
970				}
971			}
972			mutex_exit(&aiop->aio_mutex);
973			if (sigev) {
974				signalled++;
975				aio_sigev_send(reqp->aio_req_buf.b_proc,
976				    sigev);
977			}
978			if (lio_sigev) {
979				signalled++;
980				aio_sigev_send(reqp->aio_req_buf.b_proc,
981				    lio_sigev);
982			}
983		}
984	} while ((reqp = next) != qhead);
985
986	return (signalled);
987}
988
989/*
990 * Do cleanup for every element of the poll queue.
991 */
992static void
993aio_cleanup_pollq(aio_t *aiop, aio_req_t *qhead, int exitflg)
994{
995	aio_req_t *reqp, *next;
996
997	/*
998	 * As no other threads should be accessing the queue at this point,
999	 * it isn't necessary to hold aio_mutex while we traverse its elements.
1000	 */
1001	if ((reqp = qhead) == NULL)
1002		return;
1003	do {
1004		ASSERT(reqp->aio_req_flags & AIO_POLLQ);
1005		next = reqp->aio_req_next;
1006		aphysio_unlock(reqp);
1007		if (exitflg) {
1008			mutex_enter(&aiop->aio_mutex);
1009			aio_req_free(aiop, reqp);
1010			mutex_exit(&aiop->aio_mutex);
1011		} else {
1012			aio_copyout_result(reqp);
1013			mutex_enter(&aiop->aio_mutex);
1014			aio_enq(&aiop->aio_doneq, reqp, AIO_DONEQ);
1015			mutex_exit(&aiop->aio_mutex);
1016		}
1017	} while ((reqp = next) != qhead);
1018}
1019
1020/*
1021 * called by exit(). waits for all outstanding kaio to finish
1022 * before the kaio resources are freed.
1023 */
1024void
1025aio_cleanup_exit(void)
1026{
1027	proc_t *p = curproc;
1028	aio_t *aiop = p->p_aio;
1029	aio_req_t *reqp, *next, *head;
1030	aio_lio_t *nxtlio, *liop;
1031
1032	/*
1033	 * wait for all outstanding kaio to complete. process
1034	 * is now single-threaded; no other kaio requests can
1035	 * happen once aio_pending is zero.
1036	 */
1037	mutex_enter(&aiop->aio_mutex);
1038	aiop->aio_flags |= AIO_CLEANUP;
1039	while ((aiop->aio_pending != 0) || (aiop->aio_flags & AIO_DONE_ACTIVE))
1040		cv_wait(&aiop->aio_cleanupcv, &aiop->aio_mutex);
1041	mutex_exit(&aiop->aio_mutex);
1042
1043	/* cleanup the cleanup-thread queues. */
1044	aio_cleanup(AIO_CLEANUP_EXIT);
1045
1046	/*
1047	 * Although this process is now single-threaded, we
1048	 * still need to protect ourselves against a race with
1049	 * aio_cleanup_dr_delete_memory().
1050	 */
1051	mutex_enter(&p->p_lock);
1052
1053	/*
1054	 * free up the done queue's resources.
1055	 */
1056	if ((head = aiop->aio_doneq) != NULL) {
1057		aiop->aio_doneq = NULL;
1058		reqp = head;
1059		do {
1060			next = reqp->aio_req_next;
1061			aphysio_unlock(reqp);
1062			kmem_free(reqp, sizeof (struct aio_req_t));
1063		} while ((reqp = next) != head);
1064	}
1065	/*
1066	 * release aio request freelist.
1067	 */
1068	for (reqp = aiop->aio_free; reqp != NULL; reqp = next) {
1069		next = reqp->aio_req_next;
1070		kmem_free(reqp, sizeof (struct aio_req_t));
1071	}
1072
1073	/*
1074	 * release io list head freelist.
1075	 */
1076	for (liop = aiop->aio_lio_free; liop != NULL; liop = nxtlio) {
1077		nxtlio = liop->lio_next;
1078		kmem_free(liop, sizeof (aio_lio_t));
1079	}
1080
1081	if (aiop->aio_iocb)
1082		kmem_free(aiop->aio_iocb, aiop->aio_iocbsz);
1083
1084	mutex_destroy(&aiop->aio_mutex);
1085	mutex_destroy(&aiop->aio_portq_mutex);
1086	mutex_destroy(&aiop->aio_cleanupq_mutex);
1087	p->p_aio = NULL;
1088	mutex_exit(&p->p_lock);
1089	kmem_free(aiop, sizeof (struct aio));
1090}
1091
1092/*
1093 * copy out aio request's result to a user-level result_t buffer.
1094 */
1095void
1096aio_copyout_result(aio_req_t *reqp)
1097{
1098	struct buf	*bp;
1099	struct iovec	*iov;
1100	void		*resultp;
1101	int		error;
1102	size_t		retval;
1103
1104	if (reqp->aio_req_flags & AIO_COPYOUTDONE)
1105		return;
1106
1107	reqp->aio_req_flags |= AIO_COPYOUTDONE;
1108
1109	iov = reqp->aio_req_uio.uio_iov;
1110	bp = &reqp->aio_req_buf;
1111	/* "resultp" points to user-level result_t buffer */
1112	resultp = (void *)reqp->aio_req_resultp;
1113	if (bp->b_flags & B_ERROR) {
1114		if (bp->b_error)
1115			error = bp->b_error;
1116		else
1117			error = EIO;
1118		retval = (size_t)-1;
1119	} else {
1120		error = 0;
1121		retval = iov->iov_len - bp->b_resid;
1122	}
1123#ifdef	_SYSCALL32_IMPL
1124	if (get_udatamodel() == DATAMODEL_NATIVE) {
1125		(void) sulword(&((aio_result_t *)resultp)->aio_return, retval);
1126		(void) suword32(&((aio_result_t *)resultp)->aio_errno, error);
1127	} else {
1128		(void) suword32(&((aio_result32_t *)resultp)->aio_return,
1129		    (int)retval);
1130		(void) suword32(&((aio_result32_t *)resultp)->aio_errno, error);
1131	}
1132#else
1133	(void) suword32(&((aio_result_t *)resultp)->aio_return, retval);
1134	(void) suword32(&((aio_result_t *)resultp)->aio_errno, error);
1135#endif
1136}
1137
1138
1139void
1140aio_copyout_result_port(struct iovec *iov, struct buf *bp, void *resultp)
1141{
1142	int errno;
1143	size_t retval;
1144
1145	if (bp->b_flags & B_ERROR) {
1146		if (bp->b_error)
1147			errno = bp->b_error;
1148		else
1149			errno = EIO;
1150		retval = (size_t)-1;
1151	} else {
1152		errno = 0;
1153		retval = iov->iov_len - bp->b_resid;
1154	}
1155#ifdef	_SYSCALL32_IMPL
1156	if (get_udatamodel() == DATAMODEL_NATIVE) {
1157		(void) sulword(&((aio_result_t *)resultp)->aio_return, retval);
1158		(void) suword32(&((aio_result_t *)resultp)->aio_errno, errno);
1159	} else {
1160		(void) suword32(&((aio_result32_t *)resultp)->aio_return,
1161		    (int)retval);
1162		(void) suword32(&((aio_result32_t *)resultp)->aio_errno, errno);
1163	}
1164#else
1165	(void) suword32(&((aio_result_t *)resultp)->aio_return, retval);
1166	(void) suword32(&((aio_result_t *)resultp)->aio_errno, errno);
1167#endif
1168}
1169
1170/*
1171 * This function is used to remove a request from the done queue.
1172 */
1173
1174void
1175aio_req_remove_portq(aio_t *aiop, aio_req_t *reqp)
1176{
1177	ASSERT(MUTEX_HELD(&aiop->aio_portq_mutex));
1178	while (aiop->aio_portq == NULL) {
1179		/*
1180		 * aio_portq is set to NULL when aio_cleanup_portq()
1181		 * is working with the event queue.
1182		 * The aio_cleanup_thread() uses aio_cleanup_portq()
1183		 * to unlock all AIO buffers with completed transactions.
1184		 * Wait here until aio_cleanup_portq() restores the
1185		 * list of completed transactions in aio_portq.
1186		 */
1187		cv_wait(&aiop->aio_portcv, &aiop->aio_portq_mutex);
1188	}
1189	aio_deq(&aiop->aio_portq, reqp);
1190}
1191
1192/* ARGSUSED */
1193void
1194aio_close_port(void *arg, int port, pid_t pid, int lastclose)
1195{
1196	aio_t		*aiop;
1197	aio_req_t 	*reqp;
1198	aio_req_t 	*next;
1199	aio_req_t	*headp;
1200	int		counter;
1201
1202	if (arg == NULL)
1203		aiop = curproc->p_aio;
1204	else
1205		aiop = (aio_t *)arg;
1206
1207	/*
1208	 * The PORT_SOURCE_AIO source is always associated with every new
1209	 * created port by default.
1210	 * If no asynchronous I/O transactions were associated with the port
1211	 * then the aiop pointer will still be set to NULL.
1212	 */
1213	if (aiop == NULL)
1214		return;
1215
1216	/*
1217	 * Within a process event ports can be used to collect events other
1218	 * than PORT_SOURCE_AIO events. At the same time the process can submit
1219	 * asynchronous I/Os transactions which are not associated with the
1220	 * current port.
1221	 * The current process oriented model of AIO uses a sigle queue for
1222	 * pending events. On close the pending queue (queue of asynchronous
1223	 * I/O transactions using event port notification) must be scanned
1224	 * to detect and handle pending I/Os using the current port.
1225	 */
1226	mutex_enter(&aiop->aio_portq_mutex);
1227	mutex_enter(&aiop->aio_mutex);
1228	counter = 0;
1229	if ((headp = aiop->aio_portpending) != NULL) {
1230		reqp = headp;
1231		do {
1232			if (reqp->aio_req_portkev &&
1233			    reqp->aio_req_port == port) {
1234				reqp->aio_req_flags |= AIO_CLOSE_PORT;
1235				counter++;
1236			}
1237		} while ((reqp = reqp->aio_req_next) != headp);
1238	}
1239	if (counter == 0) {
1240		/* no AIOs pending */
1241		mutex_exit(&aiop->aio_mutex);
1242		mutex_exit(&aiop->aio_portq_mutex);
1243		return;
1244	}
1245	aiop->aio_portpendcnt += counter;
1246	mutex_exit(&aiop->aio_mutex);
1247	while (aiop->aio_portpendcnt)
1248		cv_wait(&aiop->aio_portcv, &aiop->aio_portq_mutex);
1249
1250	/*
1251	 * all pending AIOs are completed.
1252	 * check port doneq
1253	 */
1254	headp = NULL;
1255	if ((reqp = aiop->aio_portq) != NULL) {
1256		do {
1257			next = reqp->aio_req_next;
1258			if (reqp->aio_req_port == port) {
1259				/* dequeue request and discard event */
1260				aio_req_remove_portq(aiop, reqp);
1261				port_free_event(reqp->aio_req_portkev);
1262				/* put request in temporary queue */
1263				reqp->aio_req_next = headp;
1264				headp = reqp;
1265			}
1266		} while ((reqp = next) != aiop->aio_portq);
1267	}
1268	mutex_exit(&aiop->aio_portq_mutex);
1269
1270	/* headp points to the list of requests to be discarded */
1271	for (reqp = headp; reqp != NULL; reqp = next) {
1272		next = reqp->aio_req_next;
1273		aphysio_unlock(reqp);
1274		mutex_enter(&aiop->aio_mutex);
1275		aio_req_free_port(aiop, reqp);
1276		mutex_exit(&aiop->aio_mutex);
1277	}
1278
1279	if (aiop->aio_flags & AIO_CLEANUP)
1280		cv_broadcast(&aiop->aio_waitcv);
1281}
1282
1283/*
1284 * aio_cleanup_dr_delete_memory is used by dr's delete_memory_thread
1285 * to kick start the aio_cleanup_thread for the give process to do the
1286 * necessary cleanup.
1287 * This is needed so that delete_memory_thread can obtain writer locks
1288 * on pages that need to be relocated during a dr memory delete operation,
1289 * otherwise a deadly embrace may occur.
1290 */
1291int
1292aio_cleanup_dr_delete_memory(proc_t *procp)
1293{
1294	struct aio *aiop = procp->p_aio;
1295	struct as *as = procp->p_as;
1296	int ret = 0;
1297
1298	ASSERT(MUTEX_HELD(&procp->p_lock));
1299
1300	mutex_enter(&as->a_contents);
1301
1302	if (aiop != NULL) {
1303		aiop->aio_rqclnup = 1;
1304		cv_broadcast(&as->a_cv);
1305		ret = 1;
1306	}
1307	mutex_exit(&as->a_contents);
1308	return (ret);
1309}
1310