xref: /illumos-gate/usr/src/uts/common/os/aio.c (revision 15c07adc1c7b828006b5e3c4d528b92229d6bd23)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * Copyright (c) 2018, Joyent, Inc.
29  */
30 
31 /*
32  * Kernel asynchronous I/O.
33  * This is only for raw devices now (as of Nov. 1993).
34  */
35 
36 #include <sys/types.h>
37 #include <sys/errno.h>
38 #include <sys/conf.h>
39 #include <sys/file.h>
40 #include <sys/fs/snode.h>
41 #include <sys/unistd.h>
42 #include <sys/cmn_err.h>
43 #include <vm/as.h>
44 #include <vm/faultcode.h>
45 #include <sys/sysmacros.h>
46 #include <sys/procfs.h>
47 #include <sys/kmem.h>
48 #include <sys/autoconf.h>
49 #include <sys/ddi_impldefs.h>
50 #include <sys/sunddi.h>
51 #include <sys/aio_impl.h>
52 #include <sys/debug.h>
53 #include <sys/param.h>
54 #include <sys/systm.h>
55 #include <sys/vmsystm.h>
56 #include <sys/fs/pxfs_ki.h>
57 #include <sys/contract/process_impl.h>
58 
59 /*
60  * external entry point.
61  */
62 #ifdef _LP64
63 static int64_t kaioc(long, long, long, long, long, long);
64 #endif
65 static int kaio(ulong_t *, rval_t *);
66 
67 
68 #define	AIO_64	0
69 #define	AIO_32	1
70 #define	AIO_LARGEFILE	2
71 
72 /*
73  * implementation specific functions (private)
74  */
75 #ifdef _LP64
76 static int alio(int, aiocb_t **, int, struct sigevent *);
77 #endif
78 static int aionotify(void);
79 static int aioinit(void);
80 static int aiostart(void);
81 static void alio_cleanup(aio_t *, aiocb_t **, int, int);
82 static int (*check_vp(struct vnode *, int))(vnode_t *, struct aio_req *,
83     cred_t *);
84 static void lio_set_error(aio_req_t *, int portused);
85 static aio_t *aio_aiop_alloc();
86 static int aio_req_alloc(aio_req_t **, aio_result_t *);
87 static int aio_lio_alloc(aio_lio_t **);
88 static aio_req_t *aio_req_done(void *);
89 static aio_req_t *aio_req_remove(aio_req_t *);
90 static int aio_req_find(aio_result_t *, aio_req_t **);
91 static int aio_hash_insert(struct aio_req_t *, aio_t *);
92 static int aio_req_setup(aio_req_t **, aio_t *, aiocb_t *,
93     aio_result_t *, vnode_t *, int);
94 static int aio_cleanup_thread(aio_t *);
95 static aio_lio_t *aio_list_get(aio_result_t *);
96 static void lio_set_uerror(void *, int);
97 extern void aio_zerolen(aio_req_t *);
98 static int aiowait(struct timeval *, int, long	*);
99 static int aiowaitn(void *, uint_t, uint_t *, timespec_t *);
100 static int aio_unlock_requests(caddr_t iocblist, int iocb_index,
101     aio_req_t *reqlist, aio_t *aiop, model_t model);
102 static int aio_reqlist_concat(aio_t *aiop, aio_req_t **reqlist, int max);
103 static int aiosuspend(void *, int, struct  timespec *, int,
104     long	*, int);
105 static int aliowait(int, void *, int, void *, int);
106 static int aioerror(void *, int);
107 static int aio_cancel(int, void *, long	*, int);
108 static int arw(int, int, char *, int, offset_t, aio_result_t *, int);
109 static int aiorw(int, void *, int, int);
110 
111 static int alioLF(int, void *, int, void *);
112 static int aio_req_setupLF(aio_req_t **, aio_t *, aiocb64_32_t *,
113     aio_result_t *, vnode_t *, int);
114 static int alio32(int, void *, int, void *);
115 static int driver_aio_write(vnode_t *vp, struct aio_req *aio, cred_t *cred_p);
116 static int driver_aio_read(vnode_t *vp, struct aio_req *aio, cred_t *cred_p);
117 
118 #ifdef  _SYSCALL32_IMPL
119 static void aiocb_LFton(aiocb64_32_t *, aiocb_t *);
120 void	aiocb_32ton(aiocb32_t *, aiocb_t *);
121 #endif /* _SYSCALL32_IMPL */
122 
123 /*
124  * implementation specific functions (external)
125  */
126 void aio_req_free(aio_t *, aio_req_t *);
127 
128 /*
129  * Event Port framework
130  */
131 
132 void aio_req_free_port(aio_t *, aio_req_t *);
133 static int aio_port_callback(void *, int *, pid_t, int, void *);
134 
135 /*
136  * This is the loadable module wrapper.
137  */
138 #include <sys/modctl.h>
139 #include <sys/syscall.h>
140 
141 #ifdef _LP64
142 
143 static struct sysent kaio_sysent = {
144 	6,
145 	SE_NOUNLOAD | SE_64RVAL | SE_ARGC,
146 	(int (*)())kaioc
147 };
148 
149 #ifdef _SYSCALL32_IMPL
150 static struct sysent kaio_sysent32 = {
151 	7,
152 	SE_NOUNLOAD | SE_64RVAL,
153 	kaio
154 };
155 #endif  /* _SYSCALL32_IMPL */
156 
157 #else   /* _LP64 */
158 
159 static struct sysent kaio_sysent = {
160 	7,
161 	SE_NOUNLOAD | SE_32RVAL1,
162 	kaio
163 };
164 
165 #endif  /* _LP64 */
166 
167 /*
168  * Module linkage information for the kernel.
169  */
170 
171 static struct modlsys modlsys = {
172 	&mod_syscallops,
173 	"kernel Async I/O",
174 	&kaio_sysent
175 };
176 
177 #ifdef  _SYSCALL32_IMPL
178 static struct modlsys modlsys32 = {
179 	&mod_syscallops32,
180 	"kernel Async I/O for 32 bit compatibility",
181 	&kaio_sysent32
182 };
183 #endif  /* _SYSCALL32_IMPL */
184 
185 
186 static struct modlinkage modlinkage = {
187 	MODREV_1,
188 	&modlsys,
189 #ifdef  _SYSCALL32_IMPL
190 	&modlsys32,
191 #endif
192 	NULL
193 };
194 
195 int
196 _init(void)
197 {
198 	int retval;
199 
200 	if ((retval = mod_install(&modlinkage)) != 0)
201 		return (retval);
202 
203 	return (0);
204 }
205 
206 int
207 _fini(void)
208 {
209 	int retval;
210 
211 	retval = mod_remove(&modlinkage);
212 
213 	return (retval);
214 }
215 
216 int
217 _info(struct modinfo *modinfop)
218 {
219 	return (mod_info(&modlinkage, modinfop));
220 }
221 
222 #ifdef	_LP64
223 static int64_t
224 kaioc(
225 	long	a0,
226 	long	a1,
227 	long	a2,
228 	long	a3,
229 	long	a4,
230 	long	a5)
231 {
232 	int	error;
233 	long	rval = 0;
234 
235 	switch ((int)a0 & ~AIO_POLL_BIT) {
236 	case AIOREAD:
237 		error = arw((int)a0, (int)a1, (char *)a2, (int)a3,
238 		    (offset_t)a4, (aio_result_t *)a5, FREAD);
239 		break;
240 	case AIOWRITE:
241 		error = arw((int)a0, (int)a1, (char *)a2, (int)a3,
242 		    (offset_t)a4, (aio_result_t *)a5, FWRITE);
243 		break;
244 	case AIOWAIT:
245 		error = aiowait((struct timeval *)a1, (int)a2, &rval);
246 		break;
247 	case AIOWAITN:
248 		error = aiowaitn((void *)a1, (uint_t)a2, (uint_t *)a3,
249 		    (timespec_t *)a4);
250 		break;
251 	case AIONOTIFY:
252 		error = aionotify();
253 		break;
254 	case AIOINIT:
255 		error = aioinit();
256 		break;
257 	case AIOSTART:
258 		error = aiostart();
259 		break;
260 	case AIOLIO:
261 		error = alio((int)a1, (aiocb_t **)a2, (int)a3,
262 		    (struct sigevent *)a4);
263 		break;
264 	case AIOLIOWAIT:
265 		error = aliowait((int)a1, (void *)a2, (int)a3,
266 		    (struct sigevent *)a4, AIO_64);
267 		break;
268 	case AIOSUSPEND:
269 		error = aiosuspend((void *)a1, (int)a2, (timespec_t *)a3,
270 		    (int)a4, &rval, AIO_64);
271 		break;
272 	case AIOERROR:
273 		error = aioerror((void *)a1, AIO_64);
274 		break;
275 	case AIOAREAD:
276 		error = aiorw((int)a0, (void *)a1, FREAD, AIO_64);
277 		break;
278 	case AIOAWRITE:
279 		error = aiorw((int)a0, (void *)a1, FWRITE, AIO_64);
280 		break;
281 	case AIOCANCEL:
282 		error = aio_cancel((int)a1, (void *)a2, &rval, AIO_64);
283 		break;
284 
285 	/*
286 	 * The large file related stuff is valid only for
287 	 * 32 bit kernel and not for 64 bit kernel
288 	 * On 64 bit kernel we convert large file calls
289 	 * to regular 64bit calls.
290 	 */
291 
292 	default:
293 		error = EINVAL;
294 	}
295 	if (error)
296 		return ((int64_t)set_errno(error));
297 	return (rval);
298 }
299 #endif
300 
301 static int
302 kaio(
303 	ulong_t *uap,
304 	rval_t *rvp)
305 {
306 	long rval = 0;
307 	int	error = 0;
308 	offset_t	off;
309 
310 
311 	rvp->r_vals = 0;
312 #if defined(_LITTLE_ENDIAN)
313 	off = ((u_offset_t)uap[5] << 32) | (u_offset_t)uap[4];
314 #else
315 	off = ((u_offset_t)uap[4] << 32) | (u_offset_t)uap[5];
316 #endif
317 
318 	switch (uap[0] & ~AIO_POLL_BIT) {
319 	/*
320 	 * It must be the 32 bit system call on 64 bit kernel
321 	 */
322 	case AIOREAD:
323 		return (arw((int)uap[0], (int)uap[1], (char *)uap[2],
324 		    (int)uap[3], off, (aio_result_t *)uap[6], FREAD));
325 	case AIOWRITE:
326 		return (arw((int)uap[0], (int)uap[1], (char *)uap[2],
327 		    (int)uap[3], off, (aio_result_t *)uap[6], FWRITE));
328 	case AIOWAIT:
329 		error = aiowait((struct	timeval *)uap[1], (int)uap[2],
330 		    &rval);
331 		break;
332 	case AIOWAITN:
333 		error = aiowaitn((void *)uap[1], (uint_t)uap[2],
334 		    (uint_t *)uap[3], (timespec_t *)uap[4]);
335 		break;
336 	case AIONOTIFY:
337 		return (aionotify());
338 	case AIOINIT:
339 		return (aioinit());
340 	case AIOSTART:
341 		return (aiostart());
342 	case AIOLIO:
343 		return (alio32((int)uap[1], (void *)uap[2], (int)uap[3],
344 		    (void *)uap[4]));
345 	case AIOLIOWAIT:
346 		return (aliowait((int)uap[1], (void *)uap[2],
347 		    (int)uap[3], (struct sigevent *)uap[4], AIO_32));
348 	case AIOSUSPEND:
349 		error = aiosuspend((void *)uap[1], (int)uap[2],
350 		    (timespec_t *)uap[3], (int)uap[4],
351 		    &rval, AIO_32);
352 		break;
353 	case AIOERROR:
354 		return (aioerror((void *)uap[1], AIO_32));
355 	case AIOAREAD:
356 		return (aiorw((int)uap[0], (void *)uap[1],
357 		    FREAD, AIO_32));
358 	case AIOAWRITE:
359 		return (aiorw((int)uap[0], (void *)uap[1],
360 		    FWRITE, AIO_32));
361 	case AIOCANCEL:
362 		error = (aio_cancel((int)uap[1], (void *)uap[2], &rval,
363 		    AIO_32));
364 		break;
365 	case AIOLIO64:
366 		return (alioLF((int)uap[1], (void *)uap[2],
367 		    (int)uap[3], (void *)uap[4]));
368 	case AIOLIOWAIT64:
369 		return (aliowait(uap[1], (void *)uap[2],
370 		    (int)uap[3], (void *)uap[4], AIO_LARGEFILE));
371 	case AIOSUSPEND64:
372 		error = aiosuspend((void *)uap[1], (int)uap[2],
373 		    (timespec_t *)uap[3], (int)uap[4], &rval,
374 		    AIO_LARGEFILE);
375 		break;
376 	case AIOERROR64:
377 		return (aioerror((void *)uap[1], AIO_LARGEFILE));
378 	case AIOAREAD64:
379 		return (aiorw((int)uap[0], (void *)uap[1], FREAD,
380 		    AIO_LARGEFILE));
381 	case AIOAWRITE64:
382 		return (aiorw((int)uap[0], (void *)uap[1], FWRITE,
383 		    AIO_LARGEFILE));
384 	case AIOCANCEL64:
385 		error = (aio_cancel((int)uap[1], (void *)uap[2],
386 		    &rval, AIO_LARGEFILE));
387 		break;
388 	default:
389 		return (EINVAL);
390 	}
391 
392 	rvp->r_val1 = rval;
393 	return (error);
394 }
395 
396 /*
397  * wake up LWPs in this process that are sleeping in
398  * aiowait().
399  */
400 static int
401 aionotify(void)
402 {
403 	aio_t	*aiop;
404 
405 	aiop = curproc->p_aio;
406 	if (aiop == NULL)
407 		return (0);
408 
409 	mutex_enter(&aiop->aio_mutex);
410 	aiop->aio_notifycnt++;
411 	cv_broadcast(&aiop->aio_waitcv);
412 	mutex_exit(&aiop->aio_mutex);
413 
414 	return (0);
415 }
416 
417 static int
418 timeval2reltime(struct timeval *timout, timestruc_t *rqtime,
419 	timestruc_t **rqtp, int *blocking)
420 {
421 #ifdef	_SYSCALL32_IMPL
422 	struct timeval32 wait_time_32;
423 #endif
424 	struct timeval wait_time;
425 	model_t	model = get_udatamodel();
426 
427 	*rqtp = NULL;
428 	if (timout == NULL) {		/* wait indefinitely */
429 		*blocking = 1;
430 		return (0);
431 	}
432 
433 	/*
434 	 * Need to correctly compare with the -1 passed in for a user
435 	 * address pointer, with both 32 bit and 64 bit apps.
436 	 */
437 	if (model == DATAMODEL_NATIVE) {
438 		if ((intptr_t)timout == (intptr_t)-1) {	/* don't wait */
439 			*blocking = 0;
440 			return (0);
441 		}
442 
443 		if (copyin(timout, &wait_time, sizeof (wait_time)))
444 			return (EFAULT);
445 	}
446 #ifdef	_SYSCALL32_IMPL
447 	else {
448 		/*
449 		 * -1 from a 32bit app. It will not get sign extended.
450 		 * don't wait if -1.
451 		 */
452 		if ((intptr_t)timout == (intptr_t)((uint32_t)-1)) {
453 			*blocking = 0;
454 			return (0);
455 		}
456 
457 		if (copyin(timout, &wait_time_32, sizeof (wait_time_32)))
458 			return (EFAULT);
459 		TIMEVAL32_TO_TIMEVAL(&wait_time, &wait_time_32);
460 	}
461 #endif  /* _SYSCALL32_IMPL */
462 
463 	if (wait_time.tv_sec == 0 && wait_time.tv_usec == 0) {	/* don't wait */
464 		*blocking = 0;
465 		return (0);
466 	}
467 
468 	if (wait_time.tv_sec < 0 ||
469 	    wait_time.tv_usec < 0 || wait_time.tv_usec >= MICROSEC)
470 		return (EINVAL);
471 
472 	rqtime->tv_sec = wait_time.tv_sec;
473 	rqtime->tv_nsec = wait_time.tv_usec * 1000;
474 	*rqtp = rqtime;
475 	*blocking = 1;
476 
477 	return (0);
478 }
479 
480 static int
481 timespec2reltime(timespec_t *timout, timestruc_t *rqtime,
482 	timestruc_t **rqtp, int *blocking)
483 {
484 #ifdef	_SYSCALL32_IMPL
485 	timespec32_t wait_time_32;
486 #endif
487 	model_t	model = get_udatamodel();
488 
489 	*rqtp = NULL;
490 	if (timout == NULL) {
491 		*blocking = 1;
492 		return (0);
493 	}
494 
495 	if (model == DATAMODEL_NATIVE) {
496 		if (copyin(timout, rqtime, sizeof (*rqtime)))
497 			return (EFAULT);
498 	}
499 #ifdef	_SYSCALL32_IMPL
500 	else {
501 		if (copyin(timout, &wait_time_32, sizeof (wait_time_32)))
502 			return (EFAULT);
503 		TIMESPEC32_TO_TIMESPEC(rqtime, &wait_time_32);
504 	}
505 #endif  /* _SYSCALL32_IMPL */
506 
507 	if (rqtime->tv_sec == 0 && rqtime->tv_nsec == 0) {
508 		*blocking = 0;
509 		return (0);
510 	}
511 
512 	if (rqtime->tv_sec < 0 ||
513 	    rqtime->tv_nsec < 0 || rqtime->tv_nsec >= NANOSEC)
514 		return (EINVAL);
515 
516 	*rqtp = rqtime;
517 	*blocking = 1;
518 
519 	return (0);
520 }
521 
522 /*ARGSUSED*/
523 static int
524 aiowait(
525 	struct timeval	*timout,
526 	int	dontblockflg,
527 	long	*rval)
528 {
529 	int 		error;
530 	aio_t		*aiop;
531 	aio_req_t	*reqp;
532 	clock_t		status;
533 	int		blocking;
534 	int		timecheck;
535 	timestruc_t	rqtime;
536 	timestruc_t	*rqtp;
537 
538 	aiop = curproc->p_aio;
539 	if (aiop == NULL)
540 		return (EINVAL);
541 
542 	/*
543 	 * Establish the absolute future time for the timeout.
544 	 */
545 	error = timeval2reltime(timout, &rqtime, &rqtp, &blocking);
546 	if (error)
547 		return (error);
548 	if (rqtp) {
549 		timestruc_t now;
550 		timecheck = timechanged;
551 		gethrestime(&now);
552 		timespecadd(rqtp, &now);
553 	}
554 
555 	mutex_enter(&aiop->aio_mutex);
556 	for (;;) {
557 		/* process requests on poll queue */
558 		if (aiop->aio_pollq) {
559 			mutex_exit(&aiop->aio_mutex);
560 			aio_cleanup(0);
561 			mutex_enter(&aiop->aio_mutex);
562 		}
563 		if ((reqp = aio_req_remove(NULL)) != NULL) {
564 			*rval = (long)reqp->aio_req_resultp;
565 			break;
566 		}
567 		/* user-level done queue might not be empty */
568 		if (aiop->aio_notifycnt > 0) {
569 			aiop->aio_notifycnt--;
570 			*rval = 1;
571 			break;
572 		}
573 		/* don't block if no outstanding aio */
574 		if (aiop->aio_outstanding == 0 && dontblockflg) {
575 			error = EINVAL;
576 			break;
577 		}
578 		if (blocking) {
579 			status = cv_waituntil_sig(&aiop->aio_waitcv,
580 			    &aiop->aio_mutex, rqtp, timecheck);
581 
582 			if (status > 0)		/* check done queue again */
583 				continue;
584 			if (status == 0) {	/* interrupted by a signal */
585 				error = EINTR;
586 				*rval = -1;
587 			} else {		/* timer expired */
588 				error = ETIME;
589 			}
590 		}
591 		break;
592 	}
593 	mutex_exit(&aiop->aio_mutex);
594 	if (reqp) {
595 		aphysio_unlock(reqp);
596 		aio_copyout_result(reqp);
597 		mutex_enter(&aiop->aio_mutex);
598 		aio_req_free(aiop, reqp);
599 		mutex_exit(&aiop->aio_mutex);
600 	}
601 	return (error);
602 }
603 
604 /*
605  * aiowaitn can be used to reap completed asynchronous requests submitted with
606  * lio_listio, aio_read or aio_write.
607  * This function only reaps asynchronous raw I/Os.
608  */
609 
610 /*ARGSUSED*/
611 static int
612 aiowaitn(void *uiocb, uint_t nent, uint_t *nwait, timespec_t *timout)
613 {
614 	int 		error = 0;
615 	aio_t		*aiop;
616 	aio_req_t	*reqlist = NULL;
617 	caddr_t		iocblist = NULL;	/* array of iocb ptr's */
618 	uint_t		waitcnt, cnt = 0;	/* iocb cnt */
619 	size_t		iocbsz;			/* users iocb size */
620 	size_t		riocbsz;		/* returned iocb size */
621 	int		iocb_index = 0;
622 	model_t		model = get_udatamodel();
623 	int		blocking = 1;
624 	int		timecheck;
625 	timestruc_t	rqtime;
626 	timestruc_t	*rqtp;
627 
628 	aiop = curproc->p_aio;
629 	if (aiop == NULL || nent == 0 || nent > _AIO_LISTIO_MAX)
630 		return (EINVAL);
631 
632 	if (aiop->aio_outstanding == 0)
633 		return (EAGAIN);
634 
635 	if (copyin(nwait, &waitcnt, sizeof (uint_t)))
636 		return (EFAULT);
637 
638 	/* set *nwait to zero, if we must return prematurely */
639 	if (copyout(&cnt, nwait, sizeof (uint_t)))
640 		return (EFAULT);
641 
642 	if (waitcnt == 0) {
643 		blocking = 0;
644 		rqtp = NULL;
645 		waitcnt = nent;
646 	} else {
647 		error = timespec2reltime(timout, &rqtime, &rqtp, &blocking);
648 		if (error)
649 			return (error);
650 	}
651 
652 	if (model == DATAMODEL_NATIVE)
653 		iocbsz = (sizeof (aiocb_t *) * nent);
654 #ifdef	_SYSCALL32_IMPL
655 	else
656 		iocbsz = (sizeof (caddr32_t) * nent);
657 #endif  /* _SYSCALL32_IMPL */
658 
659 	/*
660 	 * Only one aio_waitn call is allowed at a time.
661 	 * The active aio_waitn will collect all requests
662 	 * out of the "done" list and if necessary it will wait
663 	 * for some/all pending requests to fulfill the nwait
664 	 * parameter.
665 	 * A second or further aio_waitn calls will sleep here
666 	 * until the active aio_waitn finishes and leaves the kernel
667 	 * If the second call does not block (poll), then return
668 	 * immediately with the error code : EAGAIN.
669 	 * If the second call should block, then sleep here, but
670 	 * do not touch the timeout. The timeout starts when this
671 	 * aio_waitn-call becomes active.
672 	 */
673 
674 	mutex_enter(&aiop->aio_mutex);
675 
676 	while (aiop->aio_flags & AIO_WAITN) {
677 		if (blocking == 0) {
678 			mutex_exit(&aiop->aio_mutex);
679 			return (EAGAIN);
680 		}
681 
682 		/* block, no timeout */
683 		aiop->aio_flags |= AIO_WAITN_PENDING;
684 		if (!cv_wait_sig(&aiop->aio_waitncv, &aiop->aio_mutex)) {
685 			mutex_exit(&aiop->aio_mutex);
686 			return (EINTR);
687 		}
688 	}
689 
690 	/*
691 	 * Establish the absolute future time for the timeout.
692 	 */
693 	if (rqtp) {
694 		timestruc_t now;
695 		timecheck = timechanged;
696 		gethrestime(&now);
697 		timespecadd(rqtp, &now);
698 	}
699 
700 	if (iocbsz > aiop->aio_iocbsz && aiop->aio_iocb != NULL) {
701 		kmem_free(aiop->aio_iocb, aiop->aio_iocbsz);
702 		aiop->aio_iocb = NULL;
703 	}
704 
705 	if (aiop->aio_iocb == NULL) {
706 		iocblist = kmem_zalloc(iocbsz, KM_NOSLEEP);
707 		if (iocblist == NULL) {
708 			mutex_exit(&aiop->aio_mutex);
709 			return (ENOMEM);
710 		}
711 		aiop->aio_iocb = (aiocb_t **)iocblist;
712 		aiop->aio_iocbsz = iocbsz;
713 	} else {
714 		iocblist = (char *)aiop->aio_iocb;
715 	}
716 
717 	aiop->aio_waitncnt = waitcnt;
718 	aiop->aio_flags |= AIO_WAITN;
719 
720 	for (;;) {
721 		/* push requests on poll queue to done queue */
722 		if (aiop->aio_pollq) {
723 			mutex_exit(&aiop->aio_mutex);
724 			aio_cleanup(0);
725 			mutex_enter(&aiop->aio_mutex);
726 		}
727 
728 		/* check for requests on done queue */
729 		if (aiop->aio_doneq) {
730 			cnt += aio_reqlist_concat(aiop, &reqlist, nent - cnt);
731 			aiop->aio_waitncnt = waitcnt - cnt;
732 		}
733 
734 		/* user-level done queue might not be empty */
735 		if (aiop->aio_notifycnt > 0) {
736 			aiop->aio_notifycnt--;
737 			error = 0;
738 			break;
739 		}
740 
741 		/*
742 		 * if we are here second time as a result of timer
743 		 * expiration, we reset error if there are enough
744 		 * aiocb's to satisfy request.
745 		 * We return also if all requests are already done
746 		 * and we picked up the whole done queue.
747 		 */
748 
749 		if ((cnt >= waitcnt) || (cnt > 0 && aiop->aio_pending == 0 &&
750 		    aiop->aio_doneq == NULL)) {
751 			error = 0;
752 			break;
753 		}
754 
755 		if ((cnt < waitcnt) && blocking) {
756 			int rval = cv_waituntil_sig(&aiop->aio_waitcv,
757 			    &aiop->aio_mutex, rqtp, timecheck);
758 			if (rval > 0)
759 				continue;
760 			if (rval < 0) {
761 				error = ETIME;
762 				blocking = 0;
763 				continue;
764 			}
765 			error = EINTR;
766 		}
767 		break;
768 	}
769 
770 	mutex_exit(&aiop->aio_mutex);
771 
772 	if (cnt > 0) {
773 
774 		iocb_index = aio_unlock_requests(iocblist, iocb_index, reqlist,
775 		    aiop, model);
776 
777 		if (model == DATAMODEL_NATIVE)
778 			riocbsz = (sizeof (aiocb_t *) * cnt);
779 #ifdef	_SYSCALL32_IMPL
780 		else
781 			riocbsz = (sizeof (caddr32_t) * cnt);
782 #endif  /* _SYSCALL32_IMPL */
783 
784 		if (copyout(iocblist, uiocb, riocbsz) ||
785 		    copyout(&cnt, nwait, sizeof (uint_t)))
786 			error = EFAULT;
787 	}
788 
789 	/* check if there is another thread waiting for execution */
790 	mutex_enter(&aiop->aio_mutex);
791 	aiop->aio_flags &= ~AIO_WAITN;
792 	if (aiop->aio_flags & AIO_WAITN_PENDING) {
793 		aiop->aio_flags &= ~AIO_WAITN_PENDING;
794 		cv_signal(&aiop->aio_waitncv);
795 	}
796 	mutex_exit(&aiop->aio_mutex);
797 
798 	return (error);
799 }
800 
801 /*
802  * aio_unlock_requests
803  * copyouts the result of the request as well as the return value.
804  * It builds the list of completed asynchronous requests,
805  * unlocks the allocated memory ranges and
806  * put the aio request structure back into the free list.
807  */
808 
809 static int
810 aio_unlock_requests(
811 	caddr_t	iocblist,
812 	int	iocb_index,
813 	aio_req_t *reqlist,
814 	aio_t	*aiop,
815 	model_t	model)
816 {
817 	aio_req_t	*reqp, *nreqp;
818 
819 	if (model == DATAMODEL_NATIVE) {
820 		for (reqp = reqlist; reqp != NULL;  reqp = nreqp) {
821 			(((caddr_t *)iocblist)[iocb_index++]) =
822 			    reqp->aio_req_iocb.iocb;
823 			nreqp = reqp->aio_req_next;
824 			aphysio_unlock(reqp);
825 			aio_copyout_result(reqp);
826 			mutex_enter(&aiop->aio_mutex);
827 			aio_req_free(aiop, reqp);
828 			mutex_exit(&aiop->aio_mutex);
829 		}
830 	}
831 #ifdef	_SYSCALL32_IMPL
832 	else {
833 		for (reqp = reqlist; reqp != NULL;  reqp = nreqp) {
834 			((caddr32_t *)iocblist)[iocb_index++] =
835 			    reqp->aio_req_iocb.iocb32;
836 			nreqp = reqp->aio_req_next;
837 			aphysio_unlock(reqp);
838 			aio_copyout_result(reqp);
839 			mutex_enter(&aiop->aio_mutex);
840 			aio_req_free(aiop, reqp);
841 			mutex_exit(&aiop->aio_mutex);
842 		}
843 	}
844 #endif	/* _SYSCALL32_IMPL */
845 	return (iocb_index);
846 }
847 
848 /*
849  * aio_reqlist_concat
850  * moves "max" elements from the done queue to the reqlist queue and removes
851  * the AIO_DONEQ flag.
852  * - reqlist queue is a simple linked list
853  * - done queue is a double linked list
854  */
855 
856 static int
857 aio_reqlist_concat(aio_t *aiop, aio_req_t **reqlist, int max)
858 {
859 	aio_req_t *q2, *q2work, *list;
860 	int count = 0;
861 
862 	list = *reqlist;
863 	q2 = aiop->aio_doneq;
864 	q2work = q2;
865 	while (max-- > 0) {
866 		q2work->aio_req_flags &= ~AIO_DONEQ;
867 		q2work = q2work->aio_req_next;
868 		count++;
869 		if (q2work == q2)
870 			break;
871 	}
872 
873 	if (q2work == q2) {
874 		/* all elements revised */
875 		q2->aio_req_prev->aio_req_next = list;
876 		list = q2;
877 		aiop->aio_doneq = NULL;
878 	} else {
879 		/*
880 		 * max < elements in the doneq
881 		 * detach only the required amount of elements
882 		 * out of the doneq
883 		 */
884 		q2work->aio_req_prev->aio_req_next = list;
885 		list = q2;
886 
887 		aiop->aio_doneq = q2work;
888 		q2work->aio_req_prev = q2->aio_req_prev;
889 		q2->aio_req_prev->aio_req_next = q2work;
890 	}
891 	*reqlist = list;
892 	return (count);
893 }
894 
895 /*ARGSUSED*/
896 static int
897 aiosuspend(
898 	void	*aiocb,
899 	int	nent,
900 	struct	timespec	*timout,
901 	int	flag,
902 	long	*rval,
903 	int	run_mode)
904 {
905 	int 		error;
906 	aio_t		*aiop;
907 	aio_req_t	*reqp, *found, *next;
908 	caddr_t		cbplist = NULL;
909 	aiocb_t		*cbp, **ucbp;
910 #ifdef	_SYSCALL32_IMPL
911 	aiocb32_t	*cbp32;
912 	caddr32_t	*ucbp32;
913 #endif  /* _SYSCALL32_IMPL */
914 	aiocb64_32_t	*cbp64;
915 	int		rv;
916 	int		i;
917 	size_t		ssize;
918 	model_t		model = get_udatamodel();
919 	int		blocking;
920 	int		timecheck;
921 	timestruc_t	rqtime;
922 	timestruc_t	*rqtp;
923 
924 	aiop = curproc->p_aio;
925 	if (aiop == NULL || nent <= 0 || nent > _AIO_LISTIO_MAX)
926 		return (EINVAL);
927 
928 	/*
929 	 * Establish the absolute future time for the timeout.
930 	 */
931 	error = timespec2reltime(timout, &rqtime, &rqtp, &blocking);
932 	if (error)
933 		return (error);
934 	if (rqtp) {
935 		timestruc_t now;
936 		timecheck = timechanged;
937 		gethrestime(&now);
938 		timespecadd(rqtp, &now);
939 	}
940 
941 	/*
942 	 * If we are not blocking and there's no IO complete
943 	 * skip aiocb copyin.
944 	 */
945 	if (!blocking && (aiop->aio_pollq == NULL) &&
946 	    (aiop->aio_doneq == NULL)) {
947 		return (EAGAIN);
948 	}
949 
950 	if (model == DATAMODEL_NATIVE)
951 		ssize = (sizeof (aiocb_t *) * nent);
952 #ifdef	_SYSCALL32_IMPL
953 	else
954 		ssize = (sizeof (caddr32_t) * nent);
955 #endif  /* _SYSCALL32_IMPL */
956 
957 	cbplist = kmem_alloc(ssize, KM_NOSLEEP);
958 	if (cbplist == NULL)
959 		return (ENOMEM);
960 
961 	if (copyin(aiocb, cbplist, ssize)) {
962 		error = EFAULT;
963 		goto done;
964 	}
965 
966 	found = NULL;
967 	/*
968 	 * we need to get the aio_cleanupq_mutex since we call
969 	 * aio_req_done().
970 	 */
971 	mutex_enter(&aiop->aio_cleanupq_mutex);
972 	mutex_enter(&aiop->aio_mutex);
973 	for (;;) {
974 		/* push requests on poll queue to done queue */
975 		if (aiop->aio_pollq) {
976 			mutex_exit(&aiop->aio_mutex);
977 			mutex_exit(&aiop->aio_cleanupq_mutex);
978 			aio_cleanup(0);
979 			mutex_enter(&aiop->aio_cleanupq_mutex);
980 			mutex_enter(&aiop->aio_mutex);
981 		}
982 		/* check for requests on done queue */
983 		if (aiop->aio_doneq) {
984 			if (model == DATAMODEL_NATIVE)
985 				ucbp = (aiocb_t **)cbplist;
986 #ifdef	_SYSCALL32_IMPL
987 			else
988 				ucbp32 = (caddr32_t *)cbplist;
989 #endif  /* _SYSCALL32_IMPL */
990 			for (i = 0; i < nent; i++) {
991 				if (model == DATAMODEL_NATIVE) {
992 					if ((cbp = *ucbp++) == NULL)
993 						continue;
994 					if (run_mode != AIO_LARGEFILE)
995 						reqp = aio_req_done(
996 						    &cbp->aio_resultp);
997 					else {
998 						cbp64 = (aiocb64_32_t *)cbp;
999 						reqp = aio_req_done(
1000 						    &cbp64->aio_resultp);
1001 					}
1002 				}
1003 #ifdef	_SYSCALL32_IMPL
1004 				else {
1005 					if (run_mode == AIO_32) {
1006 						if ((cbp32 =
1007 						    (aiocb32_t *)(uintptr_t)
1008 						    *ucbp32++) == NULL)
1009 							continue;
1010 						reqp = aio_req_done(
1011 						    &cbp32->aio_resultp);
1012 					} else if (run_mode == AIO_LARGEFILE) {
1013 						if ((cbp64 =
1014 						    (aiocb64_32_t *)(uintptr_t)
1015 						    *ucbp32++) == NULL)
1016 							continue;
1017 						reqp = aio_req_done(
1018 						    &cbp64->aio_resultp);
1019 					}
1020 
1021 				}
1022 #endif  /* _SYSCALL32_IMPL */
1023 				if (reqp) {
1024 					reqp->aio_req_next = found;
1025 					found = reqp;
1026 				}
1027 				if (aiop->aio_doneq == NULL)
1028 					break;
1029 			}
1030 			if (found)
1031 				break;
1032 		}
1033 		if (aiop->aio_notifycnt > 0) {
1034 			/*
1035 			 * nothing on the kernel's queue. the user
1036 			 * has notified the kernel that it has items
1037 			 * on a user-level queue.
1038 			 */
1039 			aiop->aio_notifycnt--;
1040 			*rval = 1;
1041 			error = 0;
1042 			break;
1043 		}
1044 		/* don't block if nothing is outstanding */
1045 		if (aiop->aio_outstanding == 0) {
1046 			error = EAGAIN;
1047 			break;
1048 		}
1049 		if (blocking) {
1050 			/*
1051 			 * drop the aio_cleanupq_mutex as we are
1052 			 * going to block.
1053 			 */
1054 			mutex_exit(&aiop->aio_cleanupq_mutex);
1055 			rv = cv_waituntil_sig(&aiop->aio_waitcv,
1056 			    &aiop->aio_mutex, rqtp, timecheck);
1057 			/*
1058 			 * we have to drop aio_mutex and
1059 			 * grab it in the right order.
1060 			 */
1061 			mutex_exit(&aiop->aio_mutex);
1062 			mutex_enter(&aiop->aio_cleanupq_mutex);
1063 			mutex_enter(&aiop->aio_mutex);
1064 			if (rv > 0)	/* check done queue again */
1065 				continue;
1066 			if (rv == 0)	/* interrupted by a signal */
1067 				error = EINTR;
1068 			else		/* timer expired */
1069 				error = ETIME;
1070 		} else {
1071 			error = EAGAIN;
1072 		}
1073 		break;
1074 	}
1075 	mutex_exit(&aiop->aio_mutex);
1076 	mutex_exit(&aiop->aio_cleanupq_mutex);
1077 	for (reqp = found; reqp != NULL; reqp = next) {
1078 		next = reqp->aio_req_next;
1079 		aphysio_unlock(reqp);
1080 		aio_copyout_result(reqp);
1081 		mutex_enter(&aiop->aio_mutex);
1082 		aio_req_free(aiop, reqp);
1083 		mutex_exit(&aiop->aio_mutex);
1084 	}
1085 done:
1086 	kmem_free(cbplist, ssize);
1087 	return (error);
1088 }
1089 
1090 /*
1091  * initialize aio by allocating an aio_t struct for this
1092  * process.
1093  */
1094 static int
1095 aioinit(void)
1096 {
1097 	proc_t *p = curproc;
1098 	aio_t *aiop;
1099 	mutex_enter(&p->p_lock);
1100 	if ((aiop = p->p_aio) == NULL) {
1101 		aiop = aio_aiop_alloc();
1102 		p->p_aio = aiop;
1103 	}
1104 	mutex_exit(&p->p_lock);
1105 	if (aiop == NULL)
1106 		return (ENOMEM);
1107 	return (0);
1108 }
1109 
1110 /*
1111  * start a special thread that will cleanup after aio requests
1112  * that are preventing a segment from being unmapped. as_unmap()
1113  * blocks until all phsyio to this segment is completed. this
1114  * doesn't happen until all the pages in this segment are not
1115  * SOFTLOCKed. Some pages will be SOFTLOCKed when there are aio
1116  * requests still outstanding. this special thread will make sure
1117  * that these SOFTLOCKed pages will eventually be SOFTUNLOCKed.
1118  *
1119  * this function will return an error if the process has only
1120  * one LWP. the assumption is that the caller is a separate LWP
1121  * that remains blocked in the kernel for the life of this process.
1122  */
1123 static int
1124 aiostart(void)
1125 {
1126 	proc_t *p = curproc;
1127 	aio_t *aiop;
1128 	int first, error = 0;
1129 
1130 	if (p->p_lwpcnt == 1)
1131 		return (EDEADLK);
1132 	mutex_enter(&p->p_lock);
1133 	if ((aiop = p->p_aio) == NULL)
1134 		error = EINVAL;
1135 	else {
1136 		first = aiop->aio_ok;
1137 		if (aiop->aio_ok == 0)
1138 			aiop->aio_ok = 1;
1139 	}
1140 	mutex_exit(&p->p_lock);
1141 	if (error == 0 && first == 0) {
1142 		return (aio_cleanup_thread(aiop));
1143 		/* should return only to exit */
1144 	}
1145 	return (error);
1146 }
1147 
1148 /*
1149  * Associate an aiocb with a port.
1150  * This function is used by aiorw() to associate a transaction with a port.
1151  * Allocate an event port structure (port_alloc_event()) and store the
1152  * delivered user pointer (portnfy_user) in the portkev_user field of the
1153  * port_kevent_t structure..
1154  * The aio_req_portkev pointer in the aio_req_t structure was added to identify
1155  * the port association.
1156  */
1157 
1158 static int
1159 aio_req_assoc_port_rw(port_notify_t *pntfy, aiocb_t *cbp,
1160 	aio_req_t *reqp, int event)
1161 {
1162 	port_kevent_t	*pkevp = NULL;
1163 	int		error;
1164 
1165 	error = port_alloc_event(pntfy->portnfy_port, PORT_ALLOC_DEFAULT,
1166 	    PORT_SOURCE_AIO, &pkevp);
1167 	if (error) {
1168 		if ((error == ENOMEM) || (error == EAGAIN))
1169 			error = EAGAIN;
1170 		else
1171 			error = EINVAL;
1172 	} else {
1173 		port_init_event(pkevp, (uintptr_t)cbp, pntfy->portnfy_user,
1174 		    aio_port_callback, reqp);
1175 		pkevp->portkev_events = event;
1176 		reqp->aio_req_portkev = pkevp;
1177 		reqp->aio_req_port = pntfy->portnfy_port;
1178 	}
1179 	return (error);
1180 }
1181 
1182 #ifdef _LP64
1183 
1184 /*
1185  * Asynchronous list IO. A chain of aiocb's are copied in
1186  * one at a time. If the aiocb is invalid, it is skipped.
1187  * For each aiocb, the appropriate driver entry point is
1188  * called. Optimize for the common case where the list
1189  * of requests is to the same file descriptor.
1190  *
1191  * One possible optimization is to define a new driver entry
1192  * point that supports a list of IO requests. Whether this
1193  * improves performance depends somewhat on the driver's
1194  * locking strategy. Processing a list could adversely impact
1195  * the driver's interrupt latency.
1196  */
1197 static int
1198 alio(
1199 	int		mode_arg,
1200 	aiocb_t		**aiocb_arg,
1201 	int		nent,
1202 	struct sigevent	*sigev)
1203 {
1204 	file_t		*fp;
1205 	file_t		*prev_fp = NULL;
1206 	int		prev_mode = -1;
1207 	struct vnode	*vp;
1208 	aio_lio_t	*head;
1209 	aio_req_t	*reqp;
1210 	aio_t		*aiop;
1211 	caddr_t		cbplist;
1212 	aiocb_t		cb;
1213 	aiocb_t		*aiocb = &cb;
1214 	aiocb_t		*cbp;
1215 	aiocb_t		**ucbp;
1216 	struct sigevent sigevk;
1217 	sigqueue_t	*sqp;
1218 	int		(*aio_func)();
1219 	int		mode;
1220 	int		error = 0;
1221 	int		aio_errors = 0;
1222 	int		i;
1223 	size_t		ssize;
1224 	int		deadhead = 0;
1225 	int		aio_notsupported = 0;
1226 	int		lio_head_port;
1227 	int		aio_port;
1228 	int		aio_thread;
1229 	port_kevent_t	*pkevtp = NULL;
1230 	int		portused = 0;
1231 	port_notify_t	pnotify;
1232 	int		event;
1233 
1234 	aiop = curproc->p_aio;
1235 	if (aiop == NULL || nent <= 0 || nent > _AIO_LISTIO_MAX)
1236 		return (EINVAL);
1237 
1238 	ssize = (sizeof (aiocb_t *) * nent);
1239 	cbplist = kmem_alloc(ssize, KM_SLEEP);
1240 	ucbp = (aiocb_t **)cbplist;
1241 
1242 	if (copyin(aiocb_arg, cbplist, ssize) ||
1243 	    (sigev && copyin(sigev, &sigevk, sizeof (struct sigevent)))) {
1244 		kmem_free(cbplist, ssize);
1245 		return (EFAULT);
1246 	}
1247 
1248 	/* Event Ports  */
1249 	if (sigev &&
1250 	    (sigevk.sigev_notify == SIGEV_THREAD ||
1251 	    sigevk.sigev_notify == SIGEV_PORT)) {
1252 		if (sigevk.sigev_notify == SIGEV_THREAD) {
1253 			pnotify.portnfy_port = sigevk.sigev_signo;
1254 			pnotify.portnfy_user = sigevk.sigev_value.sival_ptr;
1255 		} else if (copyin(sigevk.sigev_value.sival_ptr,
1256 		    &pnotify, sizeof (pnotify))) {
1257 			kmem_free(cbplist, ssize);
1258 			return (EFAULT);
1259 		}
1260 		error = port_alloc_event(pnotify.portnfy_port,
1261 		    PORT_ALLOC_DEFAULT, PORT_SOURCE_AIO, &pkevtp);
1262 		if (error) {
1263 			if (error == ENOMEM || error == EAGAIN)
1264 				error = EAGAIN;
1265 			else
1266 				error = EINVAL;
1267 			kmem_free(cbplist, ssize);
1268 			return (error);
1269 		}
1270 		lio_head_port = pnotify.portnfy_port;
1271 		portused = 1;
1272 	}
1273 
1274 	/*
1275 	 * a list head should be allocated if notification is
1276 	 * enabled for this list.
1277 	 */
1278 	head = NULL;
1279 
1280 	if (mode_arg == LIO_WAIT || sigev) {
1281 		mutex_enter(&aiop->aio_mutex);
1282 		error = aio_lio_alloc(&head);
1283 		mutex_exit(&aiop->aio_mutex);
1284 		if (error)
1285 			goto done;
1286 		deadhead = 1;
1287 		head->lio_nent = nent;
1288 		head->lio_refcnt = nent;
1289 		head->lio_port = -1;
1290 		head->lio_portkev = NULL;
1291 		if (sigev && sigevk.sigev_notify == SIGEV_SIGNAL &&
1292 		    sigevk.sigev_signo > 0 && sigevk.sigev_signo < NSIG) {
1293 			sqp = kmem_zalloc(sizeof (sigqueue_t), KM_NOSLEEP);
1294 			if (sqp == NULL) {
1295 				error = EAGAIN;
1296 				goto done;
1297 			}
1298 			sqp->sq_func = NULL;
1299 			sqp->sq_next = NULL;
1300 			sqp->sq_info.si_code = SI_ASYNCIO;
1301 			sqp->sq_info.si_pid = curproc->p_pid;
1302 			sqp->sq_info.si_ctid = PRCTID(curproc);
1303 			sqp->sq_info.si_zoneid = getzoneid();
1304 			sqp->sq_info.si_uid = crgetuid(curproc->p_cred);
1305 			sqp->sq_info.si_signo = sigevk.sigev_signo;
1306 			sqp->sq_info.si_value = sigevk.sigev_value;
1307 			head->lio_sigqp = sqp;
1308 		} else {
1309 			head->lio_sigqp = NULL;
1310 		}
1311 		if (pkevtp) {
1312 			/*
1313 			 * Prepare data to send when list of aiocb's
1314 			 * has completed.
1315 			 */
1316 			port_init_event(pkevtp, (uintptr_t)sigev,
1317 			    (void *)(uintptr_t)pnotify.portnfy_user,
1318 			    NULL, head);
1319 			pkevtp->portkev_events = AIOLIO;
1320 			head->lio_portkev = pkevtp;
1321 			head->lio_port = pnotify.portnfy_port;
1322 		}
1323 	}
1324 
1325 	for (i = 0; i < nent; i++, ucbp++) {
1326 
1327 		cbp = *ucbp;
1328 		/* skip entry if it can't be copied. */
1329 		if (cbp == NULL || copyin(cbp, aiocb, sizeof (*aiocb))) {
1330 			if (head) {
1331 				mutex_enter(&aiop->aio_mutex);
1332 				head->lio_nent--;
1333 				head->lio_refcnt--;
1334 				mutex_exit(&aiop->aio_mutex);
1335 			}
1336 			continue;
1337 		}
1338 
1339 		/* skip if opcode for aiocb is LIO_NOP */
1340 		mode = aiocb->aio_lio_opcode;
1341 		if (mode == LIO_NOP) {
1342 			cbp = NULL;
1343 			if (head) {
1344 				mutex_enter(&aiop->aio_mutex);
1345 				head->lio_nent--;
1346 				head->lio_refcnt--;
1347 				mutex_exit(&aiop->aio_mutex);
1348 			}
1349 			continue;
1350 		}
1351 
1352 		/* increment file descriptor's ref count. */
1353 		if ((fp = getf(aiocb->aio_fildes)) == NULL) {
1354 			lio_set_uerror(&cbp->aio_resultp, EBADF);
1355 			if (head) {
1356 				mutex_enter(&aiop->aio_mutex);
1357 				head->lio_nent--;
1358 				head->lio_refcnt--;
1359 				mutex_exit(&aiop->aio_mutex);
1360 			}
1361 			aio_errors++;
1362 			continue;
1363 		}
1364 
1365 		/*
1366 		 * check the permission of the partition
1367 		 */
1368 		if ((fp->f_flag & mode) == 0) {
1369 			releasef(aiocb->aio_fildes);
1370 			lio_set_uerror(&cbp->aio_resultp, EBADF);
1371 			if (head) {
1372 				mutex_enter(&aiop->aio_mutex);
1373 				head->lio_nent--;
1374 				head->lio_refcnt--;
1375 				mutex_exit(&aiop->aio_mutex);
1376 			}
1377 			aio_errors++;
1378 			continue;
1379 		}
1380 
1381 		/*
1382 		 * common case where requests are to the same fd
1383 		 * for the same r/w operation.
1384 		 * for UFS, need to set EBADFD
1385 		 */
1386 		vp = fp->f_vnode;
1387 		if (fp != prev_fp || mode != prev_mode) {
1388 			aio_func = check_vp(vp, mode);
1389 			if (aio_func == NULL) {
1390 				prev_fp = NULL;
1391 				releasef(aiocb->aio_fildes);
1392 				lio_set_uerror(&cbp->aio_resultp, EBADFD);
1393 				aio_notsupported++;
1394 				if (head) {
1395 					mutex_enter(&aiop->aio_mutex);
1396 					head->lio_nent--;
1397 					head->lio_refcnt--;
1398 					mutex_exit(&aiop->aio_mutex);
1399 				}
1400 				continue;
1401 			} else {
1402 				prev_fp = fp;
1403 				prev_mode = mode;
1404 			}
1405 		}
1406 
1407 		error = aio_req_setup(&reqp, aiop, aiocb,
1408 		    &cbp->aio_resultp, vp, 0);
1409 		if (error) {
1410 			releasef(aiocb->aio_fildes);
1411 			lio_set_uerror(&cbp->aio_resultp, error);
1412 			if (head) {
1413 				mutex_enter(&aiop->aio_mutex);
1414 				head->lio_nent--;
1415 				head->lio_refcnt--;
1416 				mutex_exit(&aiop->aio_mutex);
1417 			}
1418 			aio_errors++;
1419 			continue;
1420 		}
1421 
1422 		reqp->aio_req_lio = head;
1423 		deadhead = 0;
1424 
1425 		/*
1426 		 * Set the errno field now before sending the request to
1427 		 * the driver to avoid a race condition
1428 		 */
1429 		(void) suword32(&cbp->aio_resultp.aio_errno,
1430 		    EINPROGRESS);
1431 
1432 		reqp->aio_req_iocb.iocb = (caddr_t)cbp;
1433 
1434 		event = (mode == LIO_READ)? AIOAREAD : AIOAWRITE;
1435 		aio_port = (aiocb->aio_sigevent.sigev_notify == SIGEV_PORT);
1436 		aio_thread = (aiocb->aio_sigevent.sigev_notify == SIGEV_THREAD);
1437 		if (aio_port | aio_thread) {
1438 			port_kevent_t *lpkevp;
1439 			/*
1440 			 * Prepare data to send with each aiocb completed.
1441 			 */
1442 			if (aio_port) {
1443 				void *paddr =
1444 				    aiocb->aio_sigevent.sigev_value.sival_ptr;
1445 				if (copyin(paddr, &pnotify, sizeof (pnotify)))
1446 					error = EFAULT;
1447 			} else {	/* aio_thread */
1448 				pnotify.portnfy_port =
1449 				    aiocb->aio_sigevent.sigev_signo;
1450 				pnotify.portnfy_user =
1451 				    aiocb->aio_sigevent.sigev_value.sival_ptr;
1452 			}
1453 			if (error)
1454 				/* EMPTY */;
1455 			else if (pkevtp != NULL &&
1456 			    pnotify.portnfy_port == lio_head_port)
1457 				error = port_dup_event(pkevtp, &lpkevp,
1458 				    PORT_ALLOC_DEFAULT);
1459 			else
1460 				error = port_alloc_event(pnotify.portnfy_port,
1461 				    PORT_ALLOC_DEFAULT, PORT_SOURCE_AIO,
1462 				    &lpkevp);
1463 			if (error == 0) {
1464 				port_init_event(lpkevp, (uintptr_t)cbp,
1465 				    (void *)(uintptr_t)pnotify.portnfy_user,
1466 				    aio_port_callback, reqp);
1467 				lpkevp->portkev_events = event;
1468 				reqp->aio_req_portkev = lpkevp;
1469 				reqp->aio_req_port = pnotify.portnfy_port;
1470 			}
1471 		}
1472 
1473 		/*
1474 		 * send the request to driver.
1475 		 */
1476 		if (error == 0) {
1477 			if (aiocb->aio_nbytes == 0) {
1478 				clear_active_fd(aiocb->aio_fildes);
1479 				aio_zerolen(reqp);
1480 				continue;
1481 			}
1482 			error = (*aio_func)(vp, (aio_req_t *)&reqp->aio_req,
1483 			    CRED());
1484 		}
1485 
1486 		/*
1487 		 * the fd's ref count is not decremented until the IO has
1488 		 * completed unless there was an error.
1489 		 */
1490 		if (error) {
1491 			releasef(aiocb->aio_fildes);
1492 			lio_set_uerror(&cbp->aio_resultp, error);
1493 			if (head) {
1494 				mutex_enter(&aiop->aio_mutex);
1495 				head->lio_nent--;
1496 				head->lio_refcnt--;
1497 				mutex_exit(&aiop->aio_mutex);
1498 			}
1499 			if (error == ENOTSUP)
1500 				aio_notsupported++;
1501 			else
1502 				aio_errors++;
1503 			lio_set_error(reqp, portused);
1504 		} else {
1505 			clear_active_fd(aiocb->aio_fildes);
1506 		}
1507 	}
1508 
1509 	if (aio_notsupported) {
1510 		error = ENOTSUP;
1511 	} else if (aio_errors) {
1512 		/*
1513 		 * return EIO if any request failed
1514 		 */
1515 		error = EIO;
1516 	}
1517 
1518 	if (mode_arg == LIO_WAIT) {
1519 		mutex_enter(&aiop->aio_mutex);
1520 		while (head->lio_refcnt > 0) {
1521 			if (!cv_wait_sig(&head->lio_notify, &aiop->aio_mutex)) {
1522 				mutex_exit(&aiop->aio_mutex);
1523 				error = EINTR;
1524 				goto done;
1525 			}
1526 		}
1527 		mutex_exit(&aiop->aio_mutex);
1528 		alio_cleanup(aiop, (aiocb_t **)cbplist, nent, AIO_64);
1529 	}
1530 
1531 done:
1532 	kmem_free(cbplist, ssize);
1533 	if (deadhead) {
1534 		if (head->lio_sigqp)
1535 			kmem_free(head->lio_sigqp, sizeof (sigqueue_t));
1536 		if (head->lio_portkev)
1537 			port_free_event(head->lio_portkev);
1538 		kmem_free(head, sizeof (aio_lio_t));
1539 	}
1540 	return (error);
1541 }
1542 
1543 #endif /* _LP64 */
1544 
1545 /*
1546  * Asynchronous list IO.
1547  * If list I/O is called with LIO_WAIT it can still return
1548  * before all the I/O's are completed if a signal is caught
1549  * or if the list include UFS I/O requests. If this happens,
1550  * libaio will call aliowait() to wait for the I/O's to
1551  * complete
1552  */
1553 /*ARGSUSED*/
1554 static int
1555 aliowait(
1556 	int	mode,
1557 	void	*aiocb,
1558 	int	nent,
1559 	void	*sigev,
1560 	int	run_mode)
1561 {
1562 	aio_lio_t	*head;
1563 	aio_t		*aiop;
1564 	caddr_t		cbplist;
1565 	aiocb_t		*cbp, **ucbp;
1566 #ifdef	_SYSCALL32_IMPL
1567 	aiocb32_t	*cbp32;
1568 	caddr32_t	*ucbp32;
1569 	aiocb64_32_t	*cbp64;
1570 #endif
1571 	int		error = 0;
1572 	int		i;
1573 	size_t		ssize = 0;
1574 	model_t		model = get_udatamodel();
1575 
1576 	aiop = curproc->p_aio;
1577 	if (aiop == NULL || nent <= 0 || nent > _AIO_LISTIO_MAX)
1578 		return (EINVAL);
1579 
1580 	if (model == DATAMODEL_NATIVE)
1581 		ssize = (sizeof (aiocb_t *) * nent);
1582 #ifdef	_SYSCALL32_IMPL
1583 	else
1584 		ssize = (sizeof (caddr32_t) * nent);
1585 #endif  /* _SYSCALL32_IMPL */
1586 
1587 	if (ssize == 0)
1588 		return (EINVAL);
1589 
1590 	cbplist = kmem_alloc(ssize, KM_SLEEP);
1591 
1592 	if (model == DATAMODEL_NATIVE)
1593 		ucbp = (aiocb_t **)cbplist;
1594 #ifdef	_SYSCALL32_IMPL
1595 	else
1596 		ucbp32 = (caddr32_t *)cbplist;
1597 #endif  /* _SYSCALL32_IMPL */
1598 
1599 	if (copyin(aiocb, cbplist, ssize)) {
1600 		error = EFAULT;
1601 		goto done;
1602 	}
1603 
1604 	/*
1605 	 * To find the list head, we go through the
1606 	 * list of aiocb structs, find the request
1607 	 * its for, then get the list head that reqp
1608 	 * points to
1609 	 */
1610 	head = NULL;
1611 
1612 	for (i = 0; i < nent; i++) {
1613 		if (model == DATAMODEL_NATIVE) {
1614 			/*
1615 			 * Since we are only checking for a NULL pointer
1616 			 * Following should work on both native data sizes
1617 			 * as well as for largefile aiocb.
1618 			 */
1619 			if ((cbp = *ucbp++) == NULL)
1620 				continue;
1621 			if (run_mode != AIO_LARGEFILE)
1622 				if (head = aio_list_get(&cbp->aio_resultp))
1623 					break;
1624 			else {
1625 				/*
1626 				 * This is a case when largefile call is
1627 				 * made on 32 bit kernel.
1628 				 * Treat each pointer as pointer to
1629 				 * aiocb64_32
1630 				 */
1631 				if (head = aio_list_get((aio_result_t *)
1632 				    &(((aiocb64_32_t *)cbp)->aio_resultp)))
1633 					break;
1634 			}
1635 		}
1636 #ifdef	_SYSCALL32_IMPL
1637 		else {
1638 			if (run_mode == AIO_LARGEFILE) {
1639 				if ((cbp64 = (aiocb64_32_t *)
1640 				    (uintptr_t)*ucbp32++) == NULL)
1641 					continue;
1642 				if (head = aio_list_get((aio_result_t *)
1643 				    &cbp64->aio_resultp))
1644 					break;
1645 			} else if (run_mode == AIO_32) {
1646 				if ((cbp32 = (aiocb32_t *)
1647 				    (uintptr_t)*ucbp32++) == NULL)
1648 					continue;
1649 				if (head = aio_list_get((aio_result_t *)
1650 				    &cbp32->aio_resultp))
1651 					break;
1652 			}
1653 		}
1654 #endif	/* _SYSCALL32_IMPL */
1655 	}
1656 
1657 	if (head == NULL) {
1658 		error = EINVAL;
1659 		goto done;
1660 	}
1661 
1662 	mutex_enter(&aiop->aio_mutex);
1663 	while (head->lio_refcnt > 0) {
1664 		if (!cv_wait_sig(&head->lio_notify, &aiop->aio_mutex)) {
1665 			mutex_exit(&aiop->aio_mutex);
1666 			error = EINTR;
1667 			goto done;
1668 		}
1669 	}
1670 	mutex_exit(&aiop->aio_mutex);
1671 	alio_cleanup(aiop, (aiocb_t **)cbplist, nent, run_mode);
1672 done:
1673 	kmem_free(cbplist, ssize);
1674 	return (error);
1675 }
1676 
1677 aio_lio_t *
1678 aio_list_get(aio_result_t *resultp)
1679 {
1680 	aio_lio_t	*head = NULL;
1681 	aio_t		*aiop;
1682 	aio_req_t 	**bucket;
1683 	aio_req_t 	*reqp;
1684 	long		index;
1685 
1686 	aiop = curproc->p_aio;
1687 	if (aiop == NULL)
1688 		return (NULL);
1689 
1690 	if (resultp) {
1691 		index = AIO_HASH(resultp);
1692 		bucket = &aiop->aio_hash[index];
1693 		for (reqp = *bucket; reqp != NULL;
1694 		    reqp = reqp->aio_hash_next) {
1695 			if (reqp->aio_req_resultp == resultp) {
1696 				head = reqp->aio_req_lio;
1697 				return (head);
1698 			}
1699 		}
1700 	}
1701 	return (NULL);
1702 }
1703 
1704 
1705 static void
1706 lio_set_uerror(void *resultp, int error)
1707 {
1708 	/*
1709 	 * the resultp field is a pointer to where the
1710 	 * error should be written out to the user's
1711 	 * aiocb.
1712 	 *
1713 	 */
1714 	if (get_udatamodel() == DATAMODEL_NATIVE) {
1715 		(void) sulword(&((aio_result_t *)resultp)->aio_return,
1716 		    (ssize_t)-1);
1717 		(void) suword32(&((aio_result_t *)resultp)->aio_errno, error);
1718 	}
1719 #ifdef	_SYSCALL32_IMPL
1720 	else {
1721 		(void) suword32(&((aio_result32_t *)resultp)->aio_return,
1722 		    (uint_t)-1);
1723 		(void) suword32(&((aio_result32_t *)resultp)->aio_errno, error);
1724 	}
1725 #endif  /* _SYSCALL32_IMPL */
1726 }
1727 
1728 /*
1729  * do cleanup completion for all requests in list. memory for
1730  * each request is also freed.
1731  */
1732 static void
1733 alio_cleanup(aio_t *aiop, aiocb_t **cbp, int nent, int run_mode)
1734 {
1735 	int i;
1736 	aio_req_t *reqp;
1737 	aio_result_t *resultp;
1738 	aiocb64_32_t *aiocb_64;
1739 
1740 	for (i = 0; i < nent; i++) {
1741 		if (get_udatamodel() == DATAMODEL_NATIVE) {
1742 			if (cbp[i] == NULL)
1743 				continue;
1744 			if (run_mode == AIO_LARGEFILE) {
1745 				aiocb_64 = (aiocb64_32_t *)cbp[i];
1746 				resultp = (aio_result_t *)
1747 				    &aiocb_64->aio_resultp;
1748 			} else
1749 				resultp = &cbp[i]->aio_resultp;
1750 		}
1751 #ifdef	_SYSCALL32_IMPL
1752 		else {
1753 			aiocb32_t *aiocb_32;
1754 			caddr32_t *cbp32;
1755 
1756 			cbp32 = (caddr32_t *)cbp;
1757 			if (cbp32[i] == NULL)
1758 				continue;
1759 			if (run_mode == AIO_32) {
1760 				aiocb_32 = (aiocb32_t *)(uintptr_t)cbp32[i];
1761 				resultp = (aio_result_t *)&aiocb_32->
1762 				    aio_resultp;
1763 			} else if (run_mode == AIO_LARGEFILE) {
1764 				aiocb_64 = (aiocb64_32_t *)(uintptr_t)cbp32[i];
1765 				resultp = (aio_result_t *)&aiocb_64->
1766 				    aio_resultp;
1767 			}
1768 		}
1769 #endif  /* _SYSCALL32_IMPL */
1770 		/*
1771 		 * we need to get the aio_cleanupq_mutex since we call
1772 		 * aio_req_done().
1773 		 */
1774 		mutex_enter(&aiop->aio_cleanupq_mutex);
1775 		mutex_enter(&aiop->aio_mutex);
1776 		reqp = aio_req_done(resultp);
1777 		mutex_exit(&aiop->aio_mutex);
1778 		mutex_exit(&aiop->aio_cleanupq_mutex);
1779 		if (reqp != NULL) {
1780 			aphysio_unlock(reqp);
1781 			aio_copyout_result(reqp);
1782 			mutex_enter(&aiop->aio_mutex);
1783 			aio_req_free(aiop, reqp);
1784 			mutex_exit(&aiop->aio_mutex);
1785 		}
1786 	}
1787 }
1788 
1789 /*
1790  * Write out the results for an aio request that is done.
1791  */
1792 static int
1793 aioerror(void *cb, int run_mode)
1794 {
1795 	aio_result_t *resultp;
1796 	aio_t *aiop;
1797 	aio_req_t *reqp;
1798 	int retval;
1799 
1800 	aiop = curproc->p_aio;
1801 	if (aiop == NULL || cb == NULL)
1802 		return (EINVAL);
1803 
1804 	if (get_udatamodel() == DATAMODEL_NATIVE) {
1805 		if (run_mode == AIO_LARGEFILE)
1806 			resultp = (aio_result_t *)&((aiocb64_32_t *)cb)->
1807 			    aio_resultp;
1808 		else
1809 			resultp = &((aiocb_t *)cb)->aio_resultp;
1810 	}
1811 #ifdef	_SYSCALL32_IMPL
1812 	else {
1813 		if (run_mode == AIO_LARGEFILE)
1814 			resultp = (aio_result_t *)&((aiocb64_32_t *)cb)->
1815 			    aio_resultp;
1816 		else if (run_mode == AIO_32)
1817 			resultp = (aio_result_t *)&((aiocb32_t *)cb)->
1818 			    aio_resultp;
1819 	}
1820 #endif  /* _SYSCALL32_IMPL */
1821 	/*
1822 	 * we need to get the aio_cleanupq_mutex since we call
1823 	 * aio_req_find().
1824 	 */
1825 	mutex_enter(&aiop->aio_cleanupq_mutex);
1826 	mutex_enter(&aiop->aio_mutex);
1827 	retval = aio_req_find(resultp, &reqp);
1828 	mutex_exit(&aiop->aio_mutex);
1829 	mutex_exit(&aiop->aio_cleanupq_mutex);
1830 	if (retval == 0) {
1831 		aphysio_unlock(reqp);
1832 		aio_copyout_result(reqp);
1833 		mutex_enter(&aiop->aio_mutex);
1834 		aio_req_free(aiop, reqp);
1835 		mutex_exit(&aiop->aio_mutex);
1836 		return (0);
1837 	} else if (retval == 1)
1838 		return (EINPROGRESS);
1839 	else if (retval == 2)
1840 		return (EINVAL);
1841 	return (0);
1842 }
1843 
1844 /*
1845  * 	aio_cancel - if no requests outstanding,
1846  *			return AIO_ALLDONE
1847  *			else
1848  *			return AIO_NOTCANCELED
1849  */
1850 static int
1851 aio_cancel(
1852 	int	fildes,
1853 	void 	*cb,
1854 	long	*rval,
1855 	int	run_mode)
1856 {
1857 	aio_t *aiop;
1858 	void *resultp;
1859 	int index;
1860 	aio_req_t **bucket;
1861 	aio_req_t *ent;
1862 
1863 
1864 	/*
1865 	 * Verify valid file descriptor
1866 	 */
1867 	if ((getf(fildes)) == NULL) {
1868 		return (EBADF);
1869 	}
1870 	releasef(fildes);
1871 
1872 	aiop = curproc->p_aio;
1873 	if (aiop == NULL)
1874 		return (EINVAL);
1875 
1876 	if (aiop->aio_outstanding == 0) {
1877 		*rval = AIO_ALLDONE;
1878 		return (0);
1879 	}
1880 
1881 	mutex_enter(&aiop->aio_mutex);
1882 	if (cb != NULL) {
1883 		if (get_udatamodel() == DATAMODEL_NATIVE) {
1884 			if (run_mode == AIO_LARGEFILE)
1885 				resultp = (aio_result_t *)&((aiocb64_32_t *)cb)
1886 				    ->aio_resultp;
1887 			else
1888 				resultp = &((aiocb_t *)cb)->aio_resultp;
1889 		}
1890 #ifdef	_SYSCALL32_IMPL
1891 		else {
1892 			if (run_mode == AIO_LARGEFILE)
1893 				resultp = (aio_result_t *)&((aiocb64_32_t *)cb)
1894 				    ->aio_resultp;
1895 			else if (run_mode == AIO_32)
1896 				resultp = (aio_result_t *)&((aiocb32_t *)cb)
1897 				    ->aio_resultp;
1898 		}
1899 #endif  /* _SYSCALL32_IMPL */
1900 		index = AIO_HASH(resultp);
1901 		bucket = &aiop->aio_hash[index];
1902 		for (ent = *bucket; ent != NULL; ent = ent->aio_hash_next) {
1903 			if (ent->aio_req_resultp == resultp) {
1904 				if ((ent->aio_req_flags & AIO_PENDING) == 0) {
1905 					mutex_exit(&aiop->aio_mutex);
1906 					*rval = AIO_ALLDONE;
1907 					return (0);
1908 				}
1909 				mutex_exit(&aiop->aio_mutex);
1910 				*rval = AIO_NOTCANCELED;
1911 				return (0);
1912 			}
1913 		}
1914 		mutex_exit(&aiop->aio_mutex);
1915 		*rval = AIO_ALLDONE;
1916 		return (0);
1917 	}
1918 
1919 	for (index = 0; index < AIO_HASHSZ; index++) {
1920 		bucket = &aiop->aio_hash[index];
1921 		for (ent = *bucket; ent != NULL; ent = ent->aio_hash_next) {
1922 			if (ent->aio_req_fd == fildes) {
1923 				if ((ent->aio_req_flags & AIO_PENDING) != 0) {
1924 					mutex_exit(&aiop->aio_mutex);
1925 					*rval = AIO_NOTCANCELED;
1926 					return (0);
1927 				}
1928 			}
1929 		}
1930 	}
1931 	mutex_exit(&aiop->aio_mutex);
1932 	*rval = AIO_ALLDONE;
1933 	return (0);
1934 }
1935 
1936 /*
1937  * solaris version of asynchronous read and write
1938  */
1939 static int
1940 arw(
1941 	int	opcode,
1942 	int	fdes,
1943 	char	*bufp,
1944 	int	bufsize,
1945 	offset_t	offset,
1946 	aio_result_t	*resultp,
1947 	int		mode)
1948 {
1949 	file_t		*fp;
1950 	int		error;
1951 	struct vnode	*vp;
1952 	aio_req_t	*reqp;
1953 	aio_t		*aiop;
1954 	int		(*aio_func)();
1955 #ifdef _LP64
1956 	aiocb_t		aiocb;
1957 #else
1958 	aiocb64_32_t	aiocb64;
1959 #endif
1960 
1961 	aiop = curproc->p_aio;
1962 	if (aiop == NULL)
1963 		return (EINVAL);
1964 
1965 	if ((fp = getf(fdes)) == NULL) {
1966 		return (EBADF);
1967 	}
1968 
1969 	/*
1970 	 * check the permission of the partition
1971 	 */
1972 	if ((fp->f_flag & mode) == 0) {
1973 		releasef(fdes);
1974 		return (EBADF);
1975 	}
1976 
1977 	vp = fp->f_vnode;
1978 	aio_func = check_vp(vp, mode);
1979 	if (aio_func == NULL) {
1980 		releasef(fdes);
1981 		return (EBADFD);
1982 	}
1983 #ifdef _LP64
1984 	aiocb.aio_fildes = fdes;
1985 	aiocb.aio_buf = bufp;
1986 	aiocb.aio_nbytes = bufsize;
1987 	aiocb.aio_offset = offset;
1988 	aiocb.aio_sigevent.sigev_notify = 0;
1989 	error = aio_req_setup(&reqp, aiop, &aiocb, resultp, vp, 1);
1990 #else
1991 	aiocb64.aio_fildes = fdes;
1992 	aiocb64.aio_buf = (caddr32_t)bufp;
1993 	aiocb64.aio_nbytes = bufsize;
1994 	aiocb64.aio_offset = offset;
1995 	aiocb64.aio_sigevent.sigev_notify = 0;
1996 	error = aio_req_setupLF(&reqp, aiop, &aiocb64, resultp, vp, 1);
1997 #endif
1998 	if (error) {
1999 		releasef(fdes);
2000 		return (error);
2001 	}
2002 
2003 	/*
2004 	 * enable polling on this request if the opcode has
2005 	 * the AIO poll bit set
2006 	 */
2007 	if (opcode & AIO_POLL_BIT)
2008 		reqp->aio_req_flags |= AIO_POLL;
2009 
2010 	if (bufsize == 0) {
2011 		clear_active_fd(fdes);
2012 		aio_zerolen(reqp);
2013 		return (0);
2014 	}
2015 	/*
2016 	 * send the request to driver.
2017 	 */
2018 	error = (*aio_func)(vp, (aio_req_t *)&reqp->aio_req, CRED());
2019 	/*
2020 	 * the fd is stored in the aio_req_t by aio_req_setup(), and
2021 	 * is released by the aio_cleanup_thread() when the IO has
2022 	 * completed.
2023 	 */
2024 	if (error) {
2025 		releasef(fdes);
2026 		mutex_enter(&aiop->aio_mutex);
2027 		aio_req_free(aiop, reqp);
2028 		aiop->aio_pending--;
2029 		if (aiop->aio_flags & AIO_REQ_BLOCK)
2030 			cv_signal(&aiop->aio_cleanupcv);
2031 		mutex_exit(&aiop->aio_mutex);
2032 		return (error);
2033 	}
2034 	clear_active_fd(fdes);
2035 	return (0);
2036 }
2037 
2038 /*
2039  * posix version of asynchronous read and write
2040  */
2041 static int
2042 aiorw(
2043 	int		opcode,
2044 	void		*aiocb_arg,
2045 	int		mode,
2046 	int		run_mode)
2047 {
2048 #ifdef _SYSCALL32_IMPL
2049 	aiocb32_t	aiocb32;
2050 	struct	sigevent32 *sigev32;
2051 	port_notify32_t	pntfy32;
2052 #endif
2053 	aiocb64_32_t	aiocb64;
2054 	aiocb_t		aiocb;
2055 	file_t		*fp;
2056 	int		error, fd;
2057 	size_t		bufsize;
2058 	struct vnode	*vp;
2059 	aio_req_t	*reqp;
2060 	aio_t		*aiop;
2061 	int		(*aio_func)();
2062 	aio_result_t	*resultp;
2063 	struct	sigevent *sigev;
2064 	model_t		model;
2065 	int		aio_use_port = 0;
2066 	port_notify_t	pntfy;
2067 
2068 	model = get_udatamodel();
2069 	aiop = curproc->p_aio;
2070 	if (aiop == NULL)
2071 		return (EINVAL);
2072 
2073 	if (model == DATAMODEL_NATIVE) {
2074 		if (run_mode != AIO_LARGEFILE) {
2075 			if (copyin(aiocb_arg, &aiocb, sizeof (aiocb_t)))
2076 				return (EFAULT);
2077 			bufsize = aiocb.aio_nbytes;
2078 			resultp = &(((aiocb_t *)aiocb_arg)->aio_resultp);
2079 			if ((fp = getf(fd = aiocb.aio_fildes)) == NULL) {
2080 				return (EBADF);
2081 			}
2082 			sigev = &aiocb.aio_sigevent;
2083 		} else {
2084 			/*
2085 			 * We come here only when we make largefile
2086 			 * call on 32 bit kernel using 32 bit library.
2087 			 */
2088 			if (copyin(aiocb_arg, &aiocb64, sizeof (aiocb64_32_t)))
2089 				return (EFAULT);
2090 			bufsize = aiocb64.aio_nbytes;
2091 			resultp = (aio_result_t *)&(((aiocb64_32_t *)aiocb_arg)
2092 			    ->aio_resultp);
2093 			if ((fp = getf(fd = aiocb64.aio_fildes)) == NULL)
2094 				return (EBADF);
2095 			sigev = (struct sigevent *)&aiocb64.aio_sigevent;
2096 		}
2097 
2098 		if (sigev->sigev_notify == SIGEV_PORT) {
2099 			if (copyin((void *)sigev->sigev_value.sival_ptr,
2100 			    &pntfy, sizeof (port_notify_t))) {
2101 				releasef(fd);
2102 				return (EFAULT);
2103 			}
2104 			aio_use_port = 1;
2105 		} else if (sigev->sigev_notify == SIGEV_THREAD) {
2106 			pntfy.portnfy_port = aiocb.aio_sigevent.sigev_signo;
2107 			pntfy.portnfy_user =
2108 			    aiocb.aio_sigevent.sigev_value.sival_ptr;
2109 			aio_use_port = 1;
2110 		}
2111 	}
2112 #ifdef	_SYSCALL32_IMPL
2113 	else {
2114 		if (run_mode == AIO_32) {
2115 			/* 32 bit system call is being made on 64 bit kernel */
2116 			if (copyin(aiocb_arg, &aiocb32, sizeof (aiocb32_t)))
2117 				return (EFAULT);
2118 
2119 			bufsize = aiocb32.aio_nbytes;
2120 			aiocb_32ton(&aiocb32, &aiocb);
2121 			resultp = (aio_result_t *)&(((aiocb32_t *)aiocb_arg)->
2122 			    aio_resultp);
2123 			if ((fp = getf(fd = aiocb32.aio_fildes)) == NULL) {
2124 				return (EBADF);
2125 			}
2126 			sigev32 = &aiocb32.aio_sigevent;
2127 		} else if (run_mode == AIO_LARGEFILE) {
2128 			/*
2129 			 * We come here only when we make largefile
2130 			 * call on 64 bit kernel using 32 bit library.
2131 			 */
2132 			if (copyin(aiocb_arg, &aiocb64, sizeof (aiocb64_32_t)))
2133 				return (EFAULT);
2134 			bufsize = aiocb64.aio_nbytes;
2135 			aiocb_LFton(&aiocb64, &aiocb);
2136 			resultp = (aio_result_t *)&(((aiocb64_32_t *)aiocb_arg)
2137 			    ->aio_resultp);
2138 			if ((fp = getf(fd = aiocb64.aio_fildes)) == NULL)
2139 				return (EBADF);
2140 			sigev32 = &aiocb64.aio_sigevent;
2141 		}
2142 
2143 		if (sigev32->sigev_notify == SIGEV_PORT) {
2144 			if (copyin(
2145 			    (void *)(uintptr_t)sigev32->sigev_value.sival_ptr,
2146 			    &pntfy32, sizeof (port_notify32_t))) {
2147 				releasef(fd);
2148 				return (EFAULT);
2149 			}
2150 			pntfy.portnfy_port = pntfy32.portnfy_port;
2151 			pntfy.portnfy_user = (void *)(uintptr_t)
2152 			    pntfy32.portnfy_user;
2153 			aio_use_port = 1;
2154 		} else if (sigev32->sigev_notify == SIGEV_THREAD) {
2155 			pntfy.portnfy_port = sigev32->sigev_signo;
2156 			pntfy.portnfy_user = (void *)(uintptr_t)
2157 			    sigev32->sigev_value.sival_ptr;
2158 			aio_use_port = 1;
2159 		}
2160 	}
2161 #endif  /* _SYSCALL32_IMPL */
2162 
2163 	/*
2164 	 * check the permission of the partition
2165 	 */
2166 
2167 	if ((fp->f_flag & mode) == 0) {
2168 		releasef(fd);
2169 		return (EBADF);
2170 	}
2171 
2172 	vp = fp->f_vnode;
2173 	aio_func = check_vp(vp, mode);
2174 	if (aio_func == NULL) {
2175 		releasef(fd);
2176 		return (EBADFD);
2177 	}
2178 	if (run_mode == AIO_LARGEFILE)
2179 		error = aio_req_setupLF(&reqp, aiop, &aiocb64, resultp, vp, 0);
2180 	else
2181 		error = aio_req_setup(&reqp, aiop, &aiocb, resultp, vp, 0);
2182 
2183 	if (error) {
2184 		releasef(fd);
2185 		return (error);
2186 	}
2187 	/*
2188 	 * enable polling on this request if the opcode has
2189 	 * the AIO poll bit set
2190 	 */
2191 	if (opcode & AIO_POLL_BIT)
2192 		reqp->aio_req_flags |= AIO_POLL;
2193 
2194 	if (model == DATAMODEL_NATIVE)
2195 		reqp->aio_req_iocb.iocb = aiocb_arg;
2196 #ifdef  _SYSCALL32_IMPL
2197 	else
2198 		reqp->aio_req_iocb.iocb32 = (caddr32_t)(uintptr_t)aiocb_arg;
2199 #endif
2200 
2201 	if (aio_use_port) {
2202 		int event = (run_mode == AIO_LARGEFILE)?
2203 		    ((mode == FREAD)? AIOAREAD64 : AIOAWRITE64) :
2204 		    ((mode == FREAD)? AIOAREAD : AIOAWRITE);
2205 		error = aio_req_assoc_port_rw(&pntfy, aiocb_arg, reqp, event);
2206 	}
2207 
2208 	/*
2209 	 * send the request to driver.
2210 	 */
2211 	if (error == 0) {
2212 		if (bufsize == 0) {
2213 			clear_active_fd(fd);
2214 			aio_zerolen(reqp);
2215 			return (0);
2216 		}
2217 		error = (*aio_func)(vp, (aio_req_t *)&reqp->aio_req, CRED());
2218 	}
2219 
2220 	/*
2221 	 * the fd is stored in the aio_req_t by aio_req_setup(), and
2222 	 * is released by the aio_cleanup_thread() when the IO has
2223 	 * completed.
2224 	 */
2225 	if (error) {
2226 		releasef(fd);
2227 		mutex_enter(&aiop->aio_mutex);
2228 		if (aio_use_port)
2229 			aio_deq(&aiop->aio_portpending, reqp);
2230 		aio_req_free(aiop, reqp);
2231 		aiop->aio_pending--;
2232 		if (aiop->aio_flags & AIO_REQ_BLOCK)
2233 			cv_signal(&aiop->aio_cleanupcv);
2234 		mutex_exit(&aiop->aio_mutex);
2235 		return (error);
2236 	}
2237 	clear_active_fd(fd);
2238 	return (0);
2239 }
2240 
2241 
2242 /*
2243  * set error for a list IO entry that failed.
2244  */
2245 static void
2246 lio_set_error(aio_req_t *reqp, int portused)
2247 {
2248 	aio_t *aiop = curproc->p_aio;
2249 
2250 	if (aiop == NULL)
2251 		return;
2252 
2253 	mutex_enter(&aiop->aio_mutex);
2254 	if (portused)
2255 		aio_deq(&aiop->aio_portpending, reqp);
2256 	aiop->aio_pending--;
2257 	/* request failed, AIO_PHYSIODONE set to aviod physio cleanup. */
2258 	reqp->aio_req_flags |= AIO_PHYSIODONE;
2259 	/*
2260 	 * Need to free the request now as its never
2261 	 * going to get on the done queue
2262 	 *
2263 	 * Note: aio_outstanding is decremented in
2264 	 *	 aio_req_free()
2265 	 */
2266 	aio_req_free(aiop, reqp);
2267 	if (aiop->aio_flags & AIO_REQ_BLOCK)
2268 		cv_signal(&aiop->aio_cleanupcv);
2269 	mutex_exit(&aiop->aio_mutex);
2270 }
2271 
2272 /*
2273  * check if a specified request is done, and remove it from
2274  * the done queue. otherwise remove anybody from the done queue
2275  * if NULL is specified.
2276  */
2277 static aio_req_t *
2278 aio_req_done(void *resultp)
2279 {
2280 	aio_req_t **bucket;
2281 	aio_req_t *ent;
2282 	aio_t *aiop = curproc->p_aio;
2283 	long index;
2284 
2285 	ASSERT(MUTEX_HELD(&aiop->aio_cleanupq_mutex));
2286 	ASSERT(MUTEX_HELD(&aiop->aio_mutex));
2287 
2288 	if (resultp) {
2289 		index = AIO_HASH(resultp);
2290 		bucket = &aiop->aio_hash[index];
2291 		for (ent = *bucket; ent != NULL; ent = ent->aio_hash_next) {
2292 			if (ent->aio_req_resultp == (aio_result_t *)resultp) {
2293 				if (ent->aio_req_flags & AIO_DONEQ) {
2294 					return (aio_req_remove(ent));
2295 				}
2296 				return (NULL);
2297 			}
2298 		}
2299 		/* no match, resultp is invalid */
2300 		return (NULL);
2301 	}
2302 	return (aio_req_remove(NULL));
2303 }
2304 
2305 /*
2306  * determine if a user-level resultp pointer is associated with an
2307  * active IO request. Zero is returned when the request is done,
2308  * and the request is removed from the done queue. Only when the
2309  * return value is zero, is the "reqp" pointer valid. One is returned
2310  * when the request is inprogress. Two is returned when the request
2311  * is invalid.
2312  */
2313 static int
2314 aio_req_find(aio_result_t *resultp, aio_req_t **reqp)
2315 {
2316 	aio_req_t **bucket;
2317 	aio_req_t *ent;
2318 	aio_t *aiop = curproc->p_aio;
2319 	long index;
2320 
2321 	ASSERT(MUTEX_HELD(&aiop->aio_cleanupq_mutex));
2322 	ASSERT(MUTEX_HELD(&aiop->aio_mutex));
2323 
2324 	index = AIO_HASH(resultp);
2325 	bucket = &aiop->aio_hash[index];
2326 	for (ent = *bucket; ent != NULL; ent = ent->aio_hash_next) {
2327 		if (ent->aio_req_resultp == resultp) {
2328 			if (ent->aio_req_flags & AIO_DONEQ) {
2329 				*reqp = aio_req_remove(ent);
2330 				return (0);
2331 			}
2332 			return (1);
2333 		}
2334 	}
2335 	/* no match, resultp is invalid */
2336 	return (2);
2337 }
2338 
2339 /*
2340  * remove a request from the done queue.
2341  */
2342 static aio_req_t *
2343 aio_req_remove(aio_req_t *reqp)
2344 {
2345 	aio_t *aiop = curproc->p_aio;
2346 
2347 	ASSERT(MUTEX_HELD(&aiop->aio_mutex));
2348 
2349 	if (reqp != NULL) {
2350 		ASSERT(reqp->aio_req_flags & AIO_DONEQ);
2351 		if (reqp->aio_req_next == reqp) {
2352 			/* only one request on queue */
2353 			if (reqp ==  aiop->aio_doneq) {
2354 				aiop->aio_doneq = NULL;
2355 			} else {
2356 				ASSERT(reqp == aiop->aio_cleanupq);
2357 				aiop->aio_cleanupq = NULL;
2358 			}
2359 		} else {
2360 			reqp->aio_req_next->aio_req_prev = reqp->aio_req_prev;
2361 			reqp->aio_req_prev->aio_req_next = reqp->aio_req_next;
2362 			/*
2363 			 * The request can be either on the aio_doneq or the
2364 			 * aio_cleanupq
2365 			 */
2366 			if (reqp == aiop->aio_doneq)
2367 				aiop->aio_doneq = reqp->aio_req_next;
2368 
2369 			if (reqp == aiop->aio_cleanupq)
2370 				aiop->aio_cleanupq = reqp->aio_req_next;
2371 		}
2372 		reqp->aio_req_flags &= ~AIO_DONEQ;
2373 		reqp->aio_req_next = NULL;
2374 		reqp->aio_req_prev = NULL;
2375 	} else if ((reqp = aiop->aio_doneq) != NULL) {
2376 		ASSERT(reqp->aio_req_flags & AIO_DONEQ);
2377 		if (reqp == reqp->aio_req_next) {
2378 			/* only one request on queue */
2379 			aiop->aio_doneq = NULL;
2380 		} else {
2381 			reqp->aio_req_prev->aio_req_next = reqp->aio_req_next;
2382 			reqp->aio_req_next->aio_req_prev = reqp->aio_req_prev;
2383 			aiop->aio_doneq = reqp->aio_req_next;
2384 		}
2385 		reqp->aio_req_flags &= ~AIO_DONEQ;
2386 		reqp->aio_req_next = NULL;
2387 		reqp->aio_req_prev = NULL;
2388 	}
2389 	if (aiop->aio_doneq == NULL && (aiop->aio_flags & AIO_WAITN))
2390 		cv_broadcast(&aiop->aio_waitcv);
2391 	return (reqp);
2392 }
2393 
2394 static int
2395 aio_req_setup(
2396 	aio_req_t	**reqpp,
2397 	aio_t 		*aiop,
2398 	aiocb_t 	*arg,
2399 	aio_result_t 	*resultp,
2400 	vnode_t		*vp,
2401 	int		old_solaris_req)
2402 {
2403 	sigqueue_t	*sqp = NULL;
2404 	aio_req_t 	*reqp;
2405 	struct uio 	*uio;
2406 	struct sigevent *sigev;
2407 	int		error;
2408 
2409 	sigev = &arg->aio_sigevent;
2410 	if (sigev->sigev_notify == SIGEV_SIGNAL &&
2411 	    sigev->sigev_signo > 0 && sigev->sigev_signo < NSIG) {
2412 		sqp = kmem_zalloc(sizeof (sigqueue_t), KM_NOSLEEP);
2413 		if (sqp == NULL)
2414 			return (EAGAIN);
2415 		sqp->sq_func = NULL;
2416 		sqp->sq_next = NULL;
2417 		sqp->sq_info.si_code = SI_ASYNCIO;
2418 		sqp->sq_info.si_pid = curproc->p_pid;
2419 		sqp->sq_info.si_ctid = PRCTID(curproc);
2420 		sqp->sq_info.si_zoneid = getzoneid();
2421 		sqp->sq_info.si_uid = crgetuid(curproc->p_cred);
2422 		sqp->sq_info.si_signo = sigev->sigev_signo;
2423 		sqp->sq_info.si_value = sigev->sigev_value;
2424 	}
2425 
2426 	mutex_enter(&aiop->aio_mutex);
2427 
2428 	if (aiop->aio_flags & AIO_REQ_BLOCK) {
2429 		mutex_exit(&aiop->aio_mutex);
2430 		if (sqp)
2431 			kmem_free(sqp, sizeof (sigqueue_t));
2432 		return (EIO);
2433 	}
2434 	/*
2435 	 * get an aio_reqp from the free list or allocate one
2436 	 * from dynamic memory.
2437 	 */
2438 	if (error = aio_req_alloc(&reqp, resultp)) {
2439 		mutex_exit(&aiop->aio_mutex);
2440 		if (sqp)
2441 			kmem_free(sqp, sizeof (sigqueue_t));
2442 		return (error);
2443 	}
2444 	aiop->aio_pending++;
2445 	aiop->aio_outstanding++;
2446 	reqp->aio_req_flags = AIO_PENDING;
2447 	if (old_solaris_req) {
2448 		/* this is an old solaris aio request */
2449 		reqp->aio_req_flags |= AIO_SOLARIS;
2450 		aiop->aio_flags |= AIO_SOLARIS_REQ;
2451 	}
2452 	if (sigev->sigev_notify == SIGEV_THREAD ||
2453 	    sigev->sigev_notify == SIGEV_PORT)
2454 		aio_enq(&aiop->aio_portpending, reqp, 0);
2455 	mutex_exit(&aiop->aio_mutex);
2456 	/*
2457 	 * initialize aio request.
2458 	 */
2459 	reqp->aio_req_fd = arg->aio_fildes;
2460 	reqp->aio_req_sigqp = sqp;
2461 	reqp->aio_req_iocb.iocb = NULL;
2462 	reqp->aio_req_lio = NULL;
2463 	reqp->aio_req_buf.b_file = vp;
2464 	uio = reqp->aio_req.aio_uio;
2465 	uio->uio_iovcnt = 1;
2466 	uio->uio_iov->iov_base = (caddr_t)arg->aio_buf;
2467 	uio->uio_iov->iov_len = arg->aio_nbytes;
2468 	uio->uio_loffset = arg->aio_offset;
2469 	*reqpp = reqp;
2470 	return (0);
2471 }
2472 
2473 /*
2474  * Allocate p_aio struct.
2475  */
2476 static aio_t *
2477 aio_aiop_alloc(void)
2478 {
2479 	aio_t	*aiop;
2480 
2481 	ASSERT(MUTEX_HELD(&curproc->p_lock));
2482 
2483 	aiop = kmem_zalloc(sizeof (struct aio), KM_NOSLEEP);
2484 	if (aiop) {
2485 		mutex_init(&aiop->aio_mutex, NULL, MUTEX_DEFAULT, NULL);
2486 		mutex_init(&aiop->aio_cleanupq_mutex, NULL, MUTEX_DEFAULT,
2487 		    NULL);
2488 		mutex_init(&aiop->aio_portq_mutex, NULL, MUTEX_DEFAULT, NULL);
2489 	}
2490 	return (aiop);
2491 }
2492 
2493 /*
2494  * Allocate an aio_req struct.
2495  */
2496 static int
2497 aio_req_alloc(aio_req_t **nreqp, aio_result_t *resultp)
2498 {
2499 	aio_req_t *reqp;
2500 	aio_t *aiop = curproc->p_aio;
2501 
2502 	ASSERT(MUTEX_HELD(&aiop->aio_mutex));
2503 
2504 	if ((reqp = aiop->aio_free) != NULL) {
2505 		aiop->aio_free = reqp->aio_req_next;
2506 		bzero(reqp, sizeof (*reqp));
2507 	} else {
2508 		/*
2509 		 * Check whether memory is getting tight.
2510 		 * This is a temporary mechanism to avoid memory
2511 		 * exhaustion by a single process until we come up
2512 		 * with a per process solution such as setrlimit().
2513 		 */
2514 		if (freemem < desfree)
2515 			return (EAGAIN);
2516 		reqp = kmem_zalloc(sizeof (struct aio_req_t), KM_NOSLEEP);
2517 		if (reqp == NULL)
2518 			return (EAGAIN);
2519 	}
2520 	reqp->aio_req.aio_uio = &reqp->aio_req_uio;
2521 	reqp->aio_req.aio_uio->uio_iov = &reqp->aio_req_iov;
2522 	reqp->aio_req.aio_private = reqp;
2523 	reqp->aio_req_buf.b_offset = -1;
2524 	reqp->aio_req_resultp = resultp;
2525 	if (aio_hash_insert(reqp, aiop)) {
2526 		reqp->aio_req_next = aiop->aio_free;
2527 		aiop->aio_free = reqp;
2528 		return (EBUSY);
2529 	}
2530 	*nreqp = reqp;
2531 	return (0);
2532 }
2533 
2534 /*
2535  * Allocate an aio_lio_t struct.
2536  */
2537 static int
2538 aio_lio_alloc(aio_lio_t **head)
2539 {
2540 	aio_lio_t *liop;
2541 	aio_t *aiop = curproc->p_aio;
2542 
2543 	ASSERT(MUTEX_HELD(&aiop->aio_mutex));
2544 
2545 	if ((liop = aiop->aio_lio_free) != NULL) {
2546 		aiop->aio_lio_free = liop->lio_next;
2547 	} else {
2548 		/*
2549 		 * Check whether memory is getting tight.
2550 		 * This is a temporary mechanism to avoid memory
2551 		 * exhaustion by a single process until we come up
2552 		 * with a per process solution such as setrlimit().
2553 		 */
2554 		if (freemem < desfree)
2555 			return (EAGAIN);
2556 
2557 		liop = kmem_zalloc(sizeof (aio_lio_t), KM_NOSLEEP);
2558 		if (liop == NULL)
2559 			return (EAGAIN);
2560 	}
2561 	*head = liop;
2562 	return (0);
2563 }
2564 
2565 /*
2566  * this is a special per-process thread that is only activated if
2567  * the process is unmapping a segment with outstanding aio. normally,
2568  * the process will have completed the aio before unmapping the
2569  * segment. If the process does unmap a segment with outstanding aio,
2570  * this special thread will guarentee that the locked pages due to
2571  * aphysio() are released, thereby permitting the segment to be
2572  * unmapped. In addition to this, the cleanup thread is woken up
2573  * during DR operations to release the locked pages.
2574  */
2575 
2576 static int
2577 aio_cleanup_thread(aio_t *aiop)
2578 {
2579 	proc_t *p = curproc;
2580 	struct as *as = p->p_as;
2581 	int poked = 0;
2582 	kcondvar_t *cvp;
2583 	int exit_flag = 0;
2584 	int rqclnup = 0;
2585 
2586 	sigfillset(&curthread->t_hold);
2587 	sigdiffset(&curthread->t_hold, &cantmask);
2588 	for (;;) {
2589 		/*
2590 		 * if a segment is being unmapped, and the current
2591 		 * process's done queue is not empty, then every request
2592 		 * on the doneq with locked resources should be forced
2593 		 * to release their locks. By moving the doneq request
2594 		 * to the cleanupq, aio_cleanup() will process the cleanupq,
2595 		 * and place requests back onto the doneq. All requests
2596 		 * processed by aio_cleanup() will have their physical
2597 		 * resources unlocked.
2598 		 */
2599 		mutex_enter(&aiop->aio_mutex);
2600 		if ((aiop->aio_flags & AIO_CLEANUP) == 0) {
2601 			aiop->aio_flags |= AIO_CLEANUP;
2602 			mutex_enter(&as->a_contents);
2603 			if (aiop->aio_rqclnup) {
2604 				aiop->aio_rqclnup = 0;
2605 				rqclnup = 1;
2606 			}
2607 			mutex_exit(&as->a_contents);
2608 			if (aiop->aio_doneq) {
2609 				aio_req_t *doneqhead = aiop->aio_doneq;
2610 				aiop->aio_doneq = NULL;
2611 				aio_cleanupq_concat(aiop, doneqhead, AIO_DONEQ);
2612 			}
2613 		}
2614 		mutex_exit(&aiop->aio_mutex);
2615 		aio_cleanup(AIO_CLEANUP_THREAD);
2616 		/*
2617 		 * thread should block on the cleanupcv while
2618 		 * AIO_CLEANUP is set.
2619 		 */
2620 		cvp = &aiop->aio_cleanupcv;
2621 		mutex_enter(&aiop->aio_mutex);
2622 
2623 		if (aiop->aio_pollq != NULL || aiop->aio_cleanupq != NULL ||
2624 		    aiop->aio_notifyq != NULL ||
2625 		    aiop->aio_portcleanupq != NULL) {
2626 			mutex_exit(&aiop->aio_mutex);
2627 			continue;
2628 		}
2629 		mutex_enter(&as->a_contents);
2630 
2631 		/*
2632 		 * AIO_CLEANUP determines when the cleanup thread
2633 		 * should be active. This flag is set when
2634 		 * the cleanup thread is awakened by as_unmap() or
2635 		 * due to DR operations.
2636 		 * The flag is cleared when the blocking as_unmap()
2637 		 * that originally awakened us is allowed to
2638 		 * complete. as_unmap() blocks when trying to
2639 		 * unmap a segment that has SOFTLOCKed pages. when
2640 		 * the segment's pages are all SOFTUNLOCKed,
2641 		 * as->a_flags & AS_UNMAPWAIT should be zero.
2642 		 *
2643 		 * In case of cleanup request by DR, the flag is cleared
2644 		 * once all the pending aio requests have been processed.
2645 		 *
2646 		 * The flag shouldn't be cleared right away if the
2647 		 * cleanup thread was interrupted because the process
2648 		 * is doing forkall(). This happens when cv_wait_sig()
2649 		 * returns zero, because it was awakened by a pokelwps().
2650 		 * If the process is not exiting, it must be doing forkall().
2651 		 */
2652 		if ((poked == 0) &&
2653 		    ((!rqclnup && (AS_ISUNMAPWAIT(as) == 0)) ||
2654 		    (aiop->aio_pending == 0))) {
2655 			aiop->aio_flags &= ~(AIO_CLEANUP | AIO_CLEANUP_PORT);
2656 			cvp = &as->a_cv;
2657 			rqclnup = 0;
2658 		}
2659 		mutex_exit(&aiop->aio_mutex);
2660 		if (poked) {
2661 			/*
2662 			 * If the process is exiting/killed, don't return
2663 			 * immediately without waiting for pending I/O's
2664 			 * and releasing the page locks.
2665 			 */
2666 			if (p->p_flag & (SEXITLWPS|SKILLED)) {
2667 				/*
2668 				 * If exit_flag is set, then it is
2669 				 * safe to exit because we have released
2670 				 * page locks of completed I/O's.
2671 				 */
2672 				if (exit_flag)
2673 					break;
2674 
2675 				mutex_exit(&as->a_contents);
2676 
2677 				/*
2678 				 * Wait for all the pending aio to complete.
2679 				 */
2680 				mutex_enter(&aiop->aio_mutex);
2681 				aiop->aio_flags |= AIO_REQ_BLOCK;
2682 				while (aiop->aio_pending != 0)
2683 					cv_wait(&aiop->aio_cleanupcv,
2684 					    &aiop->aio_mutex);
2685 				mutex_exit(&aiop->aio_mutex);
2686 				exit_flag = 1;
2687 				continue;
2688 			} else if (p->p_flag &
2689 			    (SHOLDFORK|SHOLDFORK1|SHOLDWATCH)) {
2690 				/*
2691 				 * hold LWP until it
2692 				 * is continued.
2693 				 */
2694 				mutex_exit(&as->a_contents);
2695 				mutex_enter(&p->p_lock);
2696 				stop(PR_SUSPENDED, SUSPEND_NORMAL);
2697 				mutex_exit(&p->p_lock);
2698 				poked = 0;
2699 				continue;
2700 			}
2701 		} else {
2702 			/*
2703 			 * When started this thread will sleep on as->a_cv.
2704 			 * as_unmap will awake this thread if the
2705 			 * segment has SOFTLOCKed pages (poked = 0).
2706 			 * 1. pokelwps() awakes this thread =>
2707 			 *    break the loop to check SEXITLWPS, SHOLDFORK, etc
2708 			 * 2. as_unmap awakes this thread =>
2709 			 *    to break the loop it is necessary that
2710 			 *    - AS_UNMAPWAIT is set (as_unmap is waiting for
2711 			 *	memory to be unlocked)
2712 			 *    - AIO_CLEANUP is not set
2713 			 *	(if AIO_CLEANUP is set we have to wait for
2714 			 *	pending requests. aio_done will send a signal
2715 			 *	for every request which completes to continue
2716 			 *	unmapping the corresponding address range)
2717 			 * 3. A cleanup request will wake this thread up, ex.
2718 			 *    by the DR operations. The aio_rqclnup flag will
2719 			 *    be set.
2720 			 */
2721 			while (poked == 0) {
2722 				/*
2723 				 * The clean up requests that came in
2724 				 * after we had just cleaned up, couldn't
2725 				 * be causing the unmap thread to block - as
2726 				 * unmap event happened first.
2727 				 * Let aio_done() wake us up if it sees a need.
2728 				 */
2729 				if (aiop->aio_rqclnup &&
2730 				    (aiop->aio_flags & AIO_CLEANUP) == 0)
2731 					break;
2732 				poked = !cv_wait_sig(cvp, &as->a_contents);
2733 				if (AS_ISUNMAPWAIT(as) == 0)
2734 					cv_signal(cvp);
2735 				if (aiop->aio_outstanding != 0)
2736 					break;
2737 			}
2738 		}
2739 		mutex_exit(&as->a_contents);
2740 	}
2741 exit:
2742 	mutex_exit(&as->a_contents);
2743 	ASSERT((curproc->p_flag & (SEXITLWPS|SKILLED)));
2744 	aston(curthread);	/* make thread do post_syscall */
2745 	return (0);
2746 }
2747 
2748 /*
2749  * save a reference to a user's outstanding aio in a hash list.
2750  */
2751 static int
2752 aio_hash_insert(
2753 	aio_req_t *aio_reqp,
2754 	aio_t *aiop)
2755 {
2756 	long index;
2757 	aio_result_t *resultp = aio_reqp->aio_req_resultp;
2758 	aio_req_t *current;
2759 	aio_req_t **nextp;
2760 
2761 	index = AIO_HASH(resultp);
2762 	nextp = &aiop->aio_hash[index];
2763 	while ((current = *nextp) != NULL) {
2764 		if (current->aio_req_resultp == resultp)
2765 			return (DUPLICATE);
2766 		nextp = &current->aio_hash_next;
2767 	}
2768 	*nextp = aio_reqp;
2769 	aio_reqp->aio_hash_next = NULL;
2770 	return (0);
2771 }
2772 
2773 static int
2774 (*check_vp(struct vnode *vp, int mode))(vnode_t *, struct aio_req *,
2775     cred_t *)
2776 {
2777 	struct snode *sp;
2778 	dev_t		dev;
2779 	struct cb_ops  	*cb;
2780 	major_t		major;
2781 	int		(*aio_func)();
2782 
2783 	dev = vp->v_rdev;
2784 	major = getmajor(dev);
2785 
2786 	/*
2787 	 * return NULL for requests to files and STREAMs so
2788 	 * that libaio takes care of them.
2789 	 */
2790 	if (vp->v_type == VCHR) {
2791 		/* no stream device for kaio */
2792 		if (STREAMSTAB(major)) {
2793 			return (NULL);
2794 		}
2795 	} else {
2796 		return (NULL);
2797 	}
2798 
2799 	/*
2800 	 * Check old drivers which do not have async I/O entry points.
2801 	 */
2802 	if (devopsp[major]->devo_rev < 3)
2803 		return (NULL);
2804 
2805 	cb = devopsp[major]->devo_cb_ops;
2806 
2807 	if (cb->cb_rev < 1)
2808 		return (NULL);
2809 
2810 	/*
2811 	 * Check whether this device is a block device.
2812 	 * Kaio is not supported for devices like tty.
2813 	 */
2814 	if (cb->cb_strategy == nodev || cb->cb_strategy == NULL)
2815 		return (NULL);
2816 
2817 	/*
2818 	 * Clustering: If vnode is a PXFS vnode, then the device may be remote.
2819 	 * We cannot call the driver directly. Instead return the
2820 	 * PXFS functions.
2821 	 */
2822 
2823 	if (IS_PXFSVP(vp)) {
2824 		if (mode & FREAD)
2825 			return (clpxfs_aio_read);
2826 		else
2827 			return (clpxfs_aio_write);
2828 	}
2829 	if (mode & FREAD)
2830 		aio_func = (cb->cb_aread == nodev) ? NULL : driver_aio_read;
2831 	else
2832 		aio_func = (cb->cb_awrite == nodev) ? NULL : driver_aio_write;
2833 
2834 	/*
2835 	 * Do we need this ?
2836 	 * nodev returns ENXIO anyway.
2837 	 */
2838 	if (aio_func == nodev)
2839 		return (NULL);
2840 
2841 	sp = VTOS(vp);
2842 	smark(sp, SACC);
2843 	return (aio_func);
2844 }
2845 
2846 /*
2847  * Clustering: We want check_vp to return a function prototyped
2848  * correctly that will be common to both PXFS and regular case.
2849  * We define this intermediate function that will do the right
2850  * thing for driver cases.
2851  */
2852 
2853 static int
2854 driver_aio_write(vnode_t *vp, struct aio_req *aio, cred_t *cred_p)
2855 {
2856 	dev_t dev;
2857 	struct cb_ops  	*cb;
2858 
2859 	ASSERT(vp->v_type == VCHR);
2860 	ASSERT(!IS_PXFSVP(vp));
2861 	dev = VTOS(vp)->s_dev;
2862 	ASSERT(STREAMSTAB(getmajor(dev)) == NULL);
2863 
2864 	cb = devopsp[getmajor(dev)]->devo_cb_ops;
2865 
2866 	ASSERT(cb->cb_awrite != nodev);
2867 	return ((*cb->cb_awrite)(dev, aio, cred_p));
2868 }
2869 
2870 /*
2871  * Clustering: We want check_vp to return a function prototyped
2872  * correctly that will be common to both PXFS and regular case.
2873  * We define this intermediate function that will do the right
2874  * thing for driver cases.
2875  */
2876 
2877 static int
2878 driver_aio_read(vnode_t *vp, struct aio_req *aio, cred_t *cred_p)
2879 {
2880 	dev_t dev;
2881 	struct cb_ops  	*cb;
2882 
2883 	ASSERT(vp->v_type == VCHR);
2884 	ASSERT(!IS_PXFSVP(vp));
2885 	dev = VTOS(vp)->s_dev;
2886 	ASSERT(!STREAMSTAB(getmajor(dev)));
2887 
2888 	cb = devopsp[getmajor(dev)]->devo_cb_ops;
2889 
2890 	ASSERT(cb->cb_aread != nodev);
2891 	return ((*cb->cb_aread)(dev, aio, cred_p));
2892 }
2893 
2894 /*
2895  * This routine is called when a largefile call is made by a 32bit
2896  * process on a ILP32 or LP64 kernel. All 64bit processes are large
2897  * file by definition and will call alio() instead.
2898  */
2899 static int
2900 alioLF(
2901 	int		mode_arg,
2902 	void		*aiocb_arg,
2903 	int		nent,
2904 	void		*sigev)
2905 {
2906 	file_t		*fp;
2907 	file_t		*prev_fp = NULL;
2908 	int		prev_mode = -1;
2909 	struct vnode	*vp;
2910 	aio_lio_t	*head;
2911 	aio_req_t	*reqp;
2912 	aio_t		*aiop;
2913 	caddr_t		cbplist;
2914 	aiocb64_32_t	cb64;
2915 	aiocb64_32_t	*aiocb = &cb64;
2916 	aiocb64_32_t	*cbp;
2917 	caddr32_t	*ucbp;
2918 #ifdef _LP64
2919 	aiocb_t		aiocb_n;
2920 #endif
2921 	struct sigevent32	sigevk;
2922 	sigqueue_t	*sqp;
2923 	int		(*aio_func)();
2924 	int		mode;
2925 	int		error = 0;
2926 	int		aio_errors = 0;
2927 	int		i;
2928 	size_t		ssize;
2929 	int		deadhead = 0;
2930 	int		aio_notsupported = 0;
2931 	int		lio_head_port;
2932 	int		aio_port;
2933 	int		aio_thread;
2934 	port_kevent_t	*pkevtp = NULL;
2935 	int		portused = 0;
2936 	port_notify32_t	pnotify;
2937 	int		event;
2938 
2939 	aiop = curproc->p_aio;
2940 	if (aiop == NULL || nent <= 0 || nent > _AIO_LISTIO_MAX)
2941 		return (EINVAL);
2942 
2943 	ASSERT(get_udatamodel() == DATAMODEL_ILP32);
2944 
2945 	ssize = (sizeof (caddr32_t) * nent);
2946 	cbplist = kmem_alloc(ssize, KM_SLEEP);
2947 	ucbp = (caddr32_t *)cbplist;
2948 
2949 	if (copyin(aiocb_arg, cbplist, ssize) ||
2950 	    (sigev && copyin(sigev, &sigevk, sizeof (sigevk)))) {
2951 		kmem_free(cbplist, ssize);
2952 		return (EFAULT);
2953 	}
2954 
2955 	/* Event Ports  */
2956 	if (sigev &&
2957 	    (sigevk.sigev_notify == SIGEV_THREAD ||
2958 	    sigevk.sigev_notify == SIGEV_PORT)) {
2959 		if (sigevk.sigev_notify == SIGEV_THREAD) {
2960 			pnotify.portnfy_port = sigevk.sigev_signo;
2961 			pnotify.portnfy_user = sigevk.sigev_value.sival_ptr;
2962 		} else if (copyin(
2963 		    (void *)(uintptr_t)sigevk.sigev_value.sival_ptr,
2964 		    &pnotify, sizeof (pnotify))) {
2965 			kmem_free(cbplist, ssize);
2966 			return (EFAULT);
2967 		}
2968 		error = port_alloc_event(pnotify.portnfy_port,
2969 		    PORT_ALLOC_DEFAULT, PORT_SOURCE_AIO, &pkevtp);
2970 		if (error) {
2971 			if (error == ENOMEM || error == EAGAIN)
2972 				error = EAGAIN;
2973 			else
2974 				error = EINVAL;
2975 			kmem_free(cbplist, ssize);
2976 			return (error);
2977 		}
2978 		lio_head_port = pnotify.portnfy_port;
2979 		portused = 1;
2980 	}
2981 
2982 	/*
2983 	 * a list head should be allocated if notification is
2984 	 * enabled for this list.
2985 	 */
2986 	head = NULL;
2987 
2988 	if (mode_arg == LIO_WAIT || sigev) {
2989 		mutex_enter(&aiop->aio_mutex);
2990 		error = aio_lio_alloc(&head);
2991 		mutex_exit(&aiop->aio_mutex);
2992 		if (error)
2993 			goto done;
2994 		deadhead = 1;
2995 		head->lio_nent = nent;
2996 		head->lio_refcnt = nent;
2997 		head->lio_port = -1;
2998 		head->lio_portkev = NULL;
2999 		if (sigev && sigevk.sigev_notify == SIGEV_SIGNAL &&
3000 		    sigevk.sigev_signo > 0 && sigevk.sigev_signo < NSIG) {
3001 			sqp = kmem_zalloc(sizeof (sigqueue_t), KM_NOSLEEP);
3002 			if (sqp == NULL) {
3003 				error = EAGAIN;
3004 				goto done;
3005 			}
3006 			sqp->sq_func = NULL;
3007 			sqp->sq_next = NULL;
3008 			sqp->sq_info.si_code = SI_ASYNCIO;
3009 			sqp->sq_info.si_pid = curproc->p_pid;
3010 			sqp->sq_info.si_ctid = PRCTID(curproc);
3011 			sqp->sq_info.si_zoneid = getzoneid();
3012 			sqp->sq_info.si_uid = crgetuid(curproc->p_cred);
3013 			sqp->sq_info.si_signo = sigevk.sigev_signo;
3014 			sqp->sq_info.si_value.sival_int =
3015 			    sigevk.sigev_value.sival_int;
3016 			head->lio_sigqp = sqp;
3017 		} else {
3018 			head->lio_sigqp = NULL;
3019 		}
3020 		if (pkevtp) {
3021 			/*
3022 			 * Prepare data to send when list of aiocb's
3023 			 * has completed.
3024 			 */
3025 			port_init_event(pkevtp, (uintptr_t)sigev,
3026 			    (void *)(uintptr_t)pnotify.portnfy_user,
3027 			    NULL, head);
3028 			pkevtp->portkev_events = AIOLIO64;
3029 			head->lio_portkev = pkevtp;
3030 			head->lio_port = pnotify.portnfy_port;
3031 		}
3032 	}
3033 
3034 	for (i = 0; i < nent; i++, ucbp++) {
3035 
3036 		cbp = (aiocb64_32_t *)(uintptr_t)*ucbp;
3037 		/* skip entry if it can't be copied. */
3038 		if (cbp == NULL || copyin(cbp, aiocb, sizeof (*aiocb))) {
3039 			if (head) {
3040 				mutex_enter(&aiop->aio_mutex);
3041 				head->lio_nent--;
3042 				head->lio_refcnt--;
3043 				mutex_exit(&aiop->aio_mutex);
3044 			}
3045 			continue;
3046 		}
3047 
3048 		/* skip if opcode for aiocb is LIO_NOP */
3049 		mode = aiocb->aio_lio_opcode;
3050 		if (mode == LIO_NOP) {
3051 			cbp = NULL;
3052 			if (head) {
3053 				mutex_enter(&aiop->aio_mutex);
3054 				head->lio_nent--;
3055 				head->lio_refcnt--;
3056 				mutex_exit(&aiop->aio_mutex);
3057 			}
3058 			continue;
3059 		}
3060 
3061 		/* increment file descriptor's ref count. */
3062 		if ((fp = getf(aiocb->aio_fildes)) == NULL) {
3063 			lio_set_uerror(&cbp->aio_resultp, EBADF);
3064 			if (head) {
3065 				mutex_enter(&aiop->aio_mutex);
3066 				head->lio_nent--;
3067 				head->lio_refcnt--;
3068 				mutex_exit(&aiop->aio_mutex);
3069 			}
3070 			aio_errors++;
3071 			continue;
3072 		}
3073 
3074 		/*
3075 		 * check the permission of the partition
3076 		 */
3077 		if ((fp->f_flag & mode) == 0) {
3078 			releasef(aiocb->aio_fildes);
3079 			lio_set_uerror(&cbp->aio_resultp, EBADF);
3080 			if (head) {
3081 				mutex_enter(&aiop->aio_mutex);
3082 				head->lio_nent--;
3083 				head->lio_refcnt--;
3084 				mutex_exit(&aiop->aio_mutex);
3085 			}
3086 			aio_errors++;
3087 			continue;
3088 		}
3089 
3090 		/*
3091 		 * common case where requests are to the same fd
3092 		 * for the same r/w operation
3093 		 * for UFS, need to set EBADFD
3094 		 */
3095 		vp = fp->f_vnode;
3096 		if (fp != prev_fp || mode != prev_mode) {
3097 			aio_func = check_vp(vp, mode);
3098 			if (aio_func == NULL) {
3099 				prev_fp = NULL;
3100 				releasef(aiocb->aio_fildes);
3101 				lio_set_uerror(&cbp->aio_resultp, EBADFD);
3102 				aio_notsupported++;
3103 				if (head) {
3104 					mutex_enter(&aiop->aio_mutex);
3105 					head->lio_nent--;
3106 					head->lio_refcnt--;
3107 					mutex_exit(&aiop->aio_mutex);
3108 				}
3109 				continue;
3110 			} else {
3111 				prev_fp = fp;
3112 				prev_mode = mode;
3113 			}
3114 		}
3115 
3116 #ifdef	_LP64
3117 		aiocb_LFton(aiocb, &aiocb_n);
3118 		error = aio_req_setup(&reqp, aiop, &aiocb_n,
3119 		    (aio_result_t *)&cbp->aio_resultp, vp, 0);
3120 #else
3121 		error = aio_req_setupLF(&reqp, aiop, aiocb,
3122 		    (aio_result_t *)&cbp->aio_resultp, vp, 0);
3123 #endif  /* _LP64 */
3124 		if (error) {
3125 			releasef(aiocb->aio_fildes);
3126 			lio_set_uerror(&cbp->aio_resultp, error);
3127 			if (head) {
3128 				mutex_enter(&aiop->aio_mutex);
3129 				head->lio_nent--;
3130 				head->lio_refcnt--;
3131 				mutex_exit(&aiop->aio_mutex);
3132 			}
3133 			aio_errors++;
3134 			continue;
3135 		}
3136 
3137 		reqp->aio_req_lio = head;
3138 		deadhead = 0;
3139 
3140 		/*
3141 		 * Set the errno field now before sending the request to
3142 		 * the driver to avoid a race condition
3143 		 */
3144 		(void) suword32(&cbp->aio_resultp.aio_errno,
3145 		    EINPROGRESS);
3146 
3147 		reqp->aio_req_iocb.iocb32 = *ucbp;
3148 
3149 		event = (mode == LIO_READ)? AIOAREAD64 : AIOAWRITE64;
3150 		aio_port = (aiocb->aio_sigevent.sigev_notify == SIGEV_PORT);
3151 		aio_thread = (aiocb->aio_sigevent.sigev_notify == SIGEV_THREAD);
3152 		if (aio_port | aio_thread) {
3153 			port_kevent_t *lpkevp;
3154 			/*
3155 			 * Prepare data to send with each aiocb completed.
3156 			 */
3157 			if (aio_port) {
3158 				void *paddr = (void *)(uintptr_t)
3159 				    aiocb->aio_sigevent.sigev_value.sival_ptr;
3160 				if (copyin(paddr, &pnotify, sizeof (pnotify)))
3161 					error = EFAULT;
3162 			} else {	/* aio_thread */
3163 				pnotify.portnfy_port =
3164 				    aiocb->aio_sigevent.sigev_signo;
3165 				pnotify.portnfy_user =
3166 				    aiocb->aio_sigevent.sigev_value.sival_ptr;
3167 			}
3168 			if (error)
3169 				/* EMPTY */;
3170 			else if (pkevtp != NULL &&
3171 			    pnotify.portnfy_port == lio_head_port)
3172 				error = port_dup_event(pkevtp, &lpkevp,
3173 				    PORT_ALLOC_DEFAULT);
3174 			else
3175 				error = port_alloc_event(pnotify.portnfy_port,
3176 				    PORT_ALLOC_DEFAULT, PORT_SOURCE_AIO,
3177 				    &lpkevp);
3178 			if (error == 0) {
3179 				port_init_event(lpkevp, (uintptr_t)*ucbp,
3180 				    (void *)(uintptr_t)pnotify.portnfy_user,
3181 				    aio_port_callback, reqp);
3182 				lpkevp->portkev_events = event;
3183 				reqp->aio_req_portkev = lpkevp;
3184 				reqp->aio_req_port = pnotify.portnfy_port;
3185 			}
3186 		}
3187 
3188 		/*
3189 		 * send the request to driver.
3190 		 */
3191 		if (error == 0) {
3192 			if (aiocb->aio_nbytes == 0) {
3193 				clear_active_fd(aiocb->aio_fildes);
3194 				aio_zerolen(reqp);
3195 				continue;
3196 			}
3197 			error = (*aio_func)(vp, (aio_req_t *)&reqp->aio_req,
3198 			    CRED());
3199 		}
3200 
3201 		/*
3202 		 * the fd's ref count is not decremented until the IO has
3203 		 * completed unless there was an error.
3204 		 */
3205 		if (error) {
3206 			releasef(aiocb->aio_fildes);
3207 			lio_set_uerror(&cbp->aio_resultp, error);
3208 			if (head) {
3209 				mutex_enter(&aiop->aio_mutex);
3210 				head->lio_nent--;
3211 				head->lio_refcnt--;
3212 				mutex_exit(&aiop->aio_mutex);
3213 			}
3214 			if (error == ENOTSUP)
3215 				aio_notsupported++;
3216 			else
3217 				aio_errors++;
3218 			lio_set_error(reqp, portused);
3219 		} else {
3220 			clear_active_fd(aiocb->aio_fildes);
3221 		}
3222 	}
3223 
3224 	if (aio_notsupported) {
3225 		error = ENOTSUP;
3226 	} else if (aio_errors) {
3227 		/*
3228 		 * return EIO if any request failed
3229 		 */
3230 		error = EIO;
3231 	}
3232 
3233 	if (mode_arg == LIO_WAIT) {
3234 		mutex_enter(&aiop->aio_mutex);
3235 		while (head->lio_refcnt > 0) {
3236 			if (!cv_wait_sig(&head->lio_notify, &aiop->aio_mutex)) {
3237 				mutex_exit(&aiop->aio_mutex);
3238 				error = EINTR;
3239 				goto done;
3240 			}
3241 		}
3242 		mutex_exit(&aiop->aio_mutex);
3243 		alio_cleanup(aiop, (aiocb_t **)cbplist, nent, AIO_LARGEFILE);
3244 	}
3245 
3246 done:
3247 	kmem_free(cbplist, ssize);
3248 	if (deadhead) {
3249 		if (head->lio_sigqp)
3250 			kmem_free(head->lio_sigqp, sizeof (sigqueue_t));
3251 		if (head->lio_portkev)
3252 			port_free_event(head->lio_portkev);
3253 		kmem_free(head, sizeof (aio_lio_t));
3254 	}
3255 	return (error);
3256 }
3257 
3258 #ifdef  _SYSCALL32_IMPL
3259 static void
3260 aiocb_LFton(aiocb64_32_t *src, aiocb_t *dest)
3261 {
3262 	dest->aio_fildes = src->aio_fildes;
3263 	dest->aio_buf = (void *)(uintptr_t)src->aio_buf;
3264 	dest->aio_nbytes = (size_t)src->aio_nbytes;
3265 	dest->aio_offset = (off_t)src->aio_offset;
3266 	dest->aio_reqprio = src->aio_reqprio;
3267 	dest->aio_sigevent.sigev_notify = src->aio_sigevent.sigev_notify;
3268 	dest->aio_sigevent.sigev_signo = src->aio_sigevent.sigev_signo;
3269 
3270 	/*
3271 	 * See comment in sigqueue32() on handling of 32-bit
3272 	 * sigvals in a 64-bit kernel.
3273 	 */
3274 	dest->aio_sigevent.sigev_value.sival_int =
3275 	    (int)src->aio_sigevent.sigev_value.sival_int;
3276 	dest->aio_sigevent.sigev_notify_function = (void (*)(union sigval))
3277 	    (uintptr_t)src->aio_sigevent.sigev_notify_function;
3278 	dest->aio_sigevent.sigev_notify_attributes = (pthread_attr_t *)
3279 	    (uintptr_t)src->aio_sigevent.sigev_notify_attributes;
3280 	dest->aio_sigevent.__sigev_pad2 = src->aio_sigevent.__sigev_pad2;
3281 	dest->aio_lio_opcode = src->aio_lio_opcode;
3282 	dest->aio_state = src->aio_state;
3283 	dest->aio__pad[0] = src->aio__pad[0];
3284 }
3285 #endif
3286 
3287 /*
3288  * This function is used only for largefile calls made by
3289  * 32 bit applications.
3290  */
3291 static int
3292 aio_req_setupLF(
3293 	aio_req_t	**reqpp,
3294 	aio_t		*aiop,
3295 	aiocb64_32_t	*arg,
3296 	aio_result_t	*resultp,
3297 	vnode_t		*vp,
3298 	int		old_solaris_req)
3299 {
3300 	sigqueue_t	*sqp = NULL;
3301 	aio_req_t	*reqp;
3302 	struct uio	*uio;
3303 	struct sigevent32 *sigev;
3304 	int 		error;
3305 
3306 	sigev = &arg->aio_sigevent;
3307 	if (sigev->sigev_notify == SIGEV_SIGNAL &&
3308 	    sigev->sigev_signo > 0 && sigev->sigev_signo < NSIG) {
3309 		sqp = kmem_zalloc(sizeof (sigqueue_t), KM_NOSLEEP);
3310 		if (sqp == NULL)
3311 			return (EAGAIN);
3312 		sqp->sq_func = NULL;
3313 		sqp->sq_next = NULL;
3314 		sqp->sq_info.si_code = SI_ASYNCIO;
3315 		sqp->sq_info.si_pid = curproc->p_pid;
3316 		sqp->sq_info.si_ctid = PRCTID(curproc);
3317 		sqp->sq_info.si_zoneid = getzoneid();
3318 		sqp->sq_info.si_uid = crgetuid(curproc->p_cred);
3319 		sqp->sq_info.si_signo = sigev->sigev_signo;
3320 		sqp->sq_info.si_value.sival_int = sigev->sigev_value.sival_int;
3321 	}
3322 
3323 	mutex_enter(&aiop->aio_mutex);
3324 
3325 	if (aiop->aio_flags & AIO_REQ_BLOCK) {
3326 		mutex_exit(&aiop->aio_mutex);
3327 		if (sqp)
3328 			kmem_free(sqp, sizeof (sigqueue_t));
3329 		return (EIO);
3330 	}
3331 	/*
3332 	 * get an aio_reqp from the free list or allocate one
3333 	 * from dynamic memory.
3334 	 */
3335 	if (error = aio_req_alloc(&reqp, resultp)) {
3336 		mutex_exit(&aiop->aio_mutex);
3337 		if (sqp)
3338 			kmem_free(sqp, sizeof (sigqueue_t));
3339 		return (error);
3340 	}
3341 	aiop->aio_pending++;
3342 	aiop->aio_outstanding++;
3343 	reqp->aio_req_flags = AIO_PENDING;
3344 	if (old_solaris_req) {
3345 		/* this is an old solaris aio request */
3346 		reqp->aio_req_flags |= AIO_SOLARIS;
3347 		aiop->aio_flags |= AIO_SOLARIS_REQ;
3348 	}
3349 	if (sigev->sigev_notify == SIGEV_THREAD ||
3350 	    sigev->sigev_notify == SIGEV_PORT)
3351 		aio_enq(&aiop->aio_portpending, reqp, 0);
3352 	mutex_exit(&aiop->aio_mutex);
3353 	/*
3354 	 * initialize aio request.
3355 	 */
3356 	reqp->aio_req_fd = arg->aio_fildes;
3357 	reqp->aio_req_sigqp = sqp;
3358 	reqp->aio_req_iocb.iocb = NULL;
3359 	reqp->aio_req_lio = NULL;
3360 	reqp->aio_req_buf.b_file = vp;
3361 	uio = reqp->aio_req.aio_uio;
3362 	uio->uio_iovcnt = 1;
3363 	uio->uio_iov->iov_base = (caddr_t)(uintptr_t)arg->aio_buf;
3364 	uio->uio_iov->iov_len = arg->aio_nbytes;
3365 	uio->uio_loffset = arg->aio_offset;
3366 	*reqpp = reqp;
3367 	return (0);
3368 }
3369 
3370 /*
3371  * This routine is called when a non largefile call is made by a 32bit
3372  * process on a ILP32 or LP64 kernel.
3373  */
3374 static int
3375 alio32(
3376 	int		mode_arg,
3377 	void		*aiocb_arg,
3378 	int		nent,
3379 	void		*sigev)
3380 {
3381 	file_t		*fp;
3382 	file_t		*prev_fp = NULL;
3383 	int		prev_mode = -1;
3384 	struct vnode	*vp;
3385 	aio_lio_t	*head;
3386 	aio_req_t	*reqp;
3387 	aio_t		*aiop;
3388 	caddr_t		cbplist;
3389 	aiocb_t		cb;
3390 	aiocb_t		*aiocb = &cb;
3391 #ifdef	_LP64
3392 	aiocb32_t	*cbp;
3393 	caddr32_t	*ucbp;
3394 	aiocb32_t	cb32;
3395 	aiocb32_t	*aiocb32 = &cb32;
3396 	struct sigevent32	sigevk;
3397 #else
3398 	aiocb_t		*cbp, **ucbp;
3399 	struct sigevent	sigevk;
3400 #endif
3401 	sigqueue_t	*sqp;
3402 	int		(*aio_func)();
3403 	int		mode;
3404 	int		error = 0;
3405 	int		aio_errors = 0;
3406 	int		i;
3407 	size_t		ssize;
3408 	int		deadhead = 0;
3409 	int		aio_notsupported = 0;
3410 	int		lio_head_port;
3411 	int		aio_port;
3412 	int		aio_thread;
3413 	port_kevent_t	*pkevtp = NULL;
3414 	int		portused = 0;
3415 #ifdef	_LP64
3416 	port_notify32_t	pnotify;
3417 #else
3418 	port_notify_t	pnotify;
3419 #endif
3420 	int		event;
3421 
3422 	aiop = curproc->p_aio;
3423 	if (aiop == NULL || nent <= 0 || nent > _AIO_LISTIO_MAX)
3424 		return (EINVAL);
3425 
3426 #ifdef	_LP64
3427 	ssize = (sizeof (caddr32_t) * nent);
3428 #else
3429 	ssize = (sizeof (aiocb_t *) * nent);
3430 #endif
3431 	cbplist = kmem_alloc(ssize, KM_SLEEP);
3432 	ucbp = (void *)cbplist;
3433 
3434 	if (copyin(aiocb_arg, cbplist, ssize) ||
3435 	    (sigev && copyin(sigev, &sigevk, sizeof (struct sigevent32)))) {
3436 		kmem_free(cbplist, ssize);
3437 		return (EFAULT);
3438 	}
3439 
3440 	/* Event Ports  */
3441 	if (sigev &&
3442 	    (sigevk.sigev_notify == SIGEV_THREAD ||
3443 	    sigevk.sigev_notify == SIGEV_PORT)) {
3444 		if (sigevk.sigev_notify == SIGEV_THREAD) {
3445 			pnotify.portnfy_port = sigevk.sigev_signo;
3446 			pnotify.portnfy_user = sigevk.sigev_value.sival_ptr;
3447 		} else if (copyin(
3448 		    (void *)(uintptr_t)sigevk.sigev_value.sival_ptr,
3449 		    &pnotify, sizeof (pnotify))) {
3450 			kmem_free(cbplist, ssize);
3451 			return (EFAULT);
3452 		}
3453 		error = port_alloc_event(pnotify.portnfy_port,
3454 		    PORT_ALLOC_DEFAULT, PORT_SOURCE_AIO, &pkevtp);
3455 		if (error) {
3456 			if (error == ENOMEM || error == EAGAIN)
3457 				error = EAGAIN;
3458 			else
3459 				error = EINVAL;
3460 			kmem_free(cbplist, ssize);
3461 			return (error);
3462 		}
3463 		lio_head_port = pnotify.portnfy_port;
3464 		portused = 1;
3465 	}
3466 
3467 	/*
3468 	 * a list head should be allocated if notification is
3469 	 * enabled for this list.
3470 	 */
3471 	head = NULL;
3472 
3473 	if (mode_arg == LIO_WAIT || sigev) {
3474 		mutex_enter(&aiop->aio_mutex);
3475 		error = aio_lio_alloc(&head);
3476 		mutex_exit(&aiop->aio_mutex);
3477 		if (error)
3478 			goto done;
3479 		deadhead = 1;
3480 		head->lio_nent = nent;
3481 		head->lio_refcnt = nent;
3482 		head->lio_port = -1;
3483 		head->lio_portkev = NULL;
3484 		if (sigev && sigevk.sigev_notify == SIGEV_SIGNAL &&
3485 		    sigevk.sigev_signo > 0 && sigevk.sigev_signo < NSIG) {
3486 			sqp = kmem_zalloc(sizeof (sigqueue_t), KM_NOSLEEP);
3487 			if (sqp == NULL) {
3488 				error = EAGAIN;
3489 				goto done;
3490 			}
3491 			sqp->sq_func = NULL;
3492 			sqp->sq_next = NULL;
3493 			sqp->sq_info.si_code = SI_ASYNCIO;
3494 			sqp->sq_info.si_pid = curproc->p_pid;
3495 			sqp->sq_info.si_ctid = PRCTID(curproc);
3496 			sqp->sq_info.si_zoneid = getzoneid();
3497 			sqp->sq_info.si_uid = crgetuid(curproc->p_cred);
3498 			sqp->sq_info.si_signo = sigevk.sigev_signo;
3499 			sqp->sq_info.si_value.sival_int =
3500 			    sigevk.sigev_value.sival_int;
3501 			head->lio_sigqp = sqp;
3502 		} else {
3503 			head->lio_sigqp = NULL;
3504 		}
3505 		if (pkevtp) {
3506 			/*
3507 			 * Prepare data to send when list of aiocb's has
3508 			 * completed.
3509 			 */
3510 			port_init_event(pkevtp, (uintptr_t)sigev,
3511 			    (void *)(uintptr_t)pnotify.portnfy_user,
3512 			    NULL, head);
3513 			pkevtp->portkev_events = AIOLIO;
3514 			head->lio_portkev = pkevtp;
3515 			head->lio_port = pnotify.portnfy_port;
3516 		}
3517 	}
3518 
3519 	for (i = 0; i < nent; i++, ucbp++) {
3520 
3521 		/* skip entry if it can't be copied. */
3522 #ifdef	_LP64
3523 		cbp = (aiocb32_t *)(uintptr_t)*ucbp;
3524 		if (cbp == NULL || copyin(cbp, aiocb32, sizeof (*aiocb32)))
3525 #else
3526 		cbp = (aiocb_t *)*ucbp;
3527 		if (cbp == NULL || copyin(cbp, aiocb, sizeof (*aiocb)))
3528 #endif
3529 		{
3530 			if (head) {
3531 				mutex_enter(&aiop->aio_mutex);
3532 				head->lio_nent--;
3533 				head->lio_refcnt--;
3534 				mutex_exit(&aiop->aio_mutex);
3535 			}
3536 			continue;
3537 		}
3538 #ifdef	_LP64
3539 		/*
3540 		 * copy 32 bit structure into 64 bit structure
3541 		 */
3542 		aiocb_32ton(aiocb32, aiocb);
3543 #endif /* _LP64 */
3544 
3545 		/* skip if opcode for aiocb is LIO_NOP */
3546 		mode = aiocb->aio_lio_opcode;
3547 		if (mode == LIO_NOP) {
3548 			cbp = NULL;
3549 			if (head) {
3550 				mutex_enter(&aiop->aio_mutex);
3551 				head->lio_nent--;
3552 				head->lio_refcnt--;
3553 				mutex_exit(&aiop->aio_mutex);
3554 			}
3555 			continue;
3556 		}
3557 
3558 		/* increment file descriptor's ref count. */
3559 		if ((fp = getf(aiocb->aio_fildes)) == NULL) {
3560 			lio_set_uerror(&cbp->aio_resultp, EBADF);
3561 			if (head) {
3562 				mutex_enter(&aiop->aio_mutex);
3563 				head->lio_nent--;
3564 				head->lio_refcnt--;
3565 				mutex_exit(&aiop->aio_mutex);
3566 			}
3567 			aio_errors++;
3568 			continue;
3569 		}
3570 
3571 		/*
3572 		 * check the permission of the partition
3573 		 */
3574 		if ((fp->f_flag & mode) == 0) {
3575 			releasef(aiocb->aio_fildes);
3576 			lio_set_uerror(&cbp->aio_resultp, EBADF);
3577 			if (head) {
3578 				mutex_enter(&aiop->aio_mutex);
3579 				head->lio_nent--;
3580 				head->lio_refcnt--;
3581 				mutex_exit(&aiop->aio_mutex);
3582 			}
3583 			aio_errors++;
3584 			continue;
3585 		}
3586 
3587 		/*
3588 		 * common case where requests are to the same fd
3589 		 * for the same r/w operation
3590 		 * for UFS, need to set EBADFD
3591 		 */
3592 		vp = fp->f_vnode;
3593 		if (fp != prev_fp || mode != prev_mode) {
3594 			aio_func = check_vp(vp, mode);
3595 			if (aio_func == NULL) {
3596 				prev_fp = NULL;
3597 				releasef(aiocb->aio_fildes);
3598 				lio_set_uerror(&cbp->aio_resultp, EBADFD);
3599 				aio_notsupported++;
3600 				if (head) {
3601 					mutex_enter(&aiop->aio_mutex);
3602 					head->lio_nent--;
3603 					head->lio_refcnt--;
3604 					mutex_exit(&aiop->aio_mutex);
3605 				}
3606 				continue;
3607 			} else {
3608 				prev_fp = fp;
3609 				prev_mode = mode;
3610 			}
3611 		}
3612 
3613 		error = aio_req_setup(&reqp, aiop, aiocb,
3614 		    (aio_result_t *)&cbp->aio_resultp, vp, 0);
3615 		if (error) {
3616 			releasef(aiocb->aio_fildes);
3617 			lio_set_uerror(&cbp->aio_resultp, error);
3618 			if (head) {
3619 				mutex_enter(&aiop->aio_mutex);
3620 				head->lio_nent--;
3621 				head->lio_refcnt--;
3622 				mutex_exit(&aiop->aio_mutex);
3623 			}
3624 			aio_errors++;
3625 			continue;
3626 		}
3627 
3628 		reqp->aio_req_lio = head;
3629 		deadhead = 0;
3630 
3631 		/*
3632 		 * Set the errno field now before sending the request to
3633 		 * the driver to avoid a race condition
3634 		 */
3635 		(void) suword32(&cbp->aio_resultp.aio_errno,
3636 		    EINPROGRESS);
3637 
3638 		reqp->aio_req_iocb.iocb32 = (caddr32_t)(uintptr_t)cbp;
3639 
3640 		event = (mode == LIO_READ)? AIOAREAD : AIOAWRITE;
3641 		aio_port = (aiocb->aio_sigevent.sigev_notify == SIGEV_PORT);
3642 		aio_thread = (aiocb->aio_sigevent.sigev_notify == SIGEV_THREAD);
3643 		if (aio_port | aio_thread) {
3644 			port_kevent_t *lpkevp;
3645 			/*
3646 			 * Prepare data to send with each aiocb completed.
3647 			 */
3648 #ifdef _LP64
3649 			if (aio_port) {
3650 				void *paddr = (void  *)(uintptr_t)
3651 				    aiocb32->aio_sigevent.sigev_value.sival_ptr;
3652 				if (copyin(paddr, &pnotify, sizeof (pnotify)))
3653 					error = EFAULT;
3654 			} else {	/* aio_thread */
3655 				pnotify.portnfy_port =
3656 				    aiocb32->aio_sigevent.sigev_signo;
3657 				pnotify.portnfy_user =
3658 				    aiocb32->aio_sigevent.sigev_value.sival_ptr;
3659 			}
3660 #else
3661 			if (aio_port) {
3662 				void *paddr =
3663 				    aiocb->aio_sigevent.sigev_value.sival_ptr;
3664 				if (copyin(paddr, &pnotify, sizeof (pnotify)))
3665 					error = EFAULT;
3666 			} else {	/* aio_thread */
3667 				pnotify.portnfy_port =
3668 				    aiocb->aio_sigevent.sigev_signo;
3669 				pnotify.portnfy_user =
3670 				    aiocb->aio_sigevent.sigev_value.sival_ptr;
3671 			}
3672 #endif
3673 			if (error)
3674 				/* EMPTY */;
3675 			else if (pkevtp != NULL &&
3676 			    pnotify.portnfy_port == lio_head_port)
3677 				error = port_dup_event(pkevtp, &lpkevp,
3678 				    PORT_ALLOC_DEFAULT);
3679 			else
3680 				error = port_alloc_event(pnotify.portnfy_port,
3681 				    PORT_ALLOC_DEFAULT, PORT_SOURCE_AIO,
3682 				    &lpkevp);
3683 			if (error == 0) {
3684 				port_init_event(lpkevp, (uintptr_t)cbp,
3685 				    (void *)(uintptr_t)pnotify.portnfy_user,
3686 				    aio_port_callback, reqp);
3687 				lpkevp->portkev_events = event;
3688 				reqp->aio_req_portkev = lpkevp;
3689 				reqp->aio_req_port = pnotify.portnfy_port;
3690 			}
3691 		}
3692 
3693 		/*
3694 		 * send the request to driver.
3695 		 */
3696 		if (error == 0) {
3697 			if (aiocb->aio_nbytes == 0) {
3698 				clear_active_fd(aiocb->aio_fildes);
3699 				aio_zerolen(reqp);
3700 				continue;
3701 			}
3702 			error = (*aio_func)(vp, (aio_req_t *)&reqp->aio_req,
3703 			    CRED());
3704 		}
3705 
3706 		/*
3707 		 * the fd's ref count is not decremented until the IO has
3708 		 * completed unless there was an error.
3709 		 */
3710 		if (error) {
3711 			releasef(aiocb->aio_fildes);
3712 			lio_set_uerror(&cbp->aio_resultp, error);
3713 			if (head) {
3714 				mutex_enter(&aiop->aio_mutex);
3715 				head->lio_nent--;
3716 				head->lio_refcnt--;
3717 				mutex_exit(&aiop->aio_mutex);
3718 			}
3719 			if (error == ENOTSUP)
3720 				aio_notsupported++;
3721 			else
3722 				aio_errors++;
3723 			lio_set_error(reqp, portused);
3724 		} else {
3725 			clear_active_fd(aiocb->aio_fildes);
3726 		}
3727 	}
3728 
3729 	if (aio_notsupported) {
3730 		error = ENOTSUP;
3731 	} else if (aio_errors) {
3732 		/*
3733 		 * return EIO if any request failed
3734 		 */
3735 		error = EIO;
3736 	}
3737 
3738 	if (mode_arg == LIO_WAIT) {
3739 		mutex_enter(&aiop->aio_mutex);
3740 		while (head->lio_refcnt > 0) {
3741 			if (!cv_wait_sig(&head->lio_notify, &aiop->aio_mutex)) {
3742 				mutex_exit(&aiop->aio_mutex);
3743 				error = EINTR;
3744 				goto done;
3745 			}
3746 		}
3747 		mutex_exit(&aiop->aio_mutex);
3748 		alio_cleanup(aiop, (aiocb_t **)cbplist, nent, AIO_32);
3749 	}
3750 
3751 done:
3752 	kmem_free(cbplist, ssize);
3753 	if (deadhead) {
3754 		if (head->lio_sigqp)
3755 			kmem_free(head->lio_sigqp, sizeof (sigqueue_t));
3756 		if (head->lio_portkev)
3757 			port_free_event(head->lio_portkev);
3758 		kmem_free(head, sizeof (aio_lio_t));
3759 	}
3760 	return (error);
3761 }
3762 
3763 
3764 #ifdef  _SYSCALL32_IMPL
3765 void
3766 aiocb_32ton(aiocb32_t *src, aiocb_t *dest)
3767 {
3768 	dest->aio_fildes = src->aio_fildes;
3769 	dest->aio_buf = (caddr_t)(uintptr_t)src->aio_buf;
3770 	dest->aio_nbytes = (size_t)src->aio_nbytes;
3771 	dest->aio_offset = (off_t)src->aio_offset;
3772 	dest->aio_reqprio = src->aio_reqprio;
3773 	dest->aio_sigevent.sigev_notify = src->aio_sigevent.sigev_notify;
3774 	dest->aio_sigevent.sigev_signo = src->aio_sigevent.sigev_signo;
3775 
3776 	/*
3777 	 * See comment in sigqueue32() on handling of 32-bit
3778 	 * sigvals in a 64-bit kernel.
3779 	 */
3780 	dest->aio_sigevent.sigev_value.sival_int =
3781 	    (int)src->aio_sigevent.sigev_value.sival_int;
3782 	dest->aio_sigevent.sigev_notify_function = (void (*)(union sigval))
3783 	    (uintptr_t)src->aio_sigevent.sigev_notify_function;
3784 	dest->aio_sigevent.sigev_notify_attributes = (pthread_attr_t *)
3785 	    (uintptr_t)src->aio_sigevent.sigev_notify_attributes;
3786 	dest->aio_sigevent.__sigev_pad2 = src->aio_sigevent.__sigev_pad2;
3787 	dest->aio_lio_opcode = src->aio_lio_opcode;
3788 	dest->aio_state = src->aio_state;
3789 	dest->aio__pad[0] = src->aio__pad[0];
3790 }
3791 #endif /* _SYSCALL32_IMPL */
3792 
3793 /*
3794  * aio_port_callback() is called just before the event is retrieved from the
3795  * port. The task of this callback function is to finish the work of the
3796  * transaction for the application, it means :
3797  * - copyout transaction data to the application
3798  *	(this thread is running in the right process context)
3799  * - keep trace of the transaction (update of counters).
3800  * - free allocated buffers
3801  * The aiocb pointer is the object element of the port_kevent_t structure.
3802  *
3803  * flag :
3804  *	PORT_CALLBACK_DEFAULT : do copyout and free resources
3805  *	PORT_CALLBACK_CLOSE   : don't do copyout, free resources
3806  */
3807 
3808 /*ARGSUSED*/
3809 int
3810 aio_port_callback(void *arg, int *events, pid_t pid, int flag, void *evp)
3811 {
3812 	aio_t		*aiop = curproc->p_aio;
3813 	aio_req_t	*reqp = arg;
3814 	struct	iovec	*iov;
3815 	struct	buf	*bp;
3816 	void		*resultp;
3817 
3818 	if (pid != curproc->p_pid) {
3819 		/* wrong proc !!, can not deliver data here ... */
3820 		return (EACCES);
3821 	}
3822 
3823 	mutex_enter(&aiop->aio_portq_mutex);
3824 	reqp->aio_req_portkev = NULL;
3825 	aio_req_remove_portq(aiop, reqp); /* remove request from portq */
3826 	mutex_exit(&aiop->aio_portq_mutex);
3827 	aphysio_unlock(reqp);		/* unlock used pages */
3828 	mutex_enter(&aiop->aio_mutex);
3829 	if (reqp->aio_req_flags & AIO_COPYOUTDONE) {
3830 		aio_req_free_port(aiop, reqp);	/* back to free list */
3831 		mutex_exit(&aiop->aio_mutex);
3832 		return (0);
3833 	}
3834 
3835 	iov = reqp->aio_req_uio.uio_iov;
3836 	bp = &reqp->aio_req_buf;
3837 	resultp = (void *)reqp->aio_req_resultp;
3838 	if (flag == PORT_CALLBACK_DEFAULT)
3839 		aio_copyout_result_port(iov, bp, resultp);
3840 	aio_req_free_port(aiop, reqp);	/* request struct back to free list */
3841 	mutex_exit(&aiop->aio_mutex);
3842 	return (0);
3843 }
3844