1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27/*
28 * Copyright (c) 2018, Joyent, Inc.
29 */
30
31/*
32 * Kernel asynchronous I/O.
33 * This is only for raw devices now (as of Nov. 1993).
34 */
35
36#include <sys/types.h>
37#include <sys/errno.h>
38#include <sys/conf.h>
39#include <sys/file.h>
40#include <sys/fs/snode.h>
41#include <sys/unistd.h>
42#include <sys/cmn_err.h>
43#include <vm/as.h>
44#include <vm/faultcode.h>
45#include <sys/sysmacros.h>
46#include <sys/procfs.h>
47#include <sys/kmem.h>
48#include <sys/autoconf.h>
49#include <sys/ddi_impldefs.h>
50#include <sys/sunddi.h>
51#include <sys/aio_impl.h>
52#include <sys/debug.h>
53#include <sys/param.h>
54#include <sys/systm.h>
55#include <sys/vmsystm.h>
56#include <sys/fs/pxfs_ki.h>
57#include <sys/contract/process_impl.h>
58
59/*
60 * external entry point.
61 */
62#ifdef _LP64
63static int64_t kaioc(long, long, long, long, long, long);
64#endif
65static int kaio(ulong_t *, rval_t *);
66
67
68#define	AIO_64	0
69#define	AIO_32	1
70#define	AIO_LARGEFILE	2
71
72/*
73 * implementation specific functions (private)
74 */
75#ifdef _LP64
76static int alio(int, aiocb_t **, int, struct sigevent *);
77#endif
78static int aionotify(void);
79static int aioinit(void);
80static int aiostart(void);
81static void alio_cleanup(aio_t *, aiocb_t **, int, int);
82static int (*check_vp(struct vnode *, int))(vnode_t *, struct aio_req *,
83    cred_t *);
84static void lio_set_error(aio_req_t *, int portused);
85static aio_t *aio_aiop_alloc();
86static int aio_req_alloc(aio_req_t **, aio_result_t *);
87static int aio_lio_alloc(aio_lio_t **);
88static aio_req_t *aio_req_done(void *);
89static aio_req_t *aio_req_remove(aio_req_t *);
90static int aio_req_find(aio_result_t *, aio_req_t **);
91static int aio_hash_insert(struct aio_req_t *, aio_t *);
92static int aio_req_setup(aio_req_t **, aio_t *, aiocb_t *,
93    aio_result_t *, vnode_t *, int);
94static int aio_cleanup_thread(aio_t *);
95static aio_lio_t *aio_list_get(aio_result_t *);
96static void lio_set_uerror(void *, int);
97extern void aio_zerolen(aio_req_t *);
98static int aiowait(struct timeval *, int, long	*);
99static int aiowaitn(void *, uint_t, uint_t *, timespec_t *);
100static int aio_unlock_requests(caddr_t iocblist, int iocb_index,
101    aio_req_t *reqlist, aio_t *aiop, model_t model);
102static int aio_reqlist_concat(aio_t *aiop, aio_req_t **reqlist, int max);
103static int aiosuspend(void *, int, struct  timespec *, int,
104    long	*, int);
105static int aliowait(int, void *, int, void *, int);
106static int aioerror(void *, int);
107static int aio_cancel(int, void *, long	*, int);
108static int arw(int, int, char *, int, offset_t, aio_result_t *, int);
109static int aiorw(int, void *, int, int);
110
111static int alioLF(int, void *, int, void *);
112static int aio_req_setupLF(aio_req_t **, aio_t *, aiocb64_32_t *,
113    aio_result_t *, vnode_t *, int);
114static int alio32(int, void *, int, void *);
115static int driver_aio_write(vnode_t *vp, struct aio_req *aio, cred_t *cred_p);
116static int driver_aio_read(vnode_t *vp, struct aio_req *aio, cred_t *cred_p);
117
118#ifdef  _SYSCALL32_IMPL
119static void aiocb_LFton(aiocb64_32_t *, aiocb_t *);
120void	aiocb_32ton(aiocb32_t *, aiocb_t *);
121#endif /* _SYSCALL32_IMPL */
122
123/*
124 * implementation specific functions (external)
125 */
126void aio_req_free(aio_t *, aio_req_t *);
127
128/*
129 * Event Port framework
130 */
131
132void aio_req_free_port(aio_t *, aio_req_t *);
133static int aio_port_callback(void *, int *, pid_t, int, void *);
134
135/*
136 * This is the loadable module wrapper.
137 */
138#include <sys/modctl.h>
139#include <sys/syscall.h>
140
141#ifdef _LP64
142
143static struct sysent kaio_sysent = {
144	6,
145	SE_NOUNLOAD | SE_64RVAL | SE_ARGC,
146	(int (*)())(uintptr_t)kaioc
147};
148
149#ifdef _SYSCALL32_IMPL
150static struct sysent kaio_sysent32 = {
151	7,
152	SE_NOUNLOAD | SE_64RVAL,
153	kaio
154};
155#endif  /* _SYSCALL32_IMPL */
156
157#else   /* _LP64 */
158
159static struct sysent kaio_sysent = {
160	7,
161	SE_NOUNLOAD | SE_32RVAL1,
162	kaio
163};
164
165#endif  /* _LP64 */
166
167/*
168 * Module linkage information for the kernel.
169 */
170
171static struct modlsys modlsys = {
172	&mod_syscallops,
173	"kernel Async I/O",
174	&kaio_sysent
175};
176
177#ifdef  _SYSCALL32_IMPL
178static struct modlsys modlsys32 = {
179	&mod_syscallops32,
180	"kernel Async I/O for 32 bit compatibility",
181	&kaio_sysent32
182};
183#endif  /* _SYSCALL32_IMPL */
184
185
186static struct modlinkage modlinkage = {
187	MODREV_1,
188	&modlsys,
189#ifdef  _SYSCALL32_IMPL
190	&modlsys32,
191#endif
192	NULL
193};
194
195int
196_init(void)
197{
198	int retval;
199
200	if ((retval = mod_install(&modlinkage)) != 0)
201		return (retval);
202
203	return (0);
204}
205
206int
207_fini(void)
208{
209	int retval;
210
211	retval = mod_remove(&modlinkage);
212
213	return (retval);
214}
215
216int
217_info(struct modinfo *modinfop)
218{
219	return (mod_info(&modlinkage, modinfop));
220}
221
222#ifdef	_LP64
223static int64_t
224kaioc(
225	long	a0,
226	long	a1,
227	long	a2,
228	long	a3,
229	long	a4,
230	long	a5)
231{
232	int	error;
233	long	rval = 0;
234
235	switch ((int)a0 & ~AIO_POLL_BIT) {
236	case AIOREAD:
237		error = arw((int)a0, (int)a1, (char *)a2, (int)a3,
238		    (offset_t)a4, (aio_result_t *)a5, FREAD);
239		break;
240	case AIOWRITE:
241		error = arw((int)a0, (int)a1, (char *)a2, (int)a3,
242		    (offset_t)a4, (aio_result_t *)a5, FWRITE);
243		break;
244	case AIOWAIT:
245		error = aiowait((struct timeval *)a1, (int)a2, &rval);
246		break;
247	case AIOWAITN:
248		error = aiowaitn((void *)a1, (uint_t)a2, (uint_t *)a3,
249		    (timespec_t *)a4);
250		break;
251	case AIONOTIFY:
252		error = aionotify();
253		break;
254	case AIOINIT:
255		error = aioinit();
256		break;
257	case AIOSTART:
258		error = aiostart();
259		break;
260	case AIOLIO:
261		error = alio((int)a1, (aiocb_t **)a2, (int)a3,
262		    (struct sigevent *)a4);
263		break;
264	case AIOLIOWAIT:
265		error = aliowait((int)a1, (void *)a2, (int)a3,
266		    (struct sigevent *)a4, AIO_64);
267		break;
268	case AIOSUSPEND:
269		error = aiosuspend((void *)a1, (int)a2, (timespec_t *)a3,
270		    (int)a4, &rval, AIO_64);
271		break;
272	case AIOERROR:
273		error = aioerror((void *)a1, AIO_64);
274		break;
275	case AIOAREAD:
276		error = aiorw((int)a0, (void *)a1, FREAD, AIO_64);
277		break;
278	case AIOAWRITE:
279		error = aiorw((int)a0, (void *)a1, FWRITE, AIO_64);
280		break;
281	case AIOCANCEL:
282		error = aio_cancel((int)a1, (void *)a2, &rval, AIO_64);
283		break;
284
285	/*
286	 * The large file related stuff is valid only for
287	 * 32 bit kernel and not for 64 bit kernel
288	 * On 64 bit kernel we convert large file calls
289	 * to regular 64bit calls.
290	 */
291
292	default:
293		error = EINVAL;
294	}
295	if (error)
296		return ((int64_t)set_errno(error));
297	return (rval);
298}
299#endif
300
301static int
302kaio(
303	ulong_t *uap,
304	rval_t *rvp)
305{
306	long rval = 0;
307	int	error = 0;
308	offset_t	off;
309
310
311	rvp->r_vals = 0;
312#if defined(_LITTLE_ENDIAN)
313	off = ((u_offset_t)uap[5] << 32) | (u_offset_t)uap[4];
314#else
315	off = ((u_offset_t)uap[4] << 32) | (u_offset_t)uap[5];
316#endif
317
318	switch (uap[0] & ~AIO_POLL_BIT) {
319	/*
320	 * It must be the 32 bit system call on 64 bit kernel
321	 */
322	case AIOREAD:
323		return (arw((int)uap[0], (int)uap[1], (char *)uap[2],
324		    (int)uap[3], off, (aio_result_t *)uap[6], FREAD));
325	case AIOWRITE:
326		return (arw((int)uap[0], (int)uap[1], (char *)uap[2],
327		    (int)uap[3], off, (aio_result_t *)uap[6], FWRITE));
328	case AIOWAIT:
329		error = aiowait((struct	timeval *)uap[1], (int)uap[2],
330		    &rval);
331		break;
332	case AIOWAITN:
333		error = aiowaitn((void *)uap[1], (uint_t)uap[2],
334		    (uint_t *)uap[3], (timespec_t *)uap[4]);
335		break;
336	case AIONOTIFY:
337		return (aionotify());
338	case AIOINIT:
339		return (aioinit());
340	case AIOSTART:
341		return (aiostart());
342	case AIOLIO:
343		return (alio32((int)uap[1], (void *)uap[2], (int)uap[3],
344		    (void *)uap[4]));
345	case AIOLIOWAIT:
346		return (aliowait((int)uap[1], (void *)uap[2],
347		    (int)uap[3], (struct sigevent *)uap[4], AIO_32));
348	case AIOSUSPEND:
349		error = aiosuspend((void *)uap[1], (int)uap[2],
350		    (timespec_t *)uap[3], (int)uap[4],
351		    &rval, AIO_32);
352		break;
353	case AIOERROR:
354		return (aioerror((void *)uap[1], AIO_32));
355	case AIOAREAD:
356		return (aiorw((int)uap[0], (void *)uap[1],
357		    FREAD, AIO_32));
358	case AIOAWRITE:
359		return (aiorw((int)uap[0], (void *)uap[1],
360		    FWRITE, AIO_32));
361	case AIOCANCEL:
362		error = (aio_cancel((int)uap[1], (void *)uap[2], &rval,
363		    AIO_32));
364		break;
365	case AIOLIO64:
366		return (alioLF((int)uap[1], (void *)uap[2],
367		    (int)uap[3], (void *)uap[4]));
368	case AIOLIOWAIT64:
369		return (aliowait(uap[1], (void *)uap[2],
370		    (int)uap[3], (void *)uap[4], AIO_LARGEFILE));
371	case AIOSUSPEND64:
372		error = aiosuspend((void *)uap[1], (int)uap[2],
373		    (timespec_t *)uap[3], (int)uap[4], &rval,
374		    AIO_LARGEFILE);
375		break;
376	case AIOERROR64:
377		return (aioerror((void *)uap[1], AIO_LARGEFILE));
378	case AIOAREAD64:
379		return (aiorw((int)uap[0], (void *)uap[1], FREAD,
380		    AIO_LARGEFILE));
381	case AIOAWRITE64:
382		return (aiorw((int)uap[0], (void *)uap[1], FWRITE,
383		    AIO_LARGEFILE));
384	case AIOCANCEL64:
385		error = (aio_cancel((int)uap[1], (void *)uap[2],
386		    &rval, AIO_LARGEFILE));
387		break;
388	default:
389		return (EINVAL);
390	}
391
392	rvp->r_val1 = rval;
393	return (error);
394}
395
396/*
397 * wake up LWPs in this process that are sleeping in
398 * aiowait().
399 */
400static int
401aionotify(void)
402{
403	aio_t	*aiop;
404
405	aiop = curproc->p_aio;
406	if (aiop == NULL)
407		return (0);
408
409	mutex_enter(&aiop->aio_mutex);
410	aiop->aio_notifycnt++;
411	cv_broadcast(&aiop->aio_waitcv);
412	mutex_exit(&aiop->aio_mutex);
413
414	return (0);
415}
416
417static int
418timeval2reltime(struct timeval *timout, timestruc_t *rqtime,
419    timestruc_t **rqtp, int *blocking)
420{
421#ifdef	_SYSCALL32_IMPL
422	struct timeval32 wait_time_32;
423#endif
424	struct timeval wait_time;
425	model_t	model = get_udatamodel();
426
427	*rqtp = NULL;
428	if (timout == NULL) {		/* wait indefinitely */
429		*blocking = 1;
430		return (0);
431	}
432
433	/*
434	 * Need to correctly compare with the -1 passed in for a user
435	 * address pointer, with both 32 bit and 64 bit apps.
436	 */
437	if (model == DATAMODEL_NATIVE) {
438		if ((intptr_t)timout == (intptr_t)-1) {	/* don't wait */
439			*blocking = 0;
440			return (0);
441		}
442
443		if (copyin(timout, &wait_time, sizeof (wait_time)))
444			return (EFAULT);
445	}
446#ifdef	_SYSCALL32_IMPL
447	else {
448		/*
449		 * -1 from a 32bit app. It will not get sign extended.
450		 * don't wait if -1.
451		 */
452		if ((intptr_t)timout == (intptr_t)((uint32_t)-1)) {
453			*blocking = 0;
454			return (0);
455		}
456
457		if (copyin(timout, &wait_time_32, sizeof (wait_time_32)))
458			return (EFAULT);
459		TIMEVAL32_TO_TIMEVAL(&wait_time, &wait_time_32);
460	}
461#endif  /* _SYSCALL32_IMPL */
462
463	if (wait_time.tv_sec == 0 && wait_time.tv_usec == 0) {	/* don't wait */
464		*blocking = 0;
465		return (0);
466	}
467
468	if (wait_time.tv_sec < 0 ||
469	    wait_time.tv_usec < 0 || wait_time.tv_usec >= MICROSEC)
470		return (EINVAL);
471
472	rqtime->tv_sec = wait_time.tv_sec;
473	rqtime->tv_nsec = wait_time.tv_usec * 1000;
474	*rqtp = rqtime;
475	*blocking = 1;
476
477	return (0);
478}
479
480static int
481timespec2reltime(timespec_t *timout, timestruc_t *rqtime,
482    timestruc_t **rqtp, int *blocking)
483{
484#ifdef	_SYSCALL32_IMPL
485	timespec32_t wait_time_32;
486#endif
487	model_t	model = get_udatamodel();
488
489	*rqtp = NULL;
490	if (timout == NULL) {
491		*blocking = 1;
492		return (0);
493	}
494
495	if (model == DATAMODEL_NATIVE) {
496		if (copyin(timout, rqtime, sizeof (*rqtime)))
497			return (EFAULT);
498	}
499#ifdef	_SYSCALL32_IMPL
500	else {
501		if (copyin(timout, &wait_time_32, sizeof (wait_time_32)))
502			return (EFAULT);
503		TIMESPEC32_TO_TIMESPEC(rqtime, &wait_time_32);
504	}
505#endif  /* _SYSCALL32_IMPL */
506
507	if (rqtime->tv_sec == 0 && rqtime->tv_nsec == 0) {
508		*blocking = 0;
509		return (0);
510	}
511
512	if (rqtime->tv_sec < 0 ||
513	    rqtime->tv_nsec < 0 || rqtime->tv_nsec >= NANOSEC)
514		return (EINVAL);
515
516	*rqtp = rqtime;
517	*blocking = 1;
518
519	return (0);
520}
521
522/*ARGSUSED*/
523static int
524aiowait(struct timeval *timout, int dontblockflg, long *rval)
525{
526	int		error;
527	aio_t		*aiop;
528	aio_req_t	*reqp;
529	clock_t		status;
530	int		blocking;
531	int		timecheck;
532	timestruc_t	rqtime;
533	timestruc_t	*rqtp;
534
535	aiop = curproc->p_aio;
536	if (aiop == NULL)
537		return (EINVAL);
538
539	/*
540	 * Establish the absolute future time for the timeout.
541	 */
542	error = timeval2reltime(timout, &rqtime, &rqtp, &blocking);
543	if (error)
544		return (error);
545	if (rqtp) {
546		timestruc_t now;
547		timecheck = timechanged;
548		gethrestime(&now);
549		timespecadd(rqtp, &now);
550	}
551
552	mutex_enter(&aiop->aio_mutex);
553	for (;;) {
554		/* process requests on poll queue */
555		if (aiop->aio_pollq) {
556			mutex_exit(&aiop->aio_mutex);
557			aio_cleanup(0);
558			mutex_enter(&aiop->aio_mutex);
559		}
560		if ((reqp = aio_req_remove(NULL)) != NULL) {
561			*rval = (long)reqp->aio_req_resultp;
562			break;
563		}
564		/* user-level done queue might not be empty */
565		if (aiop->aio_notifycnt > 0) {
566			aiop->aio_notifycnt--;
567			*rval = 1;
568			break;
569		}
570		/* don't block if no outstanding aio */
571		if (aiop->aio_outstanding == 0 && dontblockflg) {
572			error = EINVAL;
573			break;
574		}
575		if (blocking) {
576			status = cv_waituntil_sig(&aiop->aio_waitcv,
577			    &aiop->aio_mutex, rqtp, timecheck);
578
579			if (status > 0)		/* check done queue again */
580				continue;
581			if (status == 0) {	/* interrupted by a signal */
582				error = EINTR;
583				*rval = -1;
584			} else {		/* timer expired */
585				error = ETIME;
586			}
587		}
588		break;
589	}
590	mutex_exit(&aiop->aio_mutex);
591	if (reqp) {
592		aphysio_unlock(reqp);
593		aio_copyout_result(reqp);
594		mutex_enter(&aiop->aio_mutex);
595		aio_req_free(aiop, reqp);
596		mutex_exit(&aiop->aio_mutex);
597	}
598	return (error);
599}
600
601/*
602 * aiowaitn can be used to reap completed asynchronous requests submitted with
603 * lio_listio, aio_read or aio_write.
604 * This function only reaps asynchronous raw I/Os.
605 */
606
607/*ARGSUSED*/
608static int
609aiowaitn(void *uiocb, uint_t nent, uint_t *nwait, timespec_t *timout)
610{
611	int		error = 0;
612	aio_t		*aiop;
613	aio_req_t	*reqlist = NULL;
614	caddr_t		iocblist = NULL;	/* array of iocb ptr's */
615	uint_t		waitcnt, cnt = 0;	/* iocb cnt */
616	size_t		iocbsz;			/* users iocb size */
617	size_t		riocbsz;		/* returned iocb size */
618	int		iocb_index = 0;
619	model_t		model = get_udatamodel();
620	int		blocking = 1;
621	int		timecheck;
622	timestruc_t	rqtime;
623	timestruc_t	*rqtp;
624
625	aiop = curproc->p_aio;
626	if (aiop == NULL || nent == 0 || nent > _AIO_LISTIO_MAX)
627		return (EINVAL);
628
629	if (aiop->aio_outstanding == 0)
630		return (EAGAIN);
631
632	if (copyin(nwait, &waitcnt, sizeof (uint_t)))
633		return (EFAULT);
634
635	/* set *nwait to zero, if we must return prematurely */
636	if (copyout(&cnt, nwait, sizeof (uint_t)))
637		return (EFAULT);
638
639	if (waitcnt == 0) {
640		blocking = 0;
641		rqtp = NULL;
642		waitcnt = nent;
643	} else {
644		error = timespec2reltime(timout, &rqtime, &rqtp, &blocking);
645		if (error)
646			return (error);
647	}
648
649	if (model == DATAMODEL_NATIVE)
650		iocbsz = (sizeof (aiocb_t *) * nent);
651#ifdef	_SYSCALL32_IMPL
652	else
653		iocbsz = (sizeof (caddr32_t) * nent);
654#endif  /* _SYSCALL32_IMPL */
655
656	/*
657	 * Only one aio_waitn call is allowed at a time.
658	 * The active aio_waitn will collect all requests
659	 * out of the "done" list and if necessary it will wait
660	 * for some/all pending requests to fulfill the nwait
661	 * parameter.
662	 * A second or further aio_waitn calls will sleep here
663	 * until the active aio_waitn finishes and leaves the kernel
664	 * If the second call does not block (poll), then return
665	 * immediately with the error code : EAGAIN.
666	 * If the second call should block, then sleep here, but
667	 * do not touch the timeout. The timeout starts when this
668	 * aio_waitn-call becomes active.
669	 */
670
671	mutex_enter(&aiop->aio_mutex);
672
673	while (aiop->aio_flags & AIO_WAITN) {
674		if (blocking == 0) {
675			mutex_exit(&aiop->aio_mutex);
676			return (EAGAIN);
677		}
678
679		/* block, no timeout */
680		aiop->aio_flags |= AIO_WAITN_PENDING;
681		if (!cv_wait_sig(&aiop->aio_waitncv, &aiop->aio_mutex)) {
682			mutex_exit(&aiop->aio_mutex);
683			return (EINTR);
684		}
685	}
686
687	/*
688	 * Establish the absolute future time for the timeout.
689	 */
690	if (rqtp) {
691		timestruc_t now;
692		timecheck = timechanged;
693		gethrestime(&now);
694		timespecadd(rqtp, &now);
695	}
696
697	if (iocbsz > aiop->aio_iocbsz && aiop->aio_iocb != NULL) {
698		kmem_free(aiop->aio_iocb, aiop->aio_iocbsz);
699		aiop->aio_iocb = NULL;
700	}
701
702	if (aiop->aio_iocb == NULL) {
703		iocblist = kmem_zalloc(iocbsz, KM_NOSLEEP);
704		if (iocblist == NULL) {
705			mutex_exit(&aiop->aio_mutex);
706			return (ENOMEM);
707		}
708		aiop->aio_iocb = (aiocb_t **)iocblist;
709		aiop->aio_iocbsz = iocbsz;
710	} else {
711		iocblist = (char *)aiop->aio_iocb;
712	}
713
714	aiop->aio_waitncnt = waitcnt;
715	aiop->aio_flags |= AIO_WAITN;
716
717	for (;;) {
718		/* push requests on poll queue to done queue */
719		if (aiop->aio_pollq) {
720			mutex_exit(&aiop->aio_mutex);
721			aio_cleanup(0);
722			mutex_enter(&aiop->aio_mutex);
723		}
724
725		/* check for requests on done queue */
726		if (aiop->aio_doneq) {
727			cnt += aio_reqlist_concat(aiop, &reqlist, nent - cnt);
728			aiop->aio_waitncnt = waitcnt - cnt;
729		}
730
731		/* user-level done queue might not be empty */
732		if (aiop->aio_notifycnt > 0) {
733			aiop->aio_notifycnt--;
734			error = 0;
735			break;
736		}
737
738		/*
739		 * if we are here second time as a result of timer
740		 * expiration, we reset error if there are enough
741		 * aiocb's to satisfy request.
742		 * We return also if all requests are already done
743		 * and we picked up the whole done queue.
744		 */
745
746		if ((cnt >= waitcnt) || (cnt > 0 && aiop->aio_pending == 0 &&
747		    aiop->aio_doneq == NULL)) {
748			error = 0;
749			break;
750		}
751
752		if ((cnt < waitcnt) && blocking) {
753			int rval = cv_waituntil_sig(&aiop->aio_waitcv,
754			    &aiop->aio_mutex, rqtp, timecheck);
755			if (rval > 0)
756				continue;
757			if (rval < 0) {
758				error = ETIME;
759				blocking = 0;
760				continue;
761			}
762			error = EINTR;
763		}
764		break;
765	}
766
767	mutex_exit(&aiop->aio_mutex);
768
769	if (cnt > 0) {
770
771		iocb_index = aio_unlock_requests(iocblist, iocb_index, reqlist,
772		    aiop, model);
773
774		if (model == DATAMODEL_NATIVE)
775			riocbsz = (sizeof (aiocb_t *) * cnt);
776#ifdef	_SYSCALL32_IMPL
777		else
778			riocbsz = (sizeof (caddr32_t) * cnt);
779#endif  /* _SYSCALL32_IMPL */
780
781		if (copyout(iocblist, uiocb, riocbsz) ||
782		    copyout(&cnt, nwait, sizeof (uint_t)))
783			error = EFAULT;
784	}
785
786	/* check if there is another thread waiting for execution */
787	mutex_enter(&aiop->aio_mutex);
788	aiop->aio_flags &= ~AIO_WAITN;
789	if (aiop->aio_flags & AIO_WAITN_PENDING) {
790		aiop->aio_flags &= ~AIO_WAITN_PENDING;
791		cv_signal(&aiop->aio_waitncv);
792	}
793	mutex_exit(&aiop->aio_mutex);
794
795	return (error);
796}
797
798/*
799 * aio_unlock_requests
800 * copyouts the result of the request as well as the return value.
801 * It builds the list of completed asynchronous requests,
802 * unlocks the allocated memory ranges and
803 * put the aio request structure back into the free list.
804 */
805
806static int
807aio_unlock_requests(
808	caddr_t	iocblist,
809	int	iocb_index,
810	aio_req_t *reqlist,
811	aio_t	*aiop,
812	model_t	model)
813{
814	aio_req_t	*reqp, *nreqp;
815
816	if (model == DATAMODEL_NATIVE) {
817		for (reqp = reqlist; reqp != NULL;  reqp = nreqp) {
818			(((caddr_t *)iocblist)[iocb_index++]) =
819			    reqp->aio_req_iocb.iocb;
820			nreqp = reqp->aio_req_next;
821			aphysio_unlock(reqp);
822			aio_copyout_result(reqp);
823			mutex_enter(&aiop->aio_mutex);
824			aio_req_free(aiop, reqp);
825			mutex_exit(&aiop->aio_mutex);
826		}
827	}
828#ifdef	_SYSCALL32_IMPL
829	else {
830		for (reqp = reqlist; reqp != NULL;  reqp = nreqp) {
831			((caddr32_t *)iocblist)[iocb_index++] =
832			    reqp->aio_req_iocb.iocb32;
833			nreqp = reqp->aio_req_next;
834			aphysio_unlock(reqp);
835			aio_copyout_result(reqp);
836			mutex_enter(&aiop->aio_mutex);
837			aio_req_free(aiop, reqp);
838			mutex_exit(&aiop->aio_mutex);
839		}
840	}
841#endif	/* _SYSCALL32_IMPL */
842	return (iocb_index);
843}
844
845/*
846 * aio_reqlist_concat
847 * moves "max" elements from the done queue to the reqlist queue and removes
848 * the AIO_DONEQ flag.
849 * - reqlist queue is a simple linked list
850 * - done queue is a double linked list
851 */
852
853static int
854aio_reqlist_concat(aio_t *aiop, aio_req_t **reqlist, int max)
855{
856	aio_req_t *q2, *q2work, *list;
857	int count = 0;
858
859	list = *reqlist;
860	q2 = aiop->aio_doneq;
861	q2work = q2;
862	while (max-- > 0) {
863		q2work->aio_req_flags &= ~AIO_DONEQ;
864		q2work = q2work->aio_req_next;
865		count++;
866		if (q2work == q2)
867			break;
868	}
869
870	if (q2work == q2) {
871		/* all elements revised */
872		q2->aio_req_prev->aio_req_next = list;
873		list = q2;
874		aiop->aio_doneq = NULL;
875	} else {
876		/*
877		 * max < elements in the doneq
878		 * detach only the required amount of elements
879		 * out of the doneq
880		 */
881		q2work->aio_req_prev->aio_req_next = list;
882		list = q2;
883
884		aiop->aio_doneq = q2work;
885		q2work->aio_req_prev = q2->aio_req_prev;
886		q2->aio_req_prev->aio_req_next = q2work;
887	}
888	*reqlist = list;
889	return (count);
890}
891
892/*ARGSUSED*/
893static int
894aiosuspend(void	*aiocb, int nent, struct timespec *timout, int flag,
895    long *rval, int run_mode)
896{
897	int		error;
898	aio_t		*aiop;
899	aio_req_t	*reqp, *found, *next;
900	caddr_t		cbplist = NULL;
901	aiocb_t		*cbp, **ucbp;
902#ifdef	_SYSCALL32_IMPL
903	aiocb32_t	*cbp32;
904	caddr32_t	*ucbp32;
905#endif  /* _SYSCALL32_IMPL */
906	aiocb64_32_t	*cbp64;
907	int		rv;
908	int		i;
909	size_t		ssize;
910	model_t		model = get_udatamodel();
911	int		blocking;
912	int		timecheck;
913	timestruc_t	rqtime;
914	timestruc_t	*rqtp;
915
916	aiop = curproc->p_aio;
917	if (aiop == NULL || nent <= 0 || nent > _AIO_LISTIO_MAX)
918		return (EINVAL);
919
920	/*
921	 * Establish the absolute future time for the timeout.
922	 */
923	error = timespec2reltime(timout, &rqtime, &rqtp, &blocking);
924	if (error)
925		return (error);
926	if (rqtp) {
927		timestruc_t now;
928		timecheck = timechanged;
929		gethrestime(&now);
930		timespecadd(rqtp, &now);
931	}
932
933	/*
934	 * If we are not blocking and there's no IO complete
935	 * skip aiocb copyin.
936	 */
937	if (!blocking && (aiop->aio_pollq == NULL) &&
938	    (aiop->aio_doneq == NULL)) {
939		return (EAGAIN);
940	}
941
942	if (model == DATAMODEL_NATIVE)
943		ssize = (sizeof (aiocb_t *) * nent);
944#ifdef	_SYSCALL32_IMPL
945	else
946		ssize = (sizeof (caddr32_t) * nent);
947#endif  /* _SYSCALL32_IMPL */
948
949	cbplist = kmem_alloc(ssize, KM_NOSLEEP);
950	if (cbplist == NULL)
951		return (ENOMEM);
952
953	if (copyin(aiocb, cbplist, ssize)) {
954		error = EFAULT;
955		goto done;
956	}
957
958	found = NULL;
959	/*
960	 * we need to get the aio_cleanupq_mutex since we call
961	 * aio_req_done().
962	 */
963	mutex_enter(&aiop->aio_cleanupq_mutex);
964	mutex_enter(&aiop->aio_mutex);
965	for (;;) {
966		/* push requests on poll queue to done queue */
967		if (aiop->aio_pollq) {
968			mutex_exit(&aiop->aio_mutex);
969			mutex_exit(&aiop->aio_cleanupq_mutex);
970			aio_cleanup(0);
971			mutex_enter(&aiop->aio_cleanupq_mutex);
972			mutex_enter(&aiop->aio_mutex);
973		}
974		/* check for requests on done queue */
975		if (aiop->aio_doneq) {
976			if (model == DATAMODEL_NATIVE)
977				ucbp = (aiocb_t **)cbplist;
978#ifdef	_SYSCALL32_IMPL
979			else
980				ucbp32 = (caddr32_t *)cbplist;
981#endif  /* _SYSCALL32_IMPL */
982			for (i = 0; i < nent; i++) {
983				if (model == DATAMODEL_NATIVE) {
984					if ((cbp = *ucbp++) == NULL)
985						continue;
986					if (run_mode != AIO_LARGEFILE)
987						reqp = aio_req_done(
988						    &cbp->aio_resultp);
989					else {
990						cbp64 = (aiocb64_32_t *)cbp;
991						reqp = aio_req_done(
992						    &cbp64->aio_resultp);
993					}
994				}
995#ifdef	_SYSCALL32_IMPL
996				else {
997					if (run_mode == AIO_32) {
998						if ((cbp32 =
999						    (aiocb32_t *)(uintptr_t)
1000						    *ucbp32++) == NULL)
1001							continue;
1002						reqp = aio_req_done(
1003						    &cbp32->aio_resultp);
1004					} else if (run_mode == AIO_LARGEFILE) {
1005						if ((cbp64 =
1006						    (aiocb64_32_t *)(uintptr_t)
1007						    *ucbp32++) == NULL)
1008							continue;
1009						reqp = aio_req_done(
1010						    &cbp64->aio_resultp);
1011					}
1012
1013				}
1014#endif  /* _SYSCALL32_IMPL */
1015				if (reqp) {
1016					reqp->aio_req_next = found;
1017					found = reqp;
1018				}
1019				if (aiop->aio_doneq == NULL)
1020					break;
1021			}
1022			if (found)
1023				break;
1024		}
1025		if (aiop->aio_notifycnt > 0) {
1026			/*
1027			 * nothing on the kernel's queue. the user
1028			 * has notified the kernel that it has items
1029			 * on a user-level queue.
1030			 */
1031			aiop->aio_notifycnt--;
1032			*rval = 1;
1033			error = 0;
1034			break;
1035		}
1036		/* don't block if nothing is outstanding */
1037		if (aiop->aio_outstanding == 0) {
1038			error = EAGAIN;
1039			break;
1040		}
1041		if (blocking) {
1042			/*
1043			 * drop the aio_cleanupq_mutex as we are
1044			 * going to block.
1045			 */
1046			mutex_exit(&aiop->aio_cleanupq_mutex);
1047			rv = cv_waituntil_sig(&aiop->aio_waitcv,
1048			    &aiop->aio_mutex, rqtp, timecheck);
1049			/*
1050			 * we have to drop aio_mutex and
1051			 * grab it in the right order.
1052			 */
1053			mutex_exit(&aiop->aio_mutex);
1054			mutex_enter(&aiop->aio_cleanupq_mutex);
1055			mutex_enter(&aiop->aio_mutex);
1056			if (rv > 0)	/* check done queue again */
1057				continue;
1058			if (rv == 0)	/* interrupted by a signal */
1059				error = EINTR;
1060			else		/* timer expired */
1061				error = ETIME;
1062		} else {
1063			error = EAGAIN;
1064		}
1065		break;
1066	}
1067	mutex_exit(&aiop->aio_mutex);
1068	mutex_exit(&aiop->aio_cleanupq_mutex);
1069	for (reqp = found; reqp != NULL; reqp = next) {
1070		next = reqp->aio_req_next;
1071		aphysio_unlock(reqp);
1072		aio_copyout_result(reqp);
1073		mutex_enter(&aiop->aio_mutex);
1074		aio_req_free(aiop, reqp);
1075		mutex_exit(&aiop->aio_mutex);
1076	}
1077done:
1078	kmem_free(cbplist, ssize);
1079	return (error);
1080}
1081
1082/*
1083 * initialize aio by allocating an aio_t struct for this
1084 * process.
1085 */
1086static int
1087aioinit(void)
1088{
1089	proc_t *p = curproc;
1090	aio_t *aiop;
1091	mutex_enter(&p->p_lock);
1092	if ((aiop = p->p_aio) == NULL) {
1093		aiop = aio_aiop_alloc();
1094		p->p_aio = aiop;
1095	}
1096	mutex_exit(&p->p_lock);
1097	if (aiop == NULL)
1098		return (ENOMEM);
1099	return (0);
1100}
1101
1102/*
1103 * start a special thread that will cleanup after aio requests
1104 * that are preventing a segment from being unmapped. as_unmap()
1105 * blocks until all phsyio to this segment is completed. this
1106 * doesn't happen until all the pages in this segment are not
1107 * SOFTLOCKed. Some pages will be SOFTLOCKed when there are aio
1108 * requests still outstanding. this special thread will make sure
1109 * that these SOFTLOCKed pages will eventually be SOFTUNLOCKed.
1110 *
1111 * this function will return an error if the process has only
1112 * one LWP. the assumption is that the caller is a separate LWP
1113 * that remains blocked in the kernel for the life of this process.
1114 */
1115static int
1116aiostart(void)
1117{
1118	proc_t *p = curproc;
1119	aio_t *aiop;
1120	int first, error = 0;
1121
1122	if (p->p_lwpcnt == 1)
1123		return (EDEADLK);
1124	mutex_enter(&p->p_lock);
1125	if ((aiop = p->p_aio) == NULL)
1126		error = EINVAL;
1127	else {
1128		first = aiop->aio_ok;
1129		if (aiop->aio_ok == 0)
1130			aiop->aio_ok = 1;
1131	}
1132	mutex_exit(&p->p_lock);
1133	if (error == 0 && first == 0) {
1134		return (aio_cleanup_thread(aiop));
1135		/* should return only to exit */
1136	}
1137	return (error);
1138}
1139
1140/*
1141 * Associate an aiocb with a port.
1142 * This function is used by aiorw() to associate a transaction with a port.
1143 * Allocate an event port structure (port_alloc_event()) and store the
1144 * delivered user pointer (portnfy_user) in the portkev_user field of the
1145 * port_kevent_t structure..
1146 * The aio_req_portkev pointer in the aio_req_t structure was added to identify
1147 * the port association.
1148 */
1149
1150static int
1151aio_req_assoc_port_rw(port_notify_t *pntfy, aiocb_t *cbp,
1152    aio_req_t *reqp, int event)
1153{
1154	port_kevent_t	*pkevp = NULL;
1155	int		error;
1156
1157	error = port_alloc_event(pntfy->portnfy_port, PORT_ALLOC_DEFAULT,
1158	    PORT_SOURCE_AIO, &pkevp);
1159	if (error) {
1160		if ((error == ENOMEM) || (error == EAGAIN))
1161			error = EAGAIN;
1162		else
1163			error = EINVAL;
1164	} else {
1165		port_init_event(pkevp, (uintptr_t)cbp, pntfy->portnfy_user,
1166		    aio_port_callback, reqp);
1167		pkevp->portkev_events = event;
1168		reqp->aio_req_portkev = pkevp;
1169		reqp->aio_req_port = pntfy->portnfy_port;
1170	}
1171	return (error);
1172}
1173
1174#ifdef _LP64
1175
1176/*
1177 * Asynchronous list IO. A chain of aiocb's are copied in
1178 * one at a time. If the aiocb is invalid, it is skipped.
1179 * For each aiocb, the appropriate driver entry point is
1180 * called. Optimize for the common case where the list
1181 * of requests is to the same file descriptor.
1182 *
1183 * One possible optimization is to define a new driver entry
1184 * point that supports a list of IO requests. Whether this
1185 * improves performance depends somewhat on the driver's
1186 * locking strategy. Processing a list could adversely impact
1187 * the driver's interrupt latency.
1188 */
1189static int
1190alio(
1191	int		mode_arg,
1192	aiocb_t		**aiocb_arg,
1193	int		nent,
1194	struct sigevent	*sigev)
1195{
1196	file_t		*fp;
1197	file_t		*prev_fp = NULL;
1198	int		prev_mode = -1;
1199	struct vnode	*vp;
1200	aio_lio_t	*head;
1201	aio_req_t	*reqp;
1202	aio_t		*aiop;
1203	caddr_t		cbplist;
1204	aiocb_t		cb;
1205	aiocb_t		*aiocb = &cb;
1206	aiocb_t		*cbp;
1207	aiocb_t		**ucbp;
1208	struct sigevent sigevk;
1209	sigqueue_t	*sqp;
1210	int		(*aio_func)();
1211	int		mode;
1212	int		error = 0;
1213	int		aio_errors = 0;
1214	int		i;
1215	size_t		ssize;
1216	int		deadhead = 0;
1217	int		aio_notsupported = 0;
1218	int		lio_head_port;
1219	int		aio_port;
1220	int		aio_thread;
1221	port_kevent_t	*pkevtp = NULL;
1222	int		portused = 0;
1223	port_notify_t	pnotify;
1224	int		event;
1225
1226	aiop = curproc->p_aio;
1227	if (aiop == NULL || nent <= 0 || nent > _AIO_LISTIO_MAX)
1228		return (EINVAL);
1229
1230	ssize = (sizeof (aiocb_t *) * nent);
1231	cbplist = kmem_alloc(ssize, KM_SLEEP);
1232	ucbp = (aiocb_t **)cbplist;
1233
1234	if (copyin(aiocb_arg, cbplist, ssize) ||
1235	    (sigev && copyin(sigev, &sigevk, sizeof (struct sigevent)))) {
1236		kmem_free(cbplist, ssize);
1237		return (EFAULT);
1238	}
1239
1240	/* Event Ports  */
1241	if (sigev &&
1242	    (sigevk.sigev_notify == SIGEV_THREAD ||
1243	    sigevk.sigev_notify == SIGEV_PORT)) {
1244		if (sigevk.sigev_notify == SIGEV_THREAD) {
1245			pnotify.portnfy_port = sigevk.sigev_signo;
1246			pnotify.portnfy_user = sigevk.sigev_value.sival_ptr;
1247		} else if (copyin(sigevk.sigev_value.sival_ptr,
1248		    &pnotify, sizeof (pnotify))) {
1249			kmem_free(cbplist, ssize);
1250			return (EFAULT);
1251		}
1252		error = port_alloc_event(pnotify.portnfy_port,
1253		    PORT_ALLOC_DEFAULT, PORT_SOURCE_AIO, &pkevtp);
1254		if (error) {
1255			if (error == ENOMEM || error == EAGAIN)
1256				error = EAGAIN;
1257			else
1258				error = EINVAL;
1259			kmem_free(cbplist, ssize);
1260			return (error);
1261		}
1262		lio_head_port = pnotify.portnfy_port;
1263		portused = 1;
1264	}
1265
1266	/*
1267	 * a list head should be allocated if notification is
1268	 * enabled for this list.
1269	 */
1270	head = NULL;
1271
1272	if (mode_arg == LIO_WAIT || sigev) {
1273		mutex_enter(&aiop->aio_mutex);
1274		error = aio_lio_alloc(&head);
1275		mutex_exit(&aiop->aio_mutex);
1276		if (error)
1277			goto done;
1278		deadhead = 1;
1279		head->lio_nent = nent;
1280		head->lio_refcnt = nent;
1281		head->lio_port = -1;
1282		head->lio_portkev = NULL;
1283		if (sigev && sigevk.sigev_notify == SIGEV_SIGNAL &&
1284		    sigevk.sigev_signo > 0 && sigevk.sigev_signo < NSIG) {
1285			sqp = kmem_zalloc(sizeof (sigqueue_t), KM_NOSLEEP);
1286			if (sqp == NULL) {
1287				error = EAGAIN;
1288				goto done;
1289			}
1290			sqp->sq_func = NULL;
1291			sqp->sq_next = NULL;
1292			sqp->sq_info.si_code = SI_ASYNCIO;
1293			sqp->sq_info.si_pid = curproc->p_pid;
1294			sqp->sq_info.si_ctid = PRCTID(curproc);
1295			sqp->sq_info.si_zoneid = getzoneid();
1296			sqp->sq_info.si_uid = crgetuid(curproc->p_cred);
1297			sqp->sq_info.si_signo = sigevk.sigev_signo;
1298			sqp->sq_info.si_value = sigevk.sigev_value;
1299			head->lio_sigqp = sqp;
1300		} else {
1301			head->lio_sigqp = NULL;
1302		}
1303		if (pkevtp) {
1304			/*
1305			 * Prepare data to send when list of aiocb's
1306			 * has completed.
1307			 */
1308			port_init_event(pkevtp, (uintptr_t)sigev,
1309			    (void *)(uintptr_t)pnotify.portnfy_user,
1310			    NULL, head);
1311			pkevtp->portkev_events = AIOLIO;
1312			head->lio_portkev = pkevtp;
1313			head->lio_port = pnotify.portnfy_port;
1314		}
1315	}
1316
1317	for (i = 0; i < nent; i++, ucbp++) {
1318
1319		cbp = *ucbp;
1320		/* skip entry if it can't be copied. */
1321		if (cbp == NULL || copyin(cbp, aiocb, sizeof (*aiocb))) {
1322			if (head) {
1323				mutex_enter(&aiop->aio_mutex);
1324				head->lio_nent--;
1325				head->lio_refcnt--;
1326				mutex_exit(&aiop->aio_mutex);
1327			}
1328			continue;
1329		}
1330
1331		/* skip if opcode for aiocb is LIO_NOP */
1332		mode = aiocb->aio_lio_opcode;
1333		if (mode == LIO_NOP) {
1334			cbp = NULL;
1335			if (head) {
1336				mutex_enter(&aiop->aio_mutex);
1337				head->lio_nent--;
1338				head->lio_refcnt--;
1339				mutex_exit(&aiop->aio_mutex);
1340			}
1341			continue;
1342		}
1343
1344		/* increment file descriptor's ref count. */
1345		if ((fp = getf(aiocb->aio_fildes)) == NULL) {
1346			lio_set_uerror(&cbp->aio_resultp, EBADF);
1347			if (head) {
1348				mutex_enter(&aiop->aio_mutex);
1349				head->lio_nent--;
1350				head->lio_refcnt--;
1351				mutex_exit(&aiop->aio_mutex);
1352			}
1353			aio_errors++;
1354			continue;
1355		}
1356
1357		/*
1358		 * check the permission of the partition
1359		 */
1360		if ((fp->f_flag & mode) == 0) {
1361			releasef(aiocb->aio_fildes);
1362			lio_set_uerror(&cbp->aio_resultp, EBADF);
1363			if (head) {
1364				mutex_enter(&aiop->aio_mutex);
1365				head->lio_nent--;
1366				head->lio_refcnt--;
1367				mutex_exit(&aiop->aio_mutex);
1368			}
1369			aio_errors++;
1370			continue;
1371		}
1372
1373		/*
1374		 * common case where requests are to the same fd
1375		 * for the same r/w operation.
1376		 * for UFS, need to set EBADFD
1377		 */
1378		vp = fp->f_vnode;
1379		if (fp != prev_fp || mode != prev_mode) {
1380			aio_func = check_vp(vp, mode);
1381			if (aio_func == NULL) {
1382				prev_fp = NULL;
1383				releasef(aiocb->aio_fildes);
1384				lio_set_uerror(&cbp->aio_resultp, EBADFD);
1385				aio_notsupported++;
1386				if (head) {
1387					mutex_enter(&aiop->aio_mutex);
1388					head->lio_nent--;
1389					head->lio_refcnt--;
1390					mutex_exit(&aiop->aio_mutex);
1391				}
1392				continue;
1393			} else {
1394				prev_fp = fp;
1395				prev_mode = mode;
1396			}
1397		}
1398
1399		error = aio_req_setup(&reqp, aiop, aiocb,
1400		    &cbp->aio_resultp, vp, 0);
1401		if (error) {
1402			releasef(aiocb->aio_fildes);
1403			lio_set_uerror(&cbp->aio_resultp, error);
1404			if (head) {
1405				mutex_enter(&aiop->aio_mutex);
1406				head->lio_nent--;
1407				head->lio_refcnt--;
1408				mutex_exit(&aiop->aio_mutex);
1409			}
1410			aio_errors++;
1411			continue;
1412		}
1413
1414		reqp->aio_req_lio = head;
1415		deadhead = 0;
1416
1417		/*
1418		 * Set the errno field now before sending the request to
1419		 * the driver to avoid a race condition
1420		 */
1421		(void) suword32(&cbp->aio_resultp.aio_errno,
1422		    EINPROGRESS);
1423
1424		reqp->aio_req_iocb.iocb = (caddr_t)cbp;
1425
1426		event = (mode == LIO_READ)? AIOAREAD : AIOAWRITE;
1427		aio_port = (aiocb->aio_sigevent.sigev_notify == SIGEV_PORT);
1428		aio_thread = (aiocb->aio_sigevent.sigev_notify == SIGEV_THREAD);
1429		if (aio_port | aio_thread) {
1430			port_kevent_t *lpkevp;
1431			/*
1432			 * Prepare data to send with each aiocb completed.
1433			 */
1434			if (aio_port) {
1435				void *paddr =
1436				    aiocb->aio_sigevent.sigev_value.sival_ptr;
1437				if (copyin(paddr, &pnotify, sizeof (pnotify)))
1438					error = EFAULT;
1439			} else {	/* aio_thread */
1440				pnotify.portnfy_port =
1441				    aiocb->aio_sigevent.sigev_signo;
1442				pnotify.portnfy_user =
1443				    aiocb->aio_sigevent.sigev_value.sival_ptr;
1444			}
1445			if (error)
1446				/* EMPTY */;
1447			else if (pkevtp != NULL &&
1448			    pnotify.portnfy_port == lio_head_port)
1449				error = port_dup_event(pkevtp, &lpkevp,
1450				    PORT_ALLOC_DEFAULT);
1451			else
1452				error = port_alloc_event(pnotify.portnfy_port,
1453				    PORT_ALLOC_DEFAULT, PORT_SOURCE_AIO,
1454				    &lpkevp);
1455			if (error == 0) {
1456				port_init_event(lpkevp, (uintptr_t)cbp,
1457				    (void *)(uintptr_t)pnotify.portnfy_user,
1458				    aio_port_callback, reqp);
1459				lpkevp->portkev_events = event;
1460				reqp->aio_req_portkev = lpkevp;
1461				reqp->aio_req_port = pnotify.portnfy_port;
1462			}
1463		}
1464
1465		/*
1466		 * send the request to driver.
1467		 */
1468		if (error == 0) {
1469			if (aiocb->aio_nbytes == 0) {
1470				clear_active_fd(aiocb->aio_fildes);
1471				aio_zerolen(reqp);
1472				continue;
1473			}
1474			error = (*aio_func)(vp, (aio_req_t *)&reqp->aio_req,
1475			    CRED());
1476		}
1477
1478		/*
1479		 * the fd's ref count is not decremented until the IO has
1480		 * completed unless there was an error.
1481		 */
1482		if (error) {
1483			releasef(aiocb->aio_fildes);
1484			lio_set_uerror(&cbp->aio_resultp, error);
1485			if (head) {
1486				mutex_enter(&aiop->aio_mutex);
1487				head->lio_nent--;
1488				head->lio_refcnt--;
1489				mutex_exit(&aiop->aio_mutex);
1490			}
1491			if (error == ENOTSUP)
1492				aio_notsupported++;
1493			else
1494				aio_errors++;
1495			lio_set_error(reqp, portused);
1496		} else {
1497			clear_active_fd(aiocb->aio_fildes);
1498		}
1499	}
1500
1501	if (aio_notsupported) {
1502		error = ENOTSUP;
1503	} else if (aio_errors) {
1504		/*
1505		 * return EIO if any request failed
1506		 */
1507		error = EIO;
1508	}
1509
1510	if (mode_arg == LIO_WAIT) {
1511		mutex_enter(&aiop->aio_mutex);
1512		while (head->lio_refcnt > 0) {
1513			if (!cv_wait_sig(&head->lio_notify, &aiop->aio_mutex)) {
1514				mutex_exit(&aiop->aio_mutex);
1515				error = EINTR;
1516				goto done;
1517			}
1518		}
1519		mutex_exit(&aiop->aio_mutex);
1520		alio_cleanup(aiop, (aiocb_t **)cbplist, nent, AIO_64);
1521	}
1522
1523done:
1524	kmem_free(cbplist, ssize);
1525	if (deadhead) {
1526		if (head->lio_sigqp)
1527			kmem_free(head->lio_sigqp, sizeof (sigqueue_t));
1528		if (head->lio_portkev)
1529			port_free_event(head->lio_portkev);
1530		kmem_free(head, sizeof (aio_lio_t));
1531	}
1532	return (error);
1533}
1534
1535#endif /* _LP64 */
1536
1537/*
1538 * Asynchronous list IO.
1539 * If list I/O is called with LIO_WAIT it can still return
1540 * before all the I/O's are completed if a signal is caught
1541 * or if the list include UFS I/O requests. If this happens,
1542 * libaio will call aliowait() to wait for the I/O's to
1543 * complete
1544 */
1545/*ARGSUSED*/
1546static int
1547aliowait(
1548	int	mode,
1549	void	*aiocb,
1550	int	nent,
1551	void	*sigev,
1552	int	run_mode)
1553{
1554	aio_lio_t	*head;
1555	aio_t		*aiop;
1556	caddr_t		cbplist;
1557	aiocb_t		*cbp, **ucbp;
1558#ifdef	_SYSCALL32_IMPL
1559	aiocb32_t	*cbp32;
1560	caddr32_t	*ucbp32;
1561	aiocb64_32_t	*cbp64;
1562#endif
1563	int		error = 0;
1564	int		i;
1565	size_t		ssize = 0;
1566	model_t		model = get_udatamodel();
1567
1568	aiop = curproc->p_aio;
1569	if (aiop == NULL || nent <= 0 || nent > _AIO_LISTIO_MAX)
1570		return (EINVAL);
1571
1572	if (model == DATAMODEL_NATIVE)
1573		ssize = (sizeof (aiocb_t *) * nent);
1574#ifdef	_SYSCALL32_IMPL
1575	else
1576		ssize = (sizeof (caddr32_t) * nent);
1577#endif  /* _SYSCALL32_IMPL */
1578
1579	if (ssize == 0)
1580		return (EINVAL);
1581
1582	cbplist = kmem_alloc(ssize, KM_SLEEP);
1583
1584	if (model == DATAMODEL_NATIVE)
1585		ucbp = (aiocb_t **)cbplist;
1586#ifdef	_SYSCALL32_IMPL
1587	else
1588		ucbp32 = (caddr32_t *)cbplist;
1589#endif  /* _SYSCALL32_IMPL */
1590
1591	if (copyin(aiocb, cbplist, ssize)) {
1592		error = EFAULT;
1593		goto done;
1594	}
1595
1596	/*
1597	 * To find the list head, we go through the
1598	 * list of aiocb structs, find the request
1599	 * its for, then get the list head that reqp
1600	 * points to
1601	 */
1602	head = NULL;
1603
1604	for (i = 0; i < nent; i++) {
1605		if (model == DATAMODEL_NATIVE) {
1606			/*
1607			 * Since we are only checking for a NULL pointer
1608			 * Following should work on both native data sizes
1609			 * as well as for largefile aiocb.
1610			 */
1611			if ((cbp = *ucbp++) == NULL)
1612				continue;
1613			if (run_mode != AIO_LARGEFILE)
1614				if (head = aio_list_get(&cbp->aio_resultp))
1615					break;
1616			else {
1617				/*
1618				 * This is a case when largefile call is
1619				 * made on 32 bit kernel.
1620				 * Treat each pointer as pointer to
1621				 * aiocb64_32
1622				 */
1623				if (head = aio_list_get((aio_result_t *)
1624				    &(((aiocb64_32_t *)cbp)->aio_resultp)))
1625					break;
1626			}
1627		}
1628#ifdef	_SYSCALL32_IMPL
1629		else {
1630			if (run_mode == AIO_LARGEFILE) {
1631				if ((cbp64 = (aiocb64_32_t *)
1632				    (uintptr_t)*ucbp32++) == NULL)
1633					continue;
1634				if (head = aio_list_get((aio_result_t *)
1635				    &cbp64->aio_resultp))
1636					break;
1637			} else if (run_mode == AIO_32) {
1638				if ((cbp32 = (aiocb32_t *)
1639				    (uintptr_t)*ucbp32++) == NULL)
1640					continue;
1641				if (head = aio_list_get((aio_result_t *)
1642				    &cbp32->aio_resultp))
1643					break;
1644			}
1645		}
1646#endif	/* _SYSCALL32_IMPL */
1647	}
1648
1649	if (head == NULL) {
1650		error = EINVAL;
1651		goto done;
1652	}
1653
1654	mutex_enter(&aiop->aio_mutex);
1655	while (head->lio_refcnt > 0) {
1656		if (!cv_wait_sig(&head->lio_notify, &aiop->aio_mutex)) {
1657			mutex_exit(&aiop->aio_mutex);
1658			error = EINTR;
1659			goto done;
1660		}
1661	}
1662	mutex_exit(&aiop->aio_mutex);
1663	alio_cleanup(aiop, (aiocb_t **)cbplist, nent, run_mode);
1664done:
1665	kmem_free(cbplist, ssize);
1666	return (error);
1667}
1668
1669aio_lio_t *
1670aio_list_get(aio_result_t *resultp)
1671{
1672	aio_lio_t	*head = NULL;
1673	aio_t		*aiop;
1674	aio_req_t	**bucket;
1675	aio_req_t	*reqp;
1676	long		index;
1677
1678	aiop = curproc->p_aio;
1679	if (aiop == NULL)
1680		return (NULL);
1681
1682	if (resultp) {
1683		index = AIO_HASH(resultp);
1684		bucket = &aiop->aio_hash[index];
1685		for (reqp = *bucket; reqp != NULL;
1686		    reqp = reqp->aio_hash_next) {
1687			if (reqp->aio_req_resultp == resultp) {
1688				head = reqp->aio_req_lio;
1689				return (head);
1690			}
1691		}
1692	}
1693	return (NULL);
1694}
1695
1696
1697static void
1698lio_set_uerror(void *resultp, int error)
1699{
1700	/*
1701	 * the resultp field is a pointer to where the
1702	 * error should be written out to the user's
1703	 * aiocb.
1704	 *
1705	 */
1706	if (get_udatamodel() == DATAMODEL_NATIVE) {
1707		(void) sulword(&((aio_result_t *)resultp)->aio_return,
1708		    (ssize_t)-1);
1709		(void) suword32(&((aio_result_t *)resultp)->aio_errno, error);
1710	}
1711#ifdef	_SYSCALL32_IMPL
1712	else {
1713		(void) suword32(&((aio_result32_t *)resultp)->aio_return,
1714		    (uint_t)-1);
1715		(void) suword32(&((aio_result32_t *)resultp)->aio_errno, error);
1716	}
1717#endif  /* _SYSCALL32_IMPL */
1718}
1719
1720/*
1721 * do cleanup completion for all requests in list. memory for
1722 * each request is also freed.
1723 */
1724static void
1725alio_cleanup(aio_t *aiop, aiocb_t **cbp, int nent, int run_mode)
1726{
1727	int i;
1728	aio_req_t *reqp;
1729	aio_result_t *resultp;
1730	aiocb64_32_t *aiocb_64;
1731
1732	for (i = 0; i < nent; i++) {
1733		if (get_udatamodel() == DATAMODEL_NATIVE) {
1734			if (cbp[i] == NULL)
1735				continue;
1736			if (run_mode == AIO_LARGEFILE) {
1737				aiocb_64 = (aiocb64_32_t *)cbp[i];
1738				resultp = (aio_result_t *)
1739				    &aiocb_64->aio_resultp;
1740			} else
1741				resultp = &cbp[i]->aio_resultp;
1742		}
1743#ifdef	_SYSCALL32_IMPL
1744		else {
1745			aiocb32_t *aiocb_32;
1746			caddr32_t *cbp32;
1747
1748			cbp32 = (caddr32_t *)cbp;
1749			if (cbp32[i] == 0)
1750				continue;
1751			if (run_mode == AIO_32) {
1752				aiocb_32 = (aiocb32_t *)(uintptr_t)cbp32[i];
1753				resultp = (aio_result_t *)&aiocb_32->
1754				    aio_resultp;
1755			} else if (run_mode == AIO_LARGEFILE) {
1756				aiocb_64 = (aiocb64_32_t *)(uintptr_t)cbp32[i];
1757				resultp = (aio_result_t *)&aiocb_64->
1758				    aio_resultp;
1759			}
1760		}
1761#endif  /* _SYSCALL32_IMPL */
1762		/*
1763		 * we need to get the aio_cleanupq_mutex since we call
1764		 * aio_req_done().
1765		 */
1766		mutex_enter(&aiop->aio_cleanupq_mutex);
1767		mutex_enter(&aiop->aio_mutex);
1768		reqp = aio_req_done(resultp);
1769		mutex_exit(&aiop->aio_mutex);
1770		mutex_exit(&aiop->aio_cleanupq_mutex);
1771		if (reqp != NULL) {
1772			aphysio_unlock(reqp);
1773			aio_copyout_result(reqp);
1774			mutex_enter(&aiop->aio_mutex);
1775			aio_req_free(aiop, reqp);
1776			mutex_exit(&aiop->aio_mutex);
1777		}
1778	}
1779}
1780
1781/*
1782 * Write out the results for an aio request that is done.
1783 */
1784static int
1785aioerror(void *cb, int run_mode)
1786{
1787	aio_result_t *resultp;
1788	aio_t *aiop;
1789	aio_req_t *reqp;
1790	int retval;
1791
1792	aiop = curproc->p_aio;
1793	if (aiop == NULL || cb == NULL)
1794		return (EINVAL);
1795
1796	if (get_udatamodel() == DATAMODEL_NATIVE) {
1797		if (run_mode == AIO_LARGEFILE)
1798			resultp = (aio_result_t *)&((aiocb64_32_t *)cb)->
1799			    aio_resultp;
1800		else
1801			resultp = &((aiocb_t *)cb)->aio_resultp;
1802	}
1803#ifdef	_SYSCALL32_IMPL
1804	else {
1805		if (run_mode == AIO_LARGEFILE)
1806			resultp = (aio_result_t *)&((aiocb64_32_t *)cb)->
1807			    aio_resultp;
1808		else if (run_mode == AIO_32)
1809			resultp = (aio_result_t *)&((aiocb32_t *)cb)->
1810			    aio_resultp;
1811	}
1812#endif  /* _SYSCALL32_IMPL */
1813	/*
1814	 * we need to get the aio_cleanupq_mutex since we call
1815	 * aio_req_find().
1816	 */
1817	mutex_enter(&aiop->aio_cleanupq_mutex);
1818	mutex_enter(&aiop->aio_mutex);
1819	retval = aio_req_find(resultp, &reqp);
1820	mutex_exit(&aiop->aio_mutex);
1821	mutex_exit(&aiop->aio_cleanupq_mutex);
1822	if (retval == 0) {
1823		aphysio_unlock(reqp);
1824		aio_copyout_result(reqp);
1825		mutex_enter(&aiop->aio_mutex);
1826		aio_req_free(aiop, reqp);
1827		mutex_exit(&aiop->aio_mutex);
1828		return (0);
1829	} else if (retval == 1)
1830		return (EINPROGRESS);
1831	else if (retval == 2)
1832		return (EINVAL);
1833	return (0);
1834}
1835
1836/*
1837 *	aio_cancel - if no requests outstanding,
1838 *			return AIO_ALLDONE
1839 *			else
1840 *			return AIO_NOTCANCELED
1841 */
1842static int
1843aio_cancel(int fildes, void *cb, long *rval, int run_mode)
1844{
1845	aio_t *aiop;
1846	void *resultp;
1847	int index;
1848	aio_req_t **bucket;
1849	aio_req_t *ent;
1850
1851
1852	/*
1853	 * Verify valid file descriptor
1854	 */
1855	if ((getf(fildes)) == NULL) {
1856		return (EBADF);
1857	}
1858	releasef(fildes);
1859
1860	aiop = curproc->p_aio;
1861	if (aiop == NULL)
1862		return (EINVAL);
1863
1864	if (aiop->aio_outstanding == 0) {
1865		*rval = AIO_ALLDONE;
1866		return (0);
1867	}
1868
1869	mutex_enter(&aiop->aio_mutex);
1870	if (cb != NULL) {
1871		if (get_udatamodel() == DATAMODEL_NATIVE) {
1872			if (run_mode == AIO_LARGEFILE)
1873				resultp = (aio_result_t *)&((aiocb64_32_t *)cb)
1874				    ->aio_resultp;
1875			else
1876				resultp = &((aiocb_t *)cb)->aio_resultp;
1877		}
1878#ifdef	_SYSCALL32_IMPL
1879		else {
1880			if (run_mode == AIO_LARGEFILE)
1881				resultp = (aio_result_t *)&((aiocb64_32_t *)cb)
1882				    ->aio_resultp;
1883			else if (run_mode == AIO_32)
1884				resultp = (aio_result_t *)&((aiocb32_t *)cb)
1885				    ->aio_resultp;
1886		}
1887#endif  /* _SYSCALL32_IMPL */
1888		index = AIO_HASH(resultp);
1889		bucket = &aiop->aio_hash[index];
1890		for (ent = *bucket; ent != NULL; ent = ent->aio_hash_next) {
1891			if (ent->aio_req_resultp == resultp) {
1892				if ((ent->aio_req_flags & AIO_PENDING) == 0) {
1893					mutex_exit(&aiop->aio_mutex);
1894					*rval = AIO_ALLDONE;
1895					return (0);
1896				}
1897				mutex_exit(&aiop->aio_mutex);
1898				*rval = AIO_NOTCANCELED;
1899				return (0);
1900			}
1901		}
1902		mutex_exit(&aiop->aio_mutex);
1903		*rval = AIO_ALLDONE;
1904		return (0);
1905	}
1906
1907	for (index = 0; index < AIO_HASHSZ; index++) {
1908		bucket = &aiop->aio_hash[index];
1909		for (ent = *bucket; ent != NULL; ent = ent->aio_hash_next) {
1910			if (ent->aio_req_fd == fildes) {
1911				if ((ent->aio_req_flags & AIO_PENDING) != 0) {
1912					mutex_exit(&aiop->aio_mutex);
1913					*rval = AIO_NOTCANCELED;
1914					return (0);
1915				}
1916			}
1917		}
1918	}
1919	mutex_exit(&aiop->aio_mutex);
1920	*rval = AIO_ALLDONE;
1921	return (0);
1922}
1923
1924/*
1925 * solaris version of asynchronous read and write
1926 */
1927static int
1928arw(
1929	int	opcode,
1930	int	fdes,
1931	char	*bufp,
1932	int	bufsize,
1933	offset_t	offset,
1934	aio_result_t	*resultp,
1935	int		mode)
1936{
1937	file_t		*fp;
1938	int		error;
1939	struct vnode	*vp;
1940	aio_req_t	*reqp;
1941	aio_t		*aiop;
1942	int		(*aio_func)();
1943#ifdef _LP64
1944	aiocb_t		aiocb;
1945#else
1946	aiocb64_32_t	aiocb64;
1947#endif
1948
1949	aiop = curproc->p_aio;
1950	if (aiop == NULL)
1951		return (EINVAL);
1952
1953	if ((fp = getf(fdes)) == NULL) {
1954		return (EBADF);
1955	}
1956
1957	/*
1958	 * check the permission of the partition
1959	 */
1960	if ((fp->f_flag & mode) == 0) {
1961		releasef(fdes);
1962		return (EBADF);
1963	}
1964
1965	vp = fp->f_vnode;
1966	aio_func = check_vp(vp, mode);
1967	if (aio_func == NULL) {
1968		releasef(fdes);
1969		return (EBADFD);
1970	}
1971#ifdef _LP64
1972	aiocb.aio_fildes = fdes;
1973	aiocb.aio_buf = bufp;
1974	aiocb.aio_nbytes = bufsize;
1975	aiocb.aio_offset = offset;
1976	aiocb.aio_sigevent.sigev_notify = 0;
1977	error = aio_req_setup(&reqp, aiop, &aiocb, resultp, vp, 1);
1978#else
1979	aiocb64.aio_fildes = fdes;
1980	aiocb64.aio_buf = (caddr32_t)bufp;
1981	aiocb64.aio_nbytes = bufsize;
1982	aiocb64.aio_offset = offset;
1983	aiocb64.aio_sigevent.sigev_notify = 0;
1984	error = aio_req_setupLF(&reqp, aiop, &aiocb64, resultp, vp, 1);
1985#endif
1986	if (error) {
1987		releasef(fdes);
1988		return (error);
1989	}
1990
1991	/*
1992	 * enable polling on this request if the opcode has
1993	 * the AIO poll bit set
1994	 */
1995	if (opcode & AIO_POLL_BIT)
1996		reqp->aio_req_flags |= AIO_POLL;
1997
1998	if (bufsize == 0) {
1999		clear_active_fd(fdes);
2000		aio_zerolen(reqp);
2001		return (0);
2002	}
2003	/*
2004	 * send the request to driver.
2005	 */
2006	error = (*aio_func)(vp, (aio_req_t *)&reqp->aio_req, CRED());
2007	/*
2008	 * the fd is stored in the aio_req_t by aio_req_setup(), and
2009	 * is released by the aio_cleanup_thread() when the IO has
2010	 * completed.
2011	 */
2012	if (error) {
2013		releasef(fdes);
2014		mutex_enter(&aiop->aio_mutex);
2015		aio_req_free(aiop, reqp);
2016		aiop->aio_pending--;
2017		if (aiop->aio_flags & AIO_REQ_BLOCK)
2018			cv_signal(&aiop->aio_cleanupcv);
2019		mutex_exit(&aiop->aio_mutex);
2020		return (error);
2021	}
2022	clear_active_fd(fdes);
2023	return (0);
2024}
2025
2026/*
2027 * posix version of asynchronous read and write
2028 */
2029static int
2030aiorw(
2031	int		opcode,
2032	void		*aiocb_arg,
2033	int		mode,
2034	int		run_mode)
2035{
2036#ifdef _SYSCALL32_IMPL
2037	aiocb32_t	aiocb32;
2038	struct	sigevent32 *sigev32;
2039	port_notify32_t	pntfy32;
2040#endif
2041	aiocb64_32_t	aiocb64;
2042	aiocb_t		aiocb;
2043	file_t		*fp;
2044	int		error, fd;
2045	size_t		bufsize;
2046	struct vnode	*vp;
2047	aio_req_t	*reqp;
2048	aio_t		*aiop;
2049	int		(*aio_func)();
2050	aio_result_t	*resultp;
2051	struct	sigevent *sigev;
2052	model_t		model;
2053	int		aio_use_port = 0;
2054	port_notify_t	pntfy;
2055
2056	model = get_udatamodel();
2057	aiop = curproc->p_aio;
2058	if (aiop == NULL)
2059		return (EINVAL);
2060
2061	if (model == DATAMODEL_NATIVE) {
2062		if (run_mode != AIO_LARGEFILE) {
2063			if (copyin(aiocb_arg, &aiocb, sizeof (aiocb_t)))
2064				return (EFAULT);
2065			bufsize = aiocb.aio_nbytes;
2066			resultp = &(((aiocb_t *)aiocb_arg)->aio_resultp);
2067			if ((fp = getf(fd = aiocb.aio_fildes)) == NULL) {
2068				return (EBADF);
2069			}
2070			sigev = &aiocb.aio_sigevent;
2071		} else {
2072			/*
2073			 * We come here only when we make largefile
2074			 * call on 32 bit kernel using 32 bit library.
2075			 */
2076			if (copyin(aiocb_arg, &aiocb64, sizeof (aiocb64_32_t)))
2077				return (EFAULT);
2078			bufsize = aiocb64.aio_nbytes;
2079			resultp = (aio_result_t *)&(((aiocb64_32_t *)aiocb_arg)
2080			    ->aio_resultp);
2081			if ((fp = getf(fd = aiocb64.aio_fildes)) == NULL)
2082				return (EBADF);
2083			sigev = (struct sigevent *)&aiocb64.aio_sigevent;
2084		}
2085
2086		if (sigev->sigev_notify == SIGEV_PORT) {
2087			if (copyin((void *)sigev->sigev_value.sival_ptr,
2088			    &pntfy, sizeof (port_notify_t))) {
2089				releasef(fd);
2090				return (EFAULT);
2091			}
2092			aio_use_port = 1;
2093		} else if (sigev->sigev_notify == SIGEV_THREAD) {
2094			pntfy.portnfy_port = aiocb.aio_sigevent.sigev_signo;
2095			pntfy.portnfy_user =
2096			    aiocb.aio_sigevent.sigev_value.sival_ptr;
2097			aio_use_port = 1;
2098		}
2099	}
2100#ifdef	_SYSCALL32_IMPL
2101	else {
2102		if (run_mode == AIO_32) {
2103			/* 32 bit system call is being made on 64 bit kernel */
2104			if (copyin(aiocb_arg, &aiocb32, sizeof (aiocb32_t)))
2105				return (EFAULT);
2106
2107			bufsize = aiocb32.aio_nbytes;
2108			aiocb_32ton(&aiocb32, &aiocb);
2109			resultp = (aio_result_t *)&(((aiocb32_t *)aiocb_arg)->
2110			    aio_resultp);
2111			if ((fp = getf(fd = aiocb32.aio_fildes)) == NULL) {
2112				return (EBADF);
2113			}
2114			sigev32 = &aiocb32.aio_sigevent;
2115		} else if (run_mode == AIO_LARGEFILE) {
2116			/*
2117			 * We come here only when we make largefile
2118			 * call on 64 bit kernel using 32 bit library.
2119			 */
2120			if (copyin(aiocb_arg, &aiocb64, sizeof (aiocb64_32_t)))
2121				return (EFAULT);
2122			bufsize = aiocb64.aio_nbytes;
2123			aiocb_LFton(&aiocb64, &aiocb);
2124			resultp = (aio_result_t *)&(((aiocb64_32_t *)aiocb_arg)
2125			    ->aio_resultp);
2126			if ((fp = getf(fd = aiocb64.aio_fildes)) == NULL)
2127				return (EBADF);
2128			sigev32 = &aiocb64.aio_sigevent;
2129		}
2130
2131		if (sigev32->sigev_notify == SIGEV_PORT) {
2132			if (copyin(
2133			    (void *)(uintptr_t)sigev32->sigev_value.sival_ptr,
2134			    &pntfy32, sizeof (port_notify32_t))) {
2135				releasef(fd);
2136				return (EFAULT);
2137			}
2138			pntfy.portnfy_port = pntfy32.portnfy_port;
2139			pntfy.portnfy_user = (void *)(uintptr_t)
2140			    pntfy32.portnfy_user;
2141			aio_use_port = 1;
2142		} else if (sigev32->sigev_notify == SIGEV_THREAD) {
2143			pntfy.portnfy_port = sigev32->sigev_signo;
2144			pntfy.portnfy_user = (void *)(uintptr_t)
2145			    sigev32->sigev_value.sival_ptr;
2146			aio_use_port = 1;
2147		}
2148	}
2149#endif  /* _SYSCALL32_IMPL */
2150
2151	/*
2152	 * check the permission of the partition
2153	 */
2154
2155	if ((fp->f_flag & mode) == 0) {
2156		releasef(fd);
2157		return (EBADF);
2158	}
2159
2160	vp = fp->f_vnode;
2161	aio_func = check_vp(vp, mode);
2162	if (aio_func == NULL) {
2163		releasef(fd);
2164		return (EBADFD);
2165	}
2166	if (run_mode == AIO_LARGEFILE)
2167		error = aio_req_setupLF(&reqp, aiop, &aiocb64, resultp, vp, 0);
2168	else
2169		error = aio_req_setup(&reqp, aiop, &aiocb, resultp, vp, 0);
2170
2171	if (error) {
2172		releasef(fd);
2173		return (error);
2174	}
2175	/*
2176	 * enable polling on this request if the opcode has
2177	 * the AIO poll bit set
2178	 */
2179	if (opcode & AIO_POLL_BIT)
2180		reqp->aio_req_flags |= AIO_POLL;
2181
2182	if (model == DATAMODEL_NATIVE)
2183		reqp->aio_req_iocb.iocb = aiocb_arg;
2184#ifdef  _SYSCALL32_IMPL
2185	else
2186		reqp->aio_req_iocb.iocb32 = (caddr32_t)(uintptr_t)aiocb_arg;
2187#endif
2188
2189	if (aio_use_port) {
2190		int event = (run_mode == AIO_LARGEFILE)?
2191		    ((mode == FREAD)? AIOAREAD64 : AIOAWRITE64) :
2192		    ((mode == FREAD)? AIOAREAD : AIOAWRITE);
2193		error = aio_req_assoc_port_rw(&pntfy, aiocb_arg, reqp, event);
2194	}
2195
2196	/*
2197	 * send the request to driver.
2198	 */
2199	if (error == 0) {
2200		if (bufsize == 0) {
2201			clear_active_fd(fd);
2202			aio_zerolen(reqp);
2203			return (0);
2204		}
2205		error = (*aio_func)(vp, (aio_req_t *)&reqp->aio_req, CRED());
2206	}
2207
2208	/*
2209	 * the fd is stored in the aio_req_t by aio_req_setup(), and
2210	 * is released by the aio_cleanup_thread() when the IO has
2211	 * completed.
2212	 */
2213	if (error) {
2214		releasef(fd);
2215		mutex_enter(&aiop->aio_mutex);
2216		if (aio_use_port)
2217			aio_deq(&aiop->aio_portpending, reqp);
2218		aio_req_free(aiop, reqp);
2219		aiop->aio_pending--;
2220		if (aiop->aio_flags & AIO_REQ_BLOCK)
2221			cv_signal(&aiop->aio_cleanupcv);
2222		mutex_exit(&aiop->aio_mutex);
2223		return (error);
2224	}
2225	clear_active_fd(fd);
2226	return (0);
2227}
2228
2229
2230/*
2231 * set error for a list IO entry that failed.
2232 */
2233static void
2234lio_set_error(aio_req_t *reqp, int portused)
2235{
2236	aio_t *aiop = curproc->p_aio;
2237
2238	if (aiop == NULL)
2239		return;
2240
2241	mutex_enter(&aiop->aio_mutex);
2242	if (portused)
2243		aio_deq(&aiop->aio_portpending, reqp);
2244	aiop->aio_pending--;
2245	/* request failed, AIO_PHYSIODONE set to aviod physio cleanup. */
2246	reqp->aio_req_flags |= AIO_PHYSIODONE;
2247	/*
2248	 * Need to free the request now as its never
2249	 * going to get on the done queue
2250	 *
2251	 * Note: aio_outstanding is decremented in
2252	 *	 aio_req_free()
2253	 */
2254	aio_req_free(aiop, reqp);
2255	if (aiop->aio_flags & AIO_REQ_BLOCK)
2256		cv_signal(&aiop->aio_cleanupcv);
2257	mutex_exit(&aiop->aio_mutex);
2258}
2259
2260/*
2261 * check if a specified request is done, and remove it from
2262 * the done queue. otherwise remove anybody from the done queue
2263 * if NULL is specified.
2264 */
2265static aio_req_t *
2266aio_req_done(void *resultp)
2267{
2268	aio_req_t **bucket;
2269	aio_req_t *ent;
2270	aio_t *aiop = curproc->p_aio;
2271	long index;
2272
2273	ASSERT(MUTEX_HELD(&aiop->aio_cleanupq_mutex));
2274	ASSERT(MUTEX_HELD(&aiop->aio_mutex));
2275
2276	if (resultp) {
2277		index = AIO_HASH(resultp);
2278		bucket = &aiop->aio_hash[index];
2279		for (ent = *bucket; ent != NULL; ent = ent->aio_hash_next) {
2280			if (ent->aio_req_resultp == (aio_result_t *)resultp) {
2281				if (ent->aio_req_flags & AIO_DONEQ) {
2282					return (aio_req_remove(ent));
2283				}
2284				return (NULL);
2285			}
2286		}
2287		/* no match, resultp is invalid */
2288		return (NULL);
2289	}
2290	return (aio_req_remove(NULL));
2291}
2292
2293/*
2294 * determine if a user-level resultp pointer is associated with an
2295 * active IO request. Zero is returned when the request is done,
2296 * and the request is removed from the done queue. Only when the
2297 * return value is zero, is the "reqp" pointer valid. One is returned
2298 * when the request is inprogress. Two is returned when the request
2299 * is invalid.
2300 */
2301static int
2302aio_req_find(aio_result_t *resultp, aio_req_t **reqp)
2303{
2304	aio_req_t **bucket;
2305	aio_req_t *ent;
2306	aio_t *aiop = curproc->p_aio;
2307	long index;
2308
2309	ASSERT(MUTEX_HELD(&aiop->aio_cleanupq_mutex));
2310	ASSERT(MUTEX_HELD(&aiop->aio_mutex));
2311
2312	index = AIO_HASH(resultp);
2313	bucket = &aiop->aio_hash[index];
2314	for (ent = *bucket; ent != NULL; ent = ent->aio_hash_next) {
2315		if (ent->aio_req_resultp == resultp) {
2316			if (ent->aio_req_flags & AIO_DONEQ) {
2317				*reqp = aio_req_remove(ent);
2318				return (0);
2319			}
2320			return (1);
2321		}
2322	}
2323	/* no match, resultp is invalid */
2324	return (2);
2325}
2326
2327/*
2328 * remove a request from the done queue.
2329 */
2330static aio_req_t *
2331aio_req_remove(aio_req_t *reqp)
2332{
2333	aio_t *aiop = curproc->p_aio;
2334
2335	ASSERT(MUTEX_HELD(&aiop->aio_mutex));
2336
2337	if (reqp != NULL) {
2338		ASSERT(reqp->aio_req_flags & AIO_DONEQ);
2339		if (reqp->aio_req_next == reqp) {
2340			/* only one request on queue */
2341			if (reqp ==  aiop->aio_doneq) {
2342				aiop->aio_doneq = NULL;
2343			} else {
2344				ASSERT(reqp == aiop->aio_cleanupq);
2345				aiop->aio_cleanupq = NULL;
2346			}
2347		} else {
2348			reqp->aio_req_next->aio_req_prev = reqp->aio_req_prev;
2349			reqp->aio_req_prev->aio_req_next = reqp->aio_req_next;
2350			/*
2351			 * The request can be either on the aio_doneq or the
2352			 * aio_cleanupq
2353			 */
2354			if (reqp == aiop->aio_doneq)
2355				aiop->aio_doneq = reqp->aio_req_next;
2356
2357			if (reqp == aiop->aio_cleanupq)
2358				aiop->aio_cleanupq = reqp->aio_req_next;
2359		}
2360		reqp->aio_req_flags &= ~AIO_DONEQ;
2361		reqp->aio_req_next = NULL;
2362		reqp->aio_req_prev = NULL;
2363	} else if ((reqp = aiop->aio_doneq) != NULL) {
2364		ASSERT(reqp->aio_req_flags & AIO_DONEQ);
2365		if (reqp == reqp->aio_req_next) {
2366			/* only one request on queue */
2367			aiop->aio_doneq = NULL;
2368		} else {
2369			reqp->aio_req_prev->aio_req_next = reqp->aio_req_next;
2370			reqp->aio_req_next->aio_req_prev = reqp->aio_req_prev;
2371			aiop->aio_doneq = reqp->aio_req_next;
2372		}
2373		reqp->aio_req_flags &= ~AIO_DONEQ;
2374		reqp->aio_req_next = NULL;
2375		reqp->aio_req_prev = NULL;
2376	}
2377	if (aiop->aio_doneq == NULL && (aiop->aio_flags & AIO_WAITN))
2378		cv_broadcast(&aiop->aio_waitcv);
2379	return (reqp);
2380}
2381
2382static int
2383aio_req_setup(aio_req_t **reqpp, aio_t *aiop, aiocb_t *arg,
2384    aio_result_t *resultp, vnode_t *vp, int old_solaris_req)
2385{
2386	sigqueue_t	*sqp = NULL;
2387	aio_req_t	*reqp;
2388	struct uio	*uio;
2389	struct sigevent *sigev;
2390	int		error;
2391
2392	sigev = &arg->aio_sigevent;
2393	if (sigev->sigev_notify == SIGEV_SIGNAL &&
2394	    sigev->sigev_signo > 0 && sigev->sigev_signo < NSIG) {
2395		sqp = kmem_zalloc(sizeof (sigqueue_t), KM_NOSLEEP);
2396		if (sqp == NULL)
2397			return (EAGAIN);
2398		sqp->sq_func = NULL;
2399		sqp->sq_next = NULL;
2400		sqp->sq_info.si_code = SI_ASYNCIO;
2401		sqp->sq_info.si_pid = curproc->p_pid;
2402		sqp->sq_info.si_ctid = PRCTID(curproc);
2403		sqp->sq_info.si_zoneid = getzoneid();
2404		sqp->sq_info.si_uid = crgetuid(curproc->p_cred);
2405		sqp->sq_info.si_signo = sigev->sigev_signo;
2406		sqp->sq_info.si_value = sigev->sigev_value;
2407	}
2408
2409	mutex_enter(&aiop->aio_mutex);
2410
2411	if (aiop->aio_flags & AIO_REQ_BLOCK) {
2412		mutex_exit(&aiop->aio_mutex);
2413		if (sqp)
2414			kmem_free(sqp, sizeof (sigqueue_t));
2415		return (EIO);
2416	}
2417	/*
2418	 * get an aio_reqp from the free list or allocate one
2419	 * from dynamic memory.
2420	 */
2421	if (error = aio_req_alloc(&reqp, resultp)) {
2422		mutex_exit(&aiop->aio_mutex);
2423		if (sqp)
2424			kmem_free(sqp, sizeof (sigqueue_t));
2425		return (error);
2426	}
2427	aiop->aio_pending++;
2428	aiop->aio_outstanding++;
2429	reqp->aio_req_flags = AIO_PENDING;
2430	if (old_solaris_req) {
2431		/* this is an old solaris aio request */
2432		reqp->aio_req_flags |= AIO_SOLARIS;
2433		aiop->aio_flags |= AIO_SOLARIS_REQ;
2434	}
2435	if (sigev->sigev_notify == SIGEV_THREAD ||
2436	    sigev->sigev_notify == SIGEV_PORT)
2437		aio_enq(&aiop->aio_portpending, reqp, 0);
2438	mutex_exit(&aiop->aio_mutex);
2439	/*
2440	 * initialize aio request.
2441	 */
2442	reqp->aio_req_fd = arg->aio_fildes;
2443	reqp->aio_req_sigqp = sqp;
2444	reqp->aio_req_iocb.iocb = NULL;
2445	reqp->aio_req_lio = NULL;
2446	reqp->aio_req_buf.b_file = vp;
2447	uio = reqp->aio_req.aio_uio;
2448	uio->uio_iovcnt = 1;
2449	uio->uio_iov->iov_base = (caddr_t)arg->aio_buf;
2450	uio->uio_iov->iov_len = arg->aio_nbytes;
2451	uio->uio_loffset = arg->aio_offset;
2452	*reqpp = reqp;
2453	return (0);
2454}
2455
2456/*
2457 * Allocate p_aio struct.
2458 */
2459static aio_t *
2460aio_aiop_alloc(void)
2461{
2462	aio_t	*aiop;
2463
2464	ASSERT(MUTEX_HELD(&curproc->p_lock));
2465
2466	aiop = kmem_zalloc(sizeof (struct aio), KM_NOSLEEP);
2467	if (aiop) {
2468		mutex_init(&aiop->aio_mutex, NULL, MUTEX_DEFAULT, NULL);
2469		mutex_init(&aiop->aio_cleanupq_mutex, NULL, MUTEX_DEFAULT,
2470		    NULL);
2471		mutex_init(&aiop->aio_portq_mutex, NULL, MUTEX_DEFAULT, NULL);
2472	}
2473	return (aiop);
2474}
2475
2476/*
2477 * Allocate an aio_req struct.
2478 */
2479static int
2480aio_req_alloc(aio_req_t **nreqp, aio_result_t *resultp)
2481{
2482	aio_req_t *reqp;
2483	aio_t *aiop = curproc->p_aio;
2484
2485	ASSERT(MUTEX_HELD(&aiop->aio_mutex));
2486
2487	if ((reqp = aiop->aio_free) != NULL) {
2488		aiop->aio_free = reqp->aio_req_next;
2489		bzero(reqp, sizeof (*reqp));
2490	} else {
2491		/*
2492		 * Check whether memory is getting tight.
2493		 * This is a temporary mechanism to avoid memory
2494		 * exhaustion by a single process until we come up
2495		 * with a per process solution such as setrlimit().
2496		 */
2497		if (freemem < desfree)
2498			return (EAGAIN);
2499		reqp = kmem_zalloc(sizeof (struct aio_req_t), KM_NOSLEEP);
2500		if (reqp == NULL)
2501			return (EAGAIN);
2502	}
2503	reqp->aio_req.aio_uio = &reqp->aio_req_uio;
2504	reqp->aio_req.aio_uio->uio_iov = &reqp->aio_req_iov;
2505	reqp->aio_req.aio_private = reqp;
2506	reqp->aio_req_buf.b_offset = -1;
2507	reqp->aio_req_resultp = resultp;
2508	if (aio_hash_insert(reqp, aiop)) {
2509		reqp->aio_req_next = aiop->aio_free;
2510		aiop->aio_free = reqp;
2511		return (EBUSY);
2512	}
2513	*nreqp = reqp;
2514	return (0);
2515}
2516
2517/*
2518 * Allocate an aio_lio_t struct.
2519 */
2520static int
2521aio_lio_alloc(aio_lio_t **head)
2522{
2523	aio_lio_t *liop;
2524	aio_t *aiop = curproc->p_aio;
2525
2526	ASSERT(MUTEX_HELD(&aiop->aio_mutex));
2527
2528	if ((liop = aiop->aio_lio_free) != NULL) {
2529		aiop->aio_lio_free = liop->lio_next;
2530	} else {
2531		/*
2532		 * Check whether memory is getting tight.
2533		 * This is a temporary mechanism to avoid memory
2534		 * exhaustion by a single process until we come up
2535		 * with a per process solution such as setrlimit().
2536		 */
2537		if (freemem < desfree)
2538			return (EAGAIN);
2539
2540		liop = kmem_zalloc(sizeof (aio_lio_t), KM_NOSLEEP);
2541		if (liop == NULL)
2542			return (EAGAIN);
2543	}
2544	*head = liop;
2545	return (0);
2546}
2547
2548/*
2549 * this is a special per-process thread that is only activated if
2550 * the process is unmapping a segment with outstanding aio. normally,
2551 * the process will have completed the aio before unmapping the
2552 * segment. If the process does unmap a segment with outstanding aio,
2553 * this special thread will guarentee that the locked pages due to
2554 * aphysio() are released, thereby permitting the segment to be
2555 * unmapped. In addition to this, the cleanup thread is woken up
2556 * during DR operations to release the locked pages.
2557 */
2558
2559static int
2560aio_cleanup_thread(aio_t *aiop)
2561{
2562	proc_t *p = curproc;
2563	struct as *as = p->p_as;
2564	int poked = 0;
2565	kcondvar_t *cvp;
2566	int exit_flag = 0;
2567	int rqclnup = 0;
2568
2569	sigfillset(&curthread->t_hold);
2570	sigdiffset(&curthread->t_hold, &cantmask);
2571	for (;;) {
2572		/*
2573		 * if a segment is being unmapped, and the current
2574		 * process's done queue is not empty, then every request
2575		 * on the doneq with locked resources should be forced
2576		 * to release their locks. By moving the doneq request
2577		 * to the cleanupq, aio_cleanup() will process the cleanupq,
2578		 * and place requests back onto the doneq. All requests
2579		 * processed by aio_cleanup() will have their physical
2580		 * resources unlocked.
2581		 */
2582		mutex_enter(&aiop->aio_mutex);
2583		if ((aiop->aio_flags & AIO_CLEANUP) == 0) {
2584			aiop->aio_flags |= AIO_CLEANUP;
2585			mutex_enter(&as->a_contents);
2586			if (aiop->aio_rqclnup) {
2587				aiop->aio_rqclnup = 0;
2588				rqclnup = 1;
2589			}
2590			mutex_exit(&as->a_contents);
2591			if (aiop->aio_doneq) {
2592				aio_req_t *doneqhead = aiop->aio_doneq;
2593				aiop->aio_doneq = NULL;
2594				aio_cleanupq_concat(aiop, doneqhead, AIO_DONEQ);
2595			}
2596		}
2597		mutex_exit(&aiop->aio_mutex);
2598		aio_cleanup(AIO_CLEANUP_THREAD);
2599		/*
2600		 * thread should block on the cleanupcv while
2601		 * AIO_CLEANUP is set.
2602		 */
2603		cvp = &aiop->aio_cleanupcv;
2604		mutex_enter(&aiop->aio_mutex);
2605
2606		if (aiop->aio_pollq != NULL || aiop->aio_cleanupq != NULL ||
2607		    aiop->aio_notifyq != NULL ||
2608		    aiop->aio_portcleanupq != NULL) {
2609			mutex_exit(&aiop->aio_mutex);
2610			continue;
2611		}
2612		mutex_enter(&as->a_contents);
2613
2614		/*
2615		 * AIO_CLEANUP determines when the cleanup thread
2616		 * should be active. This flag is set when
2617		 * the cleanup thread is awakened by as_unmap() or
2618		 * due to DR operations.
2619		 * The flag is cleared when the blocking as_unmap()
2620		 * that originally awakened us is allowed to
2621		 * complete. as_unmap() blocks when trying to
2622		 * unmap a segment that has SOFTLOCKed pages. when
2623		 * the segment's pages are all SOFTUNLOCKed,
2624		 * as->a_flags & AS_UNMAPWAIT should be zero.
2625		 *
2626		 * In case of cleanup request by DR, the flag is cleared
2627		 * once all the pending aio requests have been processed.
2628		 *
2629		 * The flag shouldn't be cleared right away if the
2630		 * cleanup thread was interrupted because the process
2631		 * is doing forkall(). This happens when cv_wait_sig()
2632		 * returns zero, because it was awakened by a pokelwps().
2633		 * If the process is not exiting, it must be doing forkall().
2634		 */
2635		if ((poked == 0) &&
2636		    ((!rqclnup && (AS_ISUNMAPWAIT(as) == 0)) ||
2637		    (aiop->aio_pending == 0))) {
2638			aiop->aio_flags &= ~(AIO_CLEANUP | AIO_CLEANUP_PORT);
2639			cvp = &as->a_cv;
2640			rqclnup = 0;
2641		}
2642		mutex_exit(&aiop->aio_mutex);
2643		if (poked) {
2644			/*
2645			 * If the process is exiting/killed, don't return
2646			 * immediately without waiting for pending I/O's
2647			 * and releasing the page locks.
2648			 */
2649			if (p->p_flag & (SEXITLWPS|SKILLED)) {
2650				/*
2651				 * If exit_flag is set, then it is
2652				 * safe to exit because we have released
2653				 * page locks of completed I/O's.
2654				 */
2655				if (exit_flag)
2656					break;
2657
2658				mutex_exit(&as->a_contents);
2659
2660				/*
2661				 * Wait for all the pending aio to complete.
2662				 */
2663				mutex_enter(&aiop->aio_mutex);
2664				aiop->aio_flags |= AIO_REQ_BLOCK;
2665				while (aiop->aio_pending != 0)
2666					cv_wait(&aiop->aio_cleanupcv,
2667					    &aiop->aio_mutex);
2668				mutex_exit(&aiop->aio_mutex);
2669				exit_flag = 1;
2670				continue;
2671			} else if (p->p_flag &
2672			    (SHOLDFORK|SHOLDFORK1|SHOLDWATCH)) {
2673				/*
2674				 * hold LWP until it
2675				 * is continued.
2676				 */
2677				mutex_exit(&as->a_contents);
2678				mutex_enter(&p->p_lock);
2679				stop(PR_SUSPENDED, SUSPEND_NORMAL);
2680				mutex_exit(&p->p_lock);
2681				poked = 0;
2682				continue;
2683			}
2684		} else {
2685			/*
2686			 * When started this thread will sleep on as->a_cv.
2687			 * as_unmap will awake this thread if the
2688			 * segment has SOFTLOCKed pages (poked = 0).
2689			 * 1. pokelwps() awakes this thread =>
2690			 *    break the loop to check SEXITLWPS, SHOLDFORK, etc
2691			 * 2. as_unmap awakes this thread =>
2692			 *    to break the loop it is necessary that
2693			 *    - AS_UNMAPWAIT is set (as_unmap is waiting for
2694			 *	memory to be unlocked)
2695			 *    - AIO_CLEANUP is not set
2696			 *	(if AIO_CLEANUP is set we have to wait for
2697			 *	pending requests. aio_done will send a signal
2698			 *	for every request which completes to continue
2699			 *	unmapping the corresponding address range)
2700			 * 3. A cleanup request will wake this thread up, ex.
2701			 *    by the DR operations. The aio_rqclnup flag will
2702			 *    be set.
2703			 */
2704			while (poked == 0) {
2705				/*
2706				 * The clean up requests that came in
2707				 * after we had just cleaned up, couldn't
2708				 * be causing the unmap thread to block - as
2709				 * unmap event happened first.
2710				 * Let aio_done() wake us up if it sees a need.
2711				 */
2712				if (aiop->aio_rqclnup &&
2713				    (aiop->aio_flags & AIO_CLEANUP) == 0)
2714					break;
2715				poked = !cv_wait_sig(cvp, &as->a_contents);
2716				if (AS_ISUNMAPWAIT(as) == 0)
2717					cv_signal(cvp);
2718				if (aiop->aio_outstanding != 0)
2719					break;
2720			}
2721		}
2722		mutex_exit(&as->a_contents);
2723	}
2724exit:
2725	mutex_exit(&as->a_contents);
2726	ASSERT((curproc->p_flag & (SEXITLWPS|SKILLED)));
2727	aston(curthread);	/* make thread do post_syscall */
2728	return (0);
2729}
2730
2731/*
2732 * save a reference to a user's outstanding aio in a hash list.
2733 */
2734static int
2735aio_hash_insert(
2736	aio_req_t *aio_reqp,
2737	aio_t *aiop)
2738{
2739	long index;
2740	aio_result_t *resultp = aio_reqp->aio_req_resultp;
2741	aio_req_t *current;
2742	aio_req_t **nextp;
2743
2744	index = AIO_HASH(resultp);
2745	nextp = &aiop->aio_hash[index];
2746	while ((current = *nextp) != NULL) {
2747		if (current->aio_req_resultp == resultp)
2748			return (DUPLICATE);
2749		nextp = &current->aio_hash_next;
2750	}
2751	*nextp = aio_reqp;
2752	aio_reqp->aio_hash_next = NULL;
2753	return (0);
2754}
2755
2756static int
2757(*check_vp(struct vnode *vp, int mode))(vnode_t *, struct aio_req *,
2758    cred_t *)
2759{
2760	struct snode *sp;
2761	dev_t		dev;
2762	struct cb_ops	*cb;
2763	major_t		major;
2764	int		(*aio_func)();
2765
2766	dev = vp->v_rdev;
2767	major = getmajor(dev);
2768
2769	/*
2770	 * return NULL for requests to files and STREAMs so
2771	 * that libaio takes care of them.
2772	 */
2773	if (vp->v_type == VCHR) {
2774		/* no stream device for kaio */
2775		if (STREAMSTAB(major)) {
2776			return (NULL);
2777		}
2778	} else {
2779		return (NULL);
2780	}
2781
2782	/*
2783	 * Check old drivers which do not have async I/O entry points.
2784	 */
2785	if (devopsp[major]->devo_rev < 3)
2786		return (NULL);
2787
2788	cb = devopsp[major]->devo_cb_ops;
2789
2790	if (cb->cb_rev < 1)
2791		return (NULL);
2792
2793	/*
2794	 * Check whether this device is a block device.
2795	 * Kaio is not supported for devices like tty.
2796	 */
2797	if (cb->cb_strategy == nodev || cb->cb_strategy == NULL)
2798		return (NULL);
2799
2800	/*
2801	 * Clustering: If vnode is a PXFS vnode, then the device may be remote.
2802	 * We cannot call the driver directly. Instead return the
2803	 * PXFS functions.
2804	 */
2805
2806	if (IS_PXFSVP(vp)) {
2807		if (mode & FREAD)
2808			return (clpxfs_aio_read);
2809		else
2810			return (clpxfs_aio_write);
2811	}
2812	if (mode & FREAD)
2813		aio_func = (cb->cb_aread == nodev) ? NULL : driver_aio_read;
2814	else
2815		aio_func = (cb->cb_awrite == nodev) ? NULL : driver_aio_write;
2816
2817	/*
2818	 * Do we need this ?
2819	 * nodev returns ENXIO anyway.
2820	 */
2821	if (aio_func == nodev)
2822		return (NULL);
2823
2824	sp = VTOS(vp);
2825	smark(sp, SACC);
2826	return (aio_func);
2827}
2828
2829/*
2830 * Clustering: We want check_vp to return a function prototyped
2831 * correctly that will be common to both PXFS and regular case.
2832 * We define this intermediate function that will do the right
2833 * thing for driver cases.
2834 */
2835
2836static int
2837driver_aio_write(vnode_t *vp, struct aio_req *aio, cred_t *cred_p)
2838{
2839	dev_t dev;
2840	struct cb_ops	*cb;
2841
2842	ASSERT(vp->v_type == VCHR);
2843	ASSERT(!IS_PXFSVP(vp));
2844	dev = VTOS(vp)->s_dev;
2845	ASSERT(STREAMSTAB(getmajor(dev)) == NULL);
2846
2847	cb = devopsp[getmajor(dev)]->devo_cb_ops;
2848
2849	ASSERT(cb->cb_awrite != nodev);
2850	return ((*cb->cb_awrite)(dev, aio, cred_p));
2851}
2852
2853/*
2854 * Clustering: We want check_vp to return a function prototyped
2855 * correctly that will be common to both PXFS and regular case.
2856 * We define this intermediate function that will do the right
2857 * thing for driver cases.
2858 */
2859
2860static int
2861driver_aio_read(vnode_t *vp, struct aio_req *aio, cred_t *cred_p)
2862{
2863	dev_t dev;
2864	struct cb_ops	*cb;
2865
2866	ASSERT(vp->v_type == VCHR);
2867	ASSERT(!IS_PXFSVP(vp));
2868	dev = VTOS(vp)->s_dev;
2869	ASSERT(!STREAMSTAB(getmajor(dev)));
2870
2871	cb = devopsp[getmajor(dev)]->devo_cb_ops;
2872
2873	ASSERT(cb->cb_aread != nodev);
2874	return ((*cb->cb_aread)(dev, aio, cred_p));
2875}
2876
2877/*
2878 * This routine is called when a largefile call is made by a 32bit
2879 * process on a ILP32 or LP64 kernel. All 64bit processes are large
2880 * file by definition and will call alio() instead.
2881 */
2882static int
2883alioLF(
2884	int		mode_arg,
2885	void		*aiocb_arg,
2886	int		nent,
2887	void		*sigev)
2888{
2889	file_t		*fp;
2890	file_t		*prev_fp = NULL;
2891	int		prev_mode = -1;
2892	struct vnode	*vp;
2893	aio_lio_t	*head;
2894	aio_req_t	*reqp;
2895	aio_t		*aiop;
2896	caddr_t		cbplist;
2897	aiocb64_32_t	cb64;
2898	aiocb64_32_t	*aiocb = &cb64;
2899	aiocb64_32_t	*cbp;
2900	caddr32_t	*ucbp;
2901#ifdef _LP64
2902	aiocb_t		aiocb_n;
2903#endif
2904	struct sigevent32	sigevk;
2905	sigqueue_t	*sqp;
2906	int		(*aio_func)();
2907	int		mode;
2908	int		error = 0;
2909	int		aio_errors = 0;
2910	int		i;
2911	size_t		ssize;
2912	int		deadhead = 0;
2913	int		aio_notsupported = 0;
2914	int		lio_head_port;
2915	int		aio_port;
2916	int		aio_thread;
2917	port_kevent_t	*pkevtp = NULL;
2918	int		portused = 0;
2919	port_notify32_t	pnotify;
2920	int		event;
2921
2922	aiop = curproc->p_aio;
2923	if (aiop == NULL || nent <= 0 || nent > _AIO_LISTIO_MAX)
2924		return (EINVAL);
2925
2926	ASSERT(get_udatamodel() == DATAMODEL_ILP32);
2927
2928	ssize = (sizeof (caddr32_t) * nent);
2929	cbplist = kmem_alloc(ssize, KM_SLEEP);
2930	ucbp = (caddr32_t *)cbplist;
2931
2932	if (copyin(aiocb_arg, cbplist, ssize) ||
2933	    (sigev && copyin(sigev, &sigevk, sizeof (sigevk)))) {
2934		kmem_free(cbplist, ssize);
2935		return (EFAULT);
2936	}
2937
2938	/* Event Ports  */
2939	if (sigev &&
2940	    (sigevk.sigev_notify == SIGEV_THREAD ||
2941	    sigevk.sigev_notify == SIGEV_PORT)) {
2942		if (sigevk.sigev_notify == SIGEV_THREAD) {
2943			pnotify.portnfy_port = sigevk.sigev_signo;
2944			pnotify.portnfy_user = sigevk.sigev_value.sival_ptr;
2945		} else if (copyin(
2946		    (void *)(uintptr_t)sigevk.sigev_value.sival_ptr,
2947		    &pnotify, sizeof (pnotify))) {
2948			kmem_free(cbplist, ssize);
2949			return (EFAULT);
2950		}
2951		error = port_alloc_event(pnotify.portnfy_port,
2952		    PORT_ALLOC_DEFAULT, PORT_SOURCE_AIO, &pkevtp);
2953		if (error) {
2954			if (error == ENOMEM || error == EAGAIN)
2955				error = EAGAIN;
2956			else
2957				error = EINVAL;
2958			kmem_free(cbplist, ssize);
2959			return (error);
2960		}
2961		lio_head_port = pnotify.portnfy_port;
2962		portused = 1;
2963	}
2964
2965	/*
2966	 * a list head should be allocated if notification is
2967	 * enabled for this list.
2968	 */
2969	head = NULL;
2970
2971	if (mode_arg == LIO_WAIT || sigev) {
2972		mutex_enter(&aiop->aio_mutex);
2973		error = aio_lio_alloc(&head);
2974		mutex_exit(&aiop->aio_mutex);
2975		if (error)
2976			goto done;
2977		deadhead = 1;
2978		head->lio_nent = nent;
2979		head->lio_refcnt = nent;
2980		head->lio_port = -1;
2981		head->lio_portkev = NULL;
2982		if (sigev && sigevk.sigev_notify == SIGEV_SIGNAL &&
2983		    sigevk.sigev_signo > 0 && sigevk.sigev_signo < NSIG) {
2984			sqp = kmem_zalloc(sizeof (sigqueue_t), KM_NOSLEEP);
2985			if (sqp == NULL) {
2986				error = EAGAIN;
2987				goto done;
2988			}
2989			sqp->sq_func = NULL;
2990			sqp->sq_next = NULL;
2991			sqp->sq_info.si_code = SI_ASYNCIO;
2992			sqp->sq_info.si_pid = curproc->p_pid;
2993			sqp->sq_info.si_ctid = PRCTID(curproc);
2994			sqp->sq_info.si_zoneid = getzoneid();
2995			sqp->sq_info.si_uid = crgetuid(curproc->p_cred);
2996			sqp->sq_info.si_signo = sigevk.sigev_signo;
2997			sqp->sq_info.si_value.sival_int =
2998			    sigevk.sigev_value.sival_int;
2999			head->lio_sigqp = sqp;
3000		} else {
3001			head->lio_sigqp = NULL;
3002		}
3003		if (pkevtp) {
3004			/*
3005			 * Prepare data to send when list of aiocb's
3006			 * has completed.
3007			 */
3008			port_init_event(pkevtp, (uintptr_t)sigev,
3009			    (void *)(uintptr_t)pnotify.portnfy_user,
3010			    NULL, head);
3011			pkevtp->portkev_events = AIOLIO64;
3012			head->lio_portkev = pkevtp;
3013			head->lio_port = pnotify.portnfy_port;
3014		}
3015	}
3016
3017	for (i = 0; i < nent; i++, ucbp++) {
3018
3019		cbp = (aiocb64_32_t *)(uintptr_t)*ucbp;
3020		/* skip entry if it can't be copied. */
3021		if (cbp == NULL || copyin(cbp, aiocb, sizeof (*aiocb))) {
3022			if (head) {
3023				mutex_enter(&aiop->aio_mutex);
3024				head->lio_nent--;
3025				head->lio_refcnt--;
3026				mutex_exit(&aiop->aio_mutex);
3027			}
3028			continue;
3029		}
3030
3031		/* skip if opcode for aiocb is LIO_NOP */
3032		mode = aiocb->aio_lio_opcode;
3033		if (mode == LIO_NOP) {
3034			cbp = NULL;
3035			if (head) {
3036				mutex_enter(&aiop->aio_mutex);
3037				head->lio_nent--;
3038				head->lio_refcnt--;
3039				mutex_exit(&aiop->aio_mutex);
3040			}
3041			continue;
3042		}
3043
3044		/* increment file descriptor's ref count. */
3045		if ((fp = getf(aiocb->aio_fildes)) == NULL) {
3046			lio_set_uerror(&cbp->aio_resultp, EBADF);
3047			if (head) {
3048				mutex_enter(&aiop->aio_mutex);
3049				head->lio_nent--;
3050				head->lio_refcnt--;
3051				mutex_exit(&aiop->aio_mutex);
3052			}
3053			aio_errors++;
3054			continue;
3055		}
3056
3057		/*
3058		 * check the permission of the partition
3059		 */
3060		if ((fp->f_flag & mode) == 0) {
3061			releasef(aiocb->aio_fildes);
3062			lio_set_uerror(&cbp->aio_resultp, EBADF);
3063			if (head) {
3064				mutex_enter(&aiop->aio_mutex);
3065				head->lio_nent--;
3066				head->lio_refcnt--;
3067				mutex_exit(&aiop->aio_mutex);
3068			}
3069			aio_errors++;
3070			continue;
3071		}
3072
3073		/*
3074		 * common case where requests are to the same fd
3075		 * for the same r/w operation
3076		 * for UFS, need to set EBADFD
3077		 */
3078		vp = fp->f_vnode;
3079		if (fp != prev_fp || mode != prev_mode) {
3080			aio_func = check_vp(vp, mode);
3081			if (aio_func == NULL) {
3082				prev_fp = NULL;
3083				releasef(aiocb->aio_fildes);
3084				lio_set_uerror(&cbp->aio_resultp, EBADFD);
3085				aio_notsupported++;
3086				if (head) {
3087					mutex_enter(&aiop->aio_mutex);
3088					head->lio_nent--;
3089					head->lio_refcnt--;
3090					mutex_exit(&aiop->aio_mutex);
3091				}
3092				continue;
3093			} else {
3094				prev_fp = fp;
3095				prev_mode = mode;
3096			}
3097		}
3098
3099#ifdef	_LP64
3100		aiocb_LFton(aiocb, &aiocb_n);
3101		error = aio_req_setup(&reqp, aiop, &aiocb_n,
3102		    (aio_result_t *)&cbp->aio_resultp, vp, 0);
3103#else
3104		error = aio_req_setupLF(&reqp, aiop, aiocb,
3105		    (aio_result_t *)&cbp->aio_resultp, vp, 0);
3106#endif  /* _LP64 */
3107		if (error) {
3108			releasef(aiocb->aio_fildes);
3109			lio_set_uerror(&cbp->aio_resultp, error);
3110			if (head) {
3111				mutex_enter(&aiop->aio_mutex);
3112				head->lio_nent--;
3113				head->lio_refcnt--;
3114				mutex_exit(&aiop->aio_mutex);
3115			}
3116			aio_errors++;
3117			continue;
3118		}
3119
3120		reqp->aio_req_lio = head;
3121		deadhead = 0;
3122
3123		/*
3124		 * Set the errno field now before sending the request to
3125		 * the driver to avoid a race condition
3126		 */
3127		(void) suword32(&cbp->aio_resultp.aio_errno,
3128		    EINPROGRESS);
3129
3130		reqp->aio_req_iocb.iocb32 = *ucbp;
3131
3132		event = (mode == LIO_READ)? AIOAREAD64 : AIOAWRITE64;
3133		aio_port = (aiocb->aio_sigevent.sigev_notify == SIGEV_PORT);
3134		aio_thread = (aiocb->aio_sigevent.sigev_notify == SIGEV_THREAD);
3135		if (aio_port | aio_thread) {
3136			port_kevent_t *lpkevp;
3137			/*
3138			 * Prepare data to send with each aiocb completed.
3139			 */
3140			if (aio_port) {
3141				void *paddr = (void *)(uintptr_t)
3142				    aiocb->aio_sigevent.sigev_value.sival_ptr;
3143				if (copyin(paddr, &pnotify, sizeof (pnotify)))
3144					error = EFAULT;
3145			} else {	/* aio_thread */
3146				pnotify.portnfy_port =
3147				    aiocb->aio_sigevent.sigev_signo;
3148				pnotify.portnfy_user =
3149				    aiocb->aio_sigevent.sigev_value.sival_ptr;
3150			}
3151			if (error)
3152				/* EMPTY */;
3153			else if (pkevtp != NULL &&
3154			    pnotify.portnfy_port == lio_head_port)
3155				error = port_dup_event(pkevtp, &lpkevp,
3156				    PORT_ALLOC_DEFAULT);
3157			else
3158				error = port_alloc_event(pnotify.portnfy_port,
3159				    PORT_ALLOC_DEFAULT, PORT_SOURCE_AIO,
3160				    &lpkevp);
3161			if (error == 0) {
3162				port_init_event(lpkevp, (uintptr_t)*ucbp,
3163				    (void *)(uintptr_t)pnotify.portnfy_user,
3164				    aio_port_callback, reqp);
3165				lpkevp->portkev_events = event;
3166				reqp->aio_req_portkev = lpkevp;
3167				reqp->aio_req_port = pnotify.portnfy_port;
3168			}
3169		}
3170
3171		/*
3172		 * send the request to driver.
3173		 */
3174		if (error == 0) {
3175			if (aiocb->aio_nbytes == 0) {
3176				clear_active_fd(aiocb->aio_fildes);
3177				aio_zerolen(reqp);
3178				continue;
3179			}
3180			error = (*aio_func)(vp, (aio_req_t *)&reqp->aio_req,
3181			    CRED());
3182		}
3183
3184		/*
3185		 * the fd's ref count is not decremented until the IO has
3186		 * completed unless there was an error.
3187		 */
3188		if (error) {
3189			releasef(aiocb->aio_fildes);
3190			lio_set_uerror(&cbp->aio_resultp, error);
3191			if (head) {
3192				mutex_enter(&aiop->aio_mutex);
3193				head->lio_nent--;
3194				head->lio_refcnt--;
3195				mutex_exit(&aiop->aio_mutex);
3196			}
3197			if (error == ENOTSUP)
3198				aio_notsupported++;
3199			else
3200				aio_errors++;
3201			lio_set_error(reqp, portused);
3202		} else {
3203			clear_active_fd(aiocb->aio_fildes);
3204		}
3205	}
3206
3207	if (aio_notsupported) {
3208		error = ENOTSUP;
3209	} else if (aio_errors) {
3210		/*
3211		 * return EIO if any request failed
3212		 */
3213		error = EIO;
3214	}
3215
3216	if (mode_arg == LIO_WAIT) {
3217		mutex_enter(&aiop->aio_mutex);
3218		while (head->lio_refcnt > 0) {
3219			if (!cv_wait_sig(&head->lio_notify, &aiop->aio_mutex)) {
3220				mutex_exit(&aiop->aio_mutex);
3221				error = EINTR;
3222				goto done;
3223			}
3224		}
3225		mutex_exit(&aiop->aio_mutex);
3226		alio_cleanup(aiop, (aiocb_t **)cbplist, nent, AIO_LARGEFILE);
3227	}
3228
3229done:
3230	kmem_free(cbplist, ssize);
3231	if (deadhead) {
3232		if (head->lio_sigqp)
3233			kmem_free(head->lio_sigqp, sizeof (sigqueue_t));
3234		if (head->lio_portkev)
3235			port_free_event(head->lio_portkev);
3236		kmem_free(head, sizeof (aio_lio_t));
3237	}
3238	return (error);
3239}
3240
3241#ifdef  _SYSCALL32_IMPL
3242static void
3243aiocb_LFton(aiocb64_32_t *src, aiocb_t *dest)
3244{
3245	dest->aio_fildes = src->aio_fildes;
3246	dest->aio_buf = (void *)(uintptr_t)src->aio_buf;
3247	dest->aio_nbytes = (size_t)src->aio_nbytes;
3248	dest->aio_offset = (off_t)src->aio_offset;
3249	dest->aio_reqprio = src->aio_reqprio;
3250	dest->aio_sigevent.sigev_notify = src->aio_sigevent.sigev_notify;
3251	dest->aio_sigevent.sigev_signo = src->aio_sigevent.sigev_signo;
3252
3253	/*
3254	 * See comment in sigqueue32() on handling of 32-bit
3255	 * sigvals in a 64-bit kernel.
3256	 */
3257	dest->aio_sigevent.sigev_value.sival_int =
3258	    (int)src->aio_sigevent.sigev_value.sival_int;
3259	dest->aio_sigevent.sigev_notify_function = (void (*)(union sigval))
3260	    (uintptr_t)src->aio_sigevent.sigev_notify_function;
3261	dest->aio_sigevent.sigev_notify_attributes = (pthread_attr_t *)
3262	    (uintptr_t)src->aio_sigevent.sigev_notify_attributes;
3263	dest->aio_sigevent.__sigev_pad2 = src->aio_sigevent.__sigev_pad2;
3264	dest->aio_lio_opcode = src->aio_lio_opcode;
3265	dest->aio_state = src->aio_state;
3266	dest->aio__pad[0] = src->aio__pad[0];
3267}
3268#endif
3269
3270/*
3271 * This function is used only for largefile calls made by
3272 * 32 bit applications.
3273 */
3274static int
3275aio_req_setupLF(
3276	aio_req_t	**reqpp,
3277	aio_t		*aiop,
3278	aiocb64_32_t	*arg,
3279	aio_result_t	*resultp,
3280	vnode_t		*vp,
3281	int		old_solaris_req)
3282{
3283	sigqueue_t	*sqp = NULL;
3284	aio_req_t	*reqp;
3285	struct uio	*uio;
3286	struct sigevent32 *sigev;
3287	int		error;
3288
3289	sigev = &arg->aio_sigevent;
3290	if (sigev->sigev_notify == SIGEV_SIGNAL &&
3291	    sigev->sigev_signo > 0 && sigev->sigev_signo < NSIG) {
3292		sqp = kmem_zalloc(sizeof (sigqueue_t), KM_NOSLEEP);
3293		if (sqp == NULL)
3294			return (EAGAIN);
3295		sqp->sq_func = NULL;
3296		sqp->sq_next = NULL;
3297		sqp->sq_info.si_code = SI_ASYNCIO;
3298		sqp->sq_info.si_pid = curproc->p_pid;
3299		sqp->sq_info.si_ctid = PRCTID(curproc);
3300		sqp->sq_info.si_zoneid = getzoneid();
3301		sqp->sq_info.si_uid = crgetuid(curproc->p_cred);
3302		sqp->sq_info.si_signo = sigev->sigev_signo;
3303		sqp->sq_info.si_value.sival_int = sigev->sigev_value.sival_int;
3304	}
3305
3306	mutex_enter(&aiop->aio_mutex);
3307
3308	if (aiop->aio_flags & AIO_REQ_BLOCK) {
3309		mutex_exit(&aiop->aio_mutex);
3310		if (sqp)
3311			kmem_free(sqp, sizeof (sigqueue_t));
3312		return (EIO);
3313	}
3314	/*
3315	 * get an aio_reqp from the free list or allocate one
3316	 * from dynamic memory.
3317	 */
3318	if (error = aio_req_alloc(&reqp, resultp)) {
3319		mutex_exit(&aiop->aio_mutex);
3320		if (sqp)
3321			kmem_free(sqp, sizeof (sigqueue_t));
3322		return (error);
3323	}
3324	aiop->aio_pending++;
3325	aiop->aio_outstanding++;
3326	reqp->aio_req_flags = AIO_PENDING;
3327	if (old_solaris_req) {
3328		/* this is an old solaris aio request */
3329		reqp->aio_req_flags |= AIO_SOLARIS;
3330		aiop->aio_flags |= AIO_SOLARIS_REQ;
3331	}
3332	if (sigev->sigev_notify == SIGEV_THREAD ||
3333	    sigev->sigev_notify == SIGEV_PORT)
3334		aio_enq(&aiop->aio_portpending, reqp, 0);
3335	mutex_exit(&aiop->aio_mutex);
3336	/*
3337	 * initialize aio request.
3338	 */
3339	reqp->aio_req_fd = arg->aio_fildes;
3340	reqp->aio_req_sigqp = sqp;
3341	reqp->aio_req_iocb.iocb = NULL;
3342	reqp->aio_req_lio = NULL;
3343	reqp->aio_req_buf.b_file = vp;
3344	uio = reqp->aio_req.aio_uio;
3345	uio->uio_iovcnt = 1;
3346	uio->uio_iov->iov_base = (caddr_t)(uintptr_t)arg->aio_buf;
3347	uio->uio_iov->iov_len = arg->aio_nbytes;
3348	uio->uio_loffset = arg->aio_offset;
3349	*reqpp = reqp;
3350	return (0);
3351}
3352
3353/*
3354 * This routine is called when a non largefile call is made by a 32bit
3355 * process on a ILP32 or LP64 kernel.
3356 */
3357static int
3358alio32(
3359	int		mode_arg,
3360	void		*aiocb_arg,
3361	int		nent,
3362	void		*sigev)
3363{
3364	file_t		*fp;
3365	file_t		*prev_fp = NULL;
3366	int		prev_mode = -1;
3367	struct vnode	*vp;
3368	aio_lio_t	*head;
3369	aio_req_t	*reqp;
3370	aio_t		*aiop;
3371	caddr_t		cbplist;
3372	aiocb_t		cb;
3373	aiocb_t		*aiocb = &cb;
3374#ifdef	_LP64
3375	aiocb32_t	*cbp;
3376	caddr32_t	*ucbp;
3377	aiocb32_t	cb32;
3378	aiocb32_t	*aiocb32 = &cb32;
3379	struct sigevent32	sigevk;
3380#else
3381	aiocb_t		*cbp, **ucbp;
3382	struct sigevent	sigevk;
3383#endif
3384	sigqueue_t	*sqp;
3385	int		(*aio_func)();
3386	int		mode;
3387	int		error = 0;
3388	int		aio_errors = 0;
3389	int		i;
3390	size_t		ssize;
3391	int		deadhead = 0;
3392	int		aio_notsupported = 0;
3393	int		lio_head_port;
3394	int		aio_port;
3395	int		aio_thread;
3396	port_kevent_t	*pkevtp = NULL;
3397	int		portused = 0;
3398#ifdef	_LP64
3399	port_notify32_t	pnotify;
3400#else
3401	port_notify_t	pnotify;
3402#endif
3403	int		event;
3404
3405	aiop = curproc->p_aio;
3406	if (aiop == NULL || nent <= 0 || nent > _AIO_LISTIO_MAX)
3407		return (EINVAL);
3408
3409#ifdef	_LP64
3410	ssize = (sizeof (caddr32_t) * nent);
3411#else
3412	ssize = (sizeof (aiocb_t *) * nent);
3413#endif
3414	cbplist = kmem_alloc(ssize, KM_SLEEP);
3415	ucbp = (void *)cbplist;
3416
3417	if (copyin(aiocb_arg, cbplist, ssize) ||
3418	    (sigev && copyin(sigev, &sigevk, sizeof (struct sigevent32)))) {
3419		kmem_free(cbplist, ssize);
3420		return (EFAULT);
3421	}
3422
3423	/* Event Ports  */
3424	if (sigev &&
3425	    (sigevk.sigev_notify == SIGEV_THREAD ||
3426	    sigevk.sigev_notify == SIGEV_PORT)) {
3427		if (sigevk.sigev_notify == SIGEV_THREAD) {
3428			pnotify.portnfy_port = sigevk.sigev_signo;
3429			pnotify.portnfy_user = sigevk.sigev_value.sival_ptr;
3430		} else if (copyin(
3431		    (void *)(uintptr_t)sigevk.sigev_value.sival_ptr,
3432		    &pnotify, sizeof (pnotify))) {
3433			kmem_free(cbplist, ssize);
3434			return (EFAULT);
3435		}
3436		error = port_alloc_event(pnotify.portnfy_port,
3437		    PORT_ALLOC_DEFAULT, PORT_SOURCE_AIO, &pkevtp);
3438		if (error) {
3439			if (error == ENOMEM || error == EAGAIN)
3440				error = EAGAIN;
3441			else
3442				error = EINVAL;
3443			kmem_free(cbplist, ssize);
3444			return (error);
3445		}
3446		lio_head_port = pnotify.portnfy_port;
3447		portused = 1;
3448	}
3449
3450	/*
3451	 * a list head should be allocated if notification is
3452	 * enabled for this list.
3453	 */
3454	head = NULL;
3455
3456	if (mode_arg == LIO_WAIT || sigev) {
3457		mutex_enter(&aiop->aio_mutex);
3458		error = aio_lio_alloc(&head);
3459		mutex_exit(&aiop->aio_mutex);
3460		if (error)
3461			goto done;
3462		deadhead = 1;
3463		head->lio_nent = nent;
3464		head->lio_refcnt = nent;
3465		head->lio_port = -1;
3466		head->lio_portkev = NULL;
3467		if (sigev && sigevk.sigev_notify == SIGEV_SIGNAL &&
3468		    sigevk.sigev_signo > 0 && sigevk.sigev_signo < NSIG) {
3469			sqp = kmem_zalloc(sizeof (sigqueue_t), KM_NOSLEEP);
3470			if (sqp == NULL) {
3471				error = EAGAIN;
3472				goto done;
3473			}
3474			sqp->sq_func = NULL;
3475			sqp->sq_next = NULL;
3476			sqp->sq_info.si_code = SI_ASYNCIO;
3477			sqp->sq_info.si_pid = curproc->p_pid;
3478			sqp->sq_info.si_ctid = PRCTID(curproc);
3479			sqp->sq_info.si_zoneid = getzoneid();
3480			sqp->sq_info.si_uid = crgetuid(curproc->p_cred);
3481			sqp->sq_info.si_signo = sigevk.sigev_signo;
3482			sqp->sq_info.si_value.sival_int =
3483			    sigevk.sigev_value.sival_int;
3484			head->lio_sigqp = sqp;
3485		} else {
3486			head->lio_sigqp = NULL;
3487		}
3488		if (pkevtp) {
3489			/*
3490			 * Prepare data to send when list of aiocb's has
3491			 * completed.
3492			 */
3493			port_init_event(pkevtp, (uintptr_t)sigev,
3494			    (void *)(uintptr_t)pnotify.portnfy_user,
3495			    NULL, head);
3496			pkevtp->portkev_events = AIOLIO;
3497			head->lio_portkev = pkevtp;
3498			head->lio_port = pnotify.portnfy_port;
3499		}
3500	}
3501
3502	for (i = 0; i < nent; i++, ucbp++) {
3503
3504		/* skip entry if it can't be copied. */
3505#ifdef	_LP64
3506		cbp = (aiocb32_t *)(uintptr_t)*ucbp;
3507		if (cbp == NULL || copyin(cbp, aiocb32, sizeof (*aiocb32)))
3508#else
3509		cbp = (aiocb_t *)*ucbp;
3510		if (cbp == NULL || copyin(cbp, aiocb, sizeof (*aiocb)))
3511#endif
3512		{
3513			if (head) {
3514				mutex_enter(&aiop->aio_mutex);
3515				head->lio_nent--;
3516				head->lio_refcnt--;
3517				mutex_exit(&aiop->aio_mutex);
3518			}
3519			continue;
3520		}
3521#ifdef	_LP64
3522		/*
3523		 * copy 32 bit structure into 64 bit structure
3524		 */
3525		aiocb_32ton(aiocb32, aiocb);
3526#endif /* _LP64 */
3527
3528		/* skip if opcode for aiocb is LIO_NOP */
3529		mode = aiocb->aio_lio_opcode;
3530		if (mode == LIO_NOP) {
3531			cbp = NULL;
3532			if (head) {
3533				mutex_enter(&aiop->aio_mutex);
3534				head->lio_nent--;
3535				head->lio_refcnt--;
3536				mutex_exit(&aiop->aio_mutex);
3537			}
3538			continue;
3539		}
3540
3541		/* increment file descriptor's ref count. */
3542		if ((fp = getf(aiocb->aio_fildes)) == NULL) {
3543			lio_set_uerror(&cbp->aio_resultp, EBADF);
3544			if (head) {
3545				mutex_enter(&aiop->aio_mutex);
3546				head->lio_nent--;
3547				head->lio_refcnt--;
3548				mutex_exit(&aiop->aio_mutex);
3549			}
3550			aio_errors++;
3551			continue;
3552		}
3553
3554		/*
3555		 * check the permission of the partition
3556		 */
3557		if ((fp->f_flag & mode) == 0) {
3558			releasef(aiocb->aio_fildes);
3559			lio_set_uerror(&cbp->aio_resultp, EBADF);
3560			if (head) {
3561				mutex_enter(&aiop->aio_mutex);
3562				head->lio_nent--;
3563				head->lio_refcnt--;
3564				mutex_exit(&aiop->aio_mutex);
3565			}
3566			aio_errors++;
3567			continue;
3568		}
3569
3570		/*
3571		 * common case where requests are to the same fd
3572		 * for the same r/w operation
3573		 * for UFS, need to set EBADFD
3574		 */
3575		vp = fp->f_vnode;
3576		if (fp != prev_fp || mode != prev_mode) {
3577			aio_func = check_vp(vp, mode);
3578			if (aio_func == NULL) {
3579				prev_fp = NULL;
3580				releasef(aiocb->aio_fildes);
3581				lio_set_uerror(&cbp->aio_resultp, EBADFD);
3582				aio_notsupported++;
3583				if (head) {
3584					mutex_enter(&aiop->aio_mutex);
3585					head->lio_nent--;
3586					head->lio_refcnt--;
3587					mutex_exit(&aiop->aio_mutex);
3588				}
3589				continue;
3590			} else {
3591				prev_fp = fp;
3592				prev_mode = mode;
3593			}
3594		}
3595
3596		error = aio_req_setup(&reqp, aiop, aiocb,
3597		    (aio_result_t *)&cbp->aio_resultp, vp, 0);
3598		if (error) {
3599			releasef(aiocb->aio_fildes);
3600			lio_set_uerror(&cbp->aio_resultp, error);
3601			if (head) {
3602				mutex_enter(&aiop->aio_mutex);
3603				head->lio_nent--;
3604				head->lio_refcnt--;
3605				mutex_exit(&aiop->aio_mutex);
3606			}
3607			aio_errors++;
3608			continue;
3609		}
3610
3611		reqp->aio_req_lio = head;
3612		deadhead = 0;
3613
3614		/*
3615		 * Set the errno field now before sending the request to
3616		 * the driver to avoid a race condition
3617		 */
3618		(void) suword32(&cbp->aio_resultp.aio_errno,
3619		    EINPROGRESS);
3620
3621		reqp->aio_req_iocb.iocb32 = (caddr32_t)(uintptr_t)cbp;
3622
3623		event = (mode == LIO_READ)? AIOAREAD : AIOAWRITE;
3624		aio_port = (aiocb->aio_sigevent.sigev_notify == SIGEV_PORT);
3625		aio_thread = (aiocb->aio_sigevent.sigev_notify == SIGEV_THREAD);
3626		if (aio_port | aio_thread) {
3627			port_kevent_t *lpkevp;
3628			/*
3629			 * Prepare data to send with each aiocb completed.
3630			 */
3631#ifdef _LP64
3632			if (aio_port) {
3633				void *paddr = (void  *)(uintptr_t)
3634				    aiocb32->aio_sigevent.sigev_value.sival_ptr;
3635				if (copyin(paddr, &pnotify, sizeof (pnotify)))
3636					error = EFAULT;
3637			} else {	/* aio_thread */
3638				pnotify.portnfy_port =
3639				    aiocb32->aio_sigevent.sigev_signo;
3640				pnotify.portnfy_user =
3641				    aiocb32->aio_sigevent.sigev_value.sival_ptr;
3642			}
3643#else
3644			if (aio_port) {
3645				void *paddr =
3646				    aiocb->aio_sigevent.sigev_value.sival_ptr;
3647				if (copyin(paddr, &pnotify, sizeof (pnotify)))
3648					error = EFAULT;
3649			} else {	/* aio_thread */
3650				pnotify.portnfy_port =
3651				    aiocb->aio_sigevent.sigev_signo;
3652				pnotify.portnfy_user =
3653				    aiocb->aio_sigevent.sigev_value.sival_ptr;
3654			}
3655#endif
3656			if (error)
3657				/* EMPTY */;
3658			else if (pkevtp != NULL &&
3659			    pnotify.portnfy_port == lio_head_port)
3660				error = port_dup_event(pkevtp, &lpkevp,
3661				    PORT_ALLOC_DEFAULT);
3662			else
3663				error = port_alloc_event(pnotify.portnfy_port,
3664				    PORT_ALLOC_DEFAULT, PORT_SOURCE_AIO,
3665				    &lpkevp);
3666			if (error == 0) {
3667				port_init_event(lpkevp, (uintptr_t)cbp,
3668				    (void *)(uintptr_t)pnotify.portnfy_user,
3669				    aio_port_callback, reqp);
3670				lpkevp->portkev_events = event;
3671				reqp->aio_req_portkev = lpkevp;
3672				reqp->aio_req_port = pnotify.portnfy_port;
3673			}
3674		}
3675
3676		/*
3677		 * send the request to driver.
3678		 */
3679		if (error == 0) {
3680			if (aiocb->aio_nbytes == 0) {
3681				clear_active_fd(aiocb->aio_fildes);
3682				aio_zerolen(reqp);
3683				continue;
3684			}
3685			error = (*aio_func)(vp, (aio_req_t *)&reqp->aio_req,
3686			    CRED());
3687		}
3688
3689		/*
3690		 * the fd's ref count is not decremented until the IO has
3691		 * completed unless there was an error.
3692		 */
3693		if (error) {
3694			releasef(aiocb->aio_fildes);
3695			lio_set_uerror(&cbp->aio_resultp, error);
3696			if (head) {
3697				mutex_enter(&aiop->aio_mutex);
3698				head->lio_nent--;
3699				head->lio_refcnt--;
3700				mutex_exit(&aiop->aio_mutex);
3701			}
3702			if (error == ENOTSUP)
3703				aio_notsupported++;
3704			else
3705				aio_errors++;
3706			lio_set_error(reqp, portused);
3707		} else {
3708			clear_active_fd(aiocb->aio_fildes);
3709		}
3710	}
3711
3712	if (aio_notsupported) {
3713		error = ENOTSUP;
3714	} else if (aio_errors) {
3715		/*
3716		 * return EIO if any request failed
3717		 */
3718		error = EIO;
3719	}
3720
3721	if (mode_arg == LIO_WAIT) {
3722		mutex_enter(&aiop->aio_mutex);
3723		while (head->lio_refcnt > 0) {
3724			if (!cv_wait_sig(&head->lio_notify, &aiop->aio_mutex)) {
3725				mutex_exit(&aiop->aio_mutex);
3726				error = EINTR;
3727				goto done;
3728			}
3729		}
3730		mutex_exit(&aiop->aio_mutex);
3731		alio_cleanup(aiop, (aiocb_t **)cbplist, nent, AIO_32);
3732	}
3733
3734done:
3735	kmem_free(cbplist, ssize);
3736	if (deadhead) {
3737		if (head->lio_sigqp)
3738			kmem_free(head->lio_sigqp, sizeof (sigqueue_t));
3739		if (head->lio_portkev)
3740			port_free_event(head->lio_portkev);
3741		kmem_free(head, sizeof (aio_lio_t));
3742	}
3743	return (error);
3744}
3745
3746
3747#ifdef  _SYSCALL32_IMPL
3748void
3749aiocb_32ton(aiocb32_t *src, aiocb_t *dest)
3750{
3751	dest->aio_fildes = src->aio_fildes;
3752	dest->aio_buf = (caddr_t)(uintptr_t)src->aio_buf;
3753	dest->aio_nbytes = (size_t)src->aio_nbytes;
3754	dest->aio_offset = (off_t)src->aio_offset;
3755	dest->aio_reqprio = src->aio_reqprio;
3756	dest->aio_sigevent.sigev_notify = src->aio_sigevent.sigev_notify;
3757	dest->aio_sigevent.sigev_signo = src->aio_sigevent.sigev_signo;
3758
3759	/*
3760	 * See comment in sigqueue32() on handling of 32-bit
3761	 * sigvals in a 64-bit kernel.
3762	 */
3763	dest->aio_sigevent.sigev_value.sival_int =
3764	    (int)src->aio_sigevent.sigev_value.sival_int;
3765	dest->aio_sigevent.sigev_notify_function = (void (*)(union sigval))
3766	    (uintptr_t)src->aio_sigevent.sigev_notify_function;
3767	dest->aio_sigevent.sigev_notify_attributes = (pthread_attr_t *)
3768	    (uintptr_t)src->aio_sigevent.sigev_notify_attributes;
3769	dest->aio_sigevent.__sigev_pad2 = src->aio_sigevent.__sigev_pad2;
3770	dest->aio_lio_opcode = src->aio_lio_opcode;
3771	dest->aio_state = src->aio_state;
3772	dest->aio__pad[0] = src->aio__pad[0];
3773}
3774#endif /* _SYSCALL32_IMPL */
3775
3776/*
3777 * aio_port_callback() is called just before the event is retrieved from the
3778 * port. The task of this callback function is to finish the work of the
3779 * transaction for the application, it means :
3780 * - copyout transaction data to the application
3781 *	(this thread is running in the right process context)
3782 * - keep trace of the transaction (update of counters).
3783 * - free allocated buffers
3784 * The aiocb pointer is the object element of the port_kevent_t structure.
3785 *
3786 * flag :
3787 *	PORT_CALLBACK_DEFAULT : do copyout and free resources
3788 *	PORT_CALLBACK_CLOSE   : don't do copyout, free resources
3789 */
3790
3791/*ARGSUSED*/
3792int
3793aio_port_callback(void *arg, int *events, pid_t pid, int flag, void *evp)
3794{
3795	aio_t		*aiop = curproc->p_aio;
3796	aio_req_t	*reqp = arg;
3797	struct	iovec	*iov;
3798	struct	buf	*bp;
3799	void		*resultp;
3800
3801	if (pid != curproc->p_pid) {
3802		/* wrong proc !!, can not deliver data here ... */
3803		return (EACCES);
3804	}
3805
3806	mutex_enter(&aiop->aio_portq_mutex);
3807	reqp->aio_req_portkev = NULL;
3808	aio_req_remove_portq(aiop, reqp); /* remove request from portq */
3809	mutex_exit(&aiop->aio_portq_mutex);
3810	aphysio_unlock(reqp);		/* unlock used pages */
3811	mutex_enter(&aiop->aio_mutex);
3812	if (reqp->aio_req_flags & AIO_COPYOUTDONE) {
3813		aio_req_free_port(aiop, reqp);	/* back to free list */
3814		mutex_exit(&aiop->aio_mutex);
3815		return (0);
3816	}
3817
3818	iov = reqp->aio_req_uio.uio_iov;
3819	bp = &reqp->aio_req_buf;
3820	resultp = (void *)reqp->aio_req_resultp;
3821	if (flag == PORT_CALLBACK_DEFAULT)
3822		aio_copyout_result_port(iov, bp, resultp);
3823	aio_req_free_port(aiop, reqp);	/* request struct back to free list */
3824	mutex_exit(&aiop->aio_mutex);
3825	return (0);
3826}
3827