xref: /illumos-gate/usr/src/uts/common/os/aio.c (revision 8ab00936)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * Copyright (c) 2018, Joyent, Inc.
29  */
30 
31 /*
32  * Kernel asynchronous I/O.
33  * This is only for raw devices now (as of Nov. 1993).
34  */
35 
36 #include <sys/types.h>
37 #include <sys/errno.h>
38 #include <sys/conf.h>
39 #include <sys/file.h>
40 #include <sys/fs/snode.h>
41 #include <sys/unistd.h>
42 #include <sys/cmn_err.h>
43 #include <vm/as.h>
44 #include <vm/faultcode.h>
45 #include <sys/sysmacros.h>
46 #include <sys/procfs.h>
47 #include <sys/kmem.h>
48 #include <sys/autoconf.h>
49 #include <sys/ddi_impldefs.h>
50 #include <sys/sunddi.h>
51 #include <sys/aio_impl.h>
52 #include <sys/debug.h>
53 #include <sys/param.h>
54 #include <sys/systm.h>
55 #include <sys/vmsystm.h>
56 #include <sys/fs/pxfs_ki.h>
57 #include <sys/contract/process_impl.h>
58 
59 /*
60  * external entry point.
61  */
62 #ifdef _LP64
63 static int64_t kaioc(long, long, long, long, long, long);
64 #endif
65 static int kaio(ulong_t *, rval_t *);
66 
67 
68 #define	AIO_64	0
69 #define	AIO_32	1
70 #define	AIO_LARGEFILE	2
71 
72 /*
73  * implementation specific functions (private)
74  */
75 #ifdef _LP64
76 static int alio(int, aiocb_t **, int, struct sigevent *);
77 #endif
78 static int aionotify(void);
79 static int aioinit(void);
80 static int aiostart(void);
81 static void alio_cleanup(aio_t *, aiocb_t **, int, int);
82 static int (*check_vp(struct vnode *, int))(vnode_t *, struct aio_req *,
83     cred_t *);
84 static void lio_set_error(aio_req_t *, int portused);
85 static aio_t *aio_aiop_alloc();
86 static int aio_req_alloc(aio_req_t **, aio_result_t *);
87 static int aio_lio_alloc(aio_lio_t **);
88 static aio_req_t *aio_req_done(void *);
89 static aio_req_t *aio_req_remove(aio_req_t *);
90 static int aio_req_find(aio_result_t *, aio_req_t **);
91 static int aio_hash_insert(struct aio_req_t *, aio_t *);
92 static int aio_req_setup(aio_req_t **, aio_t *, aiocb_t *,
93     aio_result_t *, vnode_t *, int);
94 static int aio_cleanup_thread(aio_t *);
95 static aio_lio_t *aio_list_get(aio_result_t *);
96 static void lio_set_uerror(void *, int);
97 extern void aio_zerolen(aio_req_t *);
98 static int aiowait(struct timeval *, int, long	*);
99 static int aiowaitn(void *, uint_t, uint_t *, timespec_t *);
100 static int aio_unlock_requests(caddr_t iocblist, int iocb_index,
101     aio_req_t *reqlist, aio_t *aiop, model_t model);
102 static int aio_reqlist_concat(aio_t *aiop, aio_req_t **reqlist, int max);
103 static int aiosuspend(void *, int, struct  timespec *, int,
104     long	*, int);
105 static int aliowait(int, void *, int, void *, int);
106 static int aioerror(void *, int);
107 static int aio_cancel(int, void *, long	*, int);
108 static int arw(int, int, char *, int, offset_t, aio_result_t *, int);
109 static int aiorw(int, void *, int, int);
110 
111 static int alioLF(int, void *, int, void *);
112 static int aio_req_setupLF(aio_req_t **, aio_t *, aiocb64_32_t *,
113     aio_result_t *, vnode_t *, int);
114 static int alio32(int, void *, int, void *);
115 static int driver_aio_write(vnode_t *vp, struct aio_req *aio, cred_t *cred_p);
116 static int driver_aio_read(vnode_t *vp, struct aio_req *aio, cred_t *cred_p);
117 
118 #ifdef  _SYSCALL32_IMPL
119 static void aiocb_LFton(aiocb64_32_t *, aiocb_t *);
120 void	aiocb_32ton(aiocb32_t *, aiocb_t *);
121 #endif /* _SYSCALL32_IMPL */
122 
123 /*
124  * implementation specific functions (external)
125  */
126 void aio_req_free(aio_t *, aio_req_t *);
127 
128 /*
129  * Event Port framework
130  */
131 
132 void aio_req_free_port(aio_t *, aio_req_t *);
133 static int aio_port_callback(void *, int *, pid_t, int, void *);
134 
135 /*
136  * This is the loadable module wrapper.
137  */
138 #include <sys/modctl.h>
139 #include <sys/syscall.h>
140 
141 #ifdef _LP64
142 
143 static struct sysent kaio_sysent = {
144 	6,
145 	SE_NOUNLOAD | SE_64RVAL | SE_ARGC,
146 	(int (*)())(uintptr_t)kaioc
147 };
148 
149 #ifdef _SYSCALL32_IMPL
150 static struct sysent kaio_sysent32 = {
151 	7,
152 	SE_NOUNLOAD | SE_64RVAL,
153 	kaio
154 };
155 #endif  /* _SYSCALL32_IMPL */
156 
157 #else   /* _LP64 */
158 
159 static struct sysent kaio_sysent = {
160 	7,
161 	SE_NOUNLOAD | SE_32RVAL1,
162 	kaio
163 };
164 
165 #endif  /* _LP64 */
166 
167 /*
168  * Module linkage information for the kernel.
169  */
170 
171 static struct modlsys modlsys = {
172 	&mod_syscallops,
173 	"kernel Async I/O",
174 	&kaio_sysent
175 };
176 
177 #ifdef  _SYSCALL32_IMPL
178 static struct modlsys modlsys32 = {
179 	&mod_syscallops32,
180 	"kernel Async I/O for 32 bit compatibility",
181 	&kaio_sysent32
182 };
183 #endif  /* _SYSCALL32_IMPL */
184 
185 
186 static struct modlinkage modlinkage = {
187 	MODREV_1,
188 	&modlsys,
189 #ifdef  _SYSCALL32_IMPL
190 	&modlsys32,
191 #endif
192 	NULL
193 };
194 
195 int
_init(void)196 _init(void)
197 {
198 	int retval;
199 
200 	if ((retval = mod_install(&modlinkage)) != 0)
201 		return (retval);
202 
203 	return (0);
204 }
205 
206 int
_fini(void)207 _fini(void)
208 {
209 	int retval;
210 
211 	retval = mod_remove(&modlinkage);
212 
213 	return (retval);
214 }
215 
216 int
_info(struct modinfo * modinfop)217 _info(struct modinfo *modinfop)
218 {
219 	return (mod_info(&modlinkage, modinfop));
220 }
221 
222 #ifdef	_LP64
223 static int64_t
kaioc(long a0,long a1,long a2,long a3,long a4,long a5)224 kaioc(
225 	long	a0,
226 	long	a1,
227 	long	a2,
228 	long	a3,
229 	long	a4,
230 	long	a5)
231 {
232 	int	error;
233 	long	rval = 0;
234 
235 	switch ((int)a0 & ~AIO_POLL_BIT) {
236 	case AIOREAD:
237 		error = arw((int)a0, (int)a1, (char *)a2, (int)a3,
238 		    (offset_t)a4, (aio_result_t *)a5, FREAD);
239 		break;
240 	case AIOWRITE:
241 		error = arw((int)a0, (int)a1, (char *)a2, (int)a3,
242 		    (offset_t)a4, (aio_result_t *)a5, FWRITE);
243 		break;
244 	case AIOWAIT:
245 		error = aiowait((struct timeval *)a1, (int)a2, &rval);
246 		break;
247 	case AIOWAITN:
248 		error = aiowaitn((void *)a1, (uint_t)a2, (uint_t *)a3,
249 		    (timespec_t *)a4);
250 		break;
251 	case AIONOTIFY:
252 		error = aionotify();
253 		break;
254 	case AIOINIT:
255 		error = aioinit();
256 		break;
257 	case AIOSTART:
258 		error = aiostart();
259 		break;
260 	case AIOLIO:
261 		error = alio((int)a1, (aiocb_t **)a2, (int)a3,
262 		    (struct sigevent *)a4);
263 		break;
264 	case AIOLIOWAIT:
265 		error = aliowait((int)a1, (void *)a2, (int)a3,
266 		    (struct sigevent *)a4, AIO_64);
267 		break;
268 	case AIOSUSPEND:
269 		error = aiosuspend((void *)a1, (int)a2, (timespec_t *)a3,
270 		    (int)a4, &rval, AIO_64);
271 		break;
272 	case AIOERROR:
273 		error = aioerror((void *)a1, AIO_64);
274 		break;
275 	case AIOAREAD:
276 		error = aiorw((int)a0, (void *)a1, FREAD, AIO_64);
277 		break;
278 	case AIOAWRITE:
279 		error = aiorw((int)a0, (void *)a1, FWRITE, AIO_64);
280 		break;
281 	case AIOCANCEL:
282 		error = aio_cancel((int)a1, (void *)a2, &rval, AIO_64);
283 		break;
284 
285 	/*
286 	 * The large file related stuff is valid only for
287 	 * 32 bit kernel and not for 64 bit kernel
288 	 * On 64 bit kernel we convert large file calls
289 	 * to regular 64bit calls.
290 	 */
291 
292 	default:
293 		error = EINVAL;
294 	}
295 	if (error)
296 		return ((int64_t)set_errno(error));
297 	return (rval);
298 }
299 #endif
300 
301 static int
kaio(ulong_t * uap,rval_t * rvp)302 kaio(
303 	ulong_t *uap,
304 	rval_t *rvp)
305 {
306 	long rval = 0;
307 	int	error = 0;
308 	offset_t	off;
309 
310 
311 	rvp->r_vals = 0;
312 #if defined(_LITTLE_ENDIAN)
313 	off = ((u_offset_t)uap[5] << 32) | (u_offset_t)uap[4];
314 #else
315 	off = ((u_offset_t)uap[4] << 32) | (u_offset_t)uap[5];
316 #endif
317 
318 	switch (uap[0] & ~AIO_POLL_BIT) {
319 	/*
320 	 * It must be the 32 bit system call on 64 bit kernel
321 	 */
322 	case AIOREAD:
323 		return (arw((int)uap[0], (int)uap[1], (char *)uap[2],
324 		    (int)uap[3], off, (aio_result_t *)uap[6], FREAD));
325 	case AIOWRITE:
326 		return (arw((int)uap[0], (int)uap[1], (char *)uap[2],
327 		    (int)uap[3], off, (aio_result_t *)uap[6], FWRITE));
328 	case AIOWAIT:
329 		error = aiowait((struct	timeval *)uap[1], (int)uap[2],
330 		    &rval);
331 		break;
332 	case AIOWAITN:
333 		error = aiowaitn((void *)uap[1], (uint_t)uap[2],
334 		    (uint_t *)uap[3], (timespec_t *)uap[4]);
335 		break;
336 	case AIONOTIFY:
337 		return (aionotify());
338 	case AIOINIT:
339 		return (aioinit());
340 	case AIOSTART:
341 		return (aiostart());
342 	case AIOLIO:
343 		return (alio32((int)uap[1], (void *)uap[2], (int)uap[3],
344 		    (void *)uap[4]));
345 	case AIOLIOWAIT:
346 		return (aliowait((int)uap[1], (void *)uap[2],
347 		    (int)uap[3], (struct sigevent *)uap[4], AIO_32));
348 	case AIOSUSPEND:
349 		error = aiosuspend((void *)uap[1], (int)uap[2],
350 		    (timespec_t *)uap[3], (int)uap[4],
351 		    &rval, AIO_32);
352 		break;
353 	case AIOERROR:
354 		return (aioerror((void *)uap[1], AIO_32));
355 	case AIOAREAD:
356 		return (aiorw((int)uap[0], (void *)uap[1],
357 		    FREAD, AIO_32));
358 	case AIOAWRITE:
359 		return (aiorw((int)uap[0], (void *)uap[1],
360 		    FWRITE, AIO_32));
361 	case AIOCANCEL:
362 		error = (aio_cancel((int)uap[1], (void *)uap[2], &rval,
363 		    AIO_32));
364 		break;
365 	case AIOLIO64:
366 		return (alioLF((int)uap[1], (void *)uap[2],
367 		    (int)uap[3], (void *)uap[4]));
368 	case AIOLIOWAIT64:
369 		return (aliowait(uap[1], (void *)uap[2],
370 		    (int)uap[3], (void *)uap[4], AIO_LARGEFILE));
371 	case AIOSUSPEND64:
372 		error = aiosuspend((void *)uap[1], (int)uap[2],
373 		    (timespec_t *)uap[3], (int)uap[4], &rval,
374 		    AIO_LARGEFILE);
375 		break;
376 	case AIOERROR64:
377 		return (aioerror((void *)uap[1], AIO_LARGEFILE));
378 	case AIOAREAD64:
379 		return (aiorw((int)uap[0], (void *)uap[1], FREAD,
380 		    AIO_LARGEFILE));
381 	case AIOAWRITE64:
382 		return (aiorw((int)uap[0], (void *)uap[1], FWRITE,
383 		    AIO_LARGEFILE));
384 	case AIOCANCEL64:
385 		error = (aio_cancel((int)uap[1], (void *)uap[2],
386 		    &rval, AIO_LARGEFILE));
387 		break;
388 	default:
389 		return (EINVAL);
390 	}
391 
392 	rvp->r_val1 = rval;
393 	return (error);
394 }
395 
396 /*
397  * wake up LWPs in this process that are sleeping in
398  * aiowait().
399  */
400 static int
aionotify(void)401 aionotify(void)
402 {
403 	aio_t	*aiop;
404 
405 	aiop = curproc->p_aio;
406 	if (aiop == NULL)
407 		return (0);
408 
409 	mutex_enter(&aiop->aio_mutex);
410 	aiop->aio_notifycnt++;
411 	cv_broadcast(&aiop->aio_waitcv);
412 	mutex_exit(&aiop->aio_mutex);
413 
414 	return (0);
415 }
416 
417 static int
timeval2reltime(struct timeval * timout,timestruc_t * rqtime,timestruc_t ** rqtp,int * blocking)418 timeval2reltime(struct timeval *timout, timestruc_t *rqtime,
419     timestruc_t **rqtp, int *blocking)
420 {
421 #ifdef	_SYSCALL32_IMPL
422 	struct timeval32 wait_time_32;
423 #endif
424 	struct timeval wait_time;
425 	model_t	model = get_udatamodel();
426 
427 	*rqtp = NULL;
428 	if (timout == NULL) {		/* wait indefinitely */
429 		*blocking = 1;
430 		return (0);
431 	}
432 
433 	/*
434 	 * Need to correctly compare with the -1 passed in for a user
435 	 * address pointer, with both 32 bit and 64 bit apps.
436 	 */
437 	if (model == DATAMODEL_NATIVE) {
438 		if ((intptr_t)timout == (intptr_t)-1) {	/* don't wait */
439 			*blocking = 0;
440 			return (0);
441 		}
442 
443 		if (copyin(timout, &wait_time, sizeof (wait_time)))
444 			return (EFAULT);
445 	}
446 #ifdef	_SYSCALL32_IMPL
447 	else {
448 		/*
449 		 * -1 from a 32bit app. It will not get sign extended.
450 		 * don't wait if -1.
451 		 */
452 		if ((intptr_t)timout == (intptr_t)((uint32_t)-1)) {
453 			*blocking = 0;
454 			return (0);
455 		}
456 
457 		if (copyin(timout, &wait_time_32, sizeof (wait_time_32)))
458 			return (EFAULT);
459 		TIMEVAL32_TO_TIMEVAL(&wait_time, &wait_time_32);
460 	}
461 #endif  /* _SYSCALL32_IMPL */
462 
463 	if (wait_time.tv_sec == 0 && wait_time.tv_usec == 0) {	/* don't wait */
464 		*blocking = 0;
465 		return (0);
466 	}
467 
468 	if (wait_time.tv_sec < 0 ||
469 	    wait_time.tv_usec < 0 || wait_time.tv_usec >= MICROSEC)
470 		return (EINVAL);
471 
472 	rqtime->tv_sec = wait_time.tv_sec;
473 	rqtime->tv_nsec = wait_time.tv_usec * 1000;
474 	*rqtp = rqtime;
475 	*blocking = 1;
476 
477 	return (0);
478 }
479 
480 static int
timespec2reltime(timespec_t * timout,timestruc_t * rqtime,timestruc_t ** rqtp,int * blocking)481 timespec2reltime(timespec_t *timout, timestruc_t *rqtime,
482     timestruc_t **rqtp, int *blocking)
483 {
484 #ifdef	_SYSCALL32_IMPL
485 	timespec32_t wait_time_32;
486 #endif
487 	model_t	model = get_udatamodel();
488 
489 	*rqtp = NULL;
490 	if (timout == NULL) {
491 		*blocking = 1;
492 		return (0);
493 	}
494 
495 	if (model == DATAMODEL_NATIVE) {
496 		if (copyin(timout, rqtime, sizeof (*rqtime)))
497 			return (EFAULT);
498 	}
499 #ifdef	_SYSCALL32_IMPL
500 	else {
501 		if (copyin(timout, &wait_time_32, sizeof (wait_time_32)))
502 			return (EFAULT);
503 		TIMESPEC32_TO_TIMESPEC(rqtime, &wait_time_32);
504 	}
505 #endif  /* _SYSCALL32_IMPL */
506 
507 	if (rqtime->tv_sec == 0 && rqtime->tv_nsec == 0) {
508 		*blocking = 0;
509 		return (0);
510 	}
511 
512 	if (rqtime->tv_sec < 0 ||
513 	    rqtime->tv_nsec < 0 || rqtime->tv_nsec >= NANOSEC)
514 		return (EINVAL);
515 
516 	*rqtp = rqtime;
517 	*blocking = 1;
518 
519 	return (0);
520 }
521 
522 /*ARGSUSED*/
523 static int
aiowait(struct timeval * timout,int dontblockflg,long * rval)524 aiowait(struct timeval *timout, int dontblockflg, long *rval)
525 {
526 	int		error;
527 	aio_t		*aiop;
528 	aio_req_t	*reqp;
529 	clock_t		status;
530 	int		blocking;
531 	int		timecheck;
532 	timestruc_t	rqtime;
533 	timestruc_t	*rqtp;
534 
535 	aiop = curproc->p_aio;
536 	if (aiop == NULL)
537 		return (EINVAL);
538 
539 	/*
540 	 * Establish the absolute future time for the timeout.
541 	 */
542 	error = timeval2reltime(timout, &rqtime, &rqtp, &blocking);
543 	if (error)
544 		return (error);
545 	if (rqtp) {
546 		timestruc_t now;
547 		timecheck = timechanged;
548 		gethrestime(&now);
549 		timespecadd(rqtp, &now);
550 	}
551 
552 	mutex_enter(&aiop->aio_mutex);
553 	for (;;) {
554 		/* process requests on poll queue */
555 		if (aiop->aio_pollq) {
556 			mutex_exit(&aiop->aio_mutex);
557 			aio_cleanup(0);
558 			mutex_enter(&aiop->aio_mutex);
559 		}
560 		if ((reqp = aio_req_remove(NULL)) != NULL) {
561 			*rval = (long)reqp->aio_req_resultp;
562 			break;
563 		}
564 		/* user-level done queue might not be empty */
565 		if (aiop->aio_notifycnt > 0) {
566 			aiop->aio_notifycnt--;
567 			*rval = 1;
568 			break;
569 		}
570 		/* don't block if no outstanding aio */
571 		if (aiop->aio_outstanding == 0 && dontblockflg) {
572 			error = EINVAL;
573 			break;
574 		}
575 		if (blocking) {
576 			status = cv_waituntil_sig(&aiop->aio_waitcv,
577 			    &aiop->aio_mutex, rqtp, timecheck);
578 
579 			if (status > 0)		/* check done queue again */
580 				continue;
581 			if (status == 0) {	/* interrupted by a signal */
582 				error = EINTR;
583 				*rval = -1;
584 			} else {		/* timer expired */
585 				error = ETIME;
586 			}
587 		}
588 		break;
589 	}
590 	mutex_exit(&aiop->aio_mutex);
591 	if (reqp) {
592 		aphysio_unlock(reqp);
593 		aio_copyout_result(reqp);
594 		mutex_enter(&aiop->aio_mutex);
595 		aio_req_free(aiop, reqp);
596 		mutex_exit(&aiop->aio_mutex);
597 	}
598 	return (error);
599 }
600 
601 /*
602  * aiowaitn can be used to reap completed asynchronous requests submitted with
603  * lio_listio, aio_read or aio_write.
604  * This function only reaps asynchronous raw I/Os.
605  */
606 
607 /*ARGSUSED*/
608 static int
aiowaitn(void * uiocb,uint_t nent,uint_t * nwait,timespec_t * timout)609 aiowaitn(void *uiocb, uint_t nent, uint_t *nwait, timespec_t *timout)
610 {
611 	int		error = 0;
612 	aio_t		*aiop;
613 	aio_req_t	*reqlist = NULL;
614 	caddr_t		iocblist = NULL;	/* array of iocb ptr's */
615 	uint_t		waitcnt, cnt = 0;	/* iocb cnt */
616 	size_t		iocbsz;			/* users iocb size */
617 	size_t		riocbsz;		/* returned iocb size */
618 	int		iocb_index = 0;
619 	model_t		model = get_udatamodel();
620 	int		blocking = 1;
621 	int		timecheck;
622 	timestruc_t	rqtime;
623 	timestruc_t	*rqtp;
624 
625 	aiop = curproc->p_aio;
626 	if (aiop == NULL || nent == 0 || nent > _AIO_LISTIO_MAX)
627 		return (EINVAL);
628 
629 	if (aiop->aio_outstanding == 0)
630 		return (EAGAIN);
631 
632 	if (copyin(nwait, &waitcnt, sizeof (uint_t)))
633 		return (EFAULT);
634 
635 	/* set *nwait to zero, if we must return prematurely */
636 	if (copyout(&cnt, nwait, sizeof (uint_t)))
637 		return (EFAULT);
638 
639 	if (waitcnt == 0) {
640 		blocking = 0;
641 		rqtp = NULL;
642 		waitcnt = nent;
643 	} else {
644 		error = timespec2reltime(timout, &rqtime, &rqtp, &blocking);
645 		if (error)
646 			return (error);
647 	}
648 
649 	if (model == DATAMODEL_NATIVE)
650 		iocbsz = (sizeof (aiocb_t *) * nent);
651 #ifdef	_SYSCALL32_IMPL
652 	else
653 		iocbsz = (sizeof (caddr32_t) * nent);
654 #endif  /* _SYSCALL32_IMPL */
655 
656 	/*
657 	 * Only one aio_waitn call is allowed at a time.
658 	 * The active aio_waitn will collect all requests
659 	 * out of the "done" list and if necessary it will wait
660 	 * for some/all pending requests to fulfill the nwait
661 	 * parameter.
662 	 * A second or further aio_waitn calls will sleep here
663 	 * until the active aio_waitn finishes and leaves the kernel
664 	 * If the second call does not block (poll), then return
665 	 * immediately with the error code : EAGAIN.
666 	 * If the second call should block, then sleep here, but
667 	 * do not touch the timeout. The timeout starts when this
668 	 * aio_waitn-call becomes active.
669 	 */
670 
671 	mutex_enter(&aiop->aio_mutex);
672 
673 	while (aiop->aio_flags & AIO_WAITN) {
674 		if (blocking == 0) {
675 			mutex_exit(&aiop->aio_mutex);
676 			return (EAGAIN);
677 		}
678 
679 		/* block, no timeout */
680 		aiop->aio_flags |= AIO_WAITN_PENDING;
681 		if (!cv_wait_sig(&aiop->aio_waitncv, &aiop->aio_mutex)) {
682 			mutex_exit(&aiop->aio_mutex);
683 			return (EINTR);
684 		}
685 	}
686 
687 	/*
688 	 * Establish the absolute future time for the timeout.
689 	 */
690 	if (rqtp) {
691 		timestruc_t now;
692 		timecheck = timechanged;
693 		gethrestime(&now);
694 		timespecadd(rqtp, &now);
695 	}
696 
697 	if (iocbsz > aiop->aio_iocbsz && aiop->aio_iocb != NULL) {
698 		kmem_free(aiop->aio_iocb, aiop->aio_iocbsz);
699 		aiop->aio_iocb = NULL;
700 	}
701 
702 	if (aiop->aio_iocb == NULL) {
703 		iocblist = kmem_zalloc(iocbsz, KM_NOSLEEP);
704 		if (iocblist == NULL) {
705 			mutex_exit(&aiop->aio_mutex);
706 			return (ENOMEM);
707 		}
708 		aiop->aio_iocb = (aiocb_t **)iocblist;
709 		aiop->aio_iocbsz = iocbsz;
710 	} else {
711 		iocblist = (char *)aiop->aio_iocb;
712 	}
713 
714 	aiop->aio_waitncnt = waitcnt;
715 	aiop->aio_flags |= AIO_WAITN;
716 
717 	for (;;) {
718 		/* push requests on poll queue to done queue */
719 		if (aiop->aio_pollq) {
720 			mutex_exit(&aiop->aio_mutex);
721 			aio_cleanup(0);
722 			mutex_enter(&aiop->aio_mutex);
723 		}
724 
725 		/* check for requests on done queue */
726 		if (aiop->aio_doneq) {
727 			cnt += aio_reqlist_concat(aiop, &reqlist, nent - cnt);
728 			aiop->aio_waitncnt = waitcnt - cnt;
729 		}
730 
731 		/* user-level done queue might not be empty */
732 		if (aiop->aio_notifycnt > 0) {
733 			aiop->aio_notifycnt--;
734 			error = 0;
735 			break;
736 		}
737 
738 		/*
739 		 * if we are here second time as a result of timer
740 		 * expiration, we reset error if there are enough
741 		 * aiocb's to satisfy request.
742 		 * We return also if all requests are already done
743 		 * and we picked up the whole done queue.
744 		 */
745 
746 		if ((cnt >= waitcnt) || (cnt > 0 && aiop->aio_pending == 0 &&
747 		    aiop->aio_doneq == NULL)) {
748 			error = 0;
749 			break;
750 		}
751 
752 		if ((cnt < waitcnt) && blocking) {
753 			int rval = cv_waituntil_sig(&aiop->aio_waitcv,
754 			    &aiop->aio_mutex, rqtp, timecheck);
755 			if (rval > 0)
756 				continue;
757 			if (rval < 0) {
758 				error = ETIME;
759 				blocking = 0;
760 				continue;
761 			}
762 			error = EINTR;
763 		}
764 		break;
765 	}
766 
767 	mutex_exit(&aiop->aio_mutex);
768 
769 	if (cnt > 0) {
770 
771 		iocb_index = aio_unlock_requests(iocblist, iocb_index, reqlist,
772 		    aiop, model);
773 
774 		if (model == DATAMODEL_NATIVE)
775 			riocbsz = (sizeof (aiocb_t *) * cnt);
776 #ifdef	_SYSCALL32_IMPL
777 		else
778 			riocbsz = (sizeof (caddr32_t) * cnt);
779 #endif  /* _SYSCALL32_IMPL */
780 
781 		if (copyout(iocblist, uiocb, riocbsz) ||
782 		    copyout(&cnt, nwait, sizeof (uint_t)))
783 			error = EFAULT;
784 	}
785 
786 	/* check if there is another thread waiting for execution */
787 	mutex_enter(&aiop->aio_mutex);
788 	aiop->aio_flags &= ~AIO_WAITN;
789 	if (aiop->aio_flags & AIO_WAITN_PENDING) {
790 		aiop->aio_flags &= ~AIO_WAITN_PENDING;
791 		cv_signal(&aiop->aio_waitncv);
792 	}
793 	mutex_exit(&aiop->aio_mutex);
794 
795 	return (error);
796 }
797 
798 /*
799  * aio_unlock_requests
800  * copyouts the result of the request as well as the return value.
801  * It builds the list of completed asynchronous requests,
802  * unlocks the allocated memory ranges and
803  * put the aio request structure back into the free list.
804  */
805 
806 static int
aio_unlock_requests(caddr_t iocblist,int iocb_index,aio_req_t * reqlist,aio_t * aiop,model_t model)807 aio_unlock_requests(
808 	caddr_t	iocblist,
809 	int	iocb_index,
810 	aio_req_t *reqlist,
811 	aio_t	*aiop,
812 	model_t	model)
813 {
814 	aio_req_t	*reqp, *nreqp;
815 
816 	if (model == DATAMODEL_NATIVE) {
817 		for (reqp = reqlist; reqp != NULL;  reqp = nreqp) {
818 			(((caddr_t *)iocblist)[iocb_index++]) =
819 			    reqp->aio_req_iocb.iocb;
820 			nreqp = reqp->aio_req_next;
821 			aphysio_unlock(reqp);
822 			aio_copyout_result(reqp);
823 			mutex_enter(&aiop->aio_mutex);
824 			aio_req_free(aiop, reqp);
825 			mutex_exit(&aiop->aio_mutex);
826 		}
827 	}
828 #ifdef	_SYSCALL32_IMPL
829 	else {
830 		for (reqp = reqlist; reqp != NULL;  reqp = nreqp) {
831 			((caddr32_t *)iocblist)[iocb_index++] =
832 			    reqp->aio_req_iocb.iocb32;
833 			nreqp = reqp->aio_req_next;
834 			aphysio_unlock(reqp);
835 			aio_copyout_result(reqp);
836 			mutex_enter(&aiop->aio_mutex);
837 			aio_req_free(aiop, reqp);
838 			mutex_exit(&aiop->aio_mutex);
839 		}
840 	}
841 #endif	/* _SYSCALL32_IMPL */
842 	return (iocb_index);
843 }
844 
845 /*
846  * aio_reqlist_concat
847  * moves "max" elements from the done queue to the reqlist queue and removes
848  * the AIO_DONEQ flag.
849  * - reqlist queue is a simple linked list
850  * - done queue is a double linked list
851  */
852 
853 static int
aio_reqlist_concat(aio_t * aiop,aio_req_t ** reqlist,int max)854 aio_reqlist_concat(aio_t *aiop, aio_req_t **reqlist, int max)
855 {
856 	aio_req_t *q2, *q2work, *list;
857 	int count = 0;
858 
859 	list = *reqlist;
860 	q2 = aiop->aio_doneq;
861 	q2work = q2;
862 	while (max-- > 0) {
863 		q2work->aio_req_flags &= ~AIO_DONEQ;
864 		q2work = q2work->aio_req_next;
865 		count++;
866 		if (q2work == q2)
867 			break;
868 	}
869 
870 	if (q2work == q2) {
871 		/* all elements revised */
872 		q2->aio_req_prev->aio_req_next = list;
873 		list = q2;
874 		aiop->aio_doneq = NULL;
875 	} else {
876 		/*
877 		 * max < elements in the doneq
878 		 * detach only the required amount of elements
879 		 * out of the doneq
880 		 */
881 		q2work->aio_req_prev->aio_req_next = list;
882 		list = q2;
883 
884 		aiop->aio_doneq = q2work;
885 		q2work->aio_req_prev = q2->aio_req_prev;
886 		q2->aio_req_prev->aio_req_next = q2work;
887 	}
888 	*reqlist = list;
889 	return (count);
890 }
891 
892 /*ARGSUSED*/
893 static int
aiosuspend(void * aiocb,int nent,struct timespec * timout,int flag,long * rval,int run_mode)894 aiosuspend(void	*aiocb, int nent, struct timespec *timout, int flag,
895     long *rval, int run_mode)
896 {
897 	int		error;
898 	aio_t		*aiop;
899 	aio_req_t	*reqp, *found, *next;
900 	caddr_t		cbplist = NULL;
901 	aiocb_t		*cbp, **ucbp;
902 #ifdef	_SYSCALL32_IMPL
903 	aiocb32_t	*cbp32;
904 	caddr32_t	*ucbp32;
905 #endif  /* _SYSCALL32_IMPL */
906 	aiocb64_32_t	*cbp64;
907 	int		rv;
908 	int		i;
909 	size_t		ssize;
910 	model_t		model = get_udatamodel();
911 	int		blocking;
912 	int		timecheck;
913 	timestruc_t	rqtime;
914 	timestruc_t	*rqtp;
915 
916 	aiop = curproc->p_aio;
917 	if (aiop == NULL || nent <= 0 || nent > _AIO_LISTIO_MAX)
918 		return (EINVAL);
919 
920 	/*
921 	 * Establish the absolute future time for the timeout.
922 	 */
923 	error = timespec2reltime(timout, &rqtime, &rqtp, &blocking);
924 	if (error)
925 		return (error);
926 	if (rqtp) {
927 		timestruc_t now;
928 		timecheck = timechanged;
929 		gethrestime(&now);
930 		timespecadd(rqtp, &now);
931 	}
932 
933 	/*
934 	 * If we are not blocking and there's no IO complete
935 	 * skip aiocb copyin.
936 	 */
937 	if (!blocking && (aiop->aio_pollq == NULL) &&
938 	    (aiop->aio_doneq == NULL)) {
939 		return (EAGAIN);
940 	}
941 
942 	if (model == DATAMODEL_NATIVE)
943 		ssize = (sizeof (aiocb_t *) * nent);
944 #ifdef	_SYSCALL32_IMPL
945 	else
946 		ssize = (sizeof (caddr32_t) * nent);
947 #endif  /* _SYSCALL32_IMPL */
948 
949 	cbplist = kmem_alloc(ssize, KM_NOSLEEP);
950 	if (cbplist == NULL)
951 		return (ENOMEM);
952 
953 	if (copyin(aiocb, cbplist, ssize)) {
954 		error = EFAULT;
955 		goto done;
956 	}
957 
958 	found = NULL;
959 	/*
960 	 * we need to get the aio_cleanupq_mutex since we call
961 	 * aio_req_done().
962 	 */
963 	mutex_enter(&aiop->aio_cleanupq_mutex);
964 	mutex_enter(&aiop->aio_mutex);
965 	for (;;) {
966 		/* push requests on poll queue to done queue */
967 		if (aiop->aio_pollq) {
968 			mutex_exit(&aiop->aio_mutex);
969 			mutex_exit(&aiop->aio_cleanupq_mutex);
970 			aio_cleanup(0);
971 			mutex_enter(&aiop->aio_cleanupq_mutex);
972 			mutex_enter(&aiop->aio_mutex);
973 		}
974 		/* check for requests on done queue */
975 		if (aiop->aio_doneq) {
976 			if (model == DATAMODEL_NATIVE)
977 				ucbp = (aiocb_t **)cbplist;
978 #ifdef	_SYSCALL32_IMPL
979 			else
980 				ucbp32 = (caddr32_t *)cbplist;
981 #endif  /* _SYSCALL32_IMPL */
982 			for (i = 0; i < nent; i++) {
983 				if (model == DATAMODEL_NATIVE) {
984 					if ((cbp = *ucbp++) == NULL)
985 						continue;
986 					if (run_mode != AIO_LARGEFILE)
987 						reqp = aio_req_done(
988 						    &cbp->aio_resultp);
989 					else {
990 						cbp64 = (aiocb64_32_t *)cbp;
991 						reqp = aio_req_done(
992 						    &cbp64->aio_resultp);
993 					}
994 				}
995 #ifdef	_SYSCALL32_IMPL
996 				else {
997 					if (run_mode == AIO_32) {
998 						if ((cbp32 =
999 						    (aiocb32_t *)(uintptr_t)
1000 						    *ucbp32++) == NULL)
1001 							continue;
1002 						reqp = aio_req_done(
1003 						    &cbp32->aio_resultp);
1004 					} else if (run_mode == AIO_LARGEFILE) {
1005 						if ((cbp64 =
1006 						    (aiocb64_32_t *)(uintptr_t)
1007 						    *ucbp32++) == NULL)
1008 							continue;
1009 						reqp = aio_req_done(
1010 						    &cbp64->aio_resultp);
1011 					}
1012 
1013 				}
1014 #endif  /* _SYSCALL32_IMPL */
1015 				if (reqp) {
1016 					reqp->aio_req_next = found;
1017 					found = reqp;
1018 				}
1019 				if (aiop->aio_doneq == NULL)
1020 					break;
1021 			}
1022 			if (found)
1023 				break;
1024 		}
1025 		if (aiop->aio_notifycnt > 0) {
1026 			/*
1027 			 * nothing on the kernel's queue. the user
1028 			 * has notified the kernel that it has items
1029 			 * on a user-level queue.
1030 			 */
1031 			aiop->aio_notifycnt--;
1032 			*rval = 1;
1033 			error = 0;
1034 			break;
1035 		}
1036 		/* don't block if nothing is outstanding */
1037 		if (aiop->aio_outstanding == 0) {
1038 			error = EAGAIN;
1039 			break;
1040 		}
1041 		if (blocking) {
1042 			/*
1043 			 * drop the aio_cleanupq_mutex as we are
1044 			 * going to block.
1045 			 */
1046 			mutex_exit(&aiop->aio_cleanupq_mutex);
1047 			rv = cv_waituntil_sig(&aiop->aio_waitcv,
1048 			    &aiop->aio_mutex, rqtp, timecheck);
1049 			/*
1050 			 * we have to drop aio_mutex and
1051 			 * grab it in the right order.
1052 			 */
1053 			mutex_exit(&aiop->aio_mutex);
1054 			mutex_enter(&aiop->aio_cleanupq_mutex);
1055 			mutex_enter(&aiop->aio_mutex);
1056 			if (rv > 0)	/* check done queue again */
1057 				continue;
1058 			if (rv == 0)	/* interrupted by a signal */
1059 				error = EINTR;
1060 			else		/* timer expired */
1061 				error = ETIME;
1062 		} else {
1063 			error = EAGAIN;
1064 		}
1065 		break;
1066 	}
1067 	mutex_exit(&aiop->aio_mutex);
1068 	mutex_exit(&aiop->aio_cleanupq_mutex);
1069 	for (reqp = found; reqp != NULL; reqp = next) {
1070 		next = reqp->aio_req_next;
1071 		aphysio_unlock(reqp);
1072 		aio_copyout_result(reqp);
1073 		mutex_enter(&aiop->aio_mutex);
1074 		aio_req_free(aiop, reqp);
1075 		mutex_exit(&aiop->aio_mutex);
1076 	}
1077 done:
1078 	kmem_free(cbplist, ssize);
1079 	return (error);
1080 }
1081 
1082 /*
1083  * initialize aio by allocating an aio_t struct for this
1084  * process.
1085  */
1086 static int
aioinit(void)1087 aioinit(void)
1088 {
1089 	proc_t *p = curproc;
1090 	aio_t *aiop;
1091 	mutex_enter(&p->p_lock);
1092 	if ((aiop = p->p_aio) == NULL) {
1093 		aiop = aio_aiop_alloc();
1094 		p->p_aio = aiop;
1095 	}
1096 	mutex_exit(&p->p_lock);
1097 	if (aiop == NULL)
1098 		return (ENOMEM);
1099 	return (0);
1100 }
1101 
1102 /*
1103  * start a special thread that will cleanup after aio requests
1104  * that are preventing a segment from being unmapped. as_unmap()
1105  * blocks until all phsyio to this segment is completed. this
1106  * doesn't happen until all the pages in this segment are not
1107  * SOFTLOCKed. Some pages will be SOFTLOCKed when there are aio
1108  * requests still outstanding. this special thread will make sure
1109  * that these SOFTLOCKed pages will eventually be SOFTUNLOCKed.
1110  *
1111  * this function will return an error if the process has only
1112  * one LWP. the assumption is that the caller is a separate LWP
1113  * that remains blocked in the kernel for the life of this process.
1114  */
1115 static int
aiostart(void)1116 aiostart(void)
1117 {
1118 	proc_t *p = curproc;
1119 	aio_t *aiop;
1120 	int first, error = 0;
1121 
1122 	if (p->p_lwpcnt == 1)
1123 		return (EDEADLK);
1124 	mutex_enter(&p->p_lock);
1125 	if ((aiop = p->p_aio) == NULL)
1126 		error = EINVAL;
1127 	else {
1128 		first = aiop->aio_ok;
1129 		if (aiop->aio_ok == 0)
1130 			aiop->aio_ok = 1;
1131 	}
1132 	mutex_exit(&p->p_lock);
1133 	if (error == 0 && first == 0) {
1134 		return (aio_cleanup_thread(aiop));
1135 		/* should return only to exit */
1136 	}
1137 	return (error);
1138 }
1139 
1140 /*
1141  * Associate an aiocb with a port.
1142  * This function is used by aiorw() to associate a transaction with a port.
1143  * Allocate an event port structure (port_alloc_event()) and store the
1144  * delivered user pointer (portnfy_user) in the portkev_user field of the
1145  * port_kevent_t structure..
1146  * The aio_req_portkev pointer in the aio_req_t structure was added to identify
1147  * the port association.
1148  */
1149 
1150 static int
aio_req_assoc_port_rw(port_notify_t * pntfy,aiocb_t * cbp,aio_req_t * reqp,int event)1151 aio_req_assoc_port_rw(port_notify_t *pntfy, aiocb_t *cbp,
1152     aio_req_t *reqp, int event)
1153 {
1154 	port_kevent_t	*pkevp = NULL;
1155 	int		error;
1156 
1157 	error = port_alloc_event(pntfy->portnfy_port, PORT_ALLOC_DEFAULT,
1158 	    PORT_SOURCE_AIO, &pkevp);
1159 	if (error) {
1160 		if ((error == ENOMEM) || (error == EAGAIN))
1161 			error = EAGAIN;
1162 		else
1163 			error = EINVAL;
1164 	} else {
1165 		port_init_event(pkevp, (uintptr_t)cbp, pntfy->portnfy_user,
1166 		    aio_port_callback, reqp);
1167 		pkevp->portkev_events = event;
1168 		reqp->aio_req_portkev = pkevp;
1169 		reqp->aio_req_port = pntfy->portnfy_port;
1170 	}
1171 	return (error);
1172 }
1173 
1174 #ifdef _LP64
1175 
1176 /*
1177  * Asynchronous list IO. A chain of aiocb's are copied in
1178  * one at a time. If the aiocb is invalid, it is skipped.
1179  * For each aiocb, the appropriate driver entry point is
1180  * called. Optimize for the common case where the list
1181  * of requests is to the same file descriptor.
1182  *
1183  * One possible optimization is to define a new driver entry
1184  * point that supports a list of IO requests. Whether this
1185  * improves performance depends somewhat on the driver's
1186  * locking strategy. Processing a list could adversely impact
1187  * the driver's interrupt latency.
1188  */
1189 static int
alio(int mode_arg,aiocb_t ** aiocb_arg,int nent,struct sigevent * sigev)1190 alio(
1191 	int		mode_arg,
1192 	aiocb_t		**aiocb_arg,
1193 	int		nent,
1194 	struct sigevent	*sigev)
1195 {
1196 	file_t		*fp;
1197 	file_t		*prev_fp = NULL;
1198 	int		prev_mode = -1;
1199 	struct vnode	*vp;
1200 	aio_lio_t	*head;
1201 	aio_req_t	*reqp;
1202 	aio_t		*aiop;
1203 	caddr_t		cbplist;
1204 	aiocb_t		cb;
1205 	aiocb_t		*aiocb = &cb;
1206 	aiocb_t		*cbp;
1207 	aiocb_t		**ucbp;
1208 	struct sigevent sigevk;
1209 	sigqueue_t	*sqp;
1210 	int		(*aio_func)();
1211 	int		mode;
1212 	int		error = 0;
1213 	int		aio_errors = 0;
1214 	int		i;
1215 	size_t		ssize;
1216 	int		deadhead = 0;
1217 	int		aio_notsupported = 0;
1218 	int		lio_head_port;
1219 	int		aio_port;
1220 	int		aio_thread;
1221 	port_kevent_t	*pkevtp = NULL;
1222 	int		portused = 0;
1223 	port_notify_t	pnotify;
1224 	int		event;
1225 
1226 	aiop = curproc->p_aio;
1227 	if (aiop == NULL || nent <= 0 || nent > _AIO_LISTIO_MAX)
1228 		return (EINVAL);
1229 
1230 	ssize = (sizeof (aiocb_t *) * nent);
1231 	cbplist = kmem_alloc(ssize, KM_SLEEP);
1232 	ucbp = (aiocb_t **)cbplist;
1233 
1234 	if (copyin(aiocb_arg, cbplist, ssize) ||
1235 	    (sigev && copyin(sigev, &sigevk, sizeof (struct sigevent)))) {
1236 		kmem_free(cbplist, ssize);
1237 		return (EFAULT);
1238 	}
1239 
1240 	/* Event Ports  */
1241 	if (sigev &&
1242 	    (sigevk.sigev_notify == SIGEV_THREAD ||
1243 	    sigevk.sigev_notify == SIGEV_PORT)) {
1244 		if (sigevk.sigev_notify == SIGEV_THREAD) {
1245 			pnotify.portnfy_port = sigevk.sigev_signo;
1246 			pnotify.portnfy_user = sigevk.sigev_value.sival_ptr;
1247 		} else if (copyin(sigevk.sigev_value.sival_ptr,
1248 		    &pnotify, sizeof (pnotify))) {
1249 			kmem_free(cbplist, ssize);
1250 			return (EFAULT);
1251 		}
1252 		error = port_alloc_event(pnotify.portnfy_port,
1253 		    PORT_ALLOC_DEFAULT, PORT_SOURCE_AIO, &pkevtp);
1254 		if (error) {
1255 			if (error == ENOMEM || error == EAGAIN)
1256 				error = EAGAIN;
1257 			else
1258 				error = EINVAL;
1259 			kmem_free(cbplist, ssize);
1260 			return (error);
1261 		}
1262 		lio_head_port = pnotify.portnfy_port;
1263 		portused = 1;
1264 	}
1265 
1266 	/*
1267 	 * a list head should be allocated if notification is
1268 	 * enabled for this list.
1269 	 */
1270 	head = NULL;
1271 
1272 	if (mode_arg == LIO_WAIT || sigev) {
1273 		mutex_enter(&aiop->aio_mutex);
1274 		error = aio_lio_alloc(&head);
1275 		mutex_exit(&aiop->aio_mutex);
1276 		if (error)
1277 			goto done;
1278 		deadhead = 1;
1279 		head->lio_nent = nent;
1280 		head->lio_refcnt = nent;
1281 		head->lio_port = -1;
1282 		head->lio_portkev = NULL;
1283 		if (sigev && sigevk.sigev_notify == SIGEV_SIGNAL &&
1284 		    sigevk.sigev_signo > 0 && sigevk.sigev_signo < NSIG) {
1285 			sqp = kmem_zalloc(sizeof (sigqueue_t), KM_NOSLEEP);
1286 			if (sqp == NULL) {
1287 				error = EAGAIN;
1288 				goto done;
1289 			}
1290 			sqp->sq_func = NULL;
1291 			sqp->sq_next = NULL;
1292 			sqp->sq_info.si_code = SI_ASYNCIO;
1293 			sqp->sq_info.si_pid = curproc->p_pid;
1294 			sqp->sq_info.si_ctid = PRCTID(curproc);
1295 			sqp->sq_info.si_zoneid = getzoneid();
1296 			sqp->sq_info.si_uid = crgetuid(curproc->p_cred);
1297 			sqp->sq_info.si_signo = sigevk.sigev_signo;
1298 			sqp->sq_info.si_value = sigevk.sigev_value;
1299 			head->lio_sigqp = sqp;
1300 		} else {
1301 			head->lio_sigqp = NULL;
1302 		}
1303 		if (pkevtp) {
1304 			/*
1305 			 * Prepare data to send when list of aiocb's
1306 			 * has completed.
1307 			 */
1308 			port_init_event(pkevtp, (uintptr_t)sigev,
1309 			    (void *)(uintptr_t)pnotify.portnfy_user,
1310 			    NULL, head);
1311 			pkevtp->portkev_events = AIOLIO;
1312 			head->lio_portkev = pkevtp;
1313 			head->lio_port = pnotify.portnfy_port;
1314 		}
1315 	}
1316 
1317 	for (i = 0; i < nent; i++, ucbp++) {
1318 
1319 		cbp = *ucbp;
1320 		/* skip entry if it can't be copied. */
1321 		if (cbp == NULL || copyin(cbp, aiocb, sizeof (*aiocb))) {
1322 			if (head) {
1323 				mutex_enter(&aiop->aio_mutex);
1324 				head->lio_nent--;
1325 				head->lio_refcnt--;
1326 				mutex_exit(&aiop->aio_mutex);
1327 			}
1328 			continue;
1329 		}
1330 
1331 		/* skip if opcode for aiocb is LIO_NOP */
1332 		mode = aiocb->aio_lio_opcode;
1333 		if (mode == LIO_NOP) {
1334 			cbp = NULL;
1335 			if (head) {
1336 				mutex_enter(&aiop->aio_mutex);
1337 				head->lio_nent--;
1338 				head->lio_refcnt--;
1339 				mutex_exit(&aiop->aio_mutex);
1340 			}
1341 			continue;
1342 		}
1343 
1344 		/* increment file descriptor's ref count. */
1345 		if ((fp = getf(aiocb->aio_fildes)) == NULL) {
1346 			lio_set_uerror(&cbp->aio_resultp, EBADF);
1347 			if (head) {
1348 				mutex_enter(&aiop->aio_mutex);
1349 				head->lio_nent--;
1350 				head->lio_refcnt--;
1351 				mutex_exit(&aiop->aio_mutex);
1352 			}
1353 			aio_errors++;
1354 			continue;
1355 		}
1356 
1357 		/*
1358 		 * check the permission of the partition
1359 		 */
1360 		if ((fp->f_flag & mode) == 0) {
1361 			releasef(aiocb->aio_fildes);
1362 			lio_set_uerror(&cbp->aio_resultp, EBADF);
1363 			if (head) {
1364 				mutex_enter(&aiop->aio_mutex);
1365 				head->lio_nent--;
1366 				head->lio_refcnt--;
1367 				mutex_exit(&aiop->aio_mutex);
1368 			}
1369 			aio_errors++;
1370 			continue;
1371 		}
1372 
1373 		/*
1374 		 * common case where requests are to the same fd
1375 		 * for the same r/w operation.
1376 		 * for UFS, need to set EBADFD
1377 		 */
1378 		vp = fp->f_vnode;
1379 		if (fp != prev_fp || mode != prev_mode) {
1380 			aio_func = check_vp(vp, mode);
1381 			if (aio_func == NULL) {
1382 				prev_fp = NULL;
1383 				releasef(aiocb->aio_fildes);
1384 				lio_set_uerror(&cbp->aio_resultp, EBADFD);
1385 				aio_notsupported++;
1386 				if (head) {
1387 					mutex_enter(&aiop->aio_mutex);
1388 					head->lio_nent--;
1389 					head->lio_refcnt--;
1390 					mutex_exit(&aiop->aio_mutex);
1391 				}
1392 				continue;
1393 			} else {
1394 				prev_fp = fp;
1395 				prev_mode = mode;
1396 			}
1397 		}
1398 
1399 		error = aio_req_setup(&reqp, aiop, aiocb,
1400 		    &cbp->aio_resultp, vp, 0);
1401 		if (error) {
1402 			releasef(aiocb->aio_fildes);
1403 			lio_set_uerror(&cbp->aio_resultp, error);
1404 			if (head) {
1405 				mutex_enter(&aiop->aio_mutex);
1406 				head->lio_nent--;
1407 				head->lio_refcnt--;
1408 				mutex_exit(&aiop->aio_mutex);
1409 			}
1410 			aio_errors++;
1411 			continue;
1412 		}
1413 
1414 		reqp->aio_req_lio = head;
1415 		deadhead = 0;
1416 
1417 		/*
1418 		 * Set the errno field now before sending the request to
1419 		 * the driver to avoid a race condition
1420 		 */
1421 		(void) suword32(&cbp->aio_resultp.aio_errno,
1422 		    EINPROGRESS);
1423 
1424 		reqp->aio_req_iocb.iocb = (caddr_t)cbp;
1425 
1426 		event = (mode == LIO_READ)? AIOAREAD : AIOAWRITE;
1427 		aio_port = (aiocb->aio_sigevent.sigev_notify == SIGEV_PORT);
1428 		aio_thread = (aiocb->aio_sigevent.sigev_notify == SIGEV_THREAD);
1429 		if (aio_port | aio_thread) {
1430 			port_kevent_t *lpkevp;
1431 			/*
1432 			 * Prepare data to send with each aiocb completed.
1433 			 */
1434 			if (aio_port) {
1435 				void *paddr =
1436 				    aiocb->aio_sigevent.sigev_value.sival_ptr;
1437 				if (copyin(paddr, &pnotify, sizeof (pnotify)))
1438 					error = EFAULT;
1439 			} else {	/* aio_thread */
1440 				pnotify.portnfy_port =
1441 				    aiocb->aio_sigevent.sigev_signo;
1442 				pnotify.portnfy_user =
1443 				    aiocb->aio_sigevent.sigev_value.sival_ptr;
1444 			}
1445 			if (error)
1446 				/* EMPTY */;
1447 			else if (pkevtp != NULL &&
1448 			    pnotify.portnfy_port == lio_head_port)
1449 				error = port_dup_event(pkevtp, &lpkevp,
1450 				    PORT_ALLOC_DEFAULT);
1451 			else
1452 				error = port_alloc_event(pnotify.portnfy_port,
1453 				    PORT_ALLOC_DEFAULT, PORT_SOURCE_AIO,
1454 				    &lpkevp);
1455 			if (error == 0) {
1456 				port_init_event(lpkevp, (uintptr_t)cbp,
1457 				    (void *)(uintptr_t)pnotify.portnfy_user,
1458 				    aio_port_callback, reqp);
1459 				lpkevp->portkev_events = event;
1460 				reqp->aio_req_portkev = lpkevp;
1461 				reqp->aio_req_port = pnotify.portnfy_port;
1462 			}
1463 		}
1464 
1465 		/*
1466 		 * send the request to driver.
1467 		 */
1468 		if (error == 0) {
1469 			if (aiocb->aio_nbytes == 0) {
1470 				clear_active_fd(aiocb->aio_fildes);
1471 				aio_zerolen(reqp);
1472 				continue;
1473 			}
1474 			error = (*aio_func)(vp, (aio_req_t *)&reqp->aio_req,
1475 			    CRED());
1476 		}
1477 
1478 		/*
1479 		 * the fd's ref count is not decremented until the IO has
1480 		 * completed unless there was an error.
1481 		 */
1482 		if (error) {
1483 			releasef(aiocb->aio_fildes);
1484 			lio_set_uerror(&cbp->aio_resultp, error);
1485 			if (head) {
1486 				mutex_enter(&aiop->aio_mutex);
1487 				head->lio_nent--;
1488 				head->lio_refcnt--;
1489 				mutex_exit(&aiop->aio_mutex);
1490 			}
1491 			if (error == ENOTSUP)
1492 				aio_notsupported++;
1493 			else
1494 				aio_errors++;
1495 			lio_set_error(reqp, portused);
1496 		} else {
1497 			clear_active_fd(aiocb->aio_fildes);
1498 		}
1499 	}
1500 
1501 	if (aio_notsupported) {
1502 		error = ENOTSUP;
1503 	} else if (aio_errors) {
1504 		/*
1505 		 * return EIO if any request failed
1506 		 */
1507 		error = EIO;
1508 	}
1509 
1510 	if (mode_arg == LIO_WAIT) {
1511 		mutex_enter(&aiop->aio_mutex);
1512 		while (head->lio_refcnt > 0) {
1513 			if (!cv_wait_sig(&head->lio_notify, &aiop->aio_mutex)) {
1514 				mutex_exit(&aiop->aio_mutex);
1515 				error = EINTR;
1516 				goto done;
1517 			}
1518 		}
1519 		mutex_exit(&aiop->aio_mutex);
1520 		alio_cleanup(aiop, (aiocb_t **)cbplist, nent, AIO_64);
1521 	}
1522 
1523 done:
1524 	kmem_free(cbplist, ssize);
1525 	if (deadhead) {
1526 		if (head->lio_sigqp)
1527 			kmem_free(head->lio_sigqp, sizeof (sigqueue_t));
1528 		if (head->lio_portkev)
1529 			port_free_event(head->lio_portkev);
1530 		kmem_free(head, sizeof (aio_lio_t));
1531 	}
1532 	return (error);
1533 }
1534 
1535 #endif /* _LP64 */
1536 
1537 /*
1538  * Asynchronous list IO.
1539  * If list I/O is called with LIO_WAIT it can still return
1540  * before all the I/O's are completed if a signal is caught
1541  * or if the list include UFS I/O requests. If this happens,
1542  * libaio will call aliowait() to wait for the I/O's to
1543  * complete
1544  */
1545 /*ARGSUSED*/
1546 static int
aliowait(int mode,void * aiocb,int nent,void * sigev,int run_mode)1547 aliowait(
1548 	int	mode,
1549 	void	*aiocb,
1550 	int	nent,
1551 	void	*sigev,
1552 	int	run_mode)
1553 {
1554 	aio_lio_t	*head;
1555 	aio_t		*aiop;
1556 	caddr_t		cbplist;
1557 	aiocb_t		*cbp, **ucbp;
1558 #ifdef	_SYSCALL32_IMPL
1559 	aiocb32_t	*cbp32;
1560 	caddr32_t	*ucbp32;
1561 	aiocb64_32_t	*cbp64;
1562 #endif
1563 	int		error = 0;
1564 	int		i;
1565 	size_t		ssize = 0;
1566 	model_t		model = get_udatamodel();
1567 
1568 	aiop = curproc->p_aio;
1569 	if (aiop == NULL || nent <= 0 || nent > _AIO_LISTIO_MAX)
1570 		return (EINVAL);
1571 
1572 	if (model == DATAMODEL_NATIVE)
1573 		ssize = (sizeof (aiocb_t *) * nent);
1574 #ifdef	_SYSCALL32_IMPL
1575 	else
1576 		ssize = (sizeof (caddr32_t) * nent);
1577 #endif  /* _SYSCALL32_IMPL */
1578 
1579 	if (ssize == 0)
1580 		return (EINVAL);
1581 
1582 	cbplist = kmem_alloc(ssize, KM_SLEEP);
1583 
1584 	if (model == DATAMODEL_NATIVE)
1585 		ucbp = (aiocb_t **)cbplist;
1586 #ifdef	_SYSCALL32_IMPL
1587 	else
1588 		ucbp32 = (caddr32_t *)cbplist;
1589 #endif  /* _SYSCALL32_IMPL */
1590 
1591 	if (copyin(aiocb, cbplist, ssize)) {
1592 		error = EFAULT;
1593 		goto done;
1594 	}
1595 
1596 	/*
1597 	 * To find the list head, we go through the
1598 	 * list of aiocb structs, find the request
1599 	 * its for, then get the list head that reqp
1600 	 * points to
1601 	 */
1602 	head = NULL;
1603 
1604 	for (i = 0; i < nent; i++) {
1605 		if (model == DATAMODEL_NATIVE) {
1606 			/*
1607 			 * Since we are only checking for a NULL pointer
1608 			 * Following should work on both native data sizes
1609 			 * as well as for largefile aiocb.
1610 			 */
1611 			if ((cbp = *ucbp++) == NULL)
1612 				continue;
1613 			if (run_mode != AIO_LARGEFILE)
1614 				if (head = aio_list_get(&cbp->aio_resultp))
1615 					break;
1616 			else {
1617 				/*
1618 				 * This is a case when largefile call is
1619 				 * made on 32 bit kernel.
1620 				 * Treat each pointer as pointer to
1621 				 * aiocb64_32
1622 				 */
1623 				if (head = aio_list_get((aio_result_t *)
1624 				    &(((aiocb64_32_t *)cbp)->aio_resultp)))
1625 					break;
1626 			}
1627 		}
1628 #ifdef	_SYSCALL32_IMPL
1629 		else {
1630 			if (run_mode == AIO_LARGEFILE) {
1631 				if ((cbp64 = (aiocb64_32_t *)
1632 				    (uintptr_t)*ucbp32++) == NULL)
1633 					continue;
1634 				if (head = aio_list_get((aio_result_t *)
1635 				    &cbp64->aio_resultp))
1636 					break;
1637 			} else if (run_mode == AIO_32) {
1638 				if ((cbp32 = (aiocb32_t *)
1639 				    (uintptr_t)*ucbp32++) == NULL)
1640 					continue;
1641 				if (head = aio_list_get((aio_result_t *)
1642 				    &cbp32->aio_resultp))
1643 					break;
1644 			}
1645 		}
1646 #endif	/* _SYSCALL32_IMPL */
1647 	}
1648 
1649 	if (head == NULL) {
1650 		error = EINVAL;
1651 		goto done;
1652 	}
1653 
1654 	mutex_enter(&aiop->aio_mutex);
1655 	while (head->lio_refcnt > 0) {
1656 		if (!cv_wait_sig(&head->lio_notify, &aiop->aio_mutex)) {
1657 			mutex_exit(&aiop->aio_mutex);
1658 			error = EINTR;
1659 			goto done;
1660 		}
1661 	}
1662 	mutex_exit(&aiop->aio_mutex);
1663 	alio_cleanup(aiop, (aiocb_t **)cbplist, nent, run_mode);
1664 done:
1665 	kmem_free(cbplist, ssize);
1666 	return (error);
1667 }
1668 
1669 aio_lio_t *
aio_list_get(aio_result_t * resultp)1670 aio_list_get(aio_result_t *resultp)
1671 {
1672 	aio_lio_t	*head = NULL;
1673 	aio_t		*aiop;
1674 	aio_req_t	**bucket;
1675 	aio_req_t	*reqp;
1676 	long		index;
1677 
1678 	aiop = curproc->p_aio;
1679 	if (aiop == NULL)
1680 		return (NULL);
1681 
1682 	if (resultp) {
1683 		index = AIO_HASH(resultp);
1684 		bucket = &aiop->aio_hash[index];
1685 		for (reqp = *bucket; reqp != NULL;
1686 		    reqp = reqp->aio_hash_next) {
1687 			if (reqp->aio_req_resultp == resultp) {
1688 				head = reqp->aio_req_lio;
1689 				return (head);
1690 			}
1691 		}
1692 	}
1693 	return (NULL);
1694 }
1695 
1696 
1697 static void
lio_set_uerror(void * resultp,int error)1698 lio_set_uerror(void *resultp, int error)
1699 {
1700 	/*
1701 	 * the resultp field is a pointer to where the
1702 	 * error should be written out to the user's
1703 	 * aiocb.
1704 	 *
1705 	 */
1706 	if (get_udatamodel() == DATAMODEL_NATIVE) {
1707 		(void) sulword(&((aio_result_t *)resultp)->aio_return,
1708 		    (ssize_t)-1);
1709 		(void) suword32(&((aio_result_t *)resultp)->aio_errno, error);
1710 	}
1711 #ifdef	_SYSCALL32_IMPL
1712 	else {
1713 		(void) suword32(&((aio_result32_t *)resultp)->aio_return,
1714 		    (uint_t)-1);
1715 		(void) suword32(&((aio_result32_t *)resultp)->aio_errno, error);
1716 	}
1717 #endif  /* _SYSCALL32_IMPL */
1718 }
1719 
1720 /*
1721  * do cleanup completion for all requests in list. memory for
1722  * each request is also freed.
1723  */
1724 static void
alio_cleanup(aio_t * aiop,aiocb_t ** cbp,int nent,int run_mode)1725 alio_cleanup(aio_t *aiop, aiocb_t **cbp, int nent, int run_mode)
1726 {
1727 	int i;
1728 	aio_req_t *reqp;
1729 	aio_result_t *resultp;
1730 	aiocb64_32_t *aiocb_64;
1731 
1732 	for (i = 0; i < nent; i++) {
1733 		if (get_udatamodel() == DATAMODEL_NATIVE) {
1734 			if (cbp[i] == NULL)
1735 				continue;
1736 			if (run_mode == AIO_LARGEFILE) {
1737 				aiocb_64 = (aiocb64_32_t *)cbp[i];
1738 				resultp = (aio_result_t *)
1739 				    &aiocb_64->aio_resultp;
1740 			} else
1741 				resultp = &cbp[i]->aio_resultp;
1742 		}
1743 #ifdef	_SYSCALL32_IMPL
1744 		else {
1745 			aiocb32_t *aiocb_32;
1746 			caddr32_t *cbp32;
1747 
1748 			cbp32 = (caddr32_t *)cbp;
1749 			if (cbp32[i] == 0)
1750 				continue;
1751 			if (run_mode == AIO_32) {
1752 				aiocb_32 = (aiocb32_t *)(uintptr_t)cbp32[i];
1753 				resultp = (aio_result_t *)&aiocb_32->
1754 				    aio_resultp;
1755 			} else if (run_mode == AIO_LARGEFILE) {
1756 				aiocb_64 = (aiocb64_32_t *)(uintptr_t)cbp32[i];
1757 				resultp = (aio_result_t *)&aiocb_64->
1758 				    aio_resultp;
1759 			}
1760 		}
1761 #endif  /* _SYSCALL32_IMPL */
1762 		/*
1763 		 * we need to get the aio_cleanupq_mutex since we call
1764 		 * aio_req_done().
1765 		 */
1766 		mutex_enter(&aiop->aio_cleanupq_mutex);
1767 		mutex_enter(&aiop->aio_mutex);
1768 		reqp = aio_req_done(resultp);
1769 		mutex_exit(&aiop->aio_mutex);
1770 		mutex_exit(&aiop->aio_cleanupq_mutex);
1771 		if (reqp != NULL) {
1772 			aphysio_unlock(reqp);
1773 			aio_copyout_result(reqp);
1774 			mutex_enter(&aiop->aio_mutex);
1775 			aio_req_free(aiop, reqp);
1776 			mutex_exit(&aiop->aio_mutex);
1777 		}
1778 	}
1779 }
1780 
1781 /*
1782  * Write out the results for an aio request that is done.
1783  */
1784 static int
aioerror(void * cb,int run_mode)1785 aioerror(void *cb, int run_mode)
1786 {
1787 	aio_result_t *resultp;
1788 	aio_t *aiop;
1789 	aio_req_t *reqp;
1790 	int retval;
1791 
1792 	aiop = curproc->p_aio;
1793 	if (aiop == NULL || cb == NULL)
1794 		return (EINVAL);
1795 
1796 	if (get_udatamodel() == DATAMODEL_NATIVE) {
1797 		if (run_mode == AIO_LARGEFILE)
1798 			resultp = (aio_result_t *)&((aiocb64_32_t *)cb)->
1799 			    aio_resultp;
1800 		else
1801 			resultp = &((aiocb_t *)cb)->aio_resultp;
1802 	}
1803 #ifdef	_SYSCALL32_IMPL
1804 	else {
1805 		if (run_mode == AIO_LARGEFILE)
1806 			resultp = (aio_result_t *)&((aiocb64_32_t *)cb)->
1807 			    aio_resultp;
1808 		else if (run_mode == AIO_32)
1809 			resultp = (aio_result_t *)&((aiocb32_t *)cb)->
1810 			    aio_resultp;
1811 	}
1812 #endif  /* _SYSCALL32_IMPL */
1813 	/*
1814 	 * we need to get the aio_cleanupq_mutex since we call
1815 	 * aio_req_find().
1816 	 */
1817 	mutex_enter(&aiop->aio_cleanupq_mutex);
1818 	mutex_enter(&aiop->aio_mutex);
1819 	retval = aio_req_find(resultp, &reqp);
1820 	mutex_exit(&aiop->aio_mutex);
1821 	mutex_exit(&aiop->aio_cleanupq_mutex);
1822 	if (retval == 0) {
1823 		aphysio_unlock(reqp);
1824 		aio_copyout_result(reqp);
1825 		mutex_enter(&aiop->aio_mutex);
1826 		aio_req_free(aiop, reqp);
1827 		mutex_exit(&aiop->aio_mutex);
1828 		return (0);
1829 	} else if (retval == 1)
1830 		return (EINPROGRESS);
1831 	else if (retval == 2)
1832 		return (EINVAL);
1833 	return (0);
1834 }
1835 
1836 /*
1837  *	aio_cancel - if no requests outstanding,
1838  *			return AIO_ALLDONE
1839  *			else
1840  *			return AIO_NOTCANCELED
1841  */
1842 static int
aio_cancel(int fildes,void * cb,long * rval,int run_mode)1843 aio_cancel(int fildes, void *cb, long *rval, int run_mode)
1844 {
1845 	aio_t *aiop;
1846 	void *resultp;
1847 	int index;
1848 	aio_req_t **bucket;
1849 	aio_req_t *ent;
1850 
1851 
1852 	/*
1853 	 * Verify valid file descriptor
1854 	 */
1855 	if ((getf(fildes)) == NULL) {
1856 		return (EBADF);
1857 	}
1858 	releasef(fildes);
1859 
1860 	aiop = curproc->p_aio;
1861 	if (aiop == NULL)
1862 		return (EINVAL);
1863 
1864 	if (aiop->aio_outstanding == 0) {
1865 		*rval = AIO_ALLDONE;
1866 		return (0);
1867 	}
1868 
1869 	mutex_enter(&aiop->aio_mutex);
1870 	if (cb != NULL) {
1871 		if (get_udatamodel() == DATAMODEL_NATIVE) {
1872 			if (run_mode == AIO_LARGEFILE)
1873 				resultp = (aio_result_t *)&((aiocb64_32_t *)cb)
1874 				    ->aio_resultp;
1875 			else
1876 				resultp = &((aiocb_t *)cb)->aio_resultp;
1877 		}
1878 #ifdef	_SYSCALL32_IMPL
1879 		else {
1880 			if (run_mode == AIO_LARGEFILE)
1881 				resultp = (aio_result_t *)&((aiocb64_32_t *)cb)
1882 				    ->aio_resultp;
1883 			else if (run_mode == AIO_32)
1884 				resultp = (aio_result_t *)&((aiocb32_t *)cb)
1885 				    ->aio_resultp;
1886 		}
1887 #endif  /* _SYSCALL32_IMPL */
1888 		index = AIO_HASH(resultp);
1889 		bucket = &aiop->aio_hash[index];
1890 		for (ent = *bucket; ent != NULL; ent = ent->aio_hash_next) {
1891 			if (ent->aio_req_resultp == resultp) {
1892 				if ((ent->aio_req_flags & AIO_PENDING) == 0) {
1893 					mutex_exit(&aiop->aio_mutex);
1894 					*rval = AIO_ALLDONE;
1895 					return (0);
1896 				}
1897 				mutex_exit(&aiop->aio_mutex);
1898 				*rval = AIO_NOTCANCELED;
1899 				return (0);
1900 			}
1901 		}
1902 		mutex_exit(&aiop->aio_mutex);
1903 		*rval = AIO_ALLDONE;
1904 		return (0);
1905 	}
1906 
1907 	for (index = 0; index < AIO_HASHSZ; index++) {
1908 		bucket = &aiop->aio_hash[index];
1909 		for (ent = *bucket; ent != NULL; ent = ent->aio_hash_next) {
1910 			if (ent->aio_req_fd == fildes) {
1911 				if ((ent->aio_req_flags & AIO_PENDING) != 0) {
1912 					mutex_exit(&aiop->aio_mutex);
1913 					*rval = AIO_NOTCANCELED;
1914 					return (0);
1915 				}
1916 			}
1917 		}
1918 	}
1919 	mutex_exit(&aiop->aio_mutex);
1920 	*rval = AIO_ALLDONE;
1921 	return (0);
1922 }
1923 
1924 /*
1925  * solaris version of asynchronous read and write
1926  */
1927 static int
arw(int opcode,int fdes,char * bufp,int bufsize,offset_t offset,aio_result_t * resultp,int mode)1928 arw(
1929 	int	opcode,
1930 	int	fdes,
1931 	char	*bufp,
1932 	int	bufsize,
1933 	offset_t	offset,
1934 	aio_result_t	*resultp,
1935 	int		mode)
1936 {
1937 	file_t		*fp;
1938 	int		error;
1939 	struct vnode	*vp;
1940 	aio_req_t	*reqp;
1941 	aio_t		*aiop;
1942 	int		(*aio_func)();
1943 #ifdef _LP64
1944 	aiocb_t		aiocb;
1945 #else
1946 	aiocb64_32_t	aiocb64;
1947 #endif
1948 
1949 	aiop = curproc->p_aio;
1950 	if (aiop == NULL)
1951 		return (EINVAL);
1952 
1953 	if ((fp = getf(fdes)) == NULL) {
1954 		return (EBADF);
1955 	}
1956 
1957 	/*
1958 	 * check the permission of the partition
1959 	 */
1960 	if ((fp->f_flag & mode) == 0) {
1961 		releasef(fdes);
1962 		return (EBADF);
1963 	}
1964 
1965 	vp = fp->f_vnode;
1966 	aio_func = check_vp(vp, mode);
1967 	if (aio_func == NULL) {
1968 		releasef(fdes);
1969 		return (EBADFD);
1970 	}
1971 #ifdef _LP64
1972 	aiocb.aio_fildes = fdes;
1973 	aiocb.aio_buf = bufp;
1974 	aiocb.aio_nbytes = bufsize;
1975 	aiocb.aio_offset = offset;
1976 	aiocb.aio_sigevent.sigev_notify = 0;
1977 	error = aio_req_setup(&reqp, aiop, &aiocb, resultp, vp, 1);
1978 #else
1979 	aiocb64.aio_fildes = fdes;
1980 	aiocb64.aio_buf = (caddr32_t)bufp;
1981 	aiocb64.aio_nbytes = bufsize;
1982 	aiocb64.aio_offset = offset;
1983 	aiocb64.aio_sigevent.sigev_notify = 0;
1984 	error = aio_req_setupLF(&reqp, aiop, &aiocb64, resultp, vp, 1);
1985 #endif
1986 	if (error) {
1987 		releasef(fdes);
1988 		return (error);
1989 	}
1990 
1991 	/*
1992 	 * enable polling on this request if the opcode has
1993 	 * the AIO poll bit set
1994 	 */
1995 	if (opcode & AIO_POLL_BIT)
1996 		reqp->aio_req_flags |= AIO_POLL;
1997 
1998 	if (bufsize == 0) {
1999 		clear_active_fd(fdes);
2000 		aio_zerolen(reqp);
2001 		return (0);
2002 	}
2003 	/*
2004 	 * send the request to driver.
2005 	 */
2006 	error = (*aio_func)(vp, (aio_req_t *)&reqp->aio_req, CRED());
2007 	/*
2008 	 * the fd is stored in the aio_req_t by aio_req_setup(), and
2009 	 * is released by the aio_cleanup_thread() when the IO has
2010 	 * completed.
2011 	 */
2012 	if (error) {
2013 		releasef(fdes);
2014 		mutex_enter(&aiop->aio_mutex);
2015 		aio_req_free(aiop, reqp);
2016 		aiop->aio_pending--;
2017 		if (aiop->aio_flags & AIO_REQ_BLOCK)
2018 			cv_signal(&aiop->aio_cleanupcv);
2019 		mutex_exit(&aiop->aio_mutex);
2020 		return (error);
2021 	}
2022 	clear_active_fd(fdes);
2023 	return (0);
2024 }
2025 
2026 /*
2027  * posix version of asynchronous read and write
2028  */
2029 static int
aiorw(int opcode,void * aiocb_arg,int mode,int run_mode)2030 aiorw(
2031 	int		opcode,
2032 	void		*aiocb_arg,
2033 	int		mode,
2034 	int		run_mode)
2035 {
2036 #ifdef _SYSCALL32_IMPL
2037 	aiocb32_t	aiocb32;
2038 	struct	sigevent32 *sigev32;
2039 	port_notify32_t	pntfy32;
2040 #endif
2041 	aiocb64_32_t	aiocb64;
2042 	aiocb_t		aiocb;
2043 	file_t		*fp;
2044 	int		error, fd;
2045 	size_t		bufsize;
2046 	struct vnode	*vp;
2047 	aio_req_t	*reqp;
2048 	aio_t		*aiop;
2049 	int		(*aio_func)();
2050 	aio_result_t	*resultp;
2051 	struct	sigevent *sigev;
2052 	model_t		model;
2053 	int		aio_use_port = 0;
2054 	port_notify_t	pntfy;
2055 
2056 	model = get_udatamodel();
2057 	aiop = curproc->p_aio;
2058 	if (aiop == NULL)
2059 		return (EINVAL);
2060 
2061 	if (model == DATAMODEL_NATIVE) {
2062 		if (run_mode != AIO_LARGEFILE) {
2063 			if (copyin(aiocb_arg, &aiocb, sizeof (aiocb_t)))
2064 				return (EFAULT);
2065 			bufsize = aiocb.aio_nbytes;
2066 			resultp = &(((aiocb_t *)aiocb_arg)->aio_resultp);
2067 			if ((fp = getf(fd = aiocb.aio_fildes)) == NULL) {
2068 				return (EBADF);
2069 			}
2070 			sigev = &aiocb.aio_sigevent;
2071 		} else {
2072 			/*
2073 			 * We come here only when we make largefile
2074 			 * call on 32 bit kernel using 32 bit library.
2075 			 */
2076 			if (copyin(aiocb_arg, &aiocb64, sizeof (aiocb64_32_t)))
2077 				return (EFAULT);
2078 			bufsize = aiocb64.aio_nbytes;
2079 			resultp = (aio_result_t *)&(((aiocb64_32_t *)aiocb_arg)
2080 			    ->aio_resultp);
2081 			if ((fp = getf(fd = aiocb64.aio_fildes)) == NULL)
2082 				return (EBADF);
2083 			sigev = (struct sigevent *)&aiocb64.aio_sigevent;
2084 		}
2085 
2086 		if (sigev->sigev_notify == SIGEV_PORT) {
2087 			if (copyin((void *)sigev->sigev_value.sival_ptr,
2088 			    &pntfy, sizeof (port_notify_t))) {
2089 				releasef(fd);
2090 				return (EFAULT);
2091 			}
2092 			aio_use_port = 1;
2093 		} else if (sigev->sigev_notify == SIGEV_THREAD) {
2094 			pntfy.portnfy_port = aiocb.aio_sigevent.sigev_signo;
2095 			pntfy.portnfy_user =
2096 			    aiocb.aio_sigevent.sigev_value.sival_ptr;
2097 			aio_use_port = 1;
2098 		}
2099 	}
2100 #ifdef	_SYSCALL32_IMPL
2101 	else {
2102 		if (run_mode == AIO_32) {
2103 			/* 32 bit system call is being made on 64 bit kernel */
2104 			if (copyin(aiocb_arg, &aiocb32, sizeof (aiocb32_t)))
2105 				return (EFAULT);
2106 
2107 			bufsize = aiocb32.aio_nbytes;
2108 			aiocb_32ton(&aiocb32, &aiocb);
2109 			resultp = (aio_result_t *)&(((aiocb32_t *)aiocb_arg)->
2110 			    aio_resultp);
2111 			if ((fp = getf(fd = aiocb32.aio_fildes)) == NULL) {
2112 				return (EBADF);
2113 			}
2114 			sigev32 = &aiocb32.aio_sigevent;
2115 		} else if (run_mode == AIO_LARGEFILE) {
2116 			/*
2117 			 * We come here only when we make largefile
2118 			 * call on 64 bit kernel using 32 bit library.
2119 			 */
2120 			if (copyin(aiocb_arg, &aiocb64, sizeof (aiocb64_32_t)))
2121 				return (EFAULT);
2122 			bufsize = aiocb64.aio_nbytes;
2123 			aiocb_LFton(&aiocb64, &aiocb);
2124 			resultp = (aio_result_t *)&(((aiocb64_32_t *)aiocb_arg)
2125 			    ->aio_resultp);
2126 			if ((fp = getf(fd = aiocb64.aio_fildes)) == NULL)
2127 				return (EBADF);
2128 			sigev32 = &aiocb64.aio_sigevent;
2129 		}
2130 
2131 		if (sigev32->sigev_notify == SIGEV_PORT) {
2132 			if (copyin(
2133 			    (void *)(uintptr_t)sigev32->sigev_value.sival_ptr,
2134 			    &pntfy32, sizeof (port_notify32_t))) {
2135 				releasef(fd);
2136 				return (EFAULT);
2137 			}
2138 			pntfy.portnfy_port = pntfy32.portnfy_port;
2139 			pntfy.portnfy_user = (void *)(uintptr_t)
2140 			    pntfy32.portnfy_user;
2141 			aio_use_port = 1;
2142 		} else if (sigev32->sigev_notify == SIGEV_THREAD) {
2143 			pntfy.portnfy_port = sigev32->sigev_signo;
2144 			pntfy.portnfy_user = (void *)(uintptr_t)
2145 			    sigev32->sigev_value.sival_ptr;
2146 			aio_use_port = 1;
2147 		}
2148 	}
2149 #endif  /* _SYSCALL32_IMPL */
2150 
2151 	/*
2152 	 * check the permission of the partition
2153 	 */
2154 
2155 	if ((fp->f_flag & mode) == 0) {
2156 		releasef(fd);
2157 		return (EBADF);
2158 	}
2159 
2160 	vp = fp->f_vnode;
2161 	aio_func = check_vp(vp, mode);
2162 	if (aio_func == NULL) {
2163 		releasef(fd);
2164 		return (EBADFD);
2165 	}
2166 	if (run_mode == AIO_LARGEFILE)
2167 		error = aio_req_setupLF(&reqp, aiop, &aiocb64, resultp, vp, 0);
2168 	else
2169 		error = aio_req_setup(&reqp, aiop, &aiocb, resultp, vp, 0);
2170 
2171 	if (error) {
2172 		releasef(fd);
2173 		return (error);
2174 	}
2175 	/*
2176 	 * enable polling on this request if the opcode has
2177 	 * the AIO poll bit set
2178 	 */
2179 	if (opcode & AIO_POLL_BIT)
2180 		reqp->aio_req_flags |= AIO_POLL;
2181 
2182 	if (model == DATAMODEL_NATIVE)
2183 		reqp->aio_req_iocb.iocb = aiocb_arg;
2184 #ifdef  _SYSCALL32_IMPL
2185 	else
2186 		reqp->aio_req_iocb.iocb32 = (caddr32_t)(uintptr_t)aiocb_arg;
2187 #endif
2188 
2189 	if (aio_use_port) {
2190 		int event = (run_mode == AIO_LARGEFILE)?
2191 		    ((mode == FREAD)? AIOAREAD64 : AIOAWRITE64) :
2192 		    ((mode == FREAD)? AIOAREAD : AIOAWRITE);
2193 		error = aio_req_assoc_port_rw(&pntfy, aiocb_arg, reqp, event);
2194 	}
2195 
2196 	/*
2197 	 * send the request to driver.
2198 	 */
2199 	if (error == 0) {
2200 		if (bufsize == 0) {
2201 			clear_active_fd(fd);
2202 			aio_zerolen(reqp);
2203 			return (0);
2204 		}
2205 		error = (*aio_func)(vp, (aio_req_t *)&reqp->aio_req, CRED());
2206 	}
2207 
2208 	/*
2209 	 * the fd is stored in the aio_req_t by aio_req_setup(), and
2210 	 * is released by the aio_cleanup_thread() when the IO has
2211 	 * completed.
2212 	 */
2213 	if (error) {
2214 		releasef(fd);
2215 		mutex_enter(&aiop->aio_mutex);
2216 		if (aio_use_port)
2217 			aio_deq(&aiop->aio_portpending, reqp);
2218 		aio_req_free(aiop, reqp);
2219 		aiop->aio_pending--;
2220 		if (aiop->aio_flags & AIO_REQ_BLOCK)
2221 			cv_signal(&aiop->aio_cleanupcv);
2222 		mutex_exit(&aiop->aio_mutex);
2223 		return (error);
2224 	}
2225 	clear_active_fd(fd);
2226 	return (0);
2227 }
2228 
2229 
2230 /*
2231  * set error for a list IO entry that failed.
2232  */
2233 static void
lio_set_error(aio_req_t * reqp,int portused)2234 lio_set_error(aio_req_t *reqp, int portused)
2235 {
2236 	aio_t *aiop = curproc->p_aio;
2237 
2238 	if (aiop == NULL)
2239 		return;
2240 
2241 	mutex_enter(&aiop->aio_mutex);
2242 	if (portused)
2243 		aio_deq(&aiop->aio_portpending, reqp);
2244 	aiop->aio_pending--;
2245 	/* request failed, AIO_PHYSIODONE set to aviod physio cleanup. */
2246 	reqp->aio_req_flags |= AIO_PHYSIODONE;
2247 	/*
2248 	 * Need to free the request now as its never
2249 	 * going to get on the done queue
2250 	 *
2251 	 * Note: aio_outstanding is decremented in
2252 	 *	 aio_req_free()
2253 	 */
2254 	aio_req_free(aiop, reqp);
2255 	if (aiop->aio_flags & AIO_REQ_BLOCK)
2256 		cv_signal(&aiop->aio_cleanupcv);
2257 	mutex_exit(&aiop->aio_mutex);
2258 }
2259 
2260 /*
2261  * check if a specified request is done, and remove it from
2262  * the done queue. otherwise remove anybody from the done queue
2263  * if NULL is specified.
2264  */
2265 static aio_req_t *
aio_req_done(void * resultp)2266 aio_req_done(void *resultp)
2267 {
2268 	aio_req_t **bucket;
2269 	aio_req_t *ent;
2270 	aio_t *aiop = curproc->p_aio;
2271 	long index;
2272 
2273 	ASSERT(MUTEX_HELD(&aiop->aio_cleanupq_mutex));
2274 	ASSERT(MUTEX_HELD(&aiop->aio_mutex));
2275 
2276 	if (resultp) {
2277 		index = AIO_HASH(resultp);
2278 		bucket = &aiop->aio_hash[index];
2279 		for (ent = *bucket; ent != NULL; ent = ent->aio_hash_next) {
2280 			if (ent->aio_req_resultp == (aio_result_t *)resultp) {
2281 				if (ent->aio_req_flags & AIO_DONEQ) {
2282 					return (aio_req_remove(ent));
2283 				}
2284 				return (NULL);
2285 			}
2286 		}
2287 		/* no match, resultp is invalid */
2288 		return (NULL);
2289 	}
2290 	return (aio_req_remove(NULL));
2291 }
2292 
2293 /*
2294  * determine if a user-level resultp pointer is associated with an
2295  * active IO request. Zero is returned when the request is done,
2296  * and the request is removed from the done queue. Only when the
2297  * return value is zero, is the "reqp" pointer valid. One is returned
2298  * when the request is inprogress. Two is returned when the request
2299  * is invalid.
2300  */
2301 static int
aio_req_find(aio_result_t * resultp,aio_req_t ** reqp)2302 aio_req_find(aio_result_t *resultp, aio_req_t **reqp)
2303 {
2304 	aio_req_t **bucket;
2305 	aio_req_t *ent;
2306 	aio_t *aiop = curproc->p_aio;
2307 	long index;
2308 
2309 	ASSERT(MUTEX_HELD(&aiop->aio_cleanupq_mutex));
2310 	ASSERT(MUTEX_HELD(&aiop->aio_mutex));
2311 
2312 	index = AIO_HASH(resultp);
2313 	bucket = &aiop->aio_hash[index];
2314 	for (ent = *bucket; ent != NULL; ent = ent->aio_hash_next) {
2315 		if (ent->aio_req_resultp == resultp) {
2316 			if (ent->aio_req_flags & AIO_DONEQ) {
2317 				*reqp = aio_req_remove(ent);
2318 				return (0);
2319 			}
2320 			return (1);
2321 		}
2322 	}
2323 	/* no match, resultp is invalid */
2324 	return (2);
2325 }
2326 
2327 /*
2328  * remove a request from the done queue.
2329  */
2330 static aio_req_t *
aio_req_remove(aio_req_t * reqp)2331 aio_req_remove(aio_req_t *reqp)
2332 {
2333 	aio_t *aiop = curproc->p_aio;
2334 
2335 	ASSERT(MUTEX_HELD(&aiop->aio_mutex));
2336 
2337 	if (reqp != NULL) {
2338 		ASSERT(reqp->aio_req_flags & AIO_DONEQ);
2339 		if (reqp->aio_req_next == reqp) {
2340 			/* only one request on queue */
2341 			if (reqp ==  aiop->aio_doneq) {
2342 				aiop->aio_doneq = NULL;
2343 			} else {
2344 				ASSERT(reqp == aiop->aio_cleanupq);
2345 				aiop->aio_cleanupq = NULL;
2346 			}
2347 		} else {
2348 			reqp->aio_req_next->aio_req_prev = reqp->aio_req_prev;
2349 			reqp->aio_req_prev->aio_req_next = reqp->aio_req_next;
2350 			/*
2351 			 * The request can be either on the aio_doneq or the
2352 			 * aio_cleanupq
2353 			 */
2354 			if (reqp == aiop->aio_doneq)
2355 				aiop->aio_doneq = reqp->aio_req_next;
2356 
2357 			if (reqp == aiop->aio_cleanupq)
2358 				aiop->aio_cleanupq = reqp->aio_req_next;
2359 		}
2360 		reqp->aio_req_flags &= ~AIO_DONEQ;
2361 		reqp->aio_req_next = NULL;
2362 		reqp->aio_req_prev = NULL;
2363 	} else if ((reqp = aiop->aio_doneq) != NULL) {
2364 		ASSERT(reqp->aio_req_flags & AIO_DONEQ);
2365 		if (reqp == reqp->aio_req_next) {
2366 			/* only one request on queue */
2367 			aiop->aio_doneq = NULL;
2368 		} else {
2369 			reqp->aio_req_prev->aio_req_next = reqp->aio_req_next;
2370 			reqp->aio_req_next->aio_req_prev = reqp->aio_req_prev;
2371 			aiop->aio_doneq = reqp->aio_req_next;
2372 		}
2373 		reqp->aio_req_flags &= ~AIO_DONEQ;
2374 		reqp->aio_req_next = NULL;
2375 		reqp->aio_req_prev = NULL;
2376 	}
2377 	if (aiop->aio_doneq == NULL && (aiop->aio_flags & AIO_WAITN))
2378 		cv_broadcast(&aiop->aio_waitcv);
2379 	return (reqp);
2380 }
2381 
2382 static int
aio_req_setup(aio_req_t ** reqpp,aio_t * aiop,aiocb_t * arg,aio_result_t * resultp,vnode_t * vp,int old_solaris_req)2383 aio_req_setup(aio_req_t **reqpp, aio_t *aiop, aiocb_t *arg,
2384     aio_result_t *resultp, vnode_t *vp, int old_solaris_req)
2385 {
2386 	sigqueue_t	*sqp = NULL;
2387 	aio_req_t	*reqp;
2388 	struct uio	*uio;
2389 	struct sigevent *sigev;
2390 	int		error;
2391 
2392 	sigev = &arg->aio_sigevent;
2393 	if (sigev->sigev_notify == SIGEV_SIGNAL &&
2394 	    sigev->sigev_signo > 0 && sigev->sigev_signo < NSIG) {
2395 		sqp = kmem_zalloc(sizeof (sigqueue_t), KM_NOSLEEP);
2396 		if (sqp == NULL)
2397 			return (EAGAIN);
2398 		sqp->sq_func = NULL;
2399 		sqp->sq_next = NULL;
2400 		sqp->sq_info.si_code = SI_ASYNCIO;
2401 		sqp->sq_info.si_pid = curproc->p_pid;
2402 		sqp->sq_info.si_ctid = PRCTID(curproc);
2403 		sqp->sq_info.si_zoneid = getzoneid();
2404 		sqp->sq_info.si_uid = crgetuid(curproc->p_cred);
2405 		sqp->sq_info.si_signo = sigev->sigev_signo;
2406 		sqp->sq_info.si_value = sigev->sigev_value;
2407 	}
2408 
2409 	mutex_enter(&aiop->aio_mutex);
2410 
2411 	if (aiop->aio_flags & AIO_REQ_BLOCK) {
2412 		mutex_exit(&aiop->aio_mutex);
2413 		if (sqp)
2414 			kmem_free(sqp, sizeof (sigqueue_t));
2415 		return (EIO);
2416 	}
2417 	/*
2418 	 * get an aio_reqp from the free list or allocate one
2419 	 * from dynamic memory.
2420 	 */
2421 	if (error = aio_req_alloc(&reqp, resultp)) {
2422 		mutex_exit(&aiop->aio_mutex);
2423 		if (sqp)
2424 			kmem_free(sqp, sizeof (sigqueue_t));
2425 		return (error);
2426 	}
2427 	aiop->aio_pending++;
2428 	aiop->aio_outstanding++;
2429 	reqp->aio_req_flags = AIO_PENDING;
2430 	if (old_solaris_req) {
2431 		/* this is an old solaris aio request */
2432 		reqp->aio_req_flags |= AIO_SOLARIS;
2433 		aiop->aio_flags |= AIO_SOLARIS_REQ;
2434 	}
2435 	if (sigev->sigev_notify == SIGEV_THREAD ||
2436 	    sigev->sigev_notify == SIGEV_PORT)
2437 		aio_enq(&aiop->aio_portpending, reqp, 0);
2438 	mutex_exit(&aiop->aio_mutex);
2439 	/*
2440 	 * initialize aio request.
2441 	 */
2442 	reqp->aio_req_fd = arg->aio_fildes;
2443 	reqp->aio_req_sigqp = sqp;
2444 	reqp->aio_req_iocb.iocb = NULL;
2445 	reqp->aio_req_lio = NULL;
2446 	reqp->aio_req_buf.b_file = vp;
2447 	uio = reqp->aio_req.aio_uio;
2448 	uio->uio_iovcnt = 1;
2449 	uio->uio_iov->iov_base = (caddr_t)arg->aio_buf;
2450 	uio->uio_iov->iov_len = arg->aio_nbytes;
2451 	uio->uio_loffset = arg->aio_offset;
2452 	*reqpp = reqp;
2453 	return (0);
2454 }
2455 
2456 /*
2457  * Allocate p_aio struct.
2458  */
2459 static aio_t *
aio_aiop_alloc(void)2460 aio_aiop_alloc(void)
2461 {
2462 	aio_t	*aiop;
2463 
2464 	ASSERT(MUTEX_HELD(&curproc->p_lock));
2465 
2466 	aiop = kmem_zalloc(sizeof (struct aio), KM_NOSLEEP);
2467 	if (aiop) {
2468 		mutex_init(&aiop->aio_mutex, NULL, MUTEX_DEFAULT, NULL);
2469 		mutex_init(&aiop->aio_cleanupq_mutex, NULL, MUTEX_DEFAULT,
2470 		    NULL);
2471 		mutex_init(&aiop->aio_portq_mutex, NULL, MUTEX_DEFAULT, NULL);
2472 	}
2473 	return (aiop);
2474 }
2475 
2476 /*
2477  * Allocate an aio_req struct.
2478  */
2479 static int
aio_req_alloc(aio_req_t ** nreqp,aio_result_t * resultp)2480 aio_req_alloc(aio_req_t **nreqp, aio_result_t *resultp)
2481 {
2482 	aio_req_t *reqp;
2483 	aio_t *aiop = curproc->p_aio;
2484 
2485 	ASSERT(MUTEX_HELD(&aiop->aio_mutex));
2486 
2487 	if ((reqp = aiop->aio_free) != NULL) {
2488 		aiop->aio_free = reqp->aio_req_next;
2489 		bzero(reqp, sizeof (*reqp));
2490 	} else {
2491 		/*
2492 		 * Check whether memory is getting tight.
2493 		 * This is a temporary mechanism to avoid memory
2494 		 * exhaustion by a single process until we come up
2495 		 * with a per process solution such as setrlimit().
2496 		 */
2497 		if (freemem < desfree)
2498 			return (EAGAIN);
2499 		reqp = kmem_zalloc(sizeof (struct aio_req_t), KM_NOSLEEP);
2500 		if (reqp == NULL)
2501 			return (EAGAIN);
2502 	}
2503 	reqp->aio_req.aio_uio = &reqp->aio_req_uio;
2504 	reqp->aio_req.aio_uio->uio_iov = &reqp->aio_req_iov;
2505 	reqp->aio_req.aio_private = reqp;
2506 	reqp->aio_req_buf.b_offset = -1;
2507 	reqp->aio_req_resultp = resultp;
2508 	if (aio_hash_insert(reqp, aiop)) {
2509 		reqp->aio_req_next = aiop->aio_free;
2510 		aiop->aio_free = reqp;
2511 		return (EBUSY);
2512 	}
2513 	*nreqp = reqp;
2514 	return (0);
2515 }
2516 
2517 /*
2518  * Allocate an aio_lio_t struct.
2519  */
2520 static int
aio_lio_alloc(aio_lio_t ** head)2521 aio_lio_alloc(aio_lio_t **head)
2522 {
2523 	aio_lio_t *liop;
2524 	aio_t *aiop = curproc->p_aio;
2525 
2526 	ASSERT(MUTEX_HELD(&aiop->aio_mutex));
2527 
2528 	if ((liop = aiop->aio_lio_free) != NULL) {
2529 		aiop->aio_lio_free = liop->lio_next;
2530 	} else {
2531 		/*
2532 		 * Check whether memory is getting tight.
2533 		 * This is a temporary mechanism to avoid memory
2534 		 * exhaustion by a single process until we come up
2535 		 * with a per process solution such as setrlimit().
2536 		 */
2537 		if (freemem < desfree)
2538 			return (EAGAIN);
2539 
2540 		liop = kmem_zalloc(sizeof (aio_lio_t), KM_NOSLEEP);
2541 		if (liop == NULL)
2542 			return (EAGAIN);
2543 	}
2544 	*head = liop;
2545 	return (0);
2546 }
2547 
2548 /*
2549  * this is a special per-process thread that is only activated if
2550  * the process is unmapping a segment with outstanding aio. normally,
2551  * the process will have completed the aio before unmapping the
2552  * segment. If the process does unmap a segment with outstanding aio,
2553  * this special thread will guarentee that the locked pages due to
2554  * aphysio() are released, thereby permitting the segment to be
2555  * unmapped. In addition to this, the cleanup thread is woken up
2556  * during DR operations to release the locked pages.
2557  */
2558 
2559 static int
aio_cleanup_thread(aio_t * aiop)2560 aio_cleanup_thread(aio_t *aiop)
2561 {
2562 	proc_t *p = curproc;
2563 	struct as *as = p->p_as;
2564 	int poked = 0;
2565 	kcondvar_t *cvp;
2566 	int exit_flag = 0;
2567 	int rqclnup = 0;
2568 
2569 	sigfillset(&curthread->t_hold);
2570 	sigdiffset(&curthread->t_hold, &cantmask);
2571 	for (;;) {
2572 		/*
2573 		 * if a segment is being unmapped, and the current
2574 		 * process's done queue is not empty, then every request
2575 		 * on the doneq with locked resources should be forced
2576 		 * to release their locks. By moving the doneq request
2577 		 * to the cleanupq, aio_cleanup() will process the cleanupq,
2578 		 * and place requests back onto the doneq. All requests
2579 		 * processed by aio_cleanup() will have their physical
2580 		 * resources unlocked.
2581 		 */
2582 		mutex_enter(&aiop->aio_mutex);
2583 		if ((aiop->aio_flags & AIO_CLEANUP) == 0) {
2584 			aiop->aio_flags |= AIO_CLEANUP;
2585 			mutex_enter(&as->a_contents);
2586 			if (aiop->aio_rqclnup) {
2587 				aiop->aio_rqclnup = 0;
2588 				rqclnup = 1;
2589 			}
2590 			mutex_exit(&as->a_contents);
2591 			if (aiop->aio_doneq) {
2592 				aio_req_t *doneqhead = aiop->aio_doneq;
2593 				aiop->aio_doneq = NULL;
2594 				aio_cleanupq_concat(aiop, doneqhead, AIO_DONEQ);
2595 			}
2596 		}
2597 		mutex_exit(&aiop->aio_mutex);
2598 		aio_cleanup(AIO_CLEANUP_THREAD);
2599 		/*
2600 		 * thread should block on the cleanupcv while
2601 		 * AIO_CLEANUP is set.
2602 		 */
2603 		cvp = &aiop->aio_cleanupcv;
2604 		mutex_enter(&aiop->aio_mutex);
2605 
2606 		if (aiop->aio_pollq != NULL || aiop->aio_cleanupq != NULL ||
2607 		    aiop->aio_notifyq != NULL ||
2608 		    aiop->aio_portcleanupq != NULL) {
2609 			mutex_exit(&aiop->aio_mutex);
2610 			continue;
2611 		}
2612 		mutex_enter(&as->a_contents);
2613 
2614 		/*
2615 		 * AIO_CLEANUP determines when the cleanup thread
2616 		 * should be active. This flag is set when
2617 		 * the cleanup thread is awakened by as_unmap() or
2618 		 * due to DR operations.
2619 		 * The flag is cleared when the blocking as_unmap()
2620 		 * that originally awakened us is allowed to
2621 		 * complete. as_unmap() blocks when trying to
2622 		 * unmap a segment that has SOFTLOCKed pages. when
2623 		 * the segment's pages are all SOFTUNLOCKed,
2624 		 * as->a_flags & AS_UNMAPWAIT should be zero.
2625 		 *
2626 		 * In case of cleanup request by DR, the flag is cleared
2627 		 * once all the pending aio requests have been processed.
2628 		 *
2629 		 * The flag shouldn't be cleared right away if the
2630 		 * cleanup thread was interrupted because the process
2631 		 * is doing forkall(). This happens when cv_wait_sig()
2632 		 * returns zero, because it was awakened by a pokelwps().
2633 		 * If the process is not exiting, it must be doing forkall().
2634 		 */
2635 		if ((poked == 0) &&
2636 		    ((!rqclnup && (AS_ISUNMAPWAIT(as) == 0)) ||
2637 		    (aiop->aio_pending == 0))) {
2638 			aiop->aio_flags &= ~(AIO_CLEANUP | AIO_CLEANUP_PORT);
2639 			cvp = &as->a_cv;
2640 			rqclnup = 0;
2641 		}
2642 		mutex_exit(&aiop->aio_mutex);
2643 		if (poked) {
2644 			/*
2645 			 * If the process is exiting/killed, don't return
2646 			 * immediately without waiting for pending I/O's
2647 			 * and releasing the page locks.
2648 			 */
2649 			if (p->p_flag & (SEXITLWPS|SKILLED)) {
2650 				/*
2651 				 * If exit_flag is set, then it is
2652 				 * safe to exit because we have released
2653 				 * page locks of completed I/O's.
2654 				 */
2655 				if (exit_flag)
2656 					break;
2657 
2658 				mutex_exit(&as->a_contents);
2659 
2660 				/*
2661 				 * Wait for all the pending aio to complete.
2662 				 */
2663 				mutex_enter(&aiop->aio_mutex);
2664 				aiop->aio_flags |= AIO_REQ_BLOCK;
2665 				while (aiop->aio_pending != 0)
2666 					cv_wait(&aiop->aio_cleanupcv,
2667 					    &aiop->aio_mutex);
2668 				mutex_exit(&aiop->aio_mutex);
2669 				exit_flag = 1;
2670 				continue;
2671 			} else if (p->p_flag &
2672 			    (SHOLDFORK|SHOLDFORK1|SHOLDWATCH)) {
2673 				/*
2674 				 * hold LWP until it
2675 				 * is continued.
2676 				 */
2677 				mutex_exit(&as->a_contents);
2678 				mutex_enter(&p->p_lock);
2679 				stop(PR_SUSPENDED, SUSPEND_NORMAL);
2680 				mutex_exit(&p->p_lock);
2681 				poked = 0;
2682 				continue;
2683 			}
2684 		} else {
2685 			/*
2686 			 * When started this thread will sleep on as->a_cv.
2687 			 * as_unmap will awake this thread if the
2688 			 * segment has SOFTLOCKed pages (poked = 0).
2689 			 * 1. pokelwps() awakes this thread =>
2690 			 *    break the loop to check SEXITLWPS, SHOLDFORK, etc
2691 			 * 2. as_unmap awakes this thread =>
2692 			 *    to break the loop it is necessary that
2693 			 *    - AS_UNMAPWAIT is set (as_unmap is waiting for
2694 			 *	memory to be unlocked)
2695 			 *    - AIO_CLEANUP is not set
2696 			 *	(if AIO_CLEANUP is set we have to wait for
2697 			 *	pending requests. aio_done will send a signal
2698 			 *	for every request which completes to continue
2699 			 *	unmapping the corresponding address range)
2700 			 * 3. A cleanup request will wake this thread up, ex.
2701 			 *    by the DR operations. The aio_rqclnup flag will
2702 			 *    be set.
2703 			 */
2704 			while (poked == 0) {
2705 				/*
2706 				 * The clean up requests that came in
2707 				 * after we had just cleaned up, couldn't
2708 				 * be causing the unmap thread to block - as
2709 				 * unmap event happened first.
2710 				 * Let aio_done() wake us up if it sees a need.
2711 				 */
2712 				if (aiop->aio_rqclnup &&
2713 				    (aiop->aio_flags & AIO_CLEANUP) == 0)
2714 					break;
2715 				poked = !cv_wait_sig(cvp, &as->a_contents);
2716 				if (AS_ISUNMAPWAIT(as) == 0)
2717 					cv_signal(cvp);
2718 				if (aiop->aio_outstanding != 0)
2719 					break;
2720 			}
2721 		}
2722 		mutex_exit(&as->a_contents);
2723 	}
2724 exit:
2725 	mutex_exit(&as->a_contents);
2726 	ASSERT((curproc->p_flag & (SEXITLWPS|SKILLED)));
2727 	aston(curthread);	/* make thread do post_syscall */
2728 	return (0);
2729 }
2730 
2731 /*
2732  * save a reference to a user's outstanding aio in a hash list.
2733  */
2734 static int
aio_hash_insert(aio_req_t * aio_reqp,aio_t * aiop)2735 aio_hash_insert(
2736 	aio_req_t *aio_reqp,
2737 	aio_t *aiop)
2738 {
2739 	long index;
2740 	aio_result_t *resultp = aio_reqp->aio_req_resultp;
2741 	aio_req_t *current;
2742 	aio_req_t **nextp;
2743 
2744 	index = AIO_HASH(resultp);
2745 	nextp = &aiop->aio_hash[index];
2746 	while ((current = *nextp) != NULL) {
2747 		if (current->aio_req_resultp == resultp)
2748 			return (DUPLICATE);
2749 		nextp = &current->aio_hash_next;
2750 	}
2751 	*nextp = aio_reqp;
2752 	aio_reqp->aio_hash_next = NULL;
2753 	return (0);
2754 }
2755 
2756 static int
check_vp(struct vnode * vp,int mode)2757 (*check_vp(struct vnode *vp, int mode))(vnode_t *, struct aio_req *,
2758     cred_t *)
2759 {
2760 	struct snode *sp;
2761 	dev_t		dev;
2762 	struct cb_ops	*cb;
2763 	major_t		major;
2764 	int		(*aio_func)();
2765 
2766 	dev = vp->v_rdev;
2767 	major = getmajor(dev);
2768 
2769 	/*
2770 	 * return NULL for requests to files and STREAMs so
2771 	 * that libaio takes care of them.
2772 	 */
2773 	if (vp->v_type == VCHR) {
2774 		/* no stream device for kaio */
2775 		if (STREAMSTAB(major)) {
2776 			return (NULL);
2777 		}
2778 	} else {
2779 		return (NULL);
2780 	}
2781 
2782 	/*
2783 	 * Check old drivers which do not have async I/O entry points.
2784 	 */
2785 	if (devopsp[major]->devo_rev < 3)
2786 		return (NULL);
2787 
2788 	cb = devopsp[major]->devo_cb_ops;
2789 
2790 	if (cb->cb_rev < 1)
2791 		return (NULL);
2792 
2793 	/*
2794 	 * Check whether this device is a block device.
2795 	 * Kaio is not supported for devices like tty.
2796 	 */
2797 	if (cb->cb_strategy == nodev || cb->cb_strategy == NULL)
2798 		return (NULL);
2799 
2800 	/*
2801 	 * Clustering: If vnode is a PXFS vnode, then the device may be remote.
2802 	 * We cannot call the driver directly. Instead return the
2803 	 * PXFS functions.
2804 	 */
2805 
2806 	if (IS_PXFSVP(vp)) {
2807 		if (mode & FREAD)
2808 			return (clpxfs_aio_read);
2809 		else
2810 			return (clpxfs_aio_write);
2811 	}
2812 	if (mode & FREAD)
2813 		aio_func = (cb->cb_aread == nodev) ? NULL : driver_aio_read;
2814 	else
2815 		aio_func = (cb->cb_awrite == nodev) ? NULL : driver_aio_write;
2816 
2817 	/*
2818 	 * Do we need this ?
2819 	 * nodev returns ENXIO anyway.
2820 	 */
2821 	if (aio_func == nodev)
2822 		return (NULL);
2823 
2824 	sp = VTOS(vp);
2825 	smark(sp, SACC);
2826 	return (aio_func);
2827 }
2828 
2829 /*
2830  * Clustering: We want check_vp to return a function prototyped
2831  * correctly that will be common to both PXFS and regular case.
2832  * We define this intermediate function that will do the right
2833  * thing for driver cases.
2834  */
2835 
2836 static int
driver_aio_write(vnode_t * vp,struct aio_req * aio,cred_t * cred_p)2837 driver_aio_write(vnode_t *vp, struct aio_req *aio, cred_t *cred_p)
2838 {
2839 	dev_t dev;
2840 	struct cb_ops	*cb;
2841 
2842 	ASSERT(vp->v_type == VCHR);
2843 	ASSERT(!IS_PXFSVP(vp));
2844 	dev = VTOS(vp)->s_dev;
2845 	ASSERT(STREAMSTAB(getmajor(dev)) == NULL);
2846 
2847 	cb = devopsp[getmajor(dev)]->devo_cb_ops;
2848 
2849 	ASSERT(cb->cb_awrite != nodev);
2850 	return ((*cb->cb_awrite)(dev, aio, cred_p));
2851 }
2852 
2853 /*
2854  * Clustering: We want check_vp to return a function prototyped
2855  * correctly that will be common to both PXFS and regular case.
2856  * We define this intermediate function that will do the right
2857  * thing for driver cases.
2858  */
2859 
2860 static int
driver_aio_read(vnode_t * vp,struct aio_req * aio,cred_t * cred_p)2861 driver_aio_read(vnode_t *vp, struct aio_req *aio, cred_t *cred_p)
2862 {
2863 	dev_t dev;
2864 	struct cb_ops	*cb;
2865 
2866 	ASSERT(vp->v_type == VCHR);
2867 	ASSERT(!IS_PXFSVP(vp));
2868 	dev = VTOS(vp)->s_dev;
2869 	ASSERT(!STREAMSTAB(getmajor(dev)));
2870 
2871 	cb = devopsp[getmajor(dev)]->devo_cb_ops;
2872 
2873 	ASSERT(cb->cb_aread != nodev);
2874 	return ((*cb->cb_aread)(dev, aio, cred_p));
2875 }
2876 
2877 /*
2878  * This routine is called when a largefile call is made by a 32bit
2879  * process on a ILP32 or LP64 kernel. All 64bit processes are large
2880  * file by definition and will call alio() instead.
2881  */
2882 static int
alioLF(int mode_arg,void * aiocb_arg,int nent,void * sigev)2883 alioLF(
2884 	int		mode_arg,
2885 	void		*aiocb_arg,
2886 	int		nent,
2887 	void		*sigev)
2888 {
2889 	file_t		*fp;
2890 	file_t		*prev_fp = NULL;
2891 	int		prev_mode = -1;
2892 	struct vnode	*vp;
2893 	aio_lio_t	*head;
2894 	aio_req_t	*reqp;
2895 	aio_t		*aiop;
2896 	caddr_t		cbplist;
2897 	aiocb64_32_t	cb64;
2898 	aiocb64_32_t	*aiocb = &cb64;
2899 	aiocb64_32_t	*cbp;
2900 	caddr32_t	*ucbp;
2901 #ifdef _LP64
2902 	aiocb_t		aiocb_n;
2903 #endif
2904 	struct sigevent32	sigevk;
2905 	sigqueue_t	*sqp;
2906 	int		(*aio_func)();
2907 	int		mode;
2908 	int		error = 0;
2909 	int		aio_errors = 0;
2910 	int		i;
2911 	size_t		ssize;
2912 	int		deadhead = 0;
2913 	int		aio_notsupported = 0;
2914 	int		lio_head_port;
2915 	int		aio_port;
2916 	int		aio_thread;
2917 	port_kevent_t	*pkevtp = NULL;
2918 	int		portused = 0;
2919 	port_notify32_t	pnotify;
2920 	int		event;
2921 
2922 	aiop = curproc->p_aio;
2923 	if (aiop == NULL || nent <= 0 || nent > _AIO_LISTIO_MAX)
2924 		return (EINVAL);
2925 
2926 	ASSERT(get_udatamodel() == DATAMODEL_ILP32);
2927 
2928 	ssize = (sizeof (caddr32_t) * nent);
2929 	cbplist = kmem_alloc(ssize, KM_SLEEP);
2930 	ucbp = (caddr32_t *)cbplist;
2931 
2932 	if (copyin(aiocb_arg, cbplist, ssize) ||
2933 	    (sigev && copyin(sigev, &sigevk, sizeof (sigevk)))) {
2934 		kmem_free(cbplist, ssize);
2935 		return (EFAULT);
2936 	}
2937 
2938 	/* Event Ports  */
2939 	if (sigev &&
2940 	    (sigevk.sigev_notify == SIGEV_THREAD ||
2941 	    sigevk.sigev_notify == SIGEV_PORT)) {
2942 		if (sigevk.sigev_notify == SIGEV_THREAD) {
2943 			pnotify.portnfy_port = sigevk.sigev_signo;
2944 			pnotify.portnfy_user = sigevk.sigev_value.sival_ptr;
2945 		} else if (copyin(
2946 		    (void *)(uintptr_t)sigevk.sigev_value.sival_ptr,
2947 		    &pnotify, sizeof (pnotify))) {
2948 			kmem_free(cbplist, ssize);
2949 			return (EFAULT);
2950 		}
2951 		error = port_alloc_event(pnotify.portnfy_port,
2952 		    PORT_ALLOC_DEFAULT, PORT_SOURCE_AIO, &pkevtp);
2953 		if (error) {
2954 			if (error == ENOMEM || error == EAGAIN)
2955 				error = EAGAIN;
2956 			else
2957 				error = EINVAL;
2958 			kmem_free(cbplist, ssize);
2959 			return (error);
2960 		}
2961 		lio_head_port = pnotify.portnfy_port;
2962 		portused = 1;
2963 	}
2964 
2965 	/*
2966 	 * a list head should be allocated if notification is
2967 	 * enabled for this list.
2968 	 */
2969 	head = NULL;
2970 
2971 	if (mode_arg == LIO_WAIT || sigev) {
2972 		mutex_enter(&aiop->aio_mutex);
2973 		error = aio_lio_alloc(&head);
2974 		mutex_exit(&aiop->aio_mutex);
2975 		if (error)
2976 			goto done;
2977 		deadhead = 1;
2978 		head->lio_nent = nent;
2979 		head->lio_refcnt = nent;
2980 		head->lio_port = -1;
2981 		head->lio_portkev = NULL;
2982 		if (sigev && sigevk.sigev_notify == SIGEV_SIGNAL &&
2983 		    sigevk.sigev_signo > 0 && sigevk.sigev_signo < NSIG) {
2984 			sqp = kmem_zalloc(sizeof (sigqueue_t), KM_NOSLEEP);
2985 			if (sqp == NULL) {
2986 				error = EAGAIN;
2987 				goto done;
2988 			}
2989 			sqp->sq_func = NULL;
2990 			sqp->sq_next = NULL;
2991 			sqp->sq_info.si_code = SI_ASYNCIO;
2992 			sqp->sq_info.si_pid = curproc->p_pid;
2993 			sqp->sq_info.si_ctid = PRCTID(curproc);
2994 			sqp->sq_info.si_zoneid = getzoneid();
2995 			sqp->sq_info.si_uid = crgetuid(curproc->p_cred);
2996 			sqp->sq_info.si_signo = sigevk.sigev_signo;
2997 			sqp->sq_info.si_value.sival_int =
2998 			    sigevk.sigev_value.sival_int;
2999 			head->lio_sigqp = sqp;
3000 		} else {
3001 			head->lio_sigqp = NULL;
3002 		}
3003 		if (pkevtp) {
3004 			/*
3005 			 * Prepare data to send when list of aiocb's
3006 			 * has completed.
3007 			 */
3008 			port_init_event(pkevtp, (uintptr_t)sigev,
3009 			    (void *)(uintptr_t)pnotify.portnfy_user,
3010 			    NULL, head);
3011 			pkevtp->portkev_events = AIOLIO64;
3012 			head->lio_portkev = pkevtp;
3013 			head->lio_port = pnotify.portnfy_port;
3014 		}
3015 	}
3016 
3017 	for (i = 0; i < nent; i++, ucbp++) {
3018 
3019 		cbp = (aiocb64_32_t *)(uintptr_t)*ucbp;
3020 		/* skip entry if it can't be copied. */
3021 		if (cbp == NULL || copyin(cbp, aiocb, sizeof (*aiocb))) {
3022 			if (head) {
3023 				mutex_enter(&aiop->aio_mutex);
3024 				head->lio_nent--;
3025 				head->lio_refcnt--;
3026 				mutex_exit(&aiop->aio_mutex);
3027 			}
3028 			continue;
3029 		}
3030 
3031 		/* skip if opcode for aiocb is LIO_NOP */
3032 		mode = aiocb->aio_lio_opcode;
3033 		if (mode == LIO_NOP) {
3034 			cbp = NULL;
3035 			if (head) {
3036 				mutex_enter(&aiop->aio_mutex);
3037 				head->lio_nent--;
3038 				head->lio_refcnt--;
3039 				mutex_exit(&aiop->aio_mutex);
3040 			}
3041 			continue;
3042 		}
3043 
3044 		/* increment file descriptor's ref count. */
3045 		if ((fp = getf(aiocb->aio_fildes)) == NULL) {
3046 			lio_set_uerror(&cbp->aio_resultp, EBADF);
3047 			if (head) {
3048 				mutex_enter(&aiop->aio_mutex);
3049 				head->lio_nent--;
3050 				head->lio_refcnt--;
3051 				mutex_exit(&aiop->aio_mutex);
3052 			}
3053 			aio_errors++;
3054 			continue;
3055 		}
3056 
3057 		/*
3058 		 * check the permission of the partition
3059 		 */
3060 		if ((fp->f_flag & mode) == 0) {
3061 			releasef(aiocb->aio_fildes);
3062 			lio_set_uerror(&cbp->aio_resultp, EBADF);
3063 			if (head) {
3064 				mutex_enter(&aiop->aio_mutex);
3065 				head->lio_nent--;
3066 				head->lio_refcnt--;
3067 				mutex_exit(&aiop->aio_mutex);
3068 			}
3069 			aio_errors++;
3070 			continue;
3071 		}
3072 
3073 		/*
3074 		 * common case where requests are to the same fd
3075 		 * for the same r/w operation
3076 		 * for UFS, need to set EBADFD
3077 		 */
3078 		vp = fp->f_vnode;
3079 		if (fp != prev_fp || mode != prev_mode) {
3080 			aio_func = check_vp(vp, mode);
3081 			if (aio_func == NULL) {
3082 				prev_fp = NULL;
3083 				releasef(aiocb->aio_fildes);
3084 				lio_set_uerror(&cbp->aio_resultp, EBADFD);
3085 				aio_notsupported++;
3086 				if (head) {
3087 					mutex_enter(&aiop->aio_mutex);
3088 					head->lio_nent--;
3089 					head->lio_refcnt--;
3090 					mutex_exit(&aiop->aio_mutex);
3091 				}
3092 				continue;
3093 			} else {
3094 				prev_fp = fp;
3095 				prev_mode = mode;
3096 			}
3097 		}
3098 
3099 #ifdef	_LP64
3100 		aiocb_LFton(aiocb, &aiocb_n);
3101 		error = aio_req_setup(&reqp, aiop, &aiocb_n,
3102 		    (aio_result_t *)&cbp->aio_resultp, vp, 0);
3103 #else
3104 		error = aio_req_setupLF(&reqp, aiop, aiocb,
3105 		    (aio_result_t *)&cbp->aio_resultp, vp, 0);
3106 #endif  /* _LP64 */
3107 		if (error) {
3108 			releasef(aiocb->aio_fildes);
3109 			lio_set_uerror(&cbp->aio_resultp, error);
3110 			if (head) {
3111 				mutex_enter(&aiop->aio_mutex);
3112 				head->lio_nent--;
3113 				head->lio_refcnt--;
3114 				mutex_exit(&aiop->aio_mutex);
3115 			}
3116 			aio_errors++;
3117 			continue;
3118 		}
3119 
3120 		reqp->aio_req_lio = head;
3121 		deadhead = 0;
3122 
3123 		/*
3124 		 * Set the errno field now before sending the request to
3125 		 * the driver to avoid a race condition
3126 		 */
3127 		(void) suword32(&cbp->aio_resultp.aio_errno,
3128 		    EINPROGRESS);
3129 
3130 		reqp->aio_req_iocb.iocb32 = *ucbp;
3131 
3132 		event = (mode == LIO_READ)? AIOAREAD64 : AIOAWRITE64;
3133 		aio_port = (aiocb->aio_sigevent.sigev_notify == SIGEV_PORT);
3134 		aio_thread = (aiocb->aio_sigevent.sigev_notify == SIGEV_THREAD);
3135 		if (aio_port | aio_thread) {
3136 			port_kevent_t *lpkevp;
3137 			/*
3138 			 * Prepare data to send with each aiocb completed.
3139 			 */
3140 			if (aio_port) {
3141 				void *paddr = (void *)(uintptr_t)
3142 				    aiocb->aio_sigevent.sigev_value.sival_ptr;
3143 				if (copyin(paddr, &pnotify, sizeof (pnotify)))
3144 					error = EFAULT;
3145 			} else {	/* aio_thread */
3146 				pnotify.portnfy_port =
3147 				    aiocb->aio_sigevent.sigev_signo;
3148 				pnotify.portnfy_user =
3149 				    aiocb->aio_sigevent.sigev_value.sival_ptr;
3150 			}
3151 			if (error)
3152 				/* EMPTY */;
3153 			else if (pkevtp != NULL &&
3154 			    pnotify.portnfy_port == lio_head_port)
3155 				error = port_dup_event(pkevtp, &lpkevp,
3156 				    PORT_ALLOC_DEFAULT);
3157 			else
3158 				error = port_alloc_event(pnotify.portnfy_port,
3159 				    PORT_ALLOC_DEFAULT, PORT_SOURCE_AIO,
3160 				    &lpkevp);
3161 			if (error == 0) {
3162 				port_init_event(lpkevp, (uintptr_t)*ucbp,
3163 				    (void *)(uintptr_t)pnotify.portnfy_user,
3164 				    aio_port_callback, reqp);
3165 				lpkevp->portkev_events = event;
3166 				reqp->aio_req_portkev = lpkevp;
3167 				reqp->aio_req_port = pnotify.portnfy_port;
3168 			}
3169 		}
3170 
3171 		/*
3172 		 * send the request to driver.
3173 		 */
3174 		if (error == 0) {
3175 			if (aiocb->aio_nbytes == 0) {
3176 				clear_active_fd(aiocb->aio_fildes);
3177 				aio_zerolen(reqp);
3178 				continue;
3179 			}
3180 			error = (*aio_func)(vp, (aio_req_t *)&reqp->aio_req,
3181 			    CRED());
3182 		}
3183 
3184 		/*
3185 		 * the fd's ref count is not decremented until the IO has
3186 		 * completed unless there was an error.
3187 		 */
3188 		if (error) {
3189 			releasef(aiocb->aio_fildes);
3190 			lio_set_uerror(&cbp->aio_resultp, error);
3191 			if (head) {
3192 				mutex_enter(&aiop->aio_mutex);
3193 				head->lio_nent--;
3194 				head->lio_refcnt--;
3195 				mutex_exit(&aiop->aio_mutex);
3196 			}
3197 			if (error == ENOTSUP)
3198 				aio_notsupported++;
3199 			else
3200 				aio_errors++;
3201 			lio_set_error(reqp, portused);
3202 		} else {
3203 			clear_active_fd(aiocb->aio_fildes);
3204 		}
3205 	}
3206 
3207 	if (aio_notsupported) {
3208 		error = ENOTSUP;
3209 	} else if (aio_errors) {
3210 		/*
3211 		 * return EIO if any request failed
3212 		 */
3213 		error = EIO;
3214 	}
3215 
3216 	if (mode_arg == LIO_WAIT) {
3217 		mutex_enter(&aiop->aio_mutex);
3218 		while (head->lio_refcnt > 0) {
3219 			if (!cv_wait_sig(&head->lio_notify, &aiop->aio_mutex)) {
3220 				mutex_exit(&aiop->aio_mutex);
3221 				error = EINTR;
3222 				goto done;
3223 			}
3224 		}
3225 		mutex_exit(&aiop->aio_mutex);
3226 		alio_cleanup(aiop, (aiocb_t **)cbplist, nent, AIO_LARGEFILE);
3227 	}
3228 
3229 done:
3230 	kmem_free(cbplist, ssize);
3231 	if (deadhead) {
3232 		if (head->lio_sigqp)
3233 			kmem_free(head->lio_sigqp, sizeof (sigqueue_t));
3234 		if (head->lio_portkev)
3235 			port_free_event(head->lio_portkev);
3236 		kmem_free(head, sizeof (aio_lio_t));
3237 	}
3238 	return (error);
3239 }
3240 
3241 #ifdef  _SYSCALL32_IMPL
3242 static void
aiocb_LFton(aiocb64_32_t * src,aiocb_t * dest)3243 aiocb_LFton(aiocb64_32_t *src, aiocb_t *dest)
3244 {
3245 	dest->aio_fildes = src->aio_fildes;
3246 	dest->aio_buf = (void *)(uintptr_t)src->aio_buf;
3247 	dest->aio_nbytes = (size_t)src->aio_nbytes;
3248 	dest->aio_offset = (off_t)src->aio_offset;
3249 	dest->aio_reqprio = src->aio_reqprio;
3250 	dest->aio_sigevent.sigev_notify = src->aio_sigevent.sigev_notify;
3251 	dest->aio_sigevent.sigev_signo = src->aio_sigevent.sigev_signo;
3252 
3253 	/*
3254 	 * See comment in sigqueue32() on handling of 32-bit
3255 	 * sigvals in a 64-bit kernel.
3256 	 */
3257 	dest->aio_sigevent.sigev_value.sival_int =
3258 	    (int)src->aio_sigevent.sigev_value.sival_int;
3259 	dest->aio_sigevent.sigev_notify_function = (void (*)(union sigval))
3260 	    (uintptr_t)src->aio_sigevent.sigev_notify_function;
3261 	dest->aio_sigevent.sigev_notify_attributes = (pthread_attr_t *)
3262 	    (uintptr_t)src->aio_sigevent.sigev_notify_attributes;
3263 	dest->aio_sigevent.__sigev_pad2 = src->aio_sigevent.__sigev_pad2;
3264 	dest->aio_lio_opcode = src->aio_lio_opcode;
3265 	dest->aio_state = src->aio_state;
3266 	dest->aio__pad[0] = src->aio__pad[0];
3267 }
3268 #endif
3269 
3270 /*
3271  * This function is used only for largefile calls made by
3272  * 32 bit applications.
3273  */
3274 static int
aio_req_setupLF(aio_req_t ** reqpp,aio_t * aiop,aiocb64_32_t * arg,aio_result_t * resultp,vnode_t * vp,int old_solaris_req)3275 aio_req_setupLF(
3276 	aio_req_t	**reqpp,
3277 	aio_t		*aiop,
3278 	aiocb64_32_t	*arg,
3279 	aio_result_t	*resultp,
3280 	vnode_t		*vp,
3281 	int		old_solaris_req)
3282 {
3283 	sigqueue_t	*sqp = NULL;
3284 	aio_req_t	*reqp;
3285 	struct uio	*uio;
3286 	struct sigevent32 *sigev;
3287 	int		error;
3288 
3289 	sigev = &arg->aio_sigevent;
3290 	if (sigev->sigev_notify == SIGEV_SIGNAL &&
3291 	    sigev->sigev_signo > 0 && sigev->sigev_signo < NSIG) {
3292 		sqp = kmem_zalloc(sizeof (sigqueue_t), KM_NOSLEEP);
3293 		if (sqp == NULL)
3294 			return (EAGAIN);
3295 		sqp->sq_func = NULL;
3296 		sqp->sq_next = NULL;
3297 		sqp->sq_info.si_code = SI_ASYNCIO;
3298 		sqp->sq_info.si_pid = curproc->p_pid;
3299 		sqp->sq_info.si_ctid = PRCTID(curproc);
3300 		sqp->sq_info.si_zoneid = getzoneid();
3301 		sqp->sq_info.si_uid = crgetuid(curproc->p_cred);
3302 		sqp->sq_info.si_signo = sigev->sigev_signo;
3303 		sqp->sq_info.si_value.sival_int = sigev->sigev_value.sival_int;
3304 	}
3305 
3306 	mutex_enter(&aiop->aio_mutex);
3307 
3308 	if (aiop->aio_flags & AIO_REQ_BLOCK) {
3309 		mutex_exit(&aiop->aio_mutex);
3310 		if (sqp)
3311 			kmem_free(sqp, sizeof (sigqueue_t));
3312 		return (EIO);
3313 	}
3314 	/*
3315 	 * get an aio_reqp from the free list or allocate one
3316 	 * from dynamic memory.
3317 	 */
3318 	if (error = aio_req_alloc(&reqp, resultp)) {
3319 		mutex_exit(&aiop->aio_mutex);
3320 		if (sqp)
3321 			kmem_free(sqp, sizeof (sigqueue_t));
3322 		return (error);
3323 	}
3324 	aiop->aio_pending++;
3325 	aiop->aio_outstanding++;
3326 	reqp->aio_req_flags = AIO_PENDING;
3327 	if (old_solaris_req) {
3328 		/* this is an old solaris aio request */
3329 		reqp->aio_req_flags |= AIO_SOLARIS;
3330 		aiop->aio_flags |= AIO_SOLARIS_REQ;
3331 	}
3332 	if (sigev->sigev_notify == SIGEV_THREAD ||
3333 	    sigev->sigev_notify == SIGEV_PORT)
3334 		aio_enq(&aiop->aio_portpending, reqp, 0);
3335 	mutex_exit(&aiop->aio_mutex);
3336 	/*
3337 	 * initialize aio request.
3338 	 */
3339 	reqp->aio_req_fd = arg->aio_fildes;
3340 	reqp->aio_req_sigqp = sqp;
3341 	reqp->aio_req_iocb.iocb = NULL;
3342 	reqp->aio_req_lio = NULL;
3343 	reqp->aio_req_buf.b_file = vp;
3344 	uio = reqp->aio_req.aio_uio;
3345 	uio->uio_iovcnt = 1;
3346 	uio->uio_iov->iov_base = (caddr_t)(uintptr_t)arg->aio_buf;
3347 	uio->uio_iov->iov_len = arg->aio_nbytes;
3348 	uio->uio_loffset = arg->aio_offset;
3349 	*reqpp = reqp;
3350 	return (0);
3351 }
3352 
3353 /*
3354  * This routine is called when a non largefile call is made by a 32bit
3355  * process on a ILP32 or LP64 kernel.
3356  */
3357 static int
alio32(int mode_arg,void * aiocb_arg,int nent,void * sigev)3358 alio32(
3359 	int		mode_arg,
3360 	vo