xref: /illumos-gate/usr/src/lib/libc/port/aio/aio.c (revision 34b3058f)
17c478bd9Sstevel@tonic-gate /*
27c478bd9Sstevel@tonic-gate  * CDDL HEADER START
37c478bd9Sstevel@tonic-gate  *
47c478bd9Sstevel@tonic-gate  * The contents of this file are subject to the terms of the
534709573Sraf  * Common Development and Distribution License (the "License").
634709573Sraf  * You may not use this file except in compliance with the License.
77c478bd9Sstevel@tonic-gate  *
87c478bd9Sstevel@tonic-gate  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
97c478bd9Sstevel@tonic-gate  * or http://www.opensolaris.org/os/licensing.
107c478bd9Sstevel@tonic-gate  * See the License for the specific language governing permissions
117c478bd9Sstevel@tonic-gate  * and limitations under the License.
127c478bd9Sstevel@tonic-gate  *
137c478bd9Sstevel@tonic-gate  * When distributing Covered Code, include this CDDL HEADER in each
147c478bd9Sstevel@tonic-gate  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
157c478bd9Sstevel@tonic-gate  * If applicable, add the following below this CDDL HEADER, with the
167c478bd9Sstevel@tonic-gate  * fields enclosed by brackets "[]" replaced with your own identifying
177c478bd9Sstevel@tonic-gate  * information: Portions Copyright [yyyy] [name of copyright owner]
187c478bd9Sstevel@tonic-gate  *
197c478bd9Sstevel@tonic-gate  * CDDL HEADER END
207c478bd9Sstevel@tonic-gate  */
2134709573Sraf 
227c478bd9Sstevel@tonic-gate /*
23*34b3058fSpraks  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
247c478bd9Sstevel@tonic-gate  * Use is subject to license terms.
257c478bd9Sstevel@tonic-gate  */
267c478bd9Sstevel@tonic-gate 
277c478bd9Sstevel@tonic-gate #pragma ident	"%Z%%M%	%I%	%E% SMI"
287c478bd9Sstevel@tonic-gate 
29f841f6adSraf #include "synonyms.h"
30f841f6adSraf #include "thr_uberdata.h"
31f841f6adSraf #include "asyncio.h"
3234709573Sraf #include <atomic.h>
337c478bd9Sstevel@tonic-gate #include <sys/param.h>
347c478bd9Sstevel@tonic-gate #include <sys/file.h>
357c478bd9Sstevel@tonic-gate #include <sys/port.h>
367c478bd9Sstevel@tonic-gate 
377c478bd9Sstevel@tonic-gate static int _aio_hash_insert(aio_result_t *, aio_req_t *);
387c478bd9Sstevel@tonic-gate static aio_req_t *_aio_req_get(aio_worker_t *);
397c478bd9Sstevel@tonic-gate static void _aio_req_add(aio_req_t *, aio_worker_t **, int);
407c478bd9Sstevel@tonic-gate static void _aio_req_del(aio_worker_t *, aio_req_t *, int);
417c478bd9Sstevel@tonic-gate static void _aio_work_done(aio_worker_t *);
4234709573Sraf static void _aio_enq_doneq(aio_req_t *);
437c478bd9Sstevel@tonic-gate 
4434709573Sraf extern void _aio_lio_free(aio_lio_t *);
457c478bd9Sstevel@tonic-gate 
4634709573Sraf extern int __fdsync(int, int);
477c478bd9Sstevel@tonic-gate extern int _port_dispatch(int, int, int, int, uintptr_t, void *);
487c478bd9Sstevel@tonic-gate 
49c2575b5eSraf static int _aio_fsync_del(aio_worker_t *, aio_req_t *);
5034709573Sraf static void _aiodone(aio_req_t *, ssize_t, int);
517c478bd9Sstevel@tonic-gate static void _aio_cancel_work(aio_worker_t *, int, int *, int *);
5234709573Sraf static void _aio_finish_request(aio_worker_t *, ssize_t, int);
537c478bd9Sstevel@tonic-gate 
547c478bd9Sstevel@tonic-gate /*
557c478bd9Sstevel@tonic-gate  * switch for kernel async I/O
567c478bd9Sstevel@tonic-gate  */
5734709573Sraf int _kaio_ok = 0;		/* 0 = disabled, 1 = on, -1 = error */
587c478bd9Sstevel@tonic-gate 
597c478bd9Sstevel@tonic-gate /*
607c478bd9Sstevel@tonic-gate  * Key for thread-specific data
617c478bd9Sstevel@tonic-gate  */
6234709573Sraf pthread_key_t _aio_key;
637c478bd9Sstevel@tonic-gate 
647c478bd9Sstevel@tonic-gate /*
6534709573Sraf  * Array for determining whether or not a file supports kaio.
6634709573Sraf  * Initialized in _kaio_init().
677c478bd9Sstevel@tonic-gate  */
6834709573Sraf uint32_t *_kaio_supported = NULL;
697c478bd9Sstevel@tonic-gate 
707c478bd9Sstevel@tonic-gate /*
7134709573Sraf  *  workers for read/write requests
7234709573Sraf  * (__aio_mutex lock protects circular linked list of workers)
737c478bd9Sstevel@tonic-gate  */
7434709573Sraf aio_worker_t *__workers_rw;	/* circular list of AIO workers */
7534709573Sraf aio_worker_t *__nextworker_rw;	/* next worker in list of workers */
7634709573Sraf int __rw_workerscnt;		/* number of read/write workers */
777c478bd9Sstevel@tonic-gate 
787c478bd9Sstevel@tonic-gate /*
7934709573Sraf  * worker for notification requests.
807c478bd9Sstevel@tonic-gate  */
8134709573Sraf aio_worker_t *__workers_no;	/* circular list of AIO workers */
8234709573Sraf aio_worker_t *__nextworker_no;	/* next worker in list of workers */
8334709573Sraf int __no_workerscnt;		/* number of write workers */
847c478bd9Sstevel@tonic-gate 
8534709573Sraf aio_req_t *_aio_done_tail;		/* list of done requests */
8634709573Sraf aio_req_t *_aio_done_head;
877c478bd9Sstevel@tonic-gate 
887c478bd9Sstevel@tonic-gate mutex_t __aio_initlock = DEFAULTMUTEX;	/* makes aio initialization atomic */
89f841f6adSraf cond_t __aio_initcv = DEFAULTCV;
90f841f6adSraf int __aio_initbusy = 0;
91f841f6adSraf 
927c478bd9Sstevel@tonic-gate mutex_t __aio_mutex = DEFAULTMUTEX;	/* protects counts, and linked lists */
937c478bd9Sstevel@tonic-gate cond_t _aio_iowait_cv = DEFAULTCV;	/* wait for userland I/Os */
947c478bd9Sstevel@tonic-gate 
957c478bd9Sstevel@tonic-gate pid_t __pid = (pid_t)-1;		/* initialize as invalid pid */
9634709573Sraf int _sigio_enabled = 0;			/* when set, send SIGIO signal */
977c478bd9Sstevel@tonic-gate 
9834709573Sraf aio_hash_t *_aio_hash;
997c478bd9Sstevel@tonic-gate 
10034709573Sraf aio_req_t *_aio_doneq;			/* double linked done queue list */
1017c478bd9Sstevel@tonic-gate 
1027c478bd9Sstevel@tonic-gate int _aio_donecnt = 0;
10334709573Sraf int _aio_waitncnt = 0;			/* # of requests for aio_waitn */
1047c478bd9Sstevel@tonic-gate int _aio_doneq_cnt = 0;
10534709573Sraf int _aio_outstand_cnt = 0;		/* # of outstanding requests */
10634709573Sraf int _kaio_outstand_cnt = 0;		/* # of outstanding kaio requests */
1077c478bd9Sstevel@tonic-gate int _aio_req_done_cnt = 0;		/* req. done but not in "done queue" */
1087c478bd9Sstevel@tonic-gate int _aio_kernel_suspend = 0;		/* active kernel kaio calls */
1097c478bd9Sstevel@tonic-gate int _aio_suscv_cnt = 0;			/* aio_suspend calls waiting on cv's */
1107c478bd9Sstevel@tonic-gate 
1117c478bd9Sstevel@tonic-gate int _max_workers = 256;			/* max number of workers permitted */
112f841f6adSraf int _min_workers = 4;			/* min number of workers */
1137c478bd9Sstevel@tonic-gate int _minworkload = 2;			/* min number of request in q */
1147c478bd9Sstevel@tonic-gate int _aio_worker_cnt = 0;		/* number of workers to do requests */
1157c478bd9Sstevel@tonic-gate int __uaio_ok = 0;			/* AIO has been enabled */
1167c478bd9Sstevel@tonic-gate sigset_t _worker_set;			/* worker's signal mask */
1177c478bd9Sstevel@tonic-gate 
1187c478bd9Sstevel@tonic-gate int _aiowait_flag = 0;			/* when set, aiowait() is inprogress */
119f841f6adSraf int _aio_flags = 0;			/* see asyncio.h defines for */
1207c478bd9Sstevel@tonic-gate 
121f841f6adSraf aio_worker_t *_kaiowp = NULL;		/* points to kaio cleanup thread */
1227c478bd9Sstevel@tonic-gate 
12334709573Sraf int hz;					/* clock ticks per second */
1247c478bd9Sstevel@tonic-gate 
12534709573Sraf static int
12634709573Sraf _kaio_supported_init(void)
1277c478bd9Sstevel@tonic-gate {
12834709573Sraf 	void *ptr;
12934709573Sraf 	size_t size;
13034709573Sraf 
13134709573Sraf 	if (_kaio_supported != NULL)	/* already initialized */
13234709573Sraf 		return (0);
13334709573Sraf 
13434709573Sraf 	size = MAX_KAIO_FDARRAY_SIZE * sizeof (uint32_t);
13534709573Sraf 	ptr = mmap(NULL, size, PROT_READ | PROT_WRITE,
13634709573Sraf 	    MAP_PRIVATE | MAP_ANON, -1, (off_t)0);
13734709573Sraf 	if (ptr == MAP_FAILED)
13834709573Sraf 		return (-1);
13934709573Sraf 	_kaio_supported = ptr;
14034709573Sraf 	return (0);
1417c478bd9Sstevel@tonic-gate }
1427c478bd9Sstevel@tonic-gate 
1437c478bd9Sstevel@tonic-gate /*
144f841f6adSraf  * The aio subsystem is initialized when an AIO request is made.
145f841f6adSraf  * Constants are initialized like the max number of workers that
146f841f6adSraf  * the subsystem can create, and the minimum number of workers
147f841f6adSraf  * permitted before imposing some restrictions.  Also, some
148f841f6adSraf  * workers are created.
1497c478bd9Sstevel@tonic-gate  */
1507c478bd9Sstevel@tonic-gate int
1517c478bd9Sstevel@tonic-gate __uaio_init(void)
1527c478bd9Sstevel@tonic-gate {
153f841f6adSraf 	int ret = -1;
1547c478bd9Sstevel@tonic-gate 	int i;
1557c478bd9Sstevel@tonic-gate 
156f841f6adSraf 	lmutex_lock(&__aio_initlock);
157f841f6adSraf 	while (__aio_initbusy)
158f841f6adSraf 		(void) _cond_wait(&__aio_initcv, &__aio_initlock);
15934709573Sraf 	if (__uaio_ok) {	/* already initialized */
160f841f6adSraf 		lmutex_unlock(&__aio_initlock);
16134709573Sraf 		return (0);
16234709573Sraf 	}
163f841f6adSraf 	__aio_initbusy = 1;
164f841f6adSraf 	lmutex_unlock(&__aio_initlock);
1657c478bd9Sstevel@tonic-gate 
16634709573Sraf 	hz = (int)sysconf(_SC_CLK_TCK);
16734709573Sraf 	__pid = getpid();
1687c478bd9Sstevel@tonic-gate 
169f841f6adSraf 	setup_cancelsig(SIGAIOCANCEL);
1707c478bd9Sstevel@tonic-gate 
17134709573Sraf 	if (_kaio_supported_init() != 0)
17234709573Sraf 		goto out;
1737c478bd9Sstevel@tonic-gate 
17434709573Sraf 	/*
17534709573Sraf 	 * Allocate and initialize the hash table.
176f7499066Ssp 	 * Do this only once, even if __uaio_init() is called twice.
17734709573Sraf 	 */
178f7499066Ssp 	if (_aio_hash == NULL) {
179f7499066Ssp 		/* LINTED pointer cast */
180f7499066Ssp 		_aio_hash = (aio_hash_t *)mmap(NULL,
181f7499066Ssp 		    HASHSZ * sizeof (aio_hash_t), PROT_READ | PROT_WRITE,
182f7499066Ssp 		    MAP_PRIVATE | MAP_ANON, -1, (off_t)0);
183f7499066Ssp 		if ((void *)_aio_hash == MAP_FAILED) {
184f7499066Ssp 			_aio_hash = NULL;
185f7499066Ssp 			goto out;
186f7499066Ssp 		}
187f7499066Ssp 		for (i = 0; i < HASHSZ; i++)
188f7499066Ssp 			(void) mutex_init(&_aio_hash[i].hash_lock,
189f7499066Ssp 			    USYNC_THREAD, NULL);
1907c478bd9Sstevel@tonic-gate 	}
1917c478bd9Sstevel@tonic-gate 
19234709573Sraf 	/*
19334709573Sraf 	 * Initialize worker's signal mask to only catch SIGAIOCANCEL.
19434709573Sraf 	 */
19534709573Sraf 	(void) sigfillset(&_worker_set);
19634709573Sraf 	(void) sigdelset(&_worker_set, SIGAIOCANCEL);
19734709573Sraf 
19834709573Sraf 	/*
199f7499066Ssp 	 * Create one worker to send asynchronous notifications.
200f7499066Ssp 	 * Do this only once, even if __uaio_init() is called twice.
20134709573Sraf 	 */
202f7499066Ssp 	if (__no_workerscnt == 0 &&
203f7499066Ssp 	    (_aio_create_worker(NULL, AIONOTIFY) != 0)) {
204f7499066Ssp 		errno = EAGAIN;
205f7499066Ssp 		goto out;
206f7499066Ssp 	}
20734709573Sraf 
20834709573Sraf 	/*
209f7499066Ssp 	 * Create the minimum number of read/write workers.
210f7499066Ssp 	 * And later check whether atleast one worker is created;
211f7499066Ssp 	 * lwp_create() calls could fail because of segkp exhaustion.
21234709573Sraf 	 */
213f7499066Ssp 	for (i = 0; i < _min_workers; i++)
214f7499066Ssp 		(void) _aio_create_worker(NULL, AIOREAD);
215f7499066Ssp 	if (__rw_workerscnt == 0) {
216f7499066Ssp 		errno = EAGAIN;
217f7499066Ssp 		goto out;
218f7499066Ssp 	}
21934709573Sraf 
22034709573Sraf 	ret = 0;
22134709573Sraf out:
222f841f6adSraf 	lmutex_lock(&__aio_initlock);
223f841f6adSraf 	if (ret == 0)
224f841f6adSraf 		__uaio_ok = 1;
225f841f6adSraf 	__aio_initbusy = 0;
226f841f6adSraf 	(void) cond_broadcast(&__aio_initcv);
227f841f6adSraf 	lmutex_unlock(&__aio_initlock);
22834709573Sraf 	return (ret);
2297c478bd9Sstevel@tonic-gate }
2307c478bd9Sstevel@tonic-gate 
231f841f6adSraf /*
232f841f6adSraf  * Called from close() before actually performing the real _close().
233f841f6adSraf  */
234f841f6adSraf void
235f841f6adSraf _aio_close(int fd)
236f841f6adSraf {
237f841f6adSraf 	if (fd < 0)	/* avoid cancelling everything */
238f841f6adSraf 		return;
239f841f6adSraf 	/*
240f841f6adSraf 	 * Cancel all outstanding aio requests for this file descriptor.
241f841f6adSraf 	 */
242f841f6adSraf 	if (__uaio_ok)
243f841f6adSraf 		(void) aiocancel_all(fd);
244f841f6adSraf 	/*
245f841f6adSraf 	 * If we have allocated the bit array, clear the bit for this file.
246f841f6adSraf 	 * The next open may re-use this file descriptor and the new file
247f841f6adSraf 	 * may have different kaio() behaviour.
248f841f6adSraf 	 */
249f841f6adSraf 	if (_kaio_supported != NULL)
250f841f6adSraf 		CLEAR_KAIO_SUPPORTED(fd);
251f841f6adSraf }
252f841f6adSraf 
2537c478bd9Sstevel@tonic-gate /*
2547c478bd9Sstevel@tonic-gate  * special kaio cleanup thread sits in a loop in the
2557c478bd9Sstevel@tonic-gate  * kernel waiting for pending kaio requests to complete.
2567c478bd9Sstevel@tonic-gate  */
2577c478bd9Sstevel@tonic-gate void *
2587c478bd9Sstevel@tonic-gate _kaio_cleanup_thread(void *arg)
2597c478bd9Sstevel@tonic-gate {
26034709573Sraf 	if (pthread_setspecific(_aio_key, arg) != 0)
261f841f6adSraf 		aio_panic("_kaio_cleanup_thread, pthread_setspecific()");
2627c478bd9Sstevel@tonic-gate 	(void) _kaio(AIOSTART);
2637c478bd9Sstevel@tonic-gate 	return (arg);
2647c478bd9Sstevel@tonic-gate }
2657c478bd9Sstevel@tonic-gate 
2667c478bd9Sstevel@tonic-gate /*
2677c478bd9Sstevel@tonic-gate  * initialize kaio.
2687c478bd9Sstevel@tonic-gate  */
2697c478bd9Sstevel@tonic-gate void
2707c478bd9Sstevel@tonic-gate _kaio_init()
2717c478bd9Sstevel@tonic-gate {
2727c478bd9Sstevel@tonic-gate 	int error;
27334709573Sraf 	sigset_t oset;
27434709573Sraf 
275f841f6adSraf 	lmutex_lock(&__aio_initlock);
276f841f6adSraf 	while (__aio_initbusy)
277f841f6adSraf 		(void) _cond_wait(&__aio_initcv, &__aio_initlock);
278f841f6adSraf 	if (_kaio_ok) {		/* already initialized */
279f841f6adSraf 		lmutex_unlock(&__aio_initlock);
280f841f6adSraf 		return;
281f841f6adSraf 	}
282f841f6adSraf 	__aio_initbusy = 1;
283f841f6adSraf 	lmutex_unlock(&__aio_initlock);
284f841f6adSraf 
28534709573Sraf 	if (_kaio_supported_init() != 0)
286f841f6adSraf 		error = ENOMEM;
287f841f6adSraf 	else if ((_kaiowp = _aio_worker_alloc()) == NULL)
288f841f6adSraf 		error = ENOMEM;
289f841f6adSraf 	else if ((error = (int)_kaio(AIOINIT)) == 0) {
290f841f6adSraf 		(void) pthread_sigmask(SIG_SETMASK, &maskset, &oset);
291f841f6adSraf 		error = thr_create(NULL, AIOSTKSIZE, _kaio_cleanup_thread,
292f841f6adSraf 		    _kaiowp, THR_DAEMON, &_kaiowp->work_tid);
293f841f6adSraf 		(void) pthread_sigmask(SIG_SETMASK, &oset, NULL);
294f841f6adSraf 	}
295f841f6adSraf 	if (error && _kaiowp != NULL) {
296f841f6adSraf 		_aio_worker_free(_kaiowp);
297f841f6adSraf 		_kaiowp = NULL;
2987c478bd9Sstevel@tonic-gate 	}
299f841f6adSraf 
300f841f6adSraf 	lmutex_lock(&__aio_initlock);
301f841f6adSraf 	if (error)
302f841f6adSraf 		_kaio_ok = -1;
303f841f6adSraf 	else
304f841f6adSraf 		_kaio_ok = 1;
305f841f6adSraf 	__aio_initbusy = 0;
306f841f6adSraf 	(void) cond_broadcast(&__aio_initcv);
307f841f6adSraf 	lmutex_unlock(&__aio_initlock);
3087c478bd9Sstevel@tonic-gate }
3097c478bd9Sstevel@tonic-gate 
3107c478bd9Sstevel@tonic-gate int
3117c478bd9Sstevel@tonic-gate aioread(int fd, caddr_t buf, int bufsz, off_t offset, int whence,
3127c478bd9Sstevel@tonic-gate     aio_result_t *resultp)
3137c478bd9Sstevel@tonic-gate {
3147c478bd9Sstevel@tonic-gate 	return (_aiorw(fd, buf, bufsz, offset, whence, resultp, AIOREAD));
3157c478bd9Sstevel@tonic-gate }
3167c478bd9Sstevel@tonic-gate 
3177c478bd9Sstevel@tonic-gate int
3187c478bd9Sstevel@tonic-gate aiowrite(int fd, caddr_t buf, int bufsz, off_t offset, int whence,
3197c478bd9Sstevel@tonic-gate     aio_result_t *resultp)
3207c478bd9Sstevel@tonic-gate {
3217c478bd9Sstevel@tonic-gate 	return (_aiorw(fd, buf, bufsz, offset, whence, resultp, AIOWRITE));
3227c478bd9Sstevel@tonic-gate }
3237c478bd9Sstevel@tonic-gate 
32434709573Sraf #if !defined(_LP64)
3257c478bd9Sstevel@tonic-gate int
3267c478bd9Sstevel@tonic-gate aioread64(int fd, caddr_t buf, int bufsz, off64_t offset, int whence,
3277c478bd9Sstevel@tonic-gate     aio_result_t *resultp)
3287c478bd9Sstevel@tonic-gate {
3297c478bd9Sstevel@tonic-gate 	return (_aiorw(fd, buf, bufsz, offset, whence, resultp, AIOAREAD64));
3307c478bd9Sstevel@tonic-gate }
3317c478bd9Sstevel@tonic-gate 
3327c478bd9Sstevel@tonic-gate int
3337c478bd9Sstevel@tonic-gate aiowrite64(int fd, caddr_t buf, int bufsz, off64_t offset, int whence,
3347c478bd9Sstevel@tonic-gate     aio_result_t *resultp)
3357c478bd9Sstevel@tonic-gate {
3367c478bd9Sstevel@tonic-gate 	return (_aiorw(fd, buf, bufsz, offset, whence, resultp, AIOAWRITE64));
3377c478bd9Sstevel@tonic-gate }
33834709573Sraf #endif	/* !defined(_LP64) */
3397c478bd9Sstevel@tonic-gate 
3407c478bd9Sstevel@tonic-gate int
3417c478bd9Sstevel@tonic-gate _aiorw(int fd, caddr_t buf, int bufsz, offset_t offset, int whence,
3427c478bd9Sstevel@tonic-gate     aio_result_t *resultp, int mode)
3437c478bd9Sstevel@tonic-gate {
34434709573Sraf 	aio_req_t *reqp;
34534709573Sraf 	aio_args_t *ap;
34634709573Sraf 	offset_t loffset;
3477c478bd9Sstevel@tonic-gate 	struct stat stat;
34834709573Sraf 	int error = 0;
3497c478bd9Sstevel@tonic-gate 	int kerr;
3507c478bd9Sstevel@tonic-gate 	int umode;
3517c478bd9Sstevel@tonic-gate 
3527c478bd9Sstevel@tonic-gate 	switch (whence) {
3537c478bd9Sstevel@tonic-gate 
3547c478bd9Sstevel@tonic-gate 	case SEEK_SET:
3557c478bd9Sstevel@tonic-gate 		loffset = offset;
3567c478bd9Sstevel@tonic-gate 		break;
3577c478bd9Sstevel@tonic-gate 	case SEEK_CUR:
3587c478bd9Sstevel@tonic-gate 		if ((loffset = llseek(fd, 0, SEEK_CUR)) == -1)
35934709573Sraf 			error = -1;
3607c478bd9Sstevel@tonic-gate 		else
3617c478bd9Sstevel@tonic-gate 			loffset += offset;
3627c478bd9Sstevel@tonic-gate 		break;
3637c478bd9Sstevel@tonic-gate 	case SEEK_END:
3647c478bd9Sstevel@tonic-gate 		if (fstat(fd, &stat) == -1)
36534709573Sraf 			error = -1;
3667c478bd9Sstevel@tonic-gate 		else
3677c478bd9Sstevel@tonic-gate 			loffset = offset + stat.st_size;
3687c478bd9Sstevel@tonic-gate 		break;
3697c478bd9Sstevel@tonic-gate 	default:
3707c478bd9Sstevel@tonic-gate 		errno = EINVAL;
37134709573Sraf 		error = -1;
3727c478bd9Sstevel@tonic-gate 	}
3737c478bd9Sstevel@tonic-gate 
37434709573Sraf 	if (error)
37534709573Sraf 		return (error);
3767c478bd9Sstevel@tonic-gate 
3777c478bd9Sstevel@tonic-gate 	/* initialize kaio */
3787c478bd9Sstevel@tonic-gate 	if (!_kaio_ok)
3797c478bd9Sstevel@tonic-gate 		_kaio_init();
3807c478bd9Sstevel@tonic-gate 
3817c478bd9Sstevel@tonic-gate 	/*
3827c478bd9Sstevel@tonic-gate 	 * _aio_do_request() needs the original request code (mode) to be able
38334709573Sraf 	 * to choose the appropiate 32/64 bit function.  All other functions
3847c478bd9Sstevel@tonic-gate 	 * only require the difference between READ and WRITE (umode).
3857c478bd9Sstevel@tonic-gate 	 */
3867c478bd9Sstevel@tonic-gate 	if (mode == AIOAREAD64 || mode == AIOAWRITE64)
3877c478bd9Sstevel@tonic-gate 		umode = mode - AIOAREAD64;
3887c478bd9Sstevel@tonic-gate 	else
3897c478bd9Sstevel@tonic-gate 		umode = mode;
3907c478bd9Sstevel@tonic-gate 
3917c478bd9Sstevel@tonic-gate 	/*
3927c478bd9Sstevel@tonic-gate 	 * Try kernel aio first.
3937c478bd9Sstevel@tonic-gate 	 * If errno is ENOTSUP/EBADFD, fall back to the thread implementation.
3947c478bd9Sstevel@tonic-gate 	 */
39534709573Sraf 	if (_kaio_ok > 0 && KAIO_SUPPORTED(fd)) {
3967c478bd9Sstevel@tonic-gate 		resultp->aio_errno = 0;
39734709573Sraf 		sig_mutex_lock(&__aio_mutex);
39834709573Sraf 		_kaio_outstand_cnt++;
3997c478bd9Sstevel@tonic-gate 		kerr = (int)_kaio(((resultp->aio_return == AIO_INPROGRESS) ?
4007c478bd9Sstevel@tonic-gate 		    (umode | AIO_POLL_BIT) : umode),
4017c478bd9Sstevel@tonic-gate 		    fd, buf, bufsz, loffset, resultp);
40234709573Sraf 		if (kerr == 0) {
403b9868792Sraf 			sig_mutex_unlock(&__aio_mutex);
4047c478bd9Sstevel@tonic-gate 			return (0);
40534709573Sraf 		}
40634709573Sraf 		_kaio_outstand_cnt--;
40734709573Sraf 		sig_mutex_unlock(&__aio_mutex);
40834709573Sraf 		if (errno != ENOTSUP && errno != EBADFD)
4097c478bd9Sstevel@tonic-gate 			return (-1);
4107c478bd9Sstevel@tonic-gate 		if (errno == EBADFD)
4117c478bd9Sstevel@tonic-gate 			SET_KAIO_NOT_SUPPORTED(fd);
4127c478bd9Sstevel@tonic-gate 	}
4137c478bd9Sstevel@tonic-gate 
41434709573Sraf 	if (!__uaio_ok && __uaio_init() == -1)
41534709573Sraf 		return (-1);
41634709573Sraf 
41734709573Sraf 	if ((reqp = _aio_req_alloc()) == NULL) {
4187c478bd9Sstevel@tonic-gate 		errno = EAGAIN;
4197c478bd9Sstevel@tonic-gate 		return (-1);
4207c478bd9Sstevel@tonic-gate 	}
4217c478bd9Sstevel@tonic-gate 
4227c478bd9Sstevel@tonic-gate 	/*
42334709573Sraf 	 * _aio_do_request() checks reqp->req_op to differentiate
4247c478bd9Sstevel@tonic-gate 	 * between 32 and 64 bit access.
4257c478bd9Sstevel@tonic-gate 	 */
42634709573Sraf 	reqp->req_op = mode;
42734709573Sraf 	reqp->req_resultp = resultp;
42834709573Sraf 	ap = &reqp->req_args;
4297c478bd9Sstevel@tonic-gate 	ap->fd = fd;
4307c478bd9Sstevel@tonic-gate 	ap->buf = buf;
4317c478bd9Sstevel@tonic-gate 	ap->bufsz = bufsz;
4327c478bd9Sstevel@tonic-gate 	ap->offset = loffset;
4337c478bd9Sstevel@tonic-gate 
43434709573Sraf 	if (_aio_hash_insert(resultp, reqp) != 0) {
43534709573Sraf 		_aio_req_free(reqp);
4367c478bd9Sstevel@tonic-gate 		errno = EINVAL;
4377c478bd9Sstevel@tonic-gate 		return (-1);
4387c478bd9Sstevel@tonic-gate 	}
43934709573Sraf 	/*
44034709573Sraf 	 * _aio_req_add() only needs the difference between READ and
44134709573Sraf 	 * WRITE to choose the right worker queue.
44234709573Sraf 	 */
44334709573Sraf 	_aio_req_add(reqp, &__nextworker_rw, umode);
44434709573Sraf 	return (0);
4457c478bd9Sstevel@tonic-gate }
4467c478bd9Sstevel@tonic-gate 
4477c478bd9Sstevel@tonic-gate int
4487c478bd9Sstevel@tonic-gate aiocancel(aio_result_t *resultp)
4497c478bd9Sstevel@tonic-gate {
45034709573Sraf 	aio_req_t *reqp;
45134709573Sraf 	aio_worker_t *aiowp;
45234709573Sraf 	int ret;
45334709573Sraf 	int done = 0;
45434709573Sraf 	int canceled = 0;
4557c478bd9Sstevel@tonic-gate 
4567c478bd9Sstevel@tonic-gate 	if (!__uaio_ok) {
4577c478bd9Sstevel@tonic-gate 		errno = EINVAL;
4587c478bd9Sstevel@tonic-gate 		return (-1);
4597c478bd9Sstevel@tonic-gate 	}
4607c478bd9Sstevel@tonic-gate 
46134709573Sraf 	sig_mutex_lock(&__aio_mutex);
46234709573Sraf 	reqp = _aio_hash_find(resultp);
46334709573Sraf 	if (reqp == NULL) {
4647c478bd9Sstevel@tonic-gate 		if (_aio_outstand_cnt == _aio_req_done_cnt)
4657c478bd9Sstevel@tonic-gate 			errno = EINVAL;
4667c478bd9Sstevel@tonic-gate 		else
4677c478bd9Sstevel@tonic-gate 			errno = EACCES;
46834709573Sraf 		ret = -1;
4697c478bd9Sstevel@tonic-gate 	} else {
47034709573Sraf 		aiowp = reqp->req_worker;
47134709573Sraf 		sig_mutex_lock(&aiowp->work_qlock1);
47234709573Sraf 		(void) _aio_cancel_req(aiowp, reqp, &canceled, &done);
47334709573Sraf 		sig_mutex_unlock(&aiowp->work_qlock1);
4747c478bd9Sstevel@tonic-gate 
4757c478bd9Sstevel@tonic-gate 		if (canceled) {
47634709573Sraf 			ret = 0;
4777c478bd9Sstevel@tonic-gate 		} else {
47834709573Sraf 			if (_aio_outstand_cnt == 0 ||
47934709573Sraf 			    _aio_outstand_cnt == _aio_req_done_cnt)
48034709573Sraf 				errno = EINVAL;
48134709573Sraf 			else
48234709573Sraf 				errno = EACCES;
48334709573Sraf 			ret = -1;
4847c478bd9Sstevel@tonic-gate 		}
4857c478bd9Sstevel@tonic-gate 	}
48634709573Sraf 	sig_mutex_unlock(&__aio_mutex);
48734709573Sraf 	return (ret);
4887c478bd9Sstevel@tonic-gate }
4897c478bd9Sstevel@tonic-gate 
4907c478bd9Sstevel@tonic-gate /*
4917c478bd9Sstevel@tonic-gate  * This must be asynch safe
4927c478bd9Sstevel@tonic-gate  */
4937c478bd9Sstevel@tonic-gate aio_result_t *
4947c478bd9Sstevel@tonic-gate aiowait(struct timeval *uwait)
4957c478bd9Sstevel@tonic-gate {
49634709573Sraf 	aio_result_t *uresultp;
49734709573Sraf 	aio_result_t *kresultp;
49834709573Sraf 	aio_result_t *resultp;
4997c478bd9Sstevel@tonic-gate 	int dontblock;
5007c478bd9Sstevel@tonic-gate 	int timedwait = 0;
5017c478bd9Sstevel@tonic-gate 	int kaio_errno = 0;
50234709573Sraf 	struct timeval twait;
50334709573Sraf 	struct timeval *wait = NULL;
5047c478bd9Sstevel@tonic-gate 	hrtime_t hrtend;
5057c478bd9Sstevel@tonic-gate 	hrtime_t hres;
5067c478bd9Sstevel@tonic-gate 
5077c478bd9Sstevel@tonic-gate 	if (uwait) {
5087c478bd9Sstevel@tonic-gate 		/*
50934709573Sraf 		 * Check for a valid specified wait time.
51034709573Sraf 		 * If it is invalid, fail the call right away.
5117c478bd9Sstevel@tonic-gate 		 */
5127c478bd9Sstevel@tonic-gate 		if (uwait->tv_sec < 0 || uwait->tv_usec < 0 ||
5137c478bd9Sstevel@tonic-gate 		    uwait->tv_usec >= MICROSEC) {
5147c478bd9Sstevel@tonic-gate 			errno = EINVAL;
5157c478bd9Sstevel@tonic-gate 			return ((aio_result_t *)-1);
5167c478bd9Sstevel@tonic-gate 		}
5177c478bd9Sstevel@tonic-gate 
51834709573Sraf 		if (uwait->tv_sec > 0 || uwait->tv_usec > 0) {
5197c478bd9Sstevel@tonic-gate 			hrtend = gethrtime() +
520*34b3058fSpraks 			    (hrtime_t)uwait->tv_sec * NANOSEC +
521*34b3058fSpraks 			    (hrtime_t)uwait->tv_usec * (NANOSEC / MICROSEC);
5227c478bd9Sstevel@tonic-gate 			twait = *uwait;
5237c478bd9Sstevel@tonic-gate 			wait = &twait;
5247c478bd9Sstevel@tonic-gate 			timedwait++;
5257c478bd9Sstevel@tonic-gate 		} else {
5267c478bd9Sstevel@tonic-gate 			/* polling */
52734709573Sraf 			sig_mutex_lock(&__aio_mutex);
52834709573Sraf 			if (_kaio_outstand_cnt == 0) {
52934709573Sraf 				kresultp = (aio_result_t *)-1;
53034709573Sraf 			} else {
53134709573Sraf 				kresultp = (aio_result_t *)_kaio(AIOWAIT,
53234709573Sraf 				    (struct timeval *)-1, 1);
53334709573Sraf 				if (kresultp != (aio_result_t *)-1 &&
53434709573Sraf 				    kresultp != NULL &&
53534709573Sraf 				    kresultp != (aio_result_t *)1) {
53634709573Sraf 					_kaio_outstand_cnt--;
53734709573Sraf 					sig_mutex_unlock(&__aio_mutex);
53834709573Sraf 					return (kresultp);
53934709573Sraf 				}
54034709573Sraf 			}
5417c478bd9Sstevel@tonic-gate 			uresultp = _aio_req_done();
54234709573Sraf 			sig_mutex_unlock(&__aio_mutex);
54334709573Sraf 			if (uresultp != NULL &&
54434709573Sraf 			    uresultp != (aio_result_t *)-1) {
5457c478bd9Sstevel@tonic-gate 				return (uresultp);
5467c478bd9Sstevel@tonic-gate 			}
5477c478bd9Sstevel@tonic-gate 			if (uresultp == (aio_result_t *)-1 &&
5487c478bd9Sstevel@tonic-gate 			    kresultp == (aio_result_t *)-1) {
5497c478bd9Sstevel@tonic-gate 				errno = EINVAL;
5507c478bd9Sstevel@tonic-gate 				return ((aio_result_t *)-1);
55134709573Sraf 			} else {
5527c478bd9Sstevel@tonic-gate 				return (NULL);
55334709573Sraf 			}
5547c478bd9Sstevel@tonic-gate 		}
5557c478bd9Sstevel@tonic-gate 	}
5567c478bd9Sstevel@tonic-gate 
5577c478bd9Sstevel@tonic-gate 	for (;;) {
55834709573Sraf 		sig_mutex_lock(&__aio_mutex);
5597c478bd9Sstevel@tonic-gate 		uresultp = _aio_req_done();
5607c478bd9Sstevel@tonic-gate 		if (uresultp != NULL && uresultp != (aio_result_t *)-1) {
56134709573Sraf 			sig_mutex_unlock(&__aio_mutex);
5627c478bd9Sstevel@tonic-gate 			resultp = uresultp;
5637c478bd9Sstevel@tonic-gate 			break;
5647c478bd9Sstevel@tonic-gate 		}
5657c478bd9Sstevel@tonic-gate 		_aiowait_flag++;
5667c478bd9Sstevel@tonic-gate 		dontblock = (uresultp == (aio_result_t *)-1);
56734709573Sraf 		if (dontblock && _kaio_outstand_cnt == 0) {
56834709573Sraf 			kresultp = (aio_result_t *)-1;
56934709573Sraf 			kaio_errno = EINVAL;
57034709573Sraf 		} else {
57134709573Sraf 			sig_mutex_unlock(&__aio_mutex);
57234709573Sraf 			kresultp = (aio_result_t *)_kaio(AIOWAIT,
57334709573Sraf 			    wait, dontblock);
57434709573Sraf 			sig_mutex_lock(&__aio_mutex);
57534709573Sraf 			kaio_errno = errno;
57634709573Sraf 		}
5777c478bd9Sstevel@tonic-gate 		_aiowait_flag--;
57834709573Sraf 		sig_mutex_unlock(&__aio_mutex);
5797c478bd9Sstevel@tonic-gate 		if (kresultp == (aio_result_t *)1) {
5807c478bd9Sstevel@tonic-gate 			/* aiowait() awakened by an aionotify() */
5817c478bd9Sstevel@tonic-gate 			continue;
58234709573Sraf 		} else if (kresultp != NULL &&
58334709573Sraf 		    kresultp != (aio_result_t *)-1) {
5847c478bd9Sstevel@tonic-gate 			resultp = kresultp;
58534709573Sraf 			sig_mutex_lock(&__aio_mutex);
58634709573Sraf 			_kaio_outstand_cnt--;
58734709573Sraf 			sig_mutex_unlock(&__aio_mutex);
5887c478bd9Sstevel@tonic-gate 			break;
58934709573Sraf 		} else if (kresultp == (aio_result_t *)-1 &&
59034709573Sraf 		    kaio_errno == EINVAL &&
59134709573Sraf 		    uresultp == (aio_result_t *)-1) {
5927c478bd9Sstevel@tonic-gate 			errno = kaio_errno;
5937c478bd9Sstevel@tonic-gate 			resultp = (aio_result_t *)-1;
5947c478bd9Sstevel@tonic-gate 			break;
5957c478bd9Sstevel@tonic-gate 		} else if (kresultp == (aio_result_t *)-1 &&
5967c478bd9Sstevel@tonic-gate 		    kaio_errno == EINTR) {
5977c478bd9Sstevel@tonic-gate 			errno = kaio_errno;
5987c478bd9Sstevel@tonic-gate 			resultp = (aio_result_t *)-1;
5997c478bd9Sstevel@tonic-gate 			break;
6007c478bd9Sstevel@tonic-gate 		} else if (timedwait) {
6017c478bd9Sstevel@tonic-gate 			hres = hrtend - gethrtime();
6027c478bd9Sstevel@tonic-gate 			if (hres <= 0) {
60334709573Sraf 				/* time is up; return */
6047c478bd9Sstevel@tonic-gate 				resultp = NULL;
6057c478bd9Sstevel@tonic-gate 				break;
6067c478bd9Sstevel@tonic-gate 			} else {
6077c478bd9Sstevel@tonic-gate 				/*
60834709573Sraf 				 * Some time left.  Round up the remaining time
60934709573Sraf 				 * in nanoseconds to microsec.  Retry the call.
6107c478bd9Sstevel@tonic-gate 				 */
61134709573Sraf 				hres += (NANOSEC / MICROSEC) - 1;
6127c478bd9Sstevel@tonic-gate 				wait->tv_sec = hres / NANOSEC;
6137c478bd9Sstevel@tonic-gate 				wait->tv_usec =
614*34b3058fSpraks 				    (hres % NANOSEC) / (NANOSEC / MICROSEC);
6157c478bd9Sstevel@tonic-gate 			}
6167c478bd9Sstevel@tonic-gate 		} else {
61734709573Sraf 			ASSERT(kresultp == NULL && uresultp == NULL);
6187c478bd9Sstevel@tonic-gate 			resultp = NULL;
6197c478bd9Sstevel@tonic-gate 			continue;
6207c478bd9Sstevel@tonic-gate 		}
6217c478bd9Sstevel@tonic-gate 	}
6227c478bd9Sstevel@tonic-gate 	return (resultp);
6237c478bd9Sstevel@tonic-gate }
6247c478bd9Sstevel@tonic-gate 
6257c478bd9Sstevel@tonic-gate /*
6267c478bd9Sstevel@tonic-gate  * _aio_get_timedelta calculates the remaining time and stores the result
62734709573Sraf  * into timespec_t *wait.
6287c478bd9Sstevel@tonic-gate  */
6297c478bd9Sstevel@tonic-gate 
6307c478bd9Sstevel@tonic-gate int
63134709573Sraf _aio_get_timedelta(timespec_t *end, timespec_t *wait)
6327c478bd9Sstevel@tonic-gate {
6337c478bd9Sstevel@tonic-gate 	int	ret = 0;
6347c478bd9Sstevel@tonic-gate 	struct	timeval cur;
63534709573Sraf 	timespec_t curtime;
6367c478bd9Sstevel@tonic-gate 
6377c478bd9Sstevel@tonic-gate 	(void) gettimeofday(&cur, NULL);
6387c478bd9Sstevel@tonic-gate 	curtime.tv_sec = cur.tv_sec;
6397c478bd9Sstevel@tonic-gate 	curtime.tv_nsec = cur.tv_usec * 1000;   /* convert us to ns */
6407c478bd9Sstevel@tonic-gate 
6417c478bd9Sstevel@tonic-gate 	if (end->tv_sec >= curtime.tv_sec) {
6427c478bd9Sstevel@tonic-gate 		wait->tv_sec = end->tv_sec - curtime.tv_sec;
6437c478bd9Sstevel@tonic-gate 		if (end->tv_nsec >= curtime.tv_nsec) {
6447c478bd9Sstevel@tonic-gate 			wait->tv_nsec = end->tv_nsec - curtime.tv_nsec;
6457c478bd9Sstevel@tonic-gate 			if (wait->tv_sec == 0 && wait->tv_nsec == 0)
6467c478bd9Sstevel@tonic-gate 				ret = -1;	/* timer expired */
6477c478bd9Sstevel@tonic-gate 		} else {
6487c478bd9Sstevel@tonic-gate 			if (end->tv_sec > curtime.tv_sec) {
6497c478bd9Sstevel@tonic-gate 				wait->tv_sec -= 1;
6507c478bd9Sstevel@tonic-gate 				wait->tv_nsec = NANOSEC -
6517c478bd9Sstevel@tonic-gate 				    (curtime.tv_nsec - end->tv_nsec);
6527c478bd9Sstevel@tonic-gate 			} else {
6537c478bd9Sstevel@tonic-gate 				ret = -1;	/* timer expired */
6547c478bd9Sstevel@tonic-gate 			}
6557c478bd9Sstevel@tonic-gate 		}
6567c478bd9Sstevel@tonic-gate 	} else {
6577c478bd9Sstevel@tonic-gate 		ret = -1;
6587c478bd9Sstevel@tonic-gate 	}
6597c478bd9Sstevel@tonic-gate 	return (ret);
6607c478bd9Sstevel@tonic-gate }
6617c478bd9Sstevel@tonic-gate 
6627c478bd9Sstevel@tonic-gate /*
6637c478bd9Sstevel@tonic-gate  * If closing by file descriptor: we will simply cancel all the outstanding
66434709573Sraf  * aio`s and return.  Those aio's in question will have either noticed the
6657c478bd9Sstevel@tonic-gate  * cancellation notice before, during, or after initiating io.
6667c478bd9Sstevel@tonic-gate  */
6677c478bd9Sstevel@tonic-gate int
6687c478bd9Sstevel@tonic-gate aiocancel_all(int fd)
6697c478bd9Sstevel@tonic-gate {
67034709573Sraf 	aio_req_t *reqp;
67134709573Sraf 	aio_req_t **reqpp;
67234709573Sraf 	aio_worker_t *first;
67334709573Sraf 	aio_worker_t *next;
6747c478bd9Sstevel@tonic-gate 	int canceled = 0;
6757c478bd9Sstevel@tonic-gate 	int done = 0;
6767c478bd9Sstevel@tonic-gate 	int cancelall = 0;
6777c478bd9Sstevel@tonic-gate 
67834709573Sraf 	sig_mutex_lock(&__aio_mutex);
6797c478bd9Sstevel@tonic-gate 
68034709573Sraf 	if (_aio_outstand_cnt == 0) {
68134709573Sraf 		sig_mutex_unlock(&__aio_mutex);
68234709573Sraf 		return (AIO_ALLDONE);
68334709573Sraf 	}
6847c478bd9Sstevel@tonic-gate 
6857c478bd9Sstevel@tonic-gate 	/*
68634709573Sraf 	 * Cancel requests from the read/write workers' queues.
6877c478bd9Sstevel@tonic-gate 	 */
68834709573Sraf 	first = __nextworker_rw;
6897c478bd9Sstevel@tonic-gate 	next = first;
6907c478bd9Sstevel@tonic-gate 	do {
6917c478bd9Sstevel@tonic-gate 		_aio_cancel_work(next, fd, &canceled, &done);
6927c478bd9Sstevel@tonic-gate 	} while ((next = next->work_forw) != first);
6937c478bd9Sstevel@tonic-gate 
6947c478bd9Sstevel@tonic-gate 	/*
6957c478bd9Sstevel@tonic-gate 	 * finally, check if there are requests on the done queue that
6967c478bd9Sstevel@tonic-gate 	 * should be canceled.
6977c478bd9Sstevel@tonic-gate 	 */
6987c478bd9Sstevel@tonic-gate 	if (fd < 0)
6997c478bd9Sstevel@tonic-gate 		cancelall = 1;
70034709573Sraf 	reqpp = &_aio_done_tail;
70134709573Sraf 	while ((reqp = *reqpp) != NULL) {
70234709573Sraf 		if (cancelall || reqp->req_args.fd == fd) {
70334709573Sraf 			*reqpp = reqp->req_next;
7047c478bd9Sstevel@tonic-gate 			_aio_donecnt--;
70534709573Sraf 			(void) _aio_hash_del(reqp->req_resultp);
70634709573Sraf 			_aio_req_free(reqp);
7077c478bd9Sstevel@tonic-gate 		} else
70834709573Sraf 			reqpp = &reqp->req_next;
7097c478bd9Sstevel@tonic-gate 	}
7107c478bd9Sstevel@tonic-gate 	if (cancelall) {
7117c478bd9Sstevel@tonic-gate 		ASSERT(_aio_donecnt == 0);
7127c478bd9Sstevel@tonic-gate 		_aio_done_head = NULL;
7137c478bd9Sstevel@tonic-gate 	}
71434709573Sraf 	sig_mutex_unlock(&__aio_mutex);
7157c478bd9Sstevel@tonic-gate 
7167c478bd9Sstevel@tonic-gate 	if (canceled && done == 0)
7177c478bd9Sstevel@tonic-gate 		return (AIO_CANCELED);
7187c478bd9Sstevel@tonic-gate 	else if (done && canceled == 0)
7197c478bd9Sstevel@tonic-gate 		return (AIO_ALLDONE);
7207c478bd9Sstevel@tonic-gate 	else if ((canceled + done == 0) && KAIO_SUPPORTED(fd))
7217c478bd9Sstevel@tonic-gate 		return ((int)_kaio(AIOCANCEL, fd, NULL));
7227c478bd9Sstevel@tonic-gate 	return (AIO_NOTCANCELED);
7237c478bd9Sstevel@tonic-gate }
7247c478bd9Sstevel@tonic-gate 
7257c478bd9Sstevel@tonic-gate /*
72634709573Sraf  * Cancel requests from a given work queue.  If the file descriptor
72734709573Sraf  * parameter, fd, is non-negative, then only cancel those requests
72834709573Sraf  * in this queue that are to this file descriptor.  If the fd
7297c478bd9Sstevel@tonic-gate  * parameter is -1, then cancel all requests.
7307c478bd9Sstevel@tonic-gate  */
7317c478bd9Sstevel@tonic-gate static void
7327c478bd9Sstevel@tonic-gate _aio_cancel_work(aio_worker_t *aiowp, int fd, int *canceled, int *done)
7337c478bd9Sstevel@tonic-gate {
73434709573Sraf 	aio_req_t *reqp;
7357c478bd9Sstevel@tonic-gate 
73634709573Sraf 	sig_mutex_lock(&aiowp->work_qlock1);
7377c478bd9Sstevel@tonic-gate 	/*
7387c478bd9Sstevel@tonic-gate 	 * cancel queued requests first.
7397c478bd9Sstevel@tonic-gate 	 */
74034709573Sraf 	reqp = aiowp->work_tail1;
74134709573Sraf 	while (reqp != NULL) {
74234709573Sraf 		if (fd < 0 || reqp->req_args.fd == fd) {
74334709573Sraf 			if (_aio_cancel_req(aiowp, reqp, canceled, done)) {
7447c478bd9Sstevel@tonic-gate 				/*
74534709573Sraf 				 * Callers locks were dropped.
74634709573Sraf 				 * reqp is invalid; start traversing
74734709573Sraf 				 * the list from the beginning again.
7487c478bd9Sstevel@tonic-gate 				 */
74934709573Sraf 				reqp = aiowp->work_tail1;
7507c478bd9Sstevel@tonic-gate 				continue;
7517c478bd9Sstevel@tonic-gate 			}
7527c478bd9Sstevel@tonic-gate 		}
75334709573Sraf 		reqp = reqp->req_next;
7547c478bd9Sstevel@tonic-gate 	}
7557c478bd9Sstevel@tonic-gate 	/*
75634709573Sraf 	 * Since the queued requests have been canceled, there can
75734709573Sraf 	 * only be one inprogress request that should be canceled.
7587c478bd9Sstevel@tonic-gate 	 */
75934709573Sraf 	if ((reqp = aiowp->work_req) != NULL &&
76034709573Sraf 	    (fd < 0 || reqp->req_args.fd == fd))
76134709573Sraf 		(void) _aio_cancel_req(aiowp, reqp, canceled, done);
76234709573Sraf 	sig_mutex_unlock(&aiowp->work_qlock1);
7637c478bd9Sstevel@tonic-gate }
7647c478bd9Sstevel@tonic-gate 
7657c478bd9Sstevel@tonic-gate /*
76634709573Sraf  * Cancel a request.  Return 1 if the callers locks were temporarily
7677c478bd9Sstevel@tonic-gate  * dropped, otherwise return 0.
7687c478bd9Sstevel@tonic-gate  */
7697c478bd9Sstevel@tonic-gate int
77034709573Sraf _aio_cancel_req(aio_worker_t *aiowp, aio_req_t *reqp, int *canceled, int *done)
7717c478bd9Sstevel@tonic-gate {
77234709573Sraf 	int ostate = reqp->req_state;
7737c478bd9Sstevel@tonic-gate 
7747c478bd9Sstevel@tonic-gate 	ASSERT(MUTEX_HELD(&__aio_mutex));
7757c478bd9Sstevel@tonic-gate 	ASSERT(MUTEX_HELD(&aiowp->work_qlock1));
77634709573Sraf 	if (ostate == AIO_REQ_CANCELED)
7777c478bd9Sstevel@tonic-gate 		return (0);
7787c478bd9Sstevel@tonic-gate 	if (ostate == AIO_REQ_DONE || ostate == AIO_REQ_DONEQ) {
7797c478bd9Sstevel@tonic-gate 		(*done)++;
7807c478bd9Sstevel@tonic-gate 		return (0);
7817c478bd9Sstevel@tonic-gate 	}
782c2575b5eSraf 	if (reqp->req_op == AIOFSYNC && reqp != aiowp->work_req) {
78334709573Sraf 		ASSERT(POSIX_AIO(reqp));
784c2575b5eSraf 		/* Cancel the queued aio_fsync() request */
78534709573Sraf 		if (!reqp->req_head->lio_canned) {
78634709573Sraf 			reqp->req_head->lio_canned = 1;
78734709573Sraf 			_aio_outstand_cnt--;
78834709573Sraf 			(*canceled)++;
78934709573Sraf 		}
7907c478bd9Sstevel@tonic-gate 		return (0);
7917c478bd9Sstevel@tonic-gate 	}
79234709573Sraf 	reqp->req_state = AIO_REQ_CANCELED;
79334709573Sraf 	_aio_req_del(aiowp, reqp, ostate);
79434709573Sraf 	(void) _aio_hash_del(reqp->req_resultp);
7957c478bd9Sstevel@tonic-gate 	(*canceled)++;
79634709573Sraf 	if (reqp == aiowp->work_req) {
79734709573Sraf 		ASSERT(ostate == AIO_REQ_INPROGRESS);
79834709573Sraf 		/*
79934709573Sraf 		 * Set the result values now, before _aiodone() is called.
80034709573Sraf 		 * We do this because the application can expect aio_return
80134709573Sraf 		 * and aio_errno to be set to -1 and ECANCELED, respectively,
80234709573Sraf 		 * immediately after a successful return from aiocancel()
80334709573Sraf 		 * or aio_cancel().
80434709573Sraf 		 */
80534709573Sraf 		_aio_set_result(reqp, -1, ECANCELED);
80634709573Sraf 		(void) thr_kill(aiowp->work_tid, SIGAIOCANCEL);
80734709573Sraf 		return (0);
80834709573Sraf 	}
80934709573Sraf 	if (!POSIX_AIO(reqp)) {
81034709573Sraf 		_aio_outstand_cnt--;
81134709573Sraf 		_aio_set_result(reqp, -1, ECANCELED);
81234709573Sraf 		return (0);
81334709573Sraf 	}
81434709573Sraf 	sig_mutex_unlock(&aiowp->work_qlock1);
81534709573Sraf 	sig_mutex_unlock(&__aio_mutex);
81634709573Sraf 	_aiodone(reqp, -1, ECANCELED);
81734709573Sraf 	sig_mutex_lock(&__aio_mutex);
81834709573Sraf 	sig_mutex_lock(&aiowp->work_qlock1);
8197c478bd9Sstevel@tonic-gate 	return (1);
8207c478bd9Sstevel@tonic-gate }
8217c478bd9Sstevel@tonic-gate 
822f841f6adSraf int
823f841f6adSraf _aio_create_worker(aio_req_t *reqp, int mode)
824f841f6adSraf {
825f841f6adSraf 	aio_worker_t *aiowp, **workers, **nextworker;
826f841f6adSraf 	int *aio_workerscnt;
827f841f6adSraf 	void *(*func)(void *);
828f841f6adSraf 	sigset_t oset;
829f841f6adSraf 	int error;
830f841f6adSraf 
831f841f6adSraf 	/*
832f841f6adSraf 	 * Put the new worker thread in the right queue.
833f841f6adSraf 	 */
834f841f6adSraf 	switch (mode) {
835f841f6adSraf 	case AIOREAD:
836f841f6adSraf 	case AIOWRITE:
837f841f6adSraf 	case AIOAREAD:
838f841f6adSraf 	case AIOAWRITE:
839f841f6adSraf #if !defined(_LP64)
840f841f6adSraf 	case AIOAREAD64:
841f841f6adSraf 	case AIOAWRITE64:
842f841f6adSraf #endif
843f841f6adSraf 		workers = &__workers_rw;
844f841f6adSraf 		nextworker = &__nextworker_rw;
845f841f6adSraf 		aio_workerscnt = &__rw_workerscnt;
846f841f6adSraf 		func = _aio_do_request;
847f841f6adSraf 		break;
848f841f6adSraf 	case AIONOTIFY:
849f841f6adSraf 		workers = &__workers_no;
850f841f6adSraf 		nextworker = &__nextworker_no;
851f841f6adSraf 		func = _aio_do_notify;
852f841f6adSraf 		aio_workerscnt = &__no_workerscnt;
853f841f6adSraf 		break;
854f841f6adSraf 	default:
855f841f6adSraf 		aio_panic("_aio_create_worker: invalid mode");
856f841f6adSraf 		break;
857f841f6adSraf 	}
858f841f6adSraf 
859f841f6adSraf 	if ((aiowp = _aio_worker_alloc()) == NULL)
860f841f6adSraf 		return (-1);
861f841f6adSraf 
862f841f6adSraf 	if (reqp) {
863f841f6adSraf 		reqp->req_state = AIO_REQ_QUEUED;
864f841f6adSraf 		reqp->req_worker = aiowp;
865f841f6adSraf 		aiowp->work_head1 = reqp;
866f841f6adSraf 		aiowp->work_tail1 = reqp;
867f841f6adSraf 		aiowp->work_next1 = reqp;
868f841f6adSraf 		aiowp->work_count1 = 1;
869f841f6adSraf 		aiowp->work_minload1 = 1;
870f841f6adSraf 	}
871f841f6adSraf 
872f841f6adSraf 	(void) pthread_sigmask(SIG_SETMASK, &maskset, &oset);
873f841f6adSraf 	error = thr_create(NULL, AIOSTKSIZE, func, aiowp,
874*34b3058fSpraks 	    THR_DAEMON | THR_SUSPENDED, &aiowp->work_tid);
875f841f6adSraf 	(void) pthread_sigmask(SIG_SETMASK, &oset, NULL);
876f841f6adSraf 	if (error) {
877f841f6adSraf 		if (reqp) {
878f841f6adSraf 			reqp->req_state = 0;
879f841f6adSraf 			reqp->req_worker = NULL;
880f841f6adSraf 		}
881f841f6adSraf 		_aio_worker_free(aiowp);
882f841f6adSraf 		return (-1);
883f841f6adSraf 	}
884f841f6adSraf 
885f841f6adSraf 	lmutex_lock(&__aio_mutex);
886f841f6adSraf 	(*aio_workerscnt)++;
887f841f6adSraf 	if (*workers == NULL) {
888f841f6adSraf 		aiowp->work_forw = aiowp;
889f841f6adSraf 		aiowp->work_backw = aiowp;
890f841f6adSraf 		*nextworker = aiowp;
891f841f6adSraf 		*workers = aiowp;
892f841f6adSraf 	} else {
893f841f6adSraf 		aiowp->work_backw = (*workers)->work_backw;
894f841f6adSraf 		aiowp->work_forw = (*workers);
895f841f6adSraf 		(*workers)->work_backw->work_forw = aiowp;
896f841f6adSraf 		(*workers)->work_backw = aiowp;
897f841f6adSraf 	}
898f841f6adSraf 	_aio_worker_cnt++;
899f841f6adSraf 	lmutex_unlock(&__aio_mutex);
900f841f6adSraf 
901f841f6adSraf 	(void) thr_continue(aiowp->work_tid);
902f841f6adSraf 
903f841f6adSraf 	return (0);
904f841f6adSraf }
905f841f6adSraf 
9067c478bd9Sstevel@tonic-gate /*
9077c478bd9Sstevel@tonic-gate  * This is the worker's main routine.
9087c478bd9Sstevel@tonic-gate  * The task of this function is to execute all queued requests;
9097c478bd9Sstevel@tonic-gate  * once the last pending request is executed this function will block
91034709573Sraf  * in _aio_idle().  A new incoming request must wakeup this thread to
9117c478bd9Sstevel@tonic-gate  * restart the work.
91234709573Sraf  * Every worker has an own work queue.  The queue lock is required
9137c478bd9Sstevel@tonic-gate  * to synchronize the addition of new requests for this worker or
9147c478bd9Sstevel@tonic-gate  * cancellation of pending/running requests.
9157c478bd9Sstevel@tonic-gate  *
9167c478bd9Sstevel@tonic-gate  * Cancellation scenarios:
9177c478bd9Sstevel@tonic-gate  * The cancellation of a request is being done asynchronously using
9187c478bd9Sstevel@tonic-gate  * _aio_cancel_req() from another thread context.
9197c478bd9Sstevel@tonic-gate  * A queued request can be cancelled in different manners :
9207c478bd9Sstevel@tonic-gate  * a) request is queued but not "in progress" or "done" (AIO_REQ_QUEUED):
9217c478bd9Sstevel@tonic-gate  *	- lock the queue -> remove the request -> unlock the queue
9227c478bd9Sstevel@tonic-gate  *	- this function/thread does not detect this cancellation process
9237c478bd9Sstevel@tonic-gate  * b) request is in progress (AIO_REQ_INPROGRESS) :
9247c478bd9Sstevel@tonic-gate  *	- this function first allow the cancellation of the running
9257c478bd9Sstevel@tonic-gate  *	  request with the flag "work_cancel_flg=1"
9267c478bd9Sstevel@tonic-gate  * 		see _aio_req_get() -> _aio_cancel_on()
9277c478bd9Sstevel@tonic-gate  *	  During this phase, it is allowed to interrupt the worker
9287c478bd9Sstevel@tonic-gate  *	  thread running the request (this thread) using the SIGAIOCANCEL
9297c478bd9Sstevel@tonic-gate  *	  signal.
9307c478bd9Sstevel@tonic-gate  *	  Once this thread returns from the kernel (because the request
9317c478bd9Sstevel@tonic-gate  *	  is just done), then it must disable a possible cancellation
93234709573Sraf  *	  and proceed to finish the request.  To disable the cancellation
9337c478bd9Sstevel@tonic-gate  *	  this thread must use _aio_cancel_off() to set "work_cancel_flg=0".
9347c478bd9Sstevel@tonic-gate  * c) request is already done (AIO_REQ_DONE || AIO_REQ_DONEQ):
9357c478bd9Sstevel@tonic-gate  *	  same procedure as in a)
9367c478bd9Sstevel@tonic-gate  *
9377c478bd9Sstevel@tonic-gate  * To b)
9387c478bd9Sstevel@tonic-gate  *	This thread uses sigsetjmp() to define the position in the code, where
9397c478bd9Sstevel@tonic-gate  *	it wish to continue working in the case that a SIGAIOCANCEL signal
9407c478bd9Sstevel@tonic-gate  *	is detected.
9417c478bd9Sstevel@tonic-gate  *	Normally this thread should get the cancellation signal during the
94234709573Sraf  *	kernel phase (reading or writing).  In that case the signal handler
9437c478bd9Sstevel@tonic-gate  *	aiosigcancelhndlr() is activated using the worker thread context,
9447c478bd9Sstevel@tonic-gate  *	which again will use the siglongjmp() function to break the standard
9457c478bd9Sstevel@tonic-gate  *	code flow and jump to the "sigsetjmp" position, provided that
9467c478bd9Sstevel@tonic-gate  *	"work_cancel_flg" is set to "1".
9477c478bd9Sstevel@tonic-gate  *	Because the "work_cancel_flg" is only manipulated by this worker
9487c478bd9Sstevel@tonic-gate  *	thread and it can only run on one CPU at a given time, it is not
9497c478bd9Sstevel@tonic-gate  *	necessary to protect that flag with the queue lock.
9507c478bd9Sstevel@tonic-gate  *	Returning from the kernel (read or write system call) we must
9517c478bd9Sstevel@tonic-gate  *	first disable the use of the SIGAIOCANCEL signal and accordingly
9527c478bd9Sstevel@tonic-gate  *	the use of the siglongjmp() function to prevent a possible deadlock:
9537c478bd9Sstevel@tonic-gate  *	- It can happens that this worker thread returns from the kernel and
9547c478bd9Sstevel@tonic-gate  *	  blocks in "work_qlock1",
9557c478bd9Sstevel@tonic-gate  *	- then a second thread cancels the apparently "in progress" request
9567c478bd9Sstevel@tonic-gate  *	  and sends the SIGAIOCANCEL signal to the worker thread,
9577c478bd9Sstevel@tonic-gate  *	- the worker thread gets assigned the "work_qlock1" and will returns
9587c478bd9Sstevel@tonic-gate  *	  from the kernel,
9597c478bd9Sstevel@tonic-gate  *	- the kernel detects the pending signal and activates the signal
9607c478bd9Sstevel@tonic-gate  *	  handler instead,
9617c478bd9Sstevel@tonic-gate  *	- if the "work_cancel_flg" is still set then the signal handler
9627c478bd9Sstevel@tonic-gate  *	  should use siglongjmp() to cancel the "in progress" request and
9637c478bd9Sstevel@tonic-gate  *	  it would try to acquire the same work_qlock1 in _aio_req_get()
9647c478bd9Sstevel@tonic-gate  *	  for a second time => deadlock.
9657c478bd9Sstevel@tonic-gate  *	To avoid that situation we disable the cancellation of the request
9667c478bd9Sstevel@tonic-gate  *	in progress BEFORE we try to acquire the work_qlock1.
9677c478bd9Sstevel@tonic-gate  *	In that case the signal handler will not call siglongjmp() and the
9687c478bd9Sstevel@tonic-gate  *	worker thread will continue running the standard code flow.
9697c478bd9Sstevel@tonic-gate  *	Then this thread must check the AIO_REQ_CANCELED flag to emulate
9707c478bd9Sstevel@tonic-gate  *	an eventually required siglongjmp() freeing the work_qlock1 and
9717c478bd9Sstevel@tonic-gate  *	avoiding a deadlock.
9727c478bd9Sstevel@tonic-gate  */
9737c478bd9Sstevel@tonic-gate void *
9747c478bd9Sstevel@tonic-gate _aio_do_request(void *arglist)
9757c478bd9Sstevel@tonic-gate {
9767c478bd9Sstevel@tonic-gate 	aio_worker_t *aiowp = (aio_worker_t *)arglist;
977f841f6adSraf 	ulwp_t *self = curthread;
9787c478bd9Sstevel@tonic-gate 	struct aio_args *arg;
97934709573Sraf 	aio_req_t *reqp;		/* current AIO request */
9807c478bd9Sstevel@tonic-gate 	ssize_t retval;
98134709573Sraf 	int error;
9827c478bd9Sstevel@tonic-gate 
98334709573Sraf 	if (pthread_setspecific(_aio_key, aiowp) != 0)
984f841f6adSraf 		aio_panic("_aio_do_request, pthread_setspecific()");
98534709573Sraf 	(void) pthread_sigmask(SIG_SETMASK, &_worker_set, NULL);
98634709573Sraf 	ASSERT(aiowp->work_req == NULL);
9877c478bd9Sstevel@tonic-gate 
98834709573Sraf 	/*
98934709573Sraf 	 * We resume here when an operation is cancelled.
99034709573Sraf 	 * On first entry, aiowp->work_req == NULL, so all
99134709573Sraf 	 * we do is block SIGAIOCANCEL.
99234709573Sraf 	 */
99334709573Sraf 	(void) sigsetjmp(aiowp->work_jmp_buf, 0);
994f841f6adSraf 	ASSERT(self->ul_sigdefer == 0);
9957c478bd9Sstevel@tonic-gate 
996f841f6adSraf 	sigoff(self);	/* block SIGAIOCANCEL */
99734709573Sraf 	if (aiowp->work_req != NULL)
99834709573Sraf 		_aio_finish_request(aiowp, -1, ECANCELED);
9997c478bd9Sstevel@tonic-gate 
100034709573Sraf 	for (;;) {
10017c478bd9Sstevel@tonic-gate 		/*
100234709573Sraf 		 * Put completed requests on aio_done_list.  This has
10037c478bd9Sstevel@tonic-gate 		 * to be done as part of the main loop to ensure that
10047c478bd9Sstevel@tonic-gate 		 * we don't artificially starve any aiowait'ers.
10057c478bd9Sstevel@tonic-gate 		 */
10067c478bd9Sstevel@tonic-gate 		if (aiowp->work_done1)
10077c478bd9Sstevel@tonic-gate 			_aio_work_done(aiowp);
10087c478bd9Sstevel@tonic-gate 
100934709573Sraf top:
101034709573Sraf 		/* consume any deferred SIGAIOCANCEL signal here */
1011f841f6adSraf 		sigon(self);
1012f841f6adSraf 		sigoff(self);
101334709573Sraf 
1014f841f6adSraf 		while ((reqp = _aio_req_get(aiowp)) == NULL) {
1015f841f6adSraf 			if (_aio_idle(aiowp) != 0)
1016f841f6adSraf 				goto top;
1017f841f6adSraf 		}
101834709573Sraf 		arg = &reqp->req_args;
101934709573Sraf 		ASSERT(reqp->req_state == AIO_REQ_INPROGRESS ||
102034709573Sraf 		    reqp->req_state == AIO_REQ_CANCELED);
102134709573Sraf 		error = 0;
102234709573Sraf 
102334709573Sraf 		switch (reqp->req_op) {
102434709573Sraf 		case AIOREAD:
102534709573Sraf 		case AIOAREAD:
1026f841f6adSraf 			sigon(self);	/* unblock SIGAIOCANCEL */
102734709573Sraf 			retval = pread(arg->fd, arg->buf,
102834709573Sraf 			    arg->bufsz, arg->offset);
102934709573Sraf 			if (retval == -1) {
103034709573Sraf 				if (errno == ESPIPE) {
103134709573Sraf 					retval = read(arg->fd,
103234709573Sraf 					    arg->buf, arg->bufsz);
103334709573Sraf 					if (retval == -1)
103434709573Sraf 						error = errno;
103534709573Sraf 				} else {
103634709573Sraf 					error = errno;
10377c478bd9Sstevel@tonic-gate 				}
103834709573Sraf 			}
1039f841f6adSraf 			sigoff(self);	/* block SIGAIOCANCEL */
104034709573Sraf 			break;
104134709573Sraf 		case AIOWRITE:
104234709573Sraf 		case AIOAWRITE:
1043f841f6adSraf 			sigon(self);	/* unblock SIGAIOCANCEL */
104434709573Sraf 			retval = pwrite(arg->fd, arg->buf,
104534709573Sraf 			    arg->bufsz, arg->offset);
104634709573Sraf 			if (retval == -1) {
104734709573Sraf 				if (errno == ESPIPE) {
104834709573Sraf 					retval = write(arg->fd,
104934709573Sraf 					    arg->buf, arg->bufsz);
105034709573Sraf 					if (retval == -1)
105134709573Sraf 						error = errno;
105234709573Sraf 				} else {
105334709573Sraf 					error = errno;
10547c478bd9Sstevel@tonic-gate 				}
105534709573Sraf 			}
1056f841f6adSraf 			sigoff(self);	/* block SIGAIOCANCEL */
105734709573Sraf 			break;
105834709573Sraf #if !defined(_LP64)
105934709573Sraf 		case AIOAREAD64:
1060f841f6adSraf 			sigon(self);	/* unblock SIGAIOCANCEL */
106134709573Sraf 			retval = pread64(arg->fd, arg->buf,
106234709573Sraf 			    arg->bufsz, arg->offset);
106334709573Sraf 			if (retval == -1) {
106434709573Sraf 				if (errno == ESPIPE) {
106534709573Sraf 					retval = read(arg->fd,
106634709573Sraf 					    arg->buf, arg->bufsz);
106734709573Sraf 					if (retval == -1)
106834709573Sraf 						error = errno;
106934709573Sraf 				} else {
107034709573Sraf 					error = errno;
10717c478bd9Sstevel@tonic-gate 				}
107234709573Sraf 			}
1073f841f6adSraf 			sigoff(self);	/* block SIGAIOCANCEL */
107434709573Sraf 			break;
107534709573Sraf 		case AIOAWRITE64:
1076f841f6adSraf 			sigon(self);	/* unblock SIGAIOCANCEL */
107734709573Sraf 			retval = pwrite64(arg->fd, arg->buf,
107834709573Sraf 			    arg->bufsz, arg->offset);
107934709573Sraf 			if (retval == -1) {
108034709573Sraf 				if (errno == ESPIPE) {
108134709573Sraf 					retval = write(arg->fd,
108234709573Sraf 					    arg->buf, arg->bufsz);
108334709573Sraf 					if (retval == -1)
108434709573Sraf 						error = errno;
108534709573Sraf 				} else {
108634709573Sraf 					error = errno;
10877c478bd9Sstevel@tonic-gate 				}
108834709573Sraf 			}
1089f841f6adSraf 			sigoff(self);	/* block SIGAIOCANCEL */
109034709573Sraf 			break;
109134709573Sraf #endif	/* !defined(_LP64) */
109234709573Sraf 		case AIOFSYNC:
1093c2575b5eSraf 			if (_aio_fsync_del(aiowp, reqp))
109434709573Sraf 				goto top;
109534709573Sraf 			ASSERT(reqp->req_head == NULL);
109634709573Sraf 			/*
109734709573Sraf 			 * All writes for this fsync request are now
109834709573Sraf 			 * acknowledged.  Now make these writes visible
109934709573Sraf 			 * and put the final request into the hash table.
110034709573Sraf 			 */
110134709573Sraf 			if (reqp->req_state == AIO_REQ_CANCELED) {
110234709573Sraf 				/* EMPTY */;
110334709573Sraf 			} else if (arg->offset == O_SYNC) {
110434709573Sraf 				if ((retval = __fdsync(arg->fd, FSYNC)) == -1)
110534709573Sraf 					error = errno;
110634709573Sraf 			} else {
110734709573Sraf 				if ((retval = __fdsync(arg->fd, FDSYNC)) == -1)
110834709573Sraf 					error = errno;
110934709573Sraf 			}
111034709573Sraf 			if (_aio_hash_insert(reqp->req_resultp, reqp) != 0)
1111f841f6adSraf 				aio_panic("_aio_do_request(): AIOFSYNC: "
111234709573Sraf 				    "request already in hash table");
111334709573Sraf 			break;
111434709573Sraf 		default:
1115f841f6adSraf 			aio_panic("_aio_do_request, bad op");
11167c478bd9Sstevel@tonic-gate 		}
11177c478bd9Sstevel@tonic-gate 
111834709573Sraf 		_aio_finish_request(aiowp, retval, error);
111934709573Sraf 	}
112034709573Sraf 	/* NOTREACHED */
112134709573Sraf 	return (NULL);
112234709573Sraf }
112334709573Sraf 
112434709573Sraf /*
112534709573Sraf  * Perform the tail processing for _aio_do_request().
112634709573Sraf  * The in-progress request may or may not have been cancelled.
112734709573Sraf  */
112834709573Sraf static void
112934709573Sraf _aio_finish_request(aio_worker_t *aiowp, ssize_t retval, int error)
113034709573Sraf {
113134709573Sraf 	aio_req_t *reqp;
113234709573Sraf 
113334709573Sraf 	sig_mutex_lock(&aiowp->work_qlock1);
113434709573Sraf 	if ((reqp = aiowp->work_req) == NULL)
113534709573Sraf 		sig_mutex_unlock(&aiowp->work_qlock1);
113634709573Sraf 	else {
113734709573Sraf 		aiowp->work_req = NULL;
113834709573Sraf 		if (reqp->req_state == AIO_REQ_CANCELED) {
113934709573Sraf 			retval = -1;
114034709573Sraf 			error = ECANCELED;
114134709573Sraf 		}
114234709573Sraf 		if (!POSIX_AIO(reqp)) {
1143*34b3058fSpraks 			int notify;
114434709573Sraf 			sig_mutex_unlock(&aiowp->work_qlock1);
114534709573Sraf 			sig_mutex_lock(&__aio_mutex);
114634709573Sraf 			if (reqp->req_state == AIO_REQ_INPROGRESS)
114734709573Sraf 				reqp->req_state = AIO_REQ_DONE;
1148*34b3058fSpraks 			/*
1149*34b3058fSpraks 			 * If it was canceled, this request will not be
1150*34b3058fSpraks 			 * added to done list. Just free it.
1151*34b3058fSpraks 			 */
1152*34b3058fSpraks 			if (error == ECANCELED) {
115334709573Sraf 				_aio_outstand_cnt--;
1154*34b3058fSpraks 				_aio_req_free(reqp);
1155*34b3058fSpraks 			} else {
1156*34b3058fSpraks 				_aio_set_result(reqp, retval, error);
1157*34b3058fSpraks 				_aio_req_done_cnt++;
1158*34b3058fSpraks 			}
1159*34b3058fSpraks 			/*
1160*34b3058fSpraks 			 * Notify any thread that may have blocked
1161*34b3058fSpraks 			 * because it saw an outstanding request.
1162*34b3058fSpraks 			 */
1163*34b3058fSpraks 			notify = 0;
1164*34b3058fSpraks 			if (_aio_outstand_cnt == 0 && _aiowait_flag) {
1165*34b3058fSpraks 				notify = 1;
1166*34b3058fSpraks 			}
116734709573Sraf 			sig_mutex_unlock(&__aio_mutex);
1168*34b3058fSpraks 			if (notify) {
1169*34b3058fSpraks 				(void) _kaio(AIONOTIFY);
1170*34b3058fSpraks 			}
117134709573Sraf 		} else {
117234709573Sraf 			if (reqp->req_state == AIO_REQ_INPROGRESS)
117334709573Sraf 				reqp->req_state = AIO_REQ_DONE;
117434709573Sraf 			sig_mutex_unlock(&aiowp->work_qlock1);
117534709573Sraf 			_aiodone(reqp, retval, error);
117634709573Sraf 		}
117734709573Sraf 	}
117834709573Sraf }
11797c478bd9Sstevel@tonic-gate 
118034709573Sraf void
118134709573Sraf _aio_req_mark_done(aio_req_t *reqp)
118234709573Sraf {
118334709573Sraf #if !defined(_LP64)
118434709573Sraf 	if (reqp->req_largefile)
118534709573Sraf 		((aiocb64_t *)reqp->req_aiocbp)->aio_state = USERAIO_DONE;
118634709573Sraf 	else
118734709573Sraf #endif
118834709573Sraf 		((aiocb_t *)reqp->req_aiocbp)->aio_state = USERAIO_DONE;
118934709573Sraf }
11907c478bd9Sstevel@tonic-gate 
119134709573Sraf /*
119234709573Sraf  * Sleep for 'ticks' clock ticks to give somebody else a chance to run,
119334709573Sraf  * hopefully to consume one of our queued signals.
119434709573Sraf  */
119534709573Sraf static void
119634709573Sraf _aio_delay(int ticks)
119734709573Sraf {
119834709573Sraf 	(void) usleep(ticks * (MICROSEC / hz));
119934709573Sraf }
12007c478bd9Sstevel@tonic-gate 
120134709573Sraf /*
120234709573Sraf  * Actually send the notifications.
120334709573Sraf  * We could block indefinitely here if the application
120434709573Sraf  * is not listening for the signal or port notifications.
120534709573Sraf  */
120634709573Sraf static void
120734709573Sraf send_notification(notif_param_t *npp)
120834709573Sraf {
1209f841f6adSraf 	extern int __sigqueue(pid_t pid, int signo,
1210*34b3058fSpraks 	    /* const union sigval */ void *value, int si_code, int block);
1211f841f6adSraf 
1212f841f6adSraf 	if (npp->np_signo)
1213f841f6adSraf 		(void) __sigqueue(__pid, npp->np_signo, npp->np_user,
1214f841f6adSraf 		    SI_ASYNCIO, 1);
1215f841f6adSraf 	else if (npp->np_port >= 0)
121634709573Sraf 		(void) _port_dispatch(npp->np_port, 0, PORT_SOURCE_AIO,
121734709573Sraf 		    npp->np_event, npp->np_object, npp->np_user);
1218f841f6adSraf 
1219f841f6adSraf 	if (npp->np_lio_signo)
1220f841f6adSraf 		(void) __sigqueue(__pid, npp->np_lio_signo, npp->np_lio_user,
1221f841f6adSraf 		    SI_ASYNCIO, 1);
1222f841f6adSraf 	else if (npp->np_lio_port >= 0)
122334709573Sraf 		(void) _port_dispatch(npp->np_lio_port, 0, PORT_SOURCE_AIO,
122434709573Sraf 		    npp->np_lio_event, npp->np_lio_object, npp->np_lio_user);
12257c478bd9Sstevel@tonic-gate }
12267c478bd9Sstevel@tonic-gate 
12277c478bd9Sstevel@tonic-gate /*
122834709573Sraf  * Asynchronous notification worker.
12297c478bd9Sstevel@tonic-gate  */
12307c478bd9Sstevel@tonic-gate void *
123134709573Sraf _aio_do_notify(void *arg)
12327c478bd9Sstevel@tonic-gate {
12337c478bd9Sstevel@tonic-gate 	aio_worker_t *aiowp = (aio_worker_t *)arg;
123434709573Sraf 	aio_req_t *reqp;
12357c478bd9Sstevel@tonic-gate 
123634709573Sraf 	/*
123734709573Sraf 	 * This isn't really necessary.  All signals are blocked.
123834709573Sraf 	 */
123934709573Sraf 	if (pthread_setspecific(_aio_key, aiowp) != 0)
1240f841f6adSraf 		aio_panic("_aio_do_notify, pthread_setspecific()");
124134709573Sraf 
124234709573Sraf 	/*
124334709573Sraf 	 * Notifications are never cancelled.
124434709573Sraf 	 * All signals remain blocked, forever.
124534709573Sraf 	 */
12467c478bd9Sstevel@tonic-gate 	for (;;) {
1247f841f6adSraf 		while ((reqp = _aio_req_get(aiowp)) == NULL) {
1248f841f6adSraf 			if (_aio_idle(aiowp) != 0)
1249f841f6adSraf 				aio_panic("_aio_do_notify: _aio_idle() failed");
1250f841f6adSraf 		}
125134709573Sraf 		send_notification(&reqp->req_notify);
125234709573Sraf 		_aio_req_free(reqp);
12537c478bd9Sstevel@tonic-gate 	}
125434709573Sraf 
12557c478bd9Sstevel@tonic-gate 	/* NOTREACHED */
12567c478bd9Sstevel@tonic-gate 	return (NULL);
12577c478bd9Sstevel@tonic-gate }
12587c478bd9Sstevel@tonic-gate 
12597c478bd9Sstevel@tonic-gate /*
126034709573Sraf  * Do the completion semantics for a request that was either canceled
126134709573Sraf  * by _aio_cancel_req() or was completed by _aio_do_request().
12627c478bd9Sstevel@tonic-gate  */
126334709573Sraf static void
126434709573Sraf _aiodone(aio_req_t *reqp, ssize_t retval, int error)
12657c478bd9Sstevel@tonic-gate {
126634709573Sraf 	aio_result_t *resultp = reqp->req_resultp;
126734709573Sraf 	int notify = 0;
126834709573Sraf 	aio_lio_t *head;
126934709573Sraf 	int sigev_none;
127034709573Sraf 	int sigev_signal;
127134709573Sraf 	int sigev_thread;
127234709573Sraf 	int sigev_port;
127334709573Sraf 	notif_param_t np;
12747c478bd9Sstevel@tonic-gate 
127534709573Sraf 	/*
127634709573Sraf 	 * We call _aiodone() only for Posix I/O.
127734709573Sraf 	 */
127834709573Sraf 	ASSERT(POSIX_AIO(reqp));
127934709573Sraf 
128034709573Sraf 	sigev_none = 0;
128134709573Sraf 	sigev_signal = 0;
128234709573Sraf 	sigev_thread = 0;
128334709573Sraf 	sigev_port = 0;
128434709573Sraf 	np.np_signo = 0;
128534709573Sraf 	np.np_port = -1;
128634709573Sraf 	np.np_lio_signo = 0;
128734709573Sraf 	np.np_lio_port = -1;
128834709573Sraf 
128934709573Sraf 	switch (reqp->req_sigevent.sigev_notify) {
129034709573Sraf 	case SIGEV_NONE:
129134709573Sraf 		sigev_none = 1;
129234709573Sraf 		break;
129334709573Sraf 	case SIGEV_SIGNAL:
129434709573Sraf 		sigev_signal = 1;
129534709573Sraf 		break;
129634709573Sraf 	case SIGEV_THREAD:
129734709573Sraf 		sigev_thread = 1;
129834709573Sraf 		break;
129934709573Sraf 	case SIGEV_PORT:
130034709573Sraf 		sigev_port = 1;
130134709573Sraf 		break;
130234709573Sraf 	default:
1303f841f6adSraf 		aio_panic("_aiodone: improper sigev_notify");
130434709573Sraf 		break;
130534709573Sraf 	}
13067c478bd9Sstevel@tonic-gate 
130734709573Sraf 	/*
130834709573Sraf 	 * Figure out the notification parameters while holding __aio_mutex.
130934709573Sraf 	 * Actually perform the notifications after dropping __aio_mutex.
131034709573Sraf 	 * This allows us to sleep for a long time (if the notifications
131134709573Sraf 	 * incur delays) without impeding other async I/O operations.
131234709573Sraf 	 */
13137c478bd9Sstevel@tonic-gate 
131434709573Sraf 	sig_mutex_lock(&__aio_mutex);
131534709573Sraf 
131634709573Sraf 	if (sigev_signal) {
131734709573Sraf 		if ((np.np_signo = reqp->req_sigevent.sigev_signo) != 0)
131834709573Sraf 			notify = 1;
131934709573Sraf 		np.np_user = reqp->req_sigevent.sigev_value.sival_ptr;
132034709573Sraf 	} else if (sigev_thread | sigev_port) {
132134709573Sraf 		if ((np.np_port = reqp->req_sigevent.sigev_signo) >= 0)
132234709573Sraf 			notify = 1;
132334709573Sraf 		np.np_event = reqp->req_op;
132434709573Sraf 		if (np.np_event == AIOFSYNC && reqp->req_largefile)
132534709573Sraf 			np.np_event = AIOFSYNC64;
132634709573Sraf 		np.np_object = (uintptr_t)reqp->req_aiocbp;
132734709573Sraf 		np.np_user = reqp->req_sigevent.sigev_value.sival_ptr;
132834709573Sraf 	}
13297c478bd9Sstevel@tonic-gate 
133034709573Sraf 	if (resultp->aio_errno == EINPROGRESS)
133134709573Sraf 		_aio_set_result(reqp, retval, error);
13327c478bd9Sstevel@tonic-gate 
133334709573Sraf 	_aio_outstand_cnt--;
13347c478bd9Sstevel@tonic-gate 
133534709573Sraf 	head = reqp->req_head;
133634709573Sraf 	reqp->req_head = NULL;
13377c478bd9Sstevel@tonic-gate 
133834709573Sraf 	if (sigev_none) {
133934709573Sraf 		_aio_enq_doneq(reqp);
134034709573Sraf 		reqp = NULL;
134134709573Sraf 	} else {
134234709573Sraf 		(void) _aio_hash_del(resultp);
134334709573Sraf 		_aio_req_mark_done(reqp);
134434709573Sraf 	}
13457c478bd9Sstevel@tonic-gate 
134634709573Sraf 	_aio_waitn_wakeup();
13477c478bd9Sstevel@tonic-gate 
134834709573Sraf 	/*
134934709573Sraf 	 * __aio_waitn() sets AIO_WAIT_INPROGRESS and
135034709573Sraf 	 * __aio_suspend() increments "_aio_kernel_suspend"
135134709573Sraf 	 * when they are waiting in the kernel for completed I/Os.
135234709573Sraf 	 *
135334709573Sraf 	 * _kaio(AIONOTIFY) awakes the corresponding function
135434709573Sraf 	 * in the kernel; then the corresponding __aio_waitn() or
135534709573Sraf 	 * __aio_suspend() function could reap the recently
135634709573Sraf 	 * completed I/Os (_aiodone()).
135734709573Sraf 	 */
135834709573Sraf 	if ((_aio_flags & AIO_WAIT_INPROGRESS) || _aio_kernel_suspend > 0)
135934709573Sraf 		(void) _kaio(AIONOTIFY);
13607c478bd9Sstevel@tonic-gate 
136134709573Sraf 	sig_mutex_unlock(&__aio_mutex);
13627c478bd9Sstevel@tonic-gate 
136334709573Sraf 	if (head != NULL) {
13647c478bd9Sstevel@tonic-gate 		/*
136534709573Sraf 		 * If all the lio requests have completed,
136634709573Sraf 		 * prepare to notify the waiting thread.
13677c478bd9Sstevel@tonic-gate 		 */
136834709573Sraf 		sig_mutex_lock(&head->lio_mutex);
136934709573Sraf 		ASSERT(head->lio_refcnt == head->lio_nent);
137034709573Sraf 		if (head->lio_refcnt == 1) {
137134709573Sraf 			int waiting = 0;
137234709573Sraf 			if (head->lio_mode == LIO_WAIT) {
137334709573Sraf 				if ((waiting = head->lio_waiting) != 0)
137434709573Sraf 					(void) cond_signal(&head->lio_cond_cv);
137534709573Sraf 			} else if (head->lio_port < 0) { /* none or signal */
137634709573Sraf 				if ((np.np_lio_signo = head->lio_signo) != 0)
137734709573Sraf 					notify = 1;
137834709573Sraf 				np.np_lio_user = head->lio_sigval.sival_ptr;
137934709573Sraf 			} else {			/* thread or port */
138034709573Sraf 				notify = 1;
138134709573Sraf 				np.np_lio_port = head->lio_port;
138234709573Sraf 				np.np_lio_event = head->lio_event;
138334709573Sraf 				np.np_lio_object =
138434709573Sraf 				    (uintptr_t)head->lio_sigevent;
138534709573Sraf 				np.np_lio_user = head->lio_sigval.sival_ptr;
13867c478bd9Sstevel@tonic-gate 			}
138734709573Sraf 			head->lio_nent = head->lio_refcnt = 0;
138834709573Sraf 			sig_mutex_unlock(&head->lio_mutex);
138934709573Sraf 			if (waiting == 0)
139034709573Sraf 				_aio_lio_free(head);
139134709573Sraf 		} else {
139234709573Sraf 			head->lio_nent--;
139334709573Sraf 			head->lio_refcnt--;
139434709573Sraf 			sig_mutex_unlock(&head->lio_mutex);
13957c478bd9Sstevel@tonic-gate 		}
139634709573Sraf 	}
13977c478bd9Sstevel@tonic-gate 
139834709573Sraf 	/*
139934709573Sraf 	 * The request is completed; now perform the notifications.
140034709573Sraf 	 */
140134709573Sraf 	if (notify) {
140234709573Sraf 		if (reqp != NULL) {
14037c478bd9Sstevel@tonic-gate 			/*
140434709573Sraf 			 * We usually put the request on the notification
140534709573Sraf 			 * queue because we don't want to block and delay
140634709573Sraf 			 * other operations behind us in the work queue.
140734709573Sraf 			 * Also we must never block on a cancel notification
140834709573Sraf 			 * because we are being called from an application
140934709573Sraf 			 * thread in this case and that could lead to deadlock
141034709573Sraf 			 * if no other thread is receiving notificatins.
14117c478bd9Sstevel@tonic-gate 			 */
141234709573Sraf 			reqp->req_notify = np;
141334709573Sraf 			reqp->req_op = AIONOTIFY;
141434709573Sraf 			_aio_req_add(reqp, &__workers_no, AIONOTIFY);
141534709573Sraf 			reqp = NULL;
141634709573Sraf 		} else {
141734709573Sraf 			/*
141834709573Sraf 			 * We already put the request on the done queue,
141934709573Sraf 			 * so we can't queue it to the notification queue.
142034709573Sraf 			 * Just do the notification directly.
142134709573Sraf 			 */
142234709573Sraf 			send_notification(&np);
14237c478bd9Sstevel@tonic-gate 		}
14247c478bd9Sstevel@tonic-gate 	}
142534709573Sraf 
142634709573Sraf 	if (reqp != NULL)
142734709573Sraf 		_aio_req_free(reqp);
14287c478bd9Sstevel@tonic-gate }
14297c478bd9Sstevel@tonic-gate 
14307c478bd9Sstevel@tonic-gate /*
143134709573Sraf  * Delete fsync requests from list head until there is
143234709573Sraf  * only one left.  Return 0 when there is only one,
143334709573Sraf  * otherwise return a non-zero value.
14347c478bd9Sstevel@tonic-gate  */
14357c478bd9Sstevel@tonic-gate static int
1436c2575b5eSraf _aio_fsync_del(aio_worker_t *aiowp, aio_req_t *reqp)
14377c478bd9Sstevel@tonic-gate {
143834709573Sraf 	aio_lio_t *head = reqp->req_head;
143934709573Sraf 	int rval = 0;
144034709573Sraf 
1441c2575b5eSraf 	ASSERT(reqp == aiowp->work_req);
1442c2575b5eSraf 	sig_mutex_lock(&aiowp->work_qlock1);
144334709573Sraf 	sig_mutex_lock(&head->lio_mutex);
144434709573Sraf 	if (head->lio_refcnt > 1) {
144534709573Sraf 		head->lio_refcnt--;
144634709573Sraf 		head->lio_nent--;
1447c2575b5eSraf 		aiowp->work_req = NULL;
144834709573Sraf 		sig_mutex_unlock(&head->lio_mutex);
1449c2575b5eSraf 		sig_mutex_unlock(&aiowp->work_qlock1);
145034709573Sraf 		sig_mutex_lock(&__aio_mutex);
145134709573Sraf 		_aio_outstand_cnt--;
145234709573Sraf 		_aio_waitn_wakeup();
145334709573Sraf 		sig_mutex_unlock(&__aio_mutex);
145434709573Sraf 		_aio_req_free(reqp);
145534709573Sraf 		return (1);
14567c478bd9Sstevel@tonic-gate 	}
145734709573Sraf 	ASSERT(head->lio_nent == 1 && head->lio_refcnt == 1);
145834709573Sraf 	reqp->req_head = NULL;
145934709573Sraf 	if (head->lio_canned)
146034709573Sraf 		reqp->req_state = AIO_REQ_CANCELED;
146134709573Sraf 	if (head->lio_mode == LIO_DESTROY) {
1462c2575b5eSraf 		aiowp->work_req = NULL;
146334709573Sraf 		rval = 1;
146434709573Sraf 	}
1465c2575b5eSraf 	sig_mutex_unlock(&head->lio_mutex);
1466c2575b5eSraf 	sig_mutex_unlock(&aiowp->work_qlock1);
146734709573Sraf 	head->lio_refcnt--;
146834709573Sraf 	head->lio_nent--;
146934709573Sraf 	_aio_lio_free(head);
1470c2575b5eSraf 	if (rval != 0)
1471c2575b5eSraf 		_aio_req_free(reqp);
147234709573Sraf 	return (rval);
14737c478bd9Sstevel@tonic-gate }
14747c478bd9Sstevel@tonic-gate 
14757c478bd9Sstevel@tonic-gate /*
1476f841f6adSraf  * A worker is set idle when its work queue is empty.
1477f841f6adSraf  * The worker checks again that it has no more work
1478f841f6adSraf  * and then goes to sleep waiting for more work.
14797c478bd9Sstevel@tonic-gate  */
1480f841f6adSraf int
14817c478bd9Sstevel@tonic-gate _aio_idle(aio_worker_t *aiowp)
14827c478bd9Sstevel@tonic-gate {
148334709573Sraf 	int error = 0;
148434709573Sraf 
148534709573Sraf 	sig_mutex_lock(&aiowp->work_qlock1);
148634709573Sraf 	if (aiowp->work_count1 == 0) {
148734709573Sraf 		ASSERT(aiowp->work_minload1 == 0);
14887c478bd9Sstevel@tonic-gate 		aiowp->work_idleflg = 1;
14897c478bd9Sstevel@tonic-gate 		/*
149034709573Sraf 		 * A cancellation handler is not needed here.
149134709573Sraf 		 * aio worker threads are never cancelled via pthread_cancel().
14927c478bd9Sstevel@tonic-gate 		 */
149334709573Sraf 		error = sig_cond_wait(&aiowp->work_idle_cv,
149434709573Sraf 		    &aiowp->work_qlock1);
149534709573Sraf 		/*
149634709573Sraf 		 * The idle flag is normally cleared before worker is awakened
149734709573Sraf 		 * by aio_req_add().  On error (EINTR), we clear it ourself.
149834709573Sraf 		 */
149934709573Sraf 		if (error)
150034709573Sraf 			aiowp->work_idleflg = 0;
15017c478bd9Sstevel@tonic-gate 	}
150234709573Sraf 	sig_mutex_unlock(&aiowp->work_qlock1);
1503f841f6adSraf 	return (error);
15047c478bd9Sstevel@tonic-gate }
15057c478bd9Sstevel@tonic-gate 
15067c478bd9Sstevel@tonic-gate /*
15077c478bd9Sstevel@tonic-gate  * A worker's completed AIO requests are placed onto a global
150834709573Sraf  * done queue.  The application is only sent a SIGIO signal if
15097c478bd9Sstevel@tonic-gate  * the process has a handler enabled and it is not waiting via
15107c478bd9Sstevel@tonic-gate  * aiowait().
15117c478bd9Sstevel@tonic-gate  */
15127c478bd9Sstevel@tonic-gate static void
151334709573Sraf _aio_work_done(aio_worker_t *aiowp)
15147c478bd9Sstevel@tonic-gate {
151534709573Sraf 	aio_req_t *reqp;
15167c478bd9Sstevel@tonic-gate 
151734709573Sraf 	sig_mutex_lock(&aiowp->work_qlock1);
151834709573Sraf 	reqp = aiowp->work_prev1;
151934709573Sraf 	reqp->req_next = NULL;
15207c478bd9Sstevel@tonic-gate 	aiowp->work_done1 = 0;
15217c478bd9Sstevel@tonic-gate 	aiowp->work_tail1 = aiowp->work_next1;
15227c478bd9Sstevel@tonic-gate 	if (aiowp->work_tail1 == NULL)
15237c478bd9Sstevel@tonic-gate 		aiowp->work_head1 = NULL;
15247c478bd9Sstevel@tonic-gate 	aiowp->work_prev1 = NULL;
152534709573Sraf 	sig_mutex_unlock(&aiowp->work_qlock1);
152634709573Sraf 	sig_mutex_lock(&__aio_mutex);
15277c478bd9Sstevel@tonic-gate 	_aio_donecnt++;
15287c478bd9Sstevel@tonic-gate 	_aio_outstand_cnt--;
15297c478bd9Sstevel@tonic-gate 	_aio_req_done_cnt--;
153034709573Sraf 	ASSERT(_aio_donecnt > 0 &&
153134709573Sraf 	    _aio_outstand_cnt >= 0 &&
153234709573Sraf 	    _aio_req_done_cnt >= 0);
153334709573Sraf 	ASSERT(reqp != NULL);
15347c478bd9Sstevel@tonic-gate 
15357c478bd9Sstevel@tonic-gate 	if (_aio_done_tail == NULL) {
153634709573Sraf 		_aio_done_head = _aio_done_tail = reqp;
15377c478bd9Sstevel@tonic-gate 	} else {
153834709573Sraf 		_aio_done_head->req_next = reqp;
153934709573Sraf 		_aio_done_head = reqp;
15407c478bd9Sstevel@tonic-gate 	}
15417c478bd9Sstevel@tonic-gate 
15427c478bd9Sstevel@tonic-gate 	if (_aiowait_flag) {
154334709573Sraf 		sig_mutex_unlock(&__aio_mutex);
15447c478bd9Sstevel@tonic-gate 		(void) _kaio(AIONOTIFY);
15457c478bd9Sstevel@tonic-gate 	} else {
154634709573Sraf 		sig_mutex_unlock(&__aio_mutex);
154734709573Sraf 		if (_sigio_enabled)
15487c478bd9Sstevel@tonic-gate 			(void) kill(__pid, SIGIO);
15497c478bd9Sstevel@tonic-gate 	}
15507c478bd9Sstevel@tonic-gate }
15517c478bd9Sstevel@tonic-gate 
15527c478bd9Sstevel@tonic-gate /*
155334709573Sraf  * The done queue consists of AIO requests that are in either the
155434709573Sraf  * AIO_REQ_DONE or AIO_REQ_CANCELED state.  Requests that were cancelled
155534709573Sraf  * are discarded.  If the done queue is empty then NULL is returned.
155634709573Sraf  * Otherwise the address of a done aio_result_t is returned.
15577c478bd9Sstevel@tonic-gate  */
155834709573Sraf aio_result_t *
15597c478bd9Sstevel@tonic-gate _aio_req_done(void)
15607c478bd9Sstevel@tonic-gate {
156134709573Sraf 	aio_req_t *reqp;
15627c478bd9Sstevel@tonic-gate 	aio_result_t *resultp;
15637c478bd9Sstevel@tonic-gate 
15647c478bd9Sstevel@tonic-gate 	ASSERT(MUTEX_HELD(&__aio_mutex));
15657c478bd9Sstevel@tonic-gate 
156634709573Sraf 	if ((reqp = _aio_done_tail) != NULL) {
156734709573Sraf 		if ((_aio_done_tail = reqp->req_next) == NULL)
156834709573Sraf 			_aio_done_head = NULL;
15697c478bd9Sstevel@tonic-gate 		ASSERT(_aio_donecnt > 0);
15707c478bd9Sstevel@tonic-gate 		_aio_donecnt--;
157134709573Sraf 		(void) _aio_hash_del(reqp->req_resultp);
157234709573Sraf 		resultp = reqp->req_resultp;
157334709573Sraf 		ASSERT(reqp->req_state == AIO_REQ_DONE);
157434709573Sraf 		_aio_req_free(reqp);
15757c478bd9Sstevel@tonic-gate 		return (resultp);
15767c478bd9Sstevel@tonic-gate 	}
15777c478bd9Sstevel@tonic-gate 	/* is queue empty? */
157834709573Sraf 	if (reqp == NULL && _aio_outstand_cnt == 0) {
15797c478bd9Sstevel@tonic-gate 		return ((aio_result_t *)-1);
15807c478bd9Sstevel@tonic-gate 	}
15817c478bd9Sstevel@tonic-gate 	return (NULL);
15827c478bd9Sstevel@tonic-gate }
15837c478bd9Sstevel@tonic-gate 
15847c478bd9Sstevel@tonic-gate /*
158534709573Sraf  * Set the return and errno values for the application's use.
158634709573Sraf  *
158734709573Sraf  * For the Posix interfaces, we must set the return value first followed
158834709573Sraf  * by the errno value because the Posix interfaces allow for a change
158934709573Sraf  * in the errno value from EINPROGRESS to something else to signal
159034709573Sraf  * the completion of the asynchronous request.
159134709573Sraf  *
159234709573Sraf  * The opposite is true for the Solaris interfaces.  These allow for
159334709573Sraf  * a change in the return value from AIO_INPROGRESS to something else
159434709573Sraf  * to signal the completion of the asynchronous request.
15957c478bd9Sstevel@tonic-gate  */
15967c478bd9Sstevel@tonic-gate void
159734709573Sraf _aio_set_result(aio_req_t *reqp, ssize_t retval, int error)
15987c478bd9Sstevel@tonic-gate {
159934709573Sraf 	aio_result_t *resultp = reqp->req_resultp;
160034709573Sraf 
160134709573Sraf 	if (POSIX_AIO(reqp)) {
160234709573Sraf 		resultp->aio_return = retval;
160334709573Sraf 		membar_producer();
160434709573Sraf 		resultp->aio_errno = error;
160534709573Sraf 	} else {
160634709573Sraf 		resultp->aio_errno = error;
160734709573Sraf 		membar_producer();
160834709573Sraf 		resultp->aio_return = retval;
160934709573Sraf 	}
161034709573Sraf }
161134709573Sraf 
161234709573Sraf /*
161334709573Sraf  * Add an AIO request onto the next work queue.
161434709573Sraf  * A circular list of workers is used to choose the next worker.
161534709573Sraf  */
161634709573Sraf void
161734709573Sraf _aio_req_add(aio_req_t *reqp, aio_worker_t **nextworker, int mode)
161834709573Sraf {
1619f841f6adSraf 	ulwp_t *self = curthread;
162034709573Sraf 	aio_worker_t *aiowp;
162134709573Sraf 	aio_worker_t *first;
162234709573Sraf 	int load_bal_flg = 1;
162334709573Sraf 	int found;
162434709573Sraf 
162534709573Sraf 	ASSERT(reqp->req_state != AIO_REQ_DONEQ);
162634709573Sraf 	reqp->req_next = NULL;
16277c478bd9Sstevel@tonic-gate 	/*
162834709573Sraf 	 * Try to acquire the next worker's work queue.  If it is locked,
16297c478bd9Sstevel@tonic-gate 	 * then search the list of workers until a queue is found unlocked,
16307c478bd9Sstevel@tonic-gate 	 * or until the list is completely traversed at which point another
16317c478bd9Sstevel@tonic-gate 	 * worker will be created.
16327c478bd9Sstevel@tonic-gate 	 */
1633f841f6adSraf 	sigoff(self);		/* defer SIGIO */
163434709573Sraf 	sig_mutex_lock(&__aio_mutex);
163534709573Sraf 	first = aiowp = *nextworker;
163634709573Sraf 	if (mode != AIONOTIFY)
16377c478bd9Sstevel@tonic-gate 		_aio_outstand_cnt++;
163834709573Sraf 	sig_mutex_unlock(&__aio_mutex);
163934709573Sraf 
16407c478bd9Sstevel@tonic-gate 	switch (mode) {
164134709573Sraf 	case AIOREAD:
164234709573Sraf 	case AIOWRITE:
164334709573Sraf 	case AIOAREAD:
164434709573Sraf 	case AIOAWRITE:
164534709573Sraf #if !defined(_LP64)
164634709573Sraf 	case AIOAREAD64:
164734709573Sraf 	case AIOAWRITE64:
16487c478bd9Sstevel@tonic-gate #endif
164934709573Sraf 		/* try to find an idle worker */
165034709573Sraf 		found = 0;
165134709573Sraf 		do {
165234709573Sraf 			if (sig_mutex_trylock(&aiowp->work_qlock1) == 0) {
165334709573Sraf 				if (aiowp->work_idleflg) {
165434709573Sraf 					found = 1;
16557c478bd9Sstevel@tonic-gate 					break;
16567c478bd9Sstevel@tonic-gate 				}
165734709573Sraf 				sig_mutex_unlock(&aiowp->work_qlock1);
16587c478bd9Sstevel@tonic-gate 			}
165934709573Sraf 		} while ((aiowp = aiowp->work_forw) != first);
16607c478bd9Sstevel@tonic-gate 
166134709573Sraf 		if (found) {
166234709573Sraf 			aiowp->work_minload1++;
166334709573Sraf 			break;
166434709573Sraf 		}
16657c478bd9Sstevel@tonic-gate 
166634709573Sraf 		/* try to acquire some worker's queue lock */
166734709573Sraf 		do {
166834709573Sraf 			if (sig_mutex_trylock(&aiowp->work_qlock1) == 0) {
166934709573Sraf 				found = 1;
167034709573Sraf 				break;
16717c478bd9Sstevel@tonic-gate 			}
167234709573Sraf 		} while ((aiowp = aiowp->work_forw) != first);
167334709573Sraf 
167434709573Sraf 		/*
167534709573Sraf 		 * Create more workers when the workers appear overloaded.
167634709573Sraf 		 * Either all the workers are busy draining their queues
167734709573Sraf 		 * or no worker's queue lock could be acquired.
167834709573Sraf 		 */
167934709573Sraf 		if (!found) {
168034709573Sraf 			if (_aio_worker_cnt < _max_workers) {
168134709573Sraf 				if (_aio_create_worker(reqp, mode))
1682f841f6adSraf 					aio_panic("_aio_req_add: add worker");
1683f841f6adSraf 				sigon(self);	/* reenable SIGIO */
16847c478bd9Sstevel@tonic-gate 				return;
16857c478bd9Sstevel@tonic-gate 			}
168634709573Sraf 
168734709573Sraf 			/*
168834709573Sraf 			 * No worker available and we have created
168934709573Sraf 			 * _max_workers, keep going through the
169034709573Sraf 			 * list slowly until we get a lock
169134709573Sraf 			 */
169234709573Sraf 			while (sig_mutex_trylock(&aiowp->work_qlock1) != 0) {
169334709573Sraf 				/*
169434709573Sraf 				 * give someone else a chance
169534709573Sraf 				 */
169634709573Sraf 				_aio_delay(1);
169734709573Sraf 				aiowp = aiowp->work_forw;
169834709573Sraf 			}
169934709573Sraf 		}
170034709573Sraf 
170134709573Sraf 		ASSERT(MUTEX_HELD(&aiowp->work_qlock1));
170234709573Sraf 		if (_aio_worker_cnt < _max_workers &&
170334709573Sraf 		    aiowp->work_minload1 >= _minworkload) {
170434709573Sraf 			sig_mutex_unlock(&aiowp->work_qlock1);
170534709573Sraf 			sig_mutex_lock(&__aio_mutex);
170634709573Sraf 			*nextworker = aiowp->work_forw;
170734709573Sraf 			sig_mutex_unlock(&__aio_mutex);
170834709573Sraf 			if (_aio_create_worker(reqp, mode))
1709f841f6adSraf 				aio_panic("aio_req_add: add worker");
1710f841f6adSraf 			sigon(self);	/* reenable SIGIO */
171134709573Sraf 			return;
171234709573Sraf 		}
171334709573Sraf 		aiowp->work_minload1++;
171434709573Sraf 		break;
171534709573Sraf 	case AIOFSYNC:
171634709573Sraf 	case AIONOTIFY:
171734709573Sraf 		load_bal_flg = 0;
171834709573Sraf 		sig_mutex_lock(&aiowp->work_qlock1);
171934709573Sraf 		break;
172034709573Sraf 	default:
1721f841f6adSraf 		aio_panic("_aio_req_add: invalid mode");
172234709573Sraf 		break;
17237c478bd9Sstevel@tonic-gate 	}
17247c478bd9Sstevel@tonic-gate 	/*
17257c478bd9Sstevel@tonic-gate 	 * Put request onto worker's work queue.
17267c478bd9Sstevel@tonic-gate 	 */
17277c478bd9Sstevel@tonic-gate 	if (aiowp->work_tail1 == NULL) {
172834709573Sraf 		ASSERT(aiowp->work_count1 == 0);
172934709573Sraf 		aiowp->work_tail1 = reqp;
173034709573Sraf 		aiowp->work_next1 = reqp;
17317c478bd9Sstevel@tonic-gate 	} else {
173234709573Sraf 		aiowp->work_head1->req_next = reqp;
17337c478bd9Sstevel@tonic-gate 		if (aiowp->work_next1 == NULL)
173434709573Sraf 			aiowp->work_next1 = reqp;
17357c478bd9Sstevel@tonic-gate 	}
173634709573Sraf 	reqp->req_state = AIO_REQ_QUEUED;
173734709573Sraf 	reqp->req_worker = aiowp;
173834709573Sraf 	aiowp->work_head1 = reqp;
17397c478bd9Sstevel@tonic-gate 	/*
17407c478bd9Sstevel@tonic-gate 	 * Awaken worker if it is not currently active.
17417c478bd9Sstevel@tonic-gate 	 */
174234709573Sraf 	if (aiowp->work_count1++ == 0 && aiowp->work_idleflg) {
17437c478bd9Sstevel@tonic-gate 		aiowp->work_idleflg = 0;
174434709573Sraf 		(void) cond_signal(&aiowp->work_idle_cv);
17457c478bd9Sstevel@tonic-gate 	}
174634709573Sraf 	sig_mutex_unlock(&aiowp->work_qlock1);
174734709573Sraf 
174834709573Sraf 	if (load_bal_flg) {
174934709573Sraf 		sig_mutex_lock(&__aio_mutex);
175034709573Sraf 		*nextworker = aiowp->work_forw;
175134709573Sraf 		sig_mutex_unlock(&__aio_mutex);
175234709573Sraf 	}
1753f841f6adSraf 	sigon(self);	/* reenable SIGIO */
17547c478bd9Sstevel@tonic-gate }
17557c478bd9Sstevel@tonic-gate 
17567c478bd9Sstevel@tonic-gate /*
175734709573Sraf  * Get an AIO request for a specified worker.
175834709573Sraf  * If the work queue is empty, return NULL.
17597c478bd9Sstevel@tonic-gate  */
17607c478bd9Sstevel@tonic-gate aio_req_t *
17617c478bd9Sstevel@tonic-gate _aio_req_get(aio_worker_t *aiowp)
17627c478bd9Sstevel@tonic-gate {
176334709573Sraf 	aio_req_t *reqp;
17647c478bd9Sstevel@tonic-gate 
176534709573Sraf 	sig_mutex_lock(&aiowp->work_qlock1);
176634709573Sraf 	if ((reqp = aiowp->work_next1) != NULL) {
17677c478bd9Sstevel@tonic-gate 		/*
176834709573Sraf 		 * Remove a POSIX request from the queue; the
17697c478bd9Sstevel@tonic-gate 		 * request queue is a singularly linked list
177034709573Sraf 		 * with a previous pointer.  The request is
177134709573Sraf 		 * removed by updating the previous pointer.
17727c478bd9Sstevel@tonic-gate 		 *
177334709573Sraf 		 * Non-posix requests are left on the queue
177434709573Sraf 		 * to eventually be placed on the done queue.
17757c478bd9Sstevel@tonic-gate 		 */
17767c478bd9Sstevel@tonic-gate 
177734709573Sraf 		if (POSIX_AIO(reqp)) {
17787c478bd9Sstevel@tonic-gate 			if (aiowp->work_prev1 == NULL) {
177934709573Sraf 				aiowp->work_tail1 = reqp->req_next;
17807c478bd9Sstevel@tonic-gate 				if (aiowp->work_tail1 == NULL)
17817c478bd9Sstevel@tonic-gate 					aiowp->work_head1 = NULL;
17827c478bd9Sstevel@tonic-gate 			} else {
178334709573Sraf 				aiowp->work_prev1->req_next = reqp->req_next;
178434709573Sraf 				if (aiowp->work_head1 == reqp)
178534709573Sraf 					aiowp->work_head1 = reqp->req_next;
17867c478bd9Sstevel@tonic-gate 			}
17877c478bd9Sstevel@tonic-gate 
17887c478bd9Sstevel@tonic-gate 		} else {
178934709573Sraf 			aiowp->work_prev1 = reqp;
17907c478bd9Sstevel@tonic-gate 			ASSERT(aiowp->work_done1 >= 0);
17917c478bd9Sstevel@tonic-gate 			aiowp->work_done1++;
17927c478bd9Sstevel@tonic-gate 		}
179334709573Sraf 		ASSERT(reqp != reqp->req_next);
179434709573Sraf 		aiowp->work_next1 = reqp->req_next;
179534709573Sraf 		ASSERT(aiowp->work_count1 >= 1);
179634709573Sraf 		aiowp->work_count1--;
179734709573Sraf 		switch (reqp->req_op) {
179834709573Sraf 		case AIOREAD:
179934709573Sraf 		case AIOWRITE:
180034709573Sraf 		case AIOAREAD:
180134709573Sraf 		case AIOAWRITE:
180234709573Sraf #if !defined(_LP64)
180334709573Sraf 		case AIOAREAD64:
180434709573Sraf 		case AIOAWRITE64:
18057c478bd9Sstevel@tonic-gate #endif
180634709573Sraf 			ASSERT(aiowp->work_minload1 > 0);
180734709573Sraf 			aiowp->work_minload1--;
180834709573Sraf 			break;
180934709573Sraf 		}
181034709573Sraf 		reqp->req_state = AIO_REQ_INPROGRESS;
18117c478bd9Sstevel@tonic-gate 	}
181234709573Sraf 	aiowp->work_req = reqp;
181334709573Sraf 	ASSERT(reqp != NULL || aiowp->work_count1 == 0);
181434709573Sraf 	sig_mutex_unlock(&aiowp->work_qlock1);
181534709573Sraf 	return (reqp);
18167c478bd9Sstevel@tonic-gate }
18177c478bd9Sstevel@tonic-gate 
18187c478bd9Sstevel@tonic-gate static void
181934709573Sraf _aio_req_del(aio_worker_t *aiowp, aio_req_t *reqp, int ostate)
18207c478bd9Sstevel@tonic-gate {
182134709573Sraf 	aio_req_t **last;
182234709573Sraf 	aio_req_t *lastrp;
182334709573Sraf 	aio_req_t *next;
18247c478bd9Sstevel@tonic-gate 
18257c478bd9Sstevel@tonic-gate 	ASSERT(aiowp != NULL);
18267c478bd9Sstevel@tonic-gate 	ASSERT(MUTEX_HELD(&aiowp->work_qlock1));
182734709573Sraf 	if (POSIX_AIO(reqp)) {
18287c478bd9Sstevel@tonic-gate 		if (ostate != AIO_REQ_QUEUED)
18297c478bd9Sstevel@tonic-gate 			return;
18307c478bd9Sstevel@tonic-gate 	}
18317c478bd9Sstevel@tonic-gate 	last = &aiowp->work_tail1;
18327c478bd9Sstevel@tonic-gate 	lastrp = aiowp->work_tail1;
18337c478bd9Sstevel@tonic-gate 	ASSERT(ostate == AIO_REQ_QUEUED || ostate == AIO_REQ_INPROGRESS);
18347c478bd9Sstevel@tonic-gate 	while ((next = *last) != NULL) {
183534709573Sraf 		if (next == reqp) {
18367c478bd9Sstevel@tonic-gate 			*last = next->req_next;
18377c478bd9Sstevel@tonic-gate 			if (aiowp->work_next1 == next)
18387c478bd9Sstevel@tonic-gate 				aiowp->work_next1 = next->req_next;
18397c478bd9Sstevel@tonic-gate 
18407c478bd9Sstevel@tonic-gate 			if ((next->req_next != NULL) ||
18417c478bd9Sstevel@tonic-gate 			    (aiowp->work_done1 == 0)) {
18427c478bd9Sstevel@tonic-gate 				if (aiowp->work_head1 == next)
18437c478bd9Sstevel@tonic-gate 					aiowp->work_head1 = next->req_next;
18447c478bd9Sstevel@tonic-gate 				if (aiowp->work_prev1 == next)
18457c478bd9Sstevel@tonic-gate 					aiowp->work_prev1 = next->req_next;
18467c478bd9Sstevel@tonic-gate 			} else {
18477c478bd9Sstevel@tonic-gate 				if (aiowp->work_head1 == next)
18487c478bd9Sstevel@tonic-gate 					aiowp->work_head1 = lastrp;
18497c478bd9Sstevel@tonic-gate 				if (aiowp->work_prev1 == next)
18507c478bd9Sstevel@tonic-gate 					aiowp->work_prev1 = lastrp;
18517c478bd9Sstevel@tonic-gate 			}
18527c478bd9Sstevel@tonic-gate 
18537c478bd9Sstevel@tonic-gate 			if (ostate == AIO_REQ_QUEUED) {
185434709573Sraf 				ASSERT(aiowp->work_count1 >= 1);
185534709573Sraf 				aiowp->work_count1--;
185634709573Sraf 				ASSERT(aiowp->work_minload1 >= 1);
185734709573Sraf 				aiowp->work_minload1--;
18587c478bd9Sstevel@tonic-gate 			} else {
18597c478bd9Sstevel@tonic-gate 				ASSERT(ostate == AIO_REQ_INPROGRESS &&
186034709573Sraf 				    !POSIX_AIO(reqp));
18617c478bd9Sstevel@tonic-gate 				aiowp->work_done1--;
18627c478bd9Sstevel@tonic-gate 			}
18637c478bd9Sstevel@tonic-gate 			return;
18647c478bd9Sstevel@tonic-gate 		}
18657c478bd9Sstevel@tonic-gate 		last = &next->req_next;
18667c478bd9Sstevel@tonic-gate 		lastrp = next;
18677c478bd9Sstevel@tonic-gate 	}
18687c478bd9Sstevel@tonic-gate 	/* NOTREACHED */
18697c478bd9Sstevel@tonic-gate }
18707c478bd9Sstevel@tonic-gate 
18717c478bd9Sstevel@tonic-gate static void
18727c478bd9Sstevel@tonic-gate _aio_enq_doneq(aio_req_t *reqp)
18737c478bd9Sstevel@tonic-gate {
18747c478bd9Sstevel@tonic-gate 	if (_aio_doneq == NULL) {
18757c478bd9Sstevel@tonic-gate 		_aio_doneq = reqp;
187634709573Sraf 		reqp->req_next = reqp->req_prev = reqp;
18777c478bd9Sstevel@tonic-gate 	} else {
18787c478bd9Sstevel@tonic-gate 		reqp->req_next = _aio_doneq;
18797c478bd9Sstevel@tonic-gate 		reqp->req_prev = _aio_doneq->req_prev;
188034709573Sraf 		_aio_doneq->req_prev->req_next = reqp;
18817c478bd9Sstevel@tonic-gate 		_aio_doneq->req_prev = reqp;
18827c478bd9Sstevel@tonic-gate 	}
18837c478bd9Sstevel@tonic-gate 	reqp->req_state = AIO_REQ_DONEQ;
18847c478bd9Sstevel@tonic-gate 	_aio_doneq_cnt++;
18857c478bd9Sstevel@tonic-gate }
18867c478bd9Sstevel@tonic-gate 
18877c478bd9Sstevel@tonic-gate /*
18887c478bd9Sstevel@tonic-gate  * caller owns the _aio_mutex
18897c478bd9Sstevel@tonic-gate  */
18907c478bd9Sstevel@tonic-gate aio_req_t *
18917c478bd9Sstevel@tonic-gate _aio_req_remove(aio_req_t *reqp)
18927c478bd9Sstevel@tonic-gate {
18937c478bd9Sstevel@tonic-gate 	if (reqp && reqp->req_state != AIO_REQ_DONEQ)
18947c478bd9Sstevel@tonic-gate 		return (NULL);
18957c478bd9Sstevel@tonic-gate 
18967c478bd9Sstevel@tonic-gate 	if (reqp) {
18977c478bd9Sstevel@tonic-gate 		/* request in done queue */
189834709573Sraf 		if (_aio_doneq == reqp)
189934709573Sraf 			_aio_doneq = reqp->req_next;
190034709573Sraf 		if (_aio_doneq == reqp) {
19017c478bd9Sstevel@tonic-gate 			/* only one request on queue */
19027c478bd9Sstevel@tonic-gate 			_aio_doneq = NULL;
19037c478bd9Sstevel@tonic-gate 		} else {
190434709573Sraf 			aio_req_t *tmp = reqp->req_next;
190534709573Sraf 			reqp->req_prev->req_next = tmp;
190634709573Sraf 			tmp->req_prev = reqp->req_prev;
19077c478bd9Sstevel@tonic-gate 		}
190834709573Sraf 	} else if ((reqp = _aio_doneq) != NULL) {
190934709573Sraf 		if (reqp == reqp->req_next) {
19107c478bd9Sstevel@tonic-gate 			/* only one request on queue */
19117c478bd9Sstevel@tonic-gate 			_aio_doneq = NULL;
19127c478bd9Sstevel@tonic-gate 		} else {
191334709573Sraf 			reqp->req_prev->req_next = _aio_doneq = reqp->req_next;
191434709573Sraf 			_aio_doneq->req_prev = reqp->req_prev;
19157c478bd9Sstevel@tonic-gate 		}
191634709573Sraf 	}
191734709573Sraf 	if (reqp) {
19187c478bd9Sstevel@tonic-gate 		_aio_doneq_cnt--;
191934709573Sraf 		reqp->req_next = reqp->req_prev = reqp;
192034709573Sraf 		reqp->req_state = AIO_REQ_DONE;
19217c478bd9Sstevel@tonic-gate 	}
192234709573Sraf 	return (reqp);
19237c478bd9Sstevel@tonic-gate }
19247c478bd9Sstevel@tonic-gate 
19257c478bd9Sstevel@tonic-gate /*
192634709573Sraf  * An AIO request is identified by an aio_result_t pointer.  The library
192734709573Sraf  * maps this aio_result_t pointer to its internal representation using a
192834709573Sraf  * hash table.  This function adds an aio_result_t pointer to the hash table.
19297c478bd9Sstevel@tonic-gate  */
19307c478bd9Sstevel@tonic-gate static int
193134709573Sraf _aio_hash_insert(aio_result_t *resultp, aio_req_t *reqp)
19327c478bd9Sstevel@tonic-gate {
193334709573Sraf 	aio_hash_t *hashp;
193434709573Sraf 	aio_req_t **prev;
193534709573Sraf 	aio_req_t *next;
19367c478bd9Sstevel@tonic-gate 
193734709573Sraf 	hashp = _aio_hash + AIOHASH(resultp);
1938f841f6adSraf 	lmutex_lock(&hashp->hash_lock);
193934709573Sraf 	prev = &hashp->hash_ptr;
19407c478bd9Sstevel@tonic-gate 	while ((next = *prev) != NULL) {
19417c478bd9Sstevel@tonic-gate 		if (resultp == next->req_resultp) {
1942f841f6adSraf 			lmutex_unlock(&hashp->hash_lock);
194334709573Sraf 			return (-1);
19447c478bd9Sstevel@tonic-gate 		}
19457c478bd9Sstevel@tonic-gate 		prev = &next->req_link;
19467c478bd9Sstevel@tonic-gate 	}
194734709573Sraf 	*prev = reqp;
194834709573Sraf 	ASSERT(reqp->req_link == NULL);
1949f841f6adSraf 	lmutex_unlock(&hashp->hash_lock);
195034709573Sraf 	return (0);
19517c478bd9Sstevel@tonic-gate }
19527c478bd9Sstevel@tonic-gate 
19537c478bd9Sstevel@tonic-gate /*
195434709573Sraf  * Remove an entry from the hash table.
19557c478bd9Sstevel@tonic-gate  */
195634709573Sraf aio_req_t *
195734709573Sraf _aio_hash_del(aio_result_t *resultp)
19587c478bd9Sstevel@tonic-gate {
195934709573Sraf 	aio_hash_t *hashp;
196034709573Sraf 	aio_req_t **prev;
196134709573Sraf 	aio_req_t *next = NULL;
196234709573Sraf 
196334709573Sraf 	if (_aio_hash != NULL) {
196434709573Sraf 		hashp = _aio_hash + AIOHASH(resultp);
1965f841f6adSraf 		lmutex_lock(&hashp->hash_lock);
196634709573Sraf 		prev = &hashp->hash_ptr;
196734709573Sraf 		while ((next = *prev) != NULL) {
196834709573Sraf 			if (resultp == next->req_resultp) {
196934709573Sraf 				*prev = next->req_link;
197034709573Sraf 				next->req_link = NULL;
197134709573Sraf 				break;
197234709573Sraf 			}
197334709573Sraf 			prev = &next->req_link;
19747c478bd9Sstevel@tonic-gate 		}
1975f841f6adSraf 		lmutex_unlock(&hashp->hash_lock);
19767c478bd9Sstevel@tonic-gate 	}
197734709573Sraf 	return (next);
19787c478bd9Sstevel@tonic-gate }
19797c478bd9Sstevel@tonic-gate 
19807c478bd9Sstevel@tonic-gate /*
198134709573Sraf  *  find an entry in the hash table
19827c478bd9Sstevel@tonic-gate  */
19837c478bd9Sstevel@tonic-gate aio_req_t *
198434709573Sraf _aio_hash_find(aio_result_t *resultp)
19857c478bd9Sstevel@tonic-gate {
198634709573Sraf 	aio_hash_t *hashp;
198734709573Sraf 	aio_req_t **prev;
198834709573Sraf 	aio_req_t *next = NULL;
198934709573Sraf 
199034709573Sraf 	if (_aio_hash != NULL) {
199134709573Sraf 		hashp = _aio_hash + AIOHASH(resultp);
1992f841f6adSraf 		lmutex_lock(&hashp->hash_lock);
199334709573Sraf 		prev = &hashp->hash_ptr;
199434709573Sraf 		while ((next = *prev) != NULL) {
199534709573Sraf 			if (resultp == next->req_resultp)
199634709573Sraf 				break;
199734709573Sraf 			prev = &next->req_link;
199834709573Sraf 		}
1999f841f6adSraf 		lmutex_unlock(&hashp->hash_lock);
20007c478bd9Sstevel@tonic-gate 	}
200134709573Sraf 	return (next);
20027c478bd9Sstevel@tonic-gate }
20037c478bd9Sstevel@tonic-gate 
20047c478bd9Sstevel@tonic-gate /*
20057c478bd9Sstevel@tonic-gate  * AIO interface for POSIX
20067c478bd9Sstevel@tonic-gate  */
20077c478bd9Sstevel@tonic-gate int
200834709573Sraf _aio_rw(aiocb_t *aiocbp, aio_lio_t *lio_head, aio_worker_t **nextworker,
200934709573Sraf     int mode, int flg)
20107c478bd9Sstevel@tonic-gate {
201134709573Sraf 	aio_req_t *reqp;
201234709573Sraf 	aio_args_t *ap;
20137c478bd9Sstevel@tonic-gate 	int kerr;
20147c478bd9Sstevel@tonic-gate 
201534709573Sraf 	if (aiocbp == NULL) {
20167c478bd9Sstevel@tonic-gate 		errno = EINVAL;
20177c478bd9Sstevel@tonic-gate 		return (-1);
20187c478bd9Sstevel@tonic-gate 	}
20197c478bd9Sstevel@tonic-gate 
20207c478bd9Sstevel@tonic-gate 	/* initialize kaio */
20217c478bd9Sstevel@tonic-gate 	if (!_kaio_ok)
20227c478bd9Sstevel@tonic-gate 		_kaio_init();
20237c478bd9Sstevel@tonic-gate 
202434709573Sraf 	aiocbp->aio_state = NOCHECK;
20257c478bd9Sstevel@tonic-gate 
20267c478bd9Sstevel@tonic-gate 	/*
202734709573Sraf 	 * If we have been called because a list I/O
20287c478bd9Sstevel@tonic-gate 	 * kaio() failed, we dont want to repeat the
20297c478bd9Sstevel@tonic-gate 	 * system call
20307c478bd9Sstevel@tonic-gate 	 */
20317c478bd9Sstevel@tonic-gate 
20327c478bd9Sstevel@tonic-gate 	if (flg & AIO_KAIO) {
20337c478bd9Sstevel@tonic-gate 		/*
20347c478bd9Sstevel@tonic-gate 		 * Try kernel aio first.
20357c478bd9Sstevel@tonic-gate 		 * If errno is ENOTSUP/EBADFD,
20367c478bd9Sstevel@tonic-gate 		 * fall back to the thread implementation.
20377c478bd9Sstevel@tonic-gate 		 */
203834709573Sraf 		if (_kaio_ok > 0 && KAIO_SUPPORTED(aiocbp->aio_fildes)) {
203934709573Sraf 			aiocbp->aio_resultp.aio_errno = EINPROGRESS;
204034709573Sraf 			aiocbp->aio_state = CHECK;
204134709573Sraf 			kerr = (int)_kaio(mode, aiocbp);
20427c478bd9Sstevel@tonic-gate 			if (kerr == 0)
20437c478bd9Sstevel@tonic-gate 				return (0);
204434709573Sraf 			if (errno != ENOTSUP && errno != EBADFD) {
204534709573Sraf 				aiocbp->aio_resultp.aio_errno = errno;
204634709573Sraf 				aiocbp->aio_resultp.aio_return = -1;
204734709573Sraf 				aiocbp->aio_state = NOCHECK;
20487c478bd9Sstevel@tonic-gate 				return (-1);
20497c478bd9Sstevel@tonic-gate 			}
20507c478bd9Sstevel@tonic-gate 			if (errno == EBADFD)
205134709573Sraf 				SET_KAIO_NOT_SUPPORTED(aiocbp->aio_fildes);
20527c478bd9Sstevel@tonic-gate 		}
20537c478bd9Sstevel@tonic-gate 	}
20547c478bd9Sstevel@tonic-gate 
205534709573Sraf 	aiocbp->aio_resultp.aio_errno = EINPROGRESS;
205634709573Sraf 	aiocbp->aio_state = USERAIO;
20577c478bd9Sstevel@tonic-gate 
205834709573Sraf 	if (!__uaio_ok && __uaio_init() == -1)
205934709573Sraf 		return (-1);
20607c478bd9Sstevel@tonic-gate 
206134709573Sraf 	if ((reqp = _aio_req_alloc()) == NULL) {
20627c478bd9Sstevel@tonic-gate 		errno = EAGAIN;
20637c478bd9Sstevel@tonic-gate 		return (-1);
20647c478bd9Sstevel@tonic-gate 	}
20657c478bd9Sstevel@tonic-gate 
20667c478bd9Sstevel@tonic-gate 	/*
206734709573Sraf 	 * If an LIO request, add the list head to the aio request
20687c478bd9Sstevel@tonic-gate 	 */
206934709573Sraf 	reqp->req_head = lio_head;
207034709573Sraf 	reqp->req_type = AIO_POSIX_REQ;
207134709573Sraf 	reqp->req_op = mode;
207234709573Sraf 	reqp->req_largefile = 0;
207334709573Sraf 
207434709573Sraf 	if (aiocbp->aio_sigevent.sigev_notify == SIGEV_NONE) {
207534709573Sraf 		reqp->req_sigevent.sigev_notify = SIGEV_NONE;
207634709573Sraf 	} else if (aiocbp->aio_sigevent.sigev_notify == SIGEV_SIGNAL) {
207734709573Sraf 		reqp->req_sigevent.sigev_notify = SIGEV_SIGNAL;
207834709573Sraf 		reqp->req_sigevent.sigev_signo =
207934709573Sraf 		    aiocbp->aio_sigevent.sigev_signo;
208034709573Sraf 		reqp->req_sigevent.sigev_value.sival_ptr =
208134709573Sraf 		    aiocbp->aio_sigevent.sigev_value.sival_ptr;
208234709573Sraf 	} else if (aiocbp->aio_sigevent.sigev_notify == SIGEV_PORT) {
208334709573Sraf 		port_notify_t *pn = aiocbp->aio_sigevent.sigev_value.sival_ptr;
208434709573Sraf 		reqp->req_sigevent.sigev_notify = SIGEV_PORT;
208534709573Sraf 		/*
208634709573Sraf 		 * Reuse the sigevent structure to contain the port number
208734709573Sraf 		 * and the user value.  Same for SIGEV_THREAD, below.
208834709573Sraf 		 */
208934709573Sraf 		reqp->req_sigevent.sigev_signo =
209034709573Sraf 		    pn->portnfy_port;
209134709573Sraf 		reqp->req_sigevent.sigev_value.sival_ptr =
209234709573Sraf 		    pn->portnfy_user;
209334709573Sraf 	} else if (aiocbp->aio_sigevent.sigev_notify == SIGEV_THREAD) {
209434709573Sraf 		reqp->req_sigevent.sigev_notify = SIGEV_THREAD;
209534709573Sraf 		/*
209634709573Sraf 		 * The sigevent structure contains the port number
209734709573Sraf 		 * and the user value.  Same for SIGEV_PORT, above.
209834709573Sraf 		 */
209934709573Sraf 		reqp->req_sigevent.sigev_signo =
210034709573Sraf 		    aiocbp->aio_sigevent.sigev_signo;
210134709573Sraf 		reqp->req_sigevent.sigev_value.sival_ptr =
210234709573Sraf 		    aiocbp->aio_sigevent.sigev_value.sival_ptr;
21037c478bd9Sstevel@tonic-gate 	}
21047c478bd9Sstevel@tonic-gate 
210534709573Sraf 	reqp->req_resultp = &aiocbp->aio_resultp;
210634709573Sraf 	reqp->req_aiocbp = aiocbp;
210734709573Sraf 	ap = &reqp->req_args;
210834709573Sraf 	ap->fd = aiocbp->aio_fildes;
210934709573Sraf 	ap->buf = (caddr_t)aiocbp->aio_buf;
211034709573Sraf 	ap->bufsz = aiocbp->aio_nbytes;
211134709573Sraf 	ap->offset = aiocbp->aio_offset;
211234709573Sraf 
211334709573Sraf 	if ((flg & AIO_NO_DUPS) &&
211434709573Sraf 	    _aio_hash_insert(&aiocbp->aio_resultp, reqp) != 0) {
2115f841f6adSraf 		aio_panic("_aio_rw(): request already in hash table");
211634709573Sraf 		_aio_req_free(reqp);
21177c478bd9Sstevel@tonic-gate 		errno = EINVAL;
21187c478bd9Sstevel@tonic-gate 		return (-1);
21197c478bd9Sstevel@tonic-gate 	}
212034709573Sraf 	_aio_req_add(reqp, nextworker, mode);
212134709573Sraf 	return (0);
21227c478bd9Sstevel@tonic-gate }
21237c478bd9Sstevel@tonic-gate 
212434709573Sraf #if !defined(_LP64)
21257c478bd9Sstevel@tonic-gate /*
21267c478bd9Sstevel@tonic-gate  * 64-bit AIO interface for POSIX
21277c478bd9Sstevel@tonic-gate  */
21287c478bd9Sstevel@tonic-gate int
212934709573Sraf _aio_rw64(aiocb64_t *aiocbp, aio_lio_t *lio_head, aio_worker_t **nextworker,
213034709573Sraf     int mode, int flg)
21317c478bd9Sstevel@tonic-gate {
213234709573Sraf 	aio_req_t *reqp;
213334709573Sraf 	aio_args_t *ap;
21347c478bd9Sstevel@tonic-gate 	int kerr;
21357c478bd9Sstevel@tonic-gate 
213634709573Sraf 	if (aiocbp == NULL) {
21377c478bd9Sstevel@tonic-gate 		errno = EINVAL;
21387c478bd9Sstevel@tonic-gate 		return (-1);
21397c478bd9Sstevel@tonic-gate 	}
21407c478bd9Sstevel@tonic-gate 
21417c478bd9Sstevel@tonic-gate 	/* initialize kaio */
21427c478bd9Sstevel@tonic-gate 	if (!_kaio_ok)
21437c478bd9Sstevel@tonic-gate 		_kaio_init();
21447c478bd9Sstevel@tonic-gate 
214534709573Sraf 	aiocbp->aio_state = NOCHECK;
21467c478bd9Sstevel@tonic-gate 
21477c478bd9Sstevel@tonic-gate 	/*
214834709573Sraf 	 * If we have been called because a list I/O
21497c478bd9Sstevel@tonic-gate 	 * kaio() failed, we dont want to repeat the
21507c478bd9Sstevel@tonic-gate 	 * system call
21517c478bd9Sstevel@tonic-gate 	 */
21527c478bd9Sstevel@tonic-gate 
21537c478bd9Sstevel@tonic-gate 	if (flg & AIO_KAIO) {
21547c478bd9Sstevel@tonic-gate 		/*
21557c478bd9Sstevel@tonic-gate 		 * Try kernel aio first.
21567c478bd9Sstevel@tonic-gate 		 * If errno is ENOTSUP/EBADFD,
21577c478bd9Sstevel@tonic-gate 		 * fall back to the thread implementation.
21587c478bd9Sstevel@tonic-gate 		 */
215934709573Sraf 		if (_kaio_ok > 0 && KAIO_SUPPORTED(aiocbp->aio_fildes)) {
216034709573Sraf 			aiocbp->aio_resultp.aio_errno = EINPROGRESS;
216134709573Sraf 			aiocbp->aio_state = CHECK;
216234709573Sraf 			kerr = (int)_kaio(mode, aiocbp);
21637c478bd9Sstevel@tonic-gate 			if (kerr == 0)
21647c478bd9Sstevel@tonic-gate 				return (0);
216534709573Sraf 			if (errno != ENOTSUP && errno != EBADFD) {
216634709573Sraf 				aiocbp->aio_resultp.aio_errno = errno;
216734709573Sraf 				aiocbp->aio_resultp.aio_return = -1;
216834709573Sraf 				aiocbp->aio_state = NOCHECK;
21697c478bd9Sstevel@tonic-gate 				return (-1);
21707c478bd9Sstevel@tonic-gate 			}
21717c478bd9Sstevel@tonic-gate 			if (errno == EBADFD)
217234709573Sraf 				SET_KAIO_NOT_SUPPORTED(aiocbp->aio_fildes);
21737c478bd9Sstevel@tonic-gate 		}
21747c478bd9Sstevel@tonic-gate 	}
21757c478bd9Sstevel@tonic-gate 
217634709573Sraf 	aiocbp->aio_resultp.aio_errno = EINPROGRESS;
217734709573Sraf 	aiocbp->aio_state = USERAIO;
21787c478bd9Sstevel@tonic-gate 
217934709573Sraf 	if (!__uaio_ok && __uaio_init() == -1)
218034709573Sraf 		return (-1);
21817c478bd9Sstevel@tonic-gate 
218234709573Sraf 	if ((reqp = _aio_req_alloc()) == NULL) {
21837c478bd9Sstevel@tonic-gate 		errno = EAGAIN;
21847c478bd9Sstevel@tonic-gate 		return (-1);
21857c478bd9Sstevel@tonic-gate 	}
21867c478bd9Sstevel@tonic-gate 
21877c478bd9Sstevel@tonic-gate 	/*
218834709573Sraf 	 * If an LIO request, add the list head to the aio request
21897c478bd9Sstevel@tonic-gate 	 */
219034709573Sraf 	reqp->req_head = lio_head;
219134709573Sraf 	reqp->req_type = AIO_POSIX_REQ;
219234709573Sraf 	reqp->req_op = mode;
219334709573Sraf 	reqp->req_largefile = 1;
219434709573Sraf 
219534709573Sraf 	if (aiocbp->aio_sigevent.sigev_notify == SIGEV_NONE) {
219634709573Sraf 		reqp->req_sigevent.sigev_notify = SIGEV_NONE;
219734709573Sraf 	} else if (aiocbp->aio_sigevent.sigev_notify == SIGEV_SIGNAL) {
219834709573Sraf 		reqp->req_sigevent.sigev_notify = SIGEV_SIGNAL;
219934709573Sraf 		reqp->req_sigevent.sigev_signo =
220034709573Sraf 		    aiocbp->aio_sigevent.sigev_signo;
220134709573Sraf 		reqp->req_sigevent.sigev_value.sival_ptr =
220234709573Sraf 		    aiocbp->aio_sigevent.sigev_value.sival_ptr;
220334709573Sraf 	} else if (aiocbp->aio_sigevent.sigev_notify == SIGEV_PORT) {
220434709573Sraf 		port_notify_t *pn = aiocbp->aio_sigevent.sigev_value.sival_ptr;
220534709573Sraf 		reqp->req_sigevent.sigev_notify = SIGEV_PORT;
220634709573Sraf 		reqp->req_sigevent.sigev_signo =
220734709573Sraf 		    pn->portnfy_port;
220834709573Sraf 		reqp->req_sigevent.sigev_value.sival_ptr =
220934709573Sraf 		    pn->portnfy_user;
221034709573Sraf 	} else if (aiocbp->aio_sigevent.sigev_notify == SIGEV_THREAD) {
221134709573Sraf 		reqp->req_sigevent.sigev_notify = SIGEV_THREAD;
221234709573Sraf 		reqp->req_sigevent.sigev_signo =
221334709573Sraf 		    aiocbp->aio_sigevent.sigev_signo;
221434709573Sraf 		reqp->req_sigevent.sigev_value.sival_ptr =
221534709573Sraf 		    aiocbp->aio_sigevent.sigev_value.sival_ptr;
22167c478bd9Sstevel@tonic-gate 	}
22177c478bd9Sstevel@tonic-gate 
221834709573Sraf 	reqp->req_resultp = &aiocbp->aio_resultp;
221934709573Sraf 	reqp->req_aiocbp = aiocbp;
222034709573Sraf 	ap = &reqp->req_args;
222134709573Sraf 	ap->fd = aiocbp->aio_fildes;
222234709573Sraf 	ap->buf = (caddr_t)aiocbp->aio_buf;
222334709573Sraf 	ap->bufsz = aiocbp->aio_nbytes;
222434709573Sraf 	ap->offset = aiocbp->aio_offset;
222534709573Sraf 
222634709573Sraf 	if ((flg & AIO_NO_DUPS) &&
222734709573Sraf 	    _aio_hash_insert(&aiocbp->aio_resultp, reqp) != 0) {
2228f841f6adSraf 		aio_panic("_aio_rw64(): request already in hash table");
222934709573Sraf 		_aio_req_free(reqp);
22307c478bd9Sstevel@tonic-gate 		errno = EINVAL;
22317c478bd9Sstevel@tonic-gate 		return (-1);
22327c478bd9Sstevel@tonic-gate 	}
223334709573Sraf 	_aio_req_add(reqp, nextworker, mode);
223434709573Sraf 	return (0);
22357c478bd9Sstevel@tonic-gate }
223634709573Sraf #endif	/* !defined(_LP64) */
2237