xref: /illumos-gate/usr/src/lib/libc/port/aio/aio.c (revision f841f6ad)
17c478bd9Sstevel@tonic-gate /*
27c478bd9Sstevel@tonic-gate  * CDDL HEADER START
37c478bd9Sstevel@tonic-gate  *
47c478bd9Sstevel@tonic-gate  * The contents of this file are subject to the terms of the
534709573Sraf  * Common Development and Distribution License (the "License").
634709573Sraf  * You may not use this file except in compliance with the License.
77c478bd9Sstevel@tonic-gate  *
87c478bd9Sstevel@tonic-gate  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
97c478bd9Sstevel@tonic-gate  * or http://www.opensolaris.org/os/licensing.
107c478bd9Sstevel@tonic-gate  * See the License for the specific language governing permissions
117c478bd9Sstevel@tonic-gate  * and limitations under the License.
127c478bd9Sstevel@tonic-gate  *
137c478bd9Sstevel@tonic-gate  * When distributing Covered Code, include this CDDL HEADER in each
147c478bd9Sstevel@tonic-gate  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
157c478bd9Sstevel@tonic-gate  * If applicable, add the following below this CDDL HEADER, with the
167c478bd9Sstevel@tonic-gate  * fields enclosed by brackets "[]" replaced with your own identifying
177c478bd9Sstevel@tonic-gate  * information: Portions Copyright [yyyy] [name of copyright owner]
187c478bd9Sstevel@tonic-gate  *
197c478bd9Sstevel@tonic-gate  * CDDL HEADER END
207c478bd9Sstevel@tonic-gate  */
2134709573Sraf 
227c478bd9Sstevel@tonic-gate /*
2334709573Sraf  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
247c478bd9Sstevel@tonic-gate  * Use is subject to license terms.
257c478bd9Sstevel@tonic-gate  */
267c478bd9Sstevel@tonic-gate 
277c478bd9Sstevel@tonic-gate #pragma ident	"%Z%%M%	%I%	%E% SMI"
287c478bd9Sstevel@tonic-gate 
29*f841f6adSraf #include "synonyms.h"
30*f841f6adSraf #include "thr_uberdata.h"
31*f841f6adSraf #include "asyncio.h"
3234709573Sraf #include <atomic.h>
337c478bd9Sstevel@tonic-gate #include <sys/param.h>
347c478bd9Sstevel@tonic-gate #include <sys/file.h>
357c478bd9Sstevel@tonic-gate #include <sys/port.h>
367c478bd9Sstevel@tonic-gate 
377c478bd9Sstevel@tonic-gate static int _aio_hash_insert(aio_result_t *, aio_req_t *);
387c478bd9Sstevel@tonic-gate static aio_req_t *_aio_req_get(aio_worker_t *);
397c478bd9Sstevel@tonic-gate static void _aio_req_add(aio_req_t *, aio_worker_t **, int);
407c478bd9Sstevel@tonic-gate static void _aio_req_del(aio_worker_t *, aio_req_t *, int);
417c478bd9Sstevel@tonic-gate static void _aio_work_done(aio_worker_t *);
4234709573Sraf static void _aio_enq_doneq(aio_req_t *);
437c478bd9Sstevel@tonic-gate 
4434709573Sraf extern void _aio_lio_free(aio_lio_t *);
457c478bd9Sstevel@tonic-gate 
4634709573Sraf extern int __fdsync(int, int);
477c478bd9Sstevel@tonic-gate extern int _port_dispatch(int, int, int, int, uintptr_t, void *);
487c478bd9Sstevel@tonic-gate 
49c2575b5eSraf static int _aio_fsync_del(aio_worker_t *, aio_req_t *);
5034709573Sraf static void _aiodone(aio_req_t *, ssize_t, int);
517c478bd9Sstevel@tonic-gate static void _aio_cancel_work(aio_worker_t *, int, int *, int *);
5234709573Sraf static void _aio_finish_request(aio_worker_t *, ssize_t, int);
537c478bd9Sstevel@tonic-gate 
547c478bd9Sstevel@tonic-gate /*
557c478bd9Sstevel@tonic-gate  * switch for kernel async I/O
567c478bd9Sstevel@tonic-gate  */
5734709573Sraf int _kaio_ok = 0;		/* 0 = disabled, 1 = on, -1 = error */
587c478bd9Sstevel@tonic-gate 
597c478bd9Sstevel@tonic-gate /*
607c478bd9Sstevel@tonic-gate  * Key for thread-specific data
617c478bd9Sstevel@tonic-gate  */
6234709573Sraf pthread_key_t _aio_key;
637c478bd9Sstevel@tonic-gate 
647c478bd9Sstevel@tonic-gate /*
6534709573Sraf  * Array for determining whether or not a file supports kaio.
6634709573Sraf  * Initialized in _kaio_init().
677c478bd9Sstevel@tonic-gate  */
6834709573Sraf uint32_t *_kaio_supported = NULL;
697c478bd9Sstevel@tonic-gate 
707c478bd9Sstevel@tonic-gate /*
7134709573Sraf  *  workers for read/write requests
7234709573Sraf  * (__aio_mutex lock protects circular linked list of workers)
737c478bd9Sstevel@tonic-gate  */
7434709573Sraf aio_worker_t *__workers_rw;	/* circular list of AIO workers */
7534709573Sraf aio_worker_t *__nextworker_rw;	/* next worker in list of workers */
7634709573Sraf int __rw_workerscnt;		/* number of read/write workers */
777c478bd9Sstevel@tonic-gate 
787c478bd9Sstevel@tonic-gate /*
7934709573Sraf  * worker for notification requests.
807c478bd9Sstevel@tonic-gate  */
8134709573Sraf aio_worker_t *__workers_no;	/* circular list of AIO workers */
8234709573Sraf aio_worker_t *__nextworker_no;	/* next worker in list of workers */
8334709573Sraf int __no_workerscnt;		/* number of write workers */
847c478bd9Sstevel@tonic-gate 
8534709573Sraf aio_req_t *_aio_done_tail;		/* list of done requests */
8634709573Sraf aio_req_t *_aio_done_head;
877c478bd9Sstevel@tonic-gate 
887c478bd9Sstevel@tonic-gate mutex_t __aio_initlock = DEFAULTMUTEX;	/* makes aio initialization atomic */
89*f841f6adSraf cond_t __aio_initcv = DEFAULTCV;
90*f841f6adSraf int __aio_initbusy = 0;
91*f841f6adSraf 
927c478bd9Sstevel@tonic-gate mutex_t __aio_mutex = DEFAULTMUTEX;	/* protects counts, and linked lists */
937c478bd9Sstevel@tonic-gate cond_t _aio_iowait_cv = DEFAULTCV;	/* wait for userland I/Os */
947c478bd9Sstevel@tonic-gate 
957c478bd9Sstevel@tonic-gate pid_t __pid = (pid_t)-1;		/* initialize as invalid pid */
9634709573Sraf int _sigio_enabled = 0;			/* when set, send SIGIO signal */
977c478bd9Sstevel@tonic-gate 
9834709573Sraf aio_hash_t *_aio_hash;
997c478bd9Sstevel@tonic-gate 
10034709573Sraf aio_req_t *_aio_doneq;			/* double linked done queue list */
1017c478bd9Sstevel@tonic-gate 
1027c478bd9Sstevel@tonic-gate int _aio_donecnt = 0;
10334709573Sraf int _aio_waitncnt = 0;			/* # of requests for aio_waitn */
1047c478bd9Sstevel@tonic-gate int _aio_doneq_cnt = 0;
10534709573Sraf int _aio_outstand_cnt = 0;		/* # of outstanding requests */
10634709573Sraf int _kaio_outstand_cnt = 0;		/* # of outstanding kaio requests */
1077c478bd9Sstevel@tonic-gate int _aio_req_done_cnt = 0;		/* req. done but not in "done queue" */
1087c478bd9Sstevel@tonic-gate int _aio_kernel_suspend = 0;		/* active kernel kaio calls */
1097c478bd9Sstevel@tonic-gate int _aio_suscv_cnt = 0;			/* aio_suspend calls waiting on cv's */
1107c478bd9Sstevel@tonic-gate 
1117c478bd9Sstevel@tonic-gate int _max_workers = 256;			/* max number of workers permitted */
112*f841f6adSraf int _min_workers = 4;			/* min number of workers */
1137c478bd9Sstevel@tonic-gate int _minworkload = 2;			/* min number of request in q */
1147c478bd9Sstevel@tonic-gate int _aio_worker_cnt = 0;		/* number of workers to do requests */
1157c478bd9Sstevel@tonic-gate int __uaio_ok = 0;			/* AIO has been enabled */
1167c478bd9Sstevel@tonic-gate sigset_t _worker_set;			/* worker's signal mask */
1177c478bd9Sstevel@tonic-gate 
1187c478bd9Sstevel@tonic-gate int _aiowait_flag = 0;			/* when set, aiowait() is inprogress */
119*f841f6adSraf int _aio_flags = 0;			/* see asyncio.h defines for */
1207c478bd9Sstevel@tonic-gate 
121*f841f6adSraf aio_worker_t *_kaiowp = NULL;		/* points to kaio cleanup thread */
1227c478bd9Sstevel@tonic-gate 
12334709573Sraf int hz;					/* clock ticks per second */
1247c478bd9Sstevel@tonic-gate 
12534709573Sraf static int
12634709573Sraf _kaio_supported_init(void)
1277c478bd9Sstevel@tonic-gate {
12834709573Sraf 	void *ptr;
12934709573Sraf 	size_t size;
13034709573Sraf 
13134709573Sraf 	if (_kaio_supported != NULL)	/* already initialized */
13234709573Sraf 		return (0);
13334709573Sraf 
13434709573Sraf 	size = MAX_KAIO_FDARRAY_SIZE * sizeof (uint32_t);
13534709573Sraf 	ptr = mmap(NULL, size, PROT_READ | PROT_WRITE,
13634709573Sraf 	    MAP_PRIVATE | MAP_ANON, -1, (off_t)0);
13734709573Sraf 	if (ptr == MAP_FAILED)
13834709573Sraf 		return (-1);
13934709573Sraf 	_kaio_supported = ptr;
14034709573Sraf 	return (0);
1417c478bd9Sstevel@tonic-gate }
1427c478bd9Sstevel@tonic-gate 
1437c478bd9Sstevel@tonic-gate /*
144*f841f6adSraf  * The aio subsystem is initialized when an AIO request is made.
145*f841f6adSraf  * Constants are initialized like the max number of workers that
146*f841f6adSraf  * the subsystem can create, and the minimum number of workers
147*f841f6adSraf  * permitted before imposing some restrictions.  Also, some
148*f841f6adSraf  * workers are created.
1497c478bd9Sstevel@tonic-gate  */
1507c478bd9Sstevel@tonic-gate int
1517c478bd9Sstevel@tonic-gate __uaio_init(void)
1527c478bd9Sstevel@tonic-gate {
153*f841f6adSraf 	int ret = -1;
1547c478bd9Sstevel@tonic-gate 	int i;
1557c478bd9Sstevel@tonic-gate 
156*f841f6adSraf 	lmutex_lock(&__aio_initlock);
157*f841f6adSraf 	while (__aio_initbusy)
158*f841f6adSraf 		(void) _cond_wait(&__aio_initcv, &__aio_initlock);
15934709573Sraf 	if (__uaio_ok) {	/* already initialized */
160*f841f6adSraf 		lmutex_unlock(&__aio_initlock);
16134709573Sraf 		return (0);
16234709573Sraf 	}
163*f841f6adSraf 	__aio_initbusy = 1;
164*f841f6adSraf 	lmutex_unlock(&__aio_initlock);
1657c478bd9Sstevel@tonic-gate 
16634709573Sraf 	hz = (int)sysconf(_SC_CLK_TCK);
16734709573Sraf 	__pid = getpid();
1687c478bd9Sstevel@tonic-gate 
169*f841f6adSraf 	setup_cancelsig(SIGAIOCANCEL);
1707c478bd9Sstevel@tonic-gate 
17134709573Sraf 	if (_kaio_supported_init() != 0)
17234709573Sraf 		goto out;
1737c478bd9Sstevel@tonic-gate 
17434709573Sraf 	/*
17534709573Sraf 	 * Allocate and initialize the hash table.
17634709573Sraf 	 */
17734709573Sraf 	/* LINTED pointer cast */
17834709573Sraf 	_aio_hash = (aio_hash_t *)mmap(NULL,
17934709573Sraf 	    HASHSZ * sizeof (aio_hash_t), PROT_READ | PROT_WRITE,
18034709573Sraf 	    MAP_PRIVATE | MAP_ANON, -1, (off_t)0);
18134709573Sraf 	if ((void *)_aio_hash == MAP_FAILED) {
18234709573Sraf 		_aio_hash = NULL;
18334709573Sraf 		goto out;
1847c478bd9Sstevel@tonic-gate 	}
18534709573Sraf 	for (i = 0; i < HASHSZ; i++)
18634709573Sraf 		(void) mutex_init(&_aio_hash[i].hash_lock, USYNC_THREAD, NULL);
1877c478bd9Sstevel@tonic-gate 
18834709573Sraf 	/*
18934709573Sraf 	 * Initialize worker's signal mask to only catch SIGAIOCANCEL.
19034709573Sraf 	 */
19134709573Sraf 	(void) sigfillset(&_worker_set);
19234709573Sraf 	(void) sigdelset(&_worker_set, SIGAIOCANCEL);
19334709573Sraf 
19434709573Sraf 	/*
195*f841f6adSraf 	 * Create the minimum number of read/write workers.
19634709573Sraf 	 */
19734709573Sraf 	for (i = 0; i < _min_workers; i++)
19834709573Sraf 		(void) _aio_create_worker(NULL, AIOREAD);
19934709573Sraf 
20034709573Sraf 	/*
20134709573Sraf 	 * Create one worker to send asynchronous notifications.
20234709573Sraf 	 */
20334709573Sraf 	(void) _aio_create_worker(NULL, AIONOTIFY);
20434709573Sraf 
20534709573Sraf 	ret = 0;
20634709573Sraf out:
207*f841f6adSraf 	lmutex_lock(&__aio_initlock);
208*f841f6adSraf 	if (ret == 0)
209*f841f6adSraf 		__uaio_ok = 1;
210*f841f6adSraf 	__aio_initbusy = 0;
211*f841f6adSraf 	(void) cond_broadcast(&__aio_initcv);
212*f841f6adSraf 	lmutex_unlock(&__aio_initlock);
21334709573Sraf 	return (ret);
2147c478bd9Sstevel@tonic-gate }
2157c478bd9Sstevel@tonic-gate 
216*f841f6adSraf /*
217*f841f6adSraf  * Called from close() before actually performing the real _close().
218*f841f6adSraf  */
219*f841f6adSraf void
220*f841f6adSraf _aio_close(int fd)
221*f841f6adSraf {
222*f841f6adSraf 	if (fd < 0)	/* avoid cancelling everything */
223*f841f6adSraf 		return;
224*f841f6adSraf 	/*
225*f841f6adSraf 	 * Cancel all outstanding aio requests for this file descriptor.
226*f841f6adSraf 	 */
227*f841f6adSraf 	if (__uaio_ok)
228*f841f6adSraf 		(void) aiocancel_all(fd);
229*f841f6adSraf 	/*
230*f841f6adSraf 	 * If we have allocated the bit array, clear the bit for this file.
231*f841f6adSraf 	 * The next open may re-use this file descriptor and the new file
232*f841f6adSraf 	 * may have different kaio() behaviour.
233*f841f6adSraf 	 */
234*f841f6adSraf 	if (_kaio_supported != NULL)
235*f841f6adSraf 		CLEAR_KAIO_SUPPORTED(fd);
236*f841f6adSraf }
237*f841f6adSraf 
2387c478bd9Sstevel@tonic-gate /*
2397c478bd9Sstevel@tonic-gate  * special kaio cleanup thread sits in a loop in the
2407c478bd9Sstevel@tonic-gate  * kernel waiting for pending kaio requests to complete.
2417c478bd9Sstevel@tonic-gate  */
2427c478bd9Sstevel@tonic-gate void *
2437c478bd9Sstevel@tonic-gate _kaio_cleanup_thread(void *arg)
2447c478bd9Sstevel@tonic-gate {
24534709573Sraf 	if (pthread_setspecific(_aio_key, arg) != 0)
246*f841f6adSraf 		aio_panic("_kaio_cleanup_thread, pthread_setspecific()");
2477c478bd9Sstevel@tonic-gate 	(void) _kaio(AIOSTART);
2487c478bd9Sstevel@tonic-gate 	return (arg);
2497c478bd9Sstevel@tonic-gate }
2507c478bd9Sstevel@tonic-gate 
2517c478bd9Sstevel@tonic-gate /*
2527c478bd9Sstevel@tonic-gate  * initialize kaio.
2537c478bd9Sstevel@tonic-gate  */
2547c478bd9Sstevel@tonic-gate void
2557c478bd9Sstevel@tonic-gate _kaio_init()
2567c478bd9Sstevel@tonic-gate {
2577c478bd9Sstevel@tonic-gate 	int error;
25834709573Sraf 	sigset_t oset;
25934709573Sraf 
260*f841f6adSraf 	lmutex_lock(&__aio_initlock);
261*f841f6adSraf 	while (__aio_initbusy)
262*f841f6adSraf 		(void) _cond_wait(&__aio_initcv, &__aio_initlock);
263*f841f6adSraf 	if (_kaio_ok) {		/* already initialized */
264*f841f6adSraf 		lmutex_unlock(&__aio_initlock);
265*f841f6adSraf 		return;
266*f841f6adSraf 	}
267*f841f6adSraf 	__aio_initbusy = 1;
268*f841f6adSraf 	lmutex_unlock(&__aio_initlock);
269*f841f6adSraf 
27034709573Sraf 	if (_kaio_supported_init() != 0)
271*f841f6adSraf 		error = ENOMEM;
272*f841f6adSraf 	else if ((_kaiowp = _aio_worker_alloc()) == NULL)
273*f841f6adSraf 		error = ENOMEM;
274*f841f6adSraf 	else if ((error = (int)_kaio(AIOINIT)) == 0) {
275*f841f6adSraf 		(void) pthread_sigmask(SIG_SETMASK, &maskset, &oset);
276*f841f6adSraf 		error = thr_create(NULL, AIOSTKSIZE, _kaio_cleanup_thread,
277*f841f6adSraf 		    _kaiowp, THR_DAEMON, &_kaiowp->work_tid);
278*f841f6adSraf 		(void) pthread_sigmask(SIG_SETMASK, &oset, NULL);
279*f841f6adSraf 	}
280*f841f6adSraf 	if (error && _kaiowp != NULL) {
281*f841f6adSraf 		_aio_worker_free(_kaiowp);
282*f841f6adSraf 		_kaiowp = NULL;
2837c478bd9Sstevel@tonic-gate 	}
284*f841f6adSraf 
285*f841f6adSraf 	lmutex_lock(&__aio_initlock);
286*f841f6adSraf 	if (error)
287*f841f6adSraf 		_kaio_ok = -1;
288*f841f6adSraf 	else
289*f841f6adSraf 		_kaio_ok = 1;
290*f841f6adSraf 	__aio_initbusy = 0;
291*f841f6adSraf 	(void) cond_broadcast(&__aio_initcv);
292*f841f6adSraf 	lmutex_unlock(&__aio_initlock);
2937c478bd9Sstevel@tonic-gate }
2947c478bd9Sstevel@tonic-gate 
2957c478bd9Sstevel@tonic-gate int
2967c478bd9Sstevel@tonic-gate aioread(int fd, caddr_t buf, int bufsz, off_t offset, int whence,
2977c478bd9Sstevel@tonic-gate     aio_result_t *resultp)
2987c478bd9Sstevel@tonic-gate {
2997c478bd9Sstevel@tonic-gate 	return (_aiorw(fd, buf, bufsz, offset, whence, resultp, AIOREAD));
3007c478bd9Sstevel@tonic-gate }
3017c478bd9Sstevel@tonic-gate 
3027c478bd9Sstevel@tonic-gate int
3037c478bd9Sstevel@tonic-gate aiowrite(int fd, caddr_t buf, int bufsz, off_t offset, int whence,
3047c478bd9Sstevel@tonic-gate     aio_result_t *resultp)
3057c478bd9Sstevel@tonic-gate {
3067c478bd9Sstevel@tonic-gate 	return (_aiorw(fd, buf, bufsz, offset, whence, resultp, AIOWRITE));
3077c478bd9Sstevel@tonic-gate }
3087c478bd9Sstevel@tonic-gate 
30934709573Sraf #if !defined(_LP64)
3107c478bd9Sstevel@tonic-gate int
3117c478bd9Sstevel@tonic-gate aioread64(int fd, caddr_t buf, int bufsz, off64_t offset, int whence,
3127c478bd9Sstevel@tonic-gate     aio_result_t *resultp)
3137c478bd9Sstevel@tonic-gate {
3147c478bd9Sstevel@tonic-gate 	return (_aiorw(fd, buf, bufsz, offset, whence, resultp, AIOAREAD64));
3157c478bd9Sstevel@tonic-gate }
3167c478bd9Sstevel@tonic-gate 
3177c478bd9Sstevel@tonic-gate int
3187c478bd9Sstevel@tonic-gate aiowrite64(int fd, caddr_t buf, int bufsz, off64_t offset, int whence,
3197c478bd9Sstevel@tonic-gate     aio_result_t *resultp)
3207c478bd9Sstevel@tonic-gate {
3217c478bd9Sstevel@tonic-gate 	return (_aiorw(fd, buf, bufsz, offset, whence, resultp, AIOAWRITE64));
3227c478bd9Sstevel@tonic-gate }
32334709573Sraf #endif	/* !defined(_LP64) */
3247c478bd9Sstevel@tonic-gate 
3257c478bd9Sstevel@tonic-gate int
3267c478bd9Sstevel@tonic-gate _aiorw(int fd, caddr_t buf, int bufsz, offset_t offset, int whence,
3277c478bd9Sstevel@tonic-gate     aio_result_t *resultp, int mode)
3287c478bd9Sstevel@tonic-gate {
32934709573Sraf 	aio_req_t *reqp;
33034709573Sraf 	aio_args_t *ap;
33134709573Sraf 	offset_t loffset;
3327c478bd9Sstevel@tonic-gate 	struct stat stat;
33334709573Sraf 	int error = 0;
3347c478bd9Sstevel@tonic-gate 	int kerr;
3357c478bd9Sstevel@tonic-gate 	int umode;
3367c478bd9Sstevel@tonic-gate 
3377c478bd9Sstevel@tonic-gate 	switch (whence) {
3387c478bd9Sstevel@tonic-gate 
3397c478bd9Sstevel@tonic-gate 	case SEEK_SET:
3407c478bd9Sstevel@tonic-gate 		loffset = offset;
3417c478bd9Sstevel@tonic-gate 		break;
3427c478bd9Sstevel@tonic-gate 	case SEEK_CUR:
3437c478bd9Sstevel@tonic-gate 		if ((loffset = llseek(fd, 0, SEEK_CUR)) == -1)
34434709573Sraf 			error = -1;
3457c478bd9Sstevel@tonic-gate 		else
3467c478bd9Sstevel@tonic-gate 			loffset += offset;
3477c478bd9Sstevel@tonic-gate 		break;
3487c478bd9Sstevel@tonic-gate 	case SEEK_END:
3497c478bd9Sstevel@tonic-gate 		if (fstat(fd, &stat) == -1)
35034709573Sraf 			error = -1;
3517c478bd9Sstevel@tonic-gate 		else
3527c478bd9Sstevel@tonic-gate 			loffset = offset + stat.st_size;
3537c478bd9Sstevel@tonic-gate 		break;
3547c478bd9Sstevel@tonic-gate 	default:
3557c478bd9Sstevel@tonic-gate 		errno = EINVAL;
35634709573Sraf 		error = -1;
3577c478bd9Sstevel@tonic-gate 	}
3587c478bd9Sstevel@tonic-gate 
35934709573Sraf 	if (error)
36034709573Sraf 		return (error);
3617c478bd9Sstevel@tonic-gate 
3627c478bd9Sstevel@tonic-gate 	/* initialize kaio */
3637c478bd9Sstevel@tonic-gate 	if (!_kaio_ok)
3647c478bd9Sstevel@tonic-gate 		_kaio_init();
3657c478bd9Sstevel@tonic-gate 
3667c478bd9Sstevel@tonic-gate 	/*
3677c478bd9Sstevel@tonic-gate 	 * _aio_do_request() needs the original request code (mode) to be able
36834709573Sraf 	 * to choose the appropiate 32/64 bit function.  All other functions
3697c478bd9Sstevel@tonic-gate 	 * only require the difference between READ and WRITE (umode).
3707c478bd9Sstevel@tonic-gate 	 */
3717c478bd9Sstevel@tonic-gate 	if (mode == AIOAREAD64 || mode == AIOAWRITE64)
3727c478bd9Sstevel@tonic-gate 		umode = mode - AIOAREAD64;
3737c478bd9Sstevel@tonic-gate 	else
3747c478bd9Sstevel@tonic-gate 		umode = mode;
3757c478bd9Sstevel@tonic-gate 
3767c478bd9Sstevel@tonic-gate 	/*
3777c478bd9Sstevel@tonic-gate 	 * Try kernel aio first.
3787c478bd9Sstevel@tonic-gate 	 * If errno is ENOTSUP/EBADFD, fall back to the thread implementation.
3797c478bd9Sstevel@tonic-gate 	 */
38034709573Sraf 	if (_kaio_ok > 0 && KAIO_SUPPORTED(fd)) {
3817c478bd9Sstevel@tonic-gate 		resultp->aio_errno = 0;
38234709573Sraf 		sig_mutex_lock(&__aio_mutex);
38334709573Sraf 		_kaio_outstand_cnt++;
3847c478bd9Sstevel@tonic-gate 		kerr = (int)_kaio(((resultp->aio_return == AIO_INPROGRESS) ?
3857c478bd9Sstevel@tonic-gate 		    (umode | AIO_POLL_BIT) : umode),
3867c478bd9Sstevel@tonic-gate 		    fd, buf, bufsz, loffset, resultp);
38734709573Sraf 		if (kerr == 0) {
388b9868792Sraf 			sig_mutex_unlock(&__aio_mutex);
3897c478bd9Sstevel@tonic-gate 			return (0);
39034709573Sraf 		}
39134709573Sraf 		_kaio_outstand_cnt--;
39234709573Sraf 		sig_mutex_unlock(&__aio_mutex);
39334709573Sraf 		if (errno != ENOTSUP && errno != EBADFD)
3947c478bd9Sstevel@tonic-gate 			return (-1);
3957c478bd9Sstevel@tonic-gate 		if (errno == EBADFD)
3967c478bd9Sstevel@tonic-gate 			SET_KAIO_NOT_SUPPORTED(fd);
3977c478bd9Sstevel@tonic-gate 	}
3987c478bd9Sstevel@tonic-gate 
39934709573Sraf 	if (!__uaio_ok && __uaio_init() == -1)
40034709573Sraf 		return (-1);
40134709573Sraf 
40234709573Sraf 	if ((reqp = _aio_req_alloc()) == NULL) {
4037c478bd9Sstevel@tonic-gate 		errno = EAGAIN;
4047c478bd9Sstevel@tonic-gate 		return (-1);
4057c478bd9Sstevel@tonic-gate 	}
4067c478bd9Sstevel@tonic-gate 
4077c478bd9Sstevel@tonic-gate 	/*
40834709573Sraf 	 * _aio_do_request() checks reqp->req_op to differentiate
4097c478bd9Sstevel@tonic-gate 	 * between 32 and 64 bit access.
4107c478bd9Sstevel@tonic-gate 	 */
41134709573Sraf 	reqp->req_op = mode;
41234709573Sraf 	reqp->req_resultp = resultp;
41334709573Sraf 	ap = &reqp->req_args;
4147c478bd9Sstevel@tonic-gate 	ap->fd = fd;
4157c478bd9Sstevel@tonic-gate 	ap->buf = buf;
4167c478bd9Sstevel@tonic-gate 	ap->bufsz = bufsz;
4177c478bd9Sstevel@tonic-gate 	ap->offset = loffset;
4187c478bd9Sstevel@tonic-gate 
41934709573Sraf 	if (_aio_hash_insert(resultp, reqp) != 0) {
42034709573Sraf 		_aio_req_free(reqp);
4217c478bd9Sstevel@tonic-gate 		errno = EINVAL;
4227c478bd9Sstevel@tonic-gate 		return (-1);
4237c478bd9Sstevel@tonic-gate 	}
42434709573Sraf 	/*
42534709573Sraf 	 * _aio_req_add() only needs the difference between READ and
42634709573Sraf 	 * WRITE to choose the right worker queue.
42734709573Sraf 	 */
42834709573Sraf 	_aio_req_add(reqp, &__nextworker_rw, umode);
42934709573Sraf 	return (0);
4307c478bd9Sstevel@tonic-gate }
4317c478bd9Sstevel@tonic-gate 
4327c478bd9Sstevel@tonic-gate int
4337c478bd9Sstevel@tonic-gate aiocancel(aio_result_t *resultp)
4347c478bd9Sstevel@tonic-gate {
43534709573Sraf 	aio_req_t *reqp;
43634709573Sraf 	aio_worker_t *aiowp;
43734709573Sraf 	int ret;
43834709573Sraf 	int done = 0;
43934709573Sraf 	int canceled = 0;
4407c478bd9Sstevel@tonic-gate 
4417c478bd9Sstevel@tonic-gate 	if (!__uaio_ok) {
4427c478bd9Sstevel@tonic-gate 		errno = EINVAL;
4437c478bd9Sstevel@tonic-gate 		return (-1);
4447c478bd9Sstevel@tonic-gate 	}
4457c478bd9Sstevel@tonic-gate 
44634709573Sraf 	sig_mutex_lock(&__aio_mutex);
44734709573Sraf 	reqp = _aio_hash_find(resultp);
44834709573Sraf 	if (reqp == NULL) {
4497c478bd9Sstevel@tonic-gate 		if (_aio_outstand_cnt == _aio_req_done_cnt)
4507c478bd9Sstevel@tonic-gate 			errno = EINVAL;
4517c478bd9Sstevel@tonic-gate 		else
4527c478bd9Sstevel@tonic-gate 			errno = EACCES;
45334709573Sraf 		ret = -1;
4547c478bd9Sstevel@tonic-gate 	} else {
45534709573Sraf 		aiowp = reqp->req_worker;
45634709573Sraf 		sig_mutex_lock(&aiowp->work_qlock1);
45734709573Sraf 		(void) _aio_cancel_req(aiowp, reqp, &canceled, &done);
45834709573Sraf 		sig_mutex_unlock(&aiowp->work_qlock1);
4597c478bd9Sstevel@tonic-gate 
4607c478bd9Sstevel@tonic-gate 		if (canceled) {
46134709573Sraf 			ret = 0;
4627c478bd9Sstevel@tonic-gate 		} else {
46334709573Sraf 			if (_aio_outstand_cnt == 0 ||
46434709573Sraf 			    _aio_outstand_cnt == _aio_req_done_cnt)
46534709573Sraf 				errno = EINVAL;
46634709573Sraf 			else
46734709573Sraf 				errno = EACCES;
46834709573Sraf 			ret = -1;
4697c478bd9Sstevel@tonic-gate 		}
4707c478bd9Sstevel@tonic-gate 	}
47134709573Sraf 	sig_mutex_unlock(&__aio_mutex);
47234709573Sraf 	return (ret);
4737c478bd9Sstevel@tonic-gate }
4747c478bd9Sstevel@tonic-gate 
4757c478bd9Sstevel@tonic-gate /*
4767c478bd9Sstevel@tonic-gate  * This must be asynch safe
4777c478bd9Sstevel@tonic-gate  */
4787c478bd9Sstevel@tonic-gate aio_result_t *
4797c478bd9Sstevel@tonic-gate aiowait(struct timeval *uwait)
4807c478bd9Sstevel@tonic-gate {
48134709573Sraf 	aio_result_t *uresultp;
48234709573Sraf 	aio_result_t *kresultp;
48334709573Sraf 	aio_result_t *resultp;
4847c478bd9Sstevel@tonic-gate 	int dontblock;
4857c478bd9Sstevel@tonic-gate 	int timedwait = 0;
4867c478bd9Sstevel@tonic-gate 	int kaio_errno = 0;
48734709573Sraf 	struct timeval twait;
48834709573Sraf 	struct timeval *wait = NULL;
4897c478bd9Sstevel@tonic-gate 	hrtime_t hrtend;
4907c478bd9Sstevel@tonic-gate 	hrtime_t hres;
4917c478bd9Sstevel@tonic-gate 
4927c478bd9Sstevel@tonic-gate 	if (uwait) {
4937c478bd9Sstevel@tonic-gate 		/*
49434709573Sraf 		 * Check for a valid specified wait time.
49534709573Sraf 		 * If it is invalid, fail the call right away.
4967c478bd9Sstevel@tonic-gate 		 */
4977c478bd9Sstevel@tonic-gate 		if (uwait->tv_sec < 0 || uwait->tv_usec < 0 ||
4987c478bd9Sstevel@tonic-gate 		    uwait->tv_usec >= MICROSEC) {
4997c478bd9Sstevel@tonic-gate 			errno = EINVAL;
5007c478bd9Sstevel@tonic-gate 			return ((aio_result_t *)-1);
5017c478bd9Sstevel@tonic-gate 		}
5027c478bd9Sstevel@tonic-gate 
50334709573Sraf 		if (uwait->tv_sec > 0 || uwait->tv_usec > 0) {
5047c478bd9Sstevel@tonic-gate 			hrtend = gethrtime() +
5057c478bd9Sstevel@tonic-gate 				(hrtime_t)uwait->tv_sec * NANOSEC +
5067c478bd9Sstevel@tonic-gate 				(hrtime_t)uwait->tv_usec * (NANOSEC / MICROSEC);
5077c478bd9Sstevel@tonic-gate 			twait = *uwait;
5087c478bd9Sstevel@tonic-gate 			wait = &twait;
5097c478bd9Sstevel@tonic-gate 			timedwait++;
5107c478bd9Sstevel@tonic-gate 		} else {
5117c478bd9Sstevel@tonic-gate 			/* polling */
51234709573Sraf 			sig_mutex_lock(&__aio_mutex);
51334709573Sraf 			if (_kaio_outstand_cnt == 0) {
51434709573Sraf 				kresultp = (aio_result_t *)-1;
51534709573Sraf 			} else {
51634709573Sraf 				kresultp = (aio_result_t *)_kaio(AIOWAIT,
51734709573Sraf 				    (struct timeval *)-1, 1);
51834709573Sraf 				if (kresultp != (aio_result_t *)-1 &&
51934709573Sraf 				    kresultp != NULL &&
52034709573Sraf 				    kresultp != (aio_result_t *)1) {
52134709573Sraf 					_kaio_outstand_cnt--;
52234709573Sraf 					sig_mutex_unlock(&__aio_mutex);
52334709573Sraf 					return (kresultp);
52434709573Sraf 				}
52534709573Sraf 			}
5267c478bd9Sstevel@tonic-gate 			uresultp = _aio_req_done();
52734709573Sraf 			sig_mutex_unlock(&__aio_mutex);
52834709573Sraf 			if (uresultp != NULL &&
52934709573Sraf 			    uresultp != (aio_result_t *)-1) {
5307c478bd9Sstevel@tonic-gate 				return (uresultp);
5317c478bd9Sstevel@tonic-gate 			}
5327c478bd9Sstevel@tonic-gate 			if (uresultp == (aio_result_t *)-1 &&
5337c478bd9Sstevel@tonic-gate 			    kresultp == (aio_result_t *)-1) {
5347c478bd9Sstevel@tonic-gate 				errno = EINVAL;
5357c478bd9Sstevel@tonic-gate 				return ((aio_result_t *)-1);
53634709573Sraf 			} else {
5377c478bd9Sstevel@tonic-gate 				return (NULL);
53834709573Sraf 			}
5397c478bd9Sstevel@tonic-gate 		}
5407c478bd9Sstevel@tonic-gate 	}
5417c478bd9Sstevel@tonic-gate 
5427c478bd9Sstevel@tonic-gate 	for (;;) {
54334709573Sraf 		sig_mutex_lock(&__aio_mutex);
5447c478bd9Sstevel@tonic-gate 		uresultp = _aio_req_done();
5457c478bd9Sstevel@tonic-gate 		if (uresultp != NULL && uresultp != (aio_result_t *)-1) {
54634709573Sraf 			sig_mutex_unlock(&__aio_mutex);
5477c478bd9Sstevel@tonic-gate 			resultp = uresultp;
5487c478bd9Sstevel@tonic-gate 			break;
5497c478bd9Sstevel@tonic-gate 		}
5507c478bd9Sstevel@tonic-gate 		_aiowait_flag++;
5517c478bd9Sstevel@tonic-gate 		dontblock = (uresultp == (aio_result_t *)-1);
55234709573Sraf 		if (dontblock && _kaio_outstand_cnt == 0) {
55334709573Sraf 			kresultp = (aio_result_t *)-1;
55434709573Sraf 			kaio_errno = EINVAL;
55534709573Sraf 		} else {
55634709573Sraf 			sig_mutex_unlock(&__aio_mutex);
55734709573Sraf 			kresultp = (aio_result_t *)_kaio(AIOWAIT,
55834709573Sraf 			    wait, dontblock);
55934709573Sraf 			sig_mutex_lock(&__aio_mutex);
56034709573Sraf 			kaio_errno = errno;
56134709573Sraf 		}
5627c478bd9Sstevel@tonic-gate 		_aiowait_flag--;
56334709573Sraf 		sig_mutex_unlock(&__aio_mutex);
5647c478bd9Sstevel@tonic-gate 		if (kresultp == (aio_result_t *)1) {
5657c478bd9Sstevel@tonic-gate 			/* aiowait() awakened by an aionotify() */
5667c478bd9Sstevel@tonic-gate 			continue;
56734709573Sraf 		} else if (kresultp != NULL &&
56834709573Sraf 		    kresultp != (aio_result_t *)-1) {
5697c478bd9Sstevel@tonic-gate 			resultp = kresultp;
57034709573Sraf 			sig_mutex_lock(&__aio_mutex);
57134709573Sraf 			_kaio_outstand_cnt--;
57234709573Sraf 			sig_mutex_unlock(&__aio_mutex);
5737c478bd9Sstevel@tonic-gate 			break;
57434709573Sraf 		} else if (kresultp == (aio_result_t *)-1 &&
57534709573Sraf 		    kaio_errno == EINVAL &&
57634709573Sraf 		    uresultp == (aio_result_t *)-1) {
5777c478bd9Sstevel@tonic-gate 			errno = kaio_errno;
5787c478bd9Sstevel@tonic-gate 			resultp = (aio_result_t *)-1;
5797c478bd9Sstevel@tonic-gate 			break;
5807c478bd9Sstevel@tonic-gate 		} else if (kresultp == (aio_result_t *)-1 &&
5817c478bd9Sstevel@tonic-gate 		    kaio_errno == EINTR) {
5827c478bd9Sstevel@tonic-gate 			errno = kaio_errno;
5837c478bd9Sstevel@tonic-gate 			resultp = (aio_result_t *)-1;
5847c478bd9Sstevel@tonic-gate 			break;
5857c478bd9Sstevel@tonic-gate 		} else if (timedwait) {
5867c478bd9Sstevel@tonic-gate 			hres = hrtend - gethrtime();
5877c478bd9Sstevel@tonic-gate 			if (hres <= 0) {
58834709573Sraf 				/* time is up; return */
5897c478bd9Sstevel@tonic-gate 				resultp = NULL;
5907c478bd9Sstevel@tonic-gate 				break;
5917c478bd9Sstevel@tonic-gate 			} else {
5927c478bd9Sstevel@tonic-gate 				/*
59334709573Sraf 				 * Some time left.  Round up the remaining time
59434709573Sraf 				 * in nanoseconds to microsec.  Retry the call.
5957c478bd9Sstevel@tonic-gate 				 */
59634709573Sraf 				hres += (NANOSEC / MICROSEC) - 1;
5977c478bd9Sstevel@tonic-gate 				wait->tv_sec = hres / NANOSEC;
5987c478bd9Sstevel@tonic-gate 				wait->tv_usec =
5997c478bd9Sstevel@tonic-gate 					(hres % NANOSEC) / (NANOSEC / MICROSEC);
6007c478bd9Sstevel@tonic-gate 			}
6017c478bd9Sstevel@tonic-gate 		} else {
60234709573Sraf 			ASSERT(kresultp == NULL && uresultp == NULL);
6037c478bd9Sstevel@tonic-gate 			resultp = NULL;
6047c478bd9Sstevel@tonic-gate 			continue;
6057c478bd9Sstevel@tonic-gate 		}
6067c478bd9Sstevel@tonic-gate 	}
6077c478bd9Sstevel@tonic-gate 	return (resultp);
6087c478bd9Sstevel@tonic-gate }
6097c478bd9Sstevel@tonic-gate 
6107c478bd9Sstevel@tonic-gate /*
6117c478bd9Sstevel@tonic-gate  * _aio_get_timedelta calculates the remaining time and stores the result
61234709573Sraf  * into timespec_t *wait.
6137c478bd9Sstevel@tonic-gate  */
6147c478bd9Sstevel@tonic-gate 
6157c478bd9Sstevel@tonic-gate int
61634709573Sraf _aio_get_timedelta(timespec_t *end, timespec_t *wait)
6177c478bd9Sstevel@tonic-gate {
6187c478bd9Sstevel@tonic-gate 	int	ret = 0;
6197c478bd9Sstevel@tonic-gate 	struct	timeval cur;
62034709573Sraf 	timespec_t curtime;
6217c478bd9Sstevel@tonic-gate 
6227c478bd9Sstevel@tonic-gate 	(void) gettimeofday(&cur, NULL);
6237c478bd9Sstevel@tonic-gate 	curtime.tv_sec = cur.tv_sec;
6247c478bd9Sstevel@tonic-gate 	curtime.tv_nsec = cur.tv_usec * 1000;   /* convert us to ns */
6257c478bd9Sstevel@tonic-gate 
6267c478bd9Sstevel@tonic-gate 	if (end->tv_sec >= curtime.tv_sec) {
6277c478bd9Sstevel@tonic-gate 		wait->tv_sec = end->tv_sec - curtime.tv_sec;
6287c478bd9Sstevel@tonic-gate 		if (end->tv_nsec >= curtime.tv_nsec) {
6297c478bd9Sstevel@tonic-gate 			wait->tv_nsec = end->tv_nsec - curtime.tv_nsec;
6307c478bd9Sstevel@tonic-gate 			if (wait->tv_sec == 0 && wait->tv_nsec == 0)
6317c478bd9Sstevel@tonic-gate 				ret = -1;	/* timer expired */
6327c478bd9Sstevel@tonic-gate 		} else {
6337c478bd9Sstevel@tonic-gate 			if (end->tv_sec > curtime.tv_sec) {
6347c478bd9Sstevel@tonic-gate 				wait->tv_sec -= 1;
6357c478bd9Sstevel@tonic-gate 				wait->tv_nsec = NANOSEC -
6367c478bd9Sstevel@tonic-gate 				    (curtime.tv_nsec - end->tv_nsec);
6377c478bd9Sstevel@tonic-gate 			} else {
6387c478bd9Sstevel@tonic-gate 				ret = -1;	/* timer expired */
6397c478bd9Sstevel@tonic-gate 			}
6407c478bd9Sstevel@tonic-gate 		}
6417c478bd9Sstevel@tonic-gate 	} else {
6427c478bd9Sstevel@tonic-gate 		ret = -1;
6437c478bd9Sstevel@tonic-gate 	}
6447c478bd9Sstevel@tonic-gate 	return (ret);
6457c478bd9Sstevel@tonic-gate }
6467c478bd9Sstevel@tonic-gate 
6477c478bd9Sstevel@tonic-gate /*
6487c478bd9Sstevel@tonic-gate  * If closing by file descriptor: we will simply cancel all the outstanding
64934709573Sraf  * aio`s and return.  Those aio's in question will have either noticed the
6507c478bd9Sstevel@tonic-gate  * cancellation notice before, during, or after initiating io.
6517c478bd9Sstevel@tonic-gate  */
6527c478bd9Sstevel@tonic-gate int
6537c478bd9Sstevel@tonic-gate aiocancel_all(int fd)
6547c478bd9Sstevel@tonic-gate {
65534709573Sraf 	aio_req_t *reqp;
65634709573Sraf 	aio_req_t **reqpp;
65734709573Sraf 	aio_worker_t *first;
65834709573Sraf 	aio_worker_t *next;
6597c478bd9Sstevel@tonic-gate 	int canceled = 0;
6607c478bd9Sstevel@tonic-gate 	int done = 0;
6617c478bd9Sstevel@tonic-gate 	int cancelall = 0;
6627c478bd9Sstevel@tonic-gate 
66334709573Sraf 	sig_mutex_lock(&__aio_mutex);
6647c478bd9Sstevel@tonic-gate 
66534709573Sraf 	if (_aio_outstand_cnt == 0) {
66634709573Sraf 		sig_mutex_unlock(&__aio_mutex);
66734709573Sraf 		return (AIO_ALLDONE);
66834709573Sraf 	}
6697c478bd9Sstevel@tonic-gate 
6707c478bd9Sstevel@tonic-gate 	/*
67134709573Sraf 	 * Cancel requests from the read/write workers' queues.
6727c478bd9Sstevel@tonic-gate 	 */
67334709573Sraf 	first = __nextworker_rw;
6747c478bd9Sstevel@tonic-gate 	next = first;
6757c478bd9Sstevel@tonic-gate 	do {
6767c478bd9Sstevel@tonic-gate 		_aio_cancel_work(next, fd, &canceled, &done);
6777c478bd9Sstevel@tonic-gate 	} while ((next = next->work_forw) != first);
6787c478bd9Sstevel@tonic-gate 
6797c478bd9Sstevel@tonic-gate 	/*
6807c478bd9Sstevel@tonic-gate 	 * finally, check if there are requests on the done queue that
6817c478bd9Sstevel@tonic-gate 	 * should be canceled.
6827c478bd9Sstevel@tonic-gate 	 */
6837c478bd9Sstevel@tonic-gate 	if (fd < 0)
6847c478bd9Sstevel@tonic-gate 		cancelall = 1;
68534709573Sraf 	reqpp = &_aio_done_tail;
68634709573Sraf 	while ((reqp = *reqpp) != NULL) {
68734709573Sraf 		if (cancelall || reqp->req_args.fd == fd) {
68834709573Sraf 			*reqpp = reqp->req_next;
6897c478bd9Sstevel@tonic-gate 			_aio_donecnt--;
69034709573Sraf 			(void) _aio_hash_del(reqp->req_resultp);
69134709573Sraf 			_aio_req_free(reqp);
6927c478bd9Sstevel@tonic-gate 		} else
69334709573Sraf 			reqpp = &reqp->req_next;
6947c478bd9Sstevel@tonic-gate 	}
6957c478bd9Sstevel@tonic-gate 	if (cancelall) {
6967c478bd9Sstevel@tonic-gate 		ASSERT(_aio_donecnt == 0);
6977c478bd9Sstevel@tonic-gate 		_aio_done_head = NULL;
6987c478bd9Sstevel@tonic-gate 	}
69934709573Sraf 	sig_mutex_unlock(&__aio_mutex);
7007c478bd9Sstevel@tonic-gate 
7017c478bd9Sstevel@tonic-gate 	if (canceled && done == 0)
7027c478bd9Sstevel@tonic-gate 		return (AIO_CANCELED);
7037c478bd9Sstevel@tonic-gate 	else if (done && canceled == 0)
7047c478bd9Sstevel@tonic-gate 		return (AIO_ALLDONE);
7057c478bd9Sstevel@tonic-gate 	else if ((canceled + done == 0) && KAIO_SUPPORTED(fd))
7067c478bd9Sstevel@tonic-gate 		return ((int)_kaio(AIOCANCEL, fd, NULL));
7077c478bd9Sstevel@tonic-gate 	return (AIO_NOTCANCELED);
7087c478bd9Sstevel@tonic-gate }
7097c478bd9Sstevel@tonic-gate 
7107c478bd9Sstevel@tonic-gate /*
71134709573Sraf  * Cancel requests from a given work queue.  If the file descriptor
71234709573Sraf  * parameter, fd, is non-negative, then only cancel those requests
71334709573Sraf  * in this queue that are to this file descriptor.  If the fd
7147c478bd9Sstevel@tonic-gate  * parameter is -1, then cancel all requests.
7157c478bd9Sstevel@tonic-gate  */
7167c478bd9Sstevel@tonic-gate static void
7177c478bd9Sstevel@tonic-gate _aio_cancel_work(aio_worker_t *aiowp, int fd, int *canceled, int *done)
7187c478bd9Sstevel@tonic-gate {
71934709573Sraf 	aio_req_t *reqp;
7207c478bd9Sstevel@tonic-gate 
72134709573Sraf 	sig_mutex_lock(&aiowp->work_qlock1);
7227c478bd9Sstevel@tonic-gate 	/*
7237c478bd9Sstevel@tonic-gate 	 * cancel queued requests first.
7247c478bd9Sstevel@tonic-gate 	 */
72534709573Sraf 	reqp = aiowp->work_tail1;
72634709573Sraf 	while (reqp != NULL) {
72734709573Sraf 		if (fd < 0 || reqp->req_args.fd == fd) {
72834709573Sraf 			if (_aio_cancel_req(aiowp, reqp, canceled, done)) {
7297c478bd9Sstevel@tonic-gate 				/*
73034709573Sraf 				 * Callers locks were dropped.
73134709573Sraf 				 * reqp is invalid; start traversing
73234709573Sraf 				 * the list from the beginning again.
7337c478bd9Sstevel@tonic-gate 				 */
73434709573Sraf 				reqp = aiowp->work_tail1;
7357c478bd9Sstevel@tonic-gate 				continue;
7367c478bd9Sstevel@tonic-gate 			}
7377c478bd9Sstevel@tonic-gate 		}
73834709573Sraf 		reqp = reqp->req_next;
7397c478bd9Sstevel@tonic-gate 	}
7407c478bd9Sstevel@tonic-gate 	/*
74134709573Sraf 	 * Since the queued requests have been canceled, there can
74234709573Sraf 	 * only be one inprogress request that should be canceled.
7437c478bd9Sstevel@tonic-gate 	 */
74434709573Sraf 	if ((reqp = aiowp->work_req) != NULL &&
74534709573Sraf 	    (fd < 0 || reqp->req_args.fd == fd))
74634709573Sraf 		(void) _aio_cancel_req(aiowp, reqp, canceled, done);
74734709573Sraf 	sig_mutex_unlock(&aiowp->work_qlock1);
7487c478bd9Sstevel@tonic-gate }
7497c478bd9Sstevel@tonic-gate 
7507c478bd9Sstevel@tonic-gate /*
75134709573Sraf  * Cancel a request.  Return 1 if the callers locks were temporarily
7527c478bd9Sstevel@tonic-gate  * dropped, otherwise return 0.
7537c478bd9Sstevel@tonic-gate  */
7547c478bd9Sstevel@tonic-gate int
75534709573Sraf _aio_cancel_req(aio_worker_t *aiowp, aio_req_t *reqp, int *canceled, int *done)
7567c478bd9Sstevel@tonic-gate {
75734709573Sraf 	int ostate = reqp->req_state;
7587c478bd9Sstevel@tonic-gate 
7597c478bd9Sstevel@tonic-gate 	ASSERT(MUTEX_HELD(&__aio_mutex));
7607c478bd9Sstevel@tonic-gate 	ASSERT(MUTEX_HELD(&aiowp->work_qlock1));
76134709573Sraf 	if (ostate == AIO_REQ_CANCELED)
7627c478bd9Sstevel@tonic-gate 		return (0);
7637c478bd9Sstevel@tonic-gate 	if (ostate == AIO_REQ_DONE || ostate == AIO_REQ_DONEQ) {
7647c478bd9Sstevel@tonic-gate 		(*done)++;
7657c478bd9Sstevel@tonic-gate 		return (0);
7667c478bd9Sstevel@tonic-gate 	}
767c2575b5eSraf 	if (reqp->req_op == AIOFSYNC && reqp != aiowp->work_req) {
76834709573Sraf 		ASSERT(POSIX_AIO(reqp));
769c2575b5eSraf 		/* Cancel the queued aio_fsync() request */
77034709573Sraf 		if (!reqp->req_head->lio_canned) {
77134709573Sraf 			reqp->req_head->lio_canned = 1;
77234709573Sraf 			_aio_outstand_cnt--;
77334709573Sraf 			(*canceled)++;
77434709573Sraf 		}
7757c478bd9Sstevel@tonic-gate 		return (0);
7767c478bd9Sstevel@tonic-gate 	}
77734709573Sraf 	reqp->req_state = AIO_REQ_CANCELED;
77834709573Sraf 	_aio_req_del(aiowp, reqp, ostate);
77934709573Sraf 	(void) _aio_hash_del(reqp->req_resultp);
7807c478bd9Sstevel@tonic-gate 	(*canceled)++;
78134709573Sraf 	if (reqp == aiowp->work_req) {
78234709573Sraf 		ASSERT(ostate == AIO_REQ_INPROGRESS);
78334709573Sraf 		/*
78434709573Sraf 		 * Set the result values now, before _aiodone() is called.
78534709573Sraf 		 * We do this because the application can expect aio_return
78634709573Sraf 		 * and aio_errno to be set to -1 and ECANCELED, respectively,
78734709573Sraf 		 * immediately after a successful return from aiocancel()
78834709573Sraf 		 * or aio_cancel().
78934709573Sraf 		 */
79034709573Sraf 		_aio_set_result(reqp, -1, ECANCELED);
79134709573Sraf 		(void) thr_kill(aiowp->work_tid, SIGAIOCANCEL);
79234709573Sraf 		return (0);
79334709573Sraf 	}
79434709573Sraf 	if (!POSIX_AIO(reqp)) {
79534709573Sraf 		_aio_outstand_cnt--;
79634709573Sraf 		_aio_set_result(reqp, -1, ECANCELED);
79734709573Sraf 		return (0);
79834709573Sraf 	}
79934709573Sraf 	sig_mutex_unlock(&aiowp->work_qlock1);
80034709573Sraf 	sig_mutex_unlock(&__aio_mutex);
80134709573Sraf 	_aiodone(reqp, -1, ECANCELED);
80234709573Sraf 	sig_mutex_lock(&__aio_mutex);
80334709573Sraf 	sig_mutex_lock(&aiowp->work_qlock1);
8047c478bd9Sstevel@tonic-gate 	return (1);
8057c478bd9Sstevel@tonic-gate }
8067c478bd9Sstevel@tonic-gate 
807*f841f6adSraf int
808*f841f6adSraf _aio_create_worker(aio_req_t *reqp, int mode)
809*f841f6adSraf {
810*f841f6adSraf 	aio_worker_t *aiowp, **workers, **nextworker;
811*f841f6adSraf 	int *aio_workerscnt;
812*f841f6adSraf 	void *(*func)(void *);
813*f841f6adSraf 	sigset_t oset;
814*f841f6adSraf 	int error;
815*f841f6adSraf 
816*f841f6adSraf 	/*
817*f841f6adSraf 	 * Put the new worker thread in the right queue.
818*f841f6adSraf 	 */
819*f841f6adSraf 	switch (mode) {
820*f841f6adSraf 	case AIOREAD:
821*f841f6adSraf 	case AIOWRITE:
822*f841f6adSraf 	case AIOAREAD:
823*f841f6adSraf 	case AIOAWRITE:
824*f841f6adSraf #if !defined(_LP64)
825*f841f6adSraf 	case AIOAREAD64:
826*f841f6adSraf 	case AIOAWRITE64:
827*f841f6adSraf #endif
828*f841f6adSraf 		workers = &__workers_rw;
829*f841f6adSraf 		nextworker = &__nextworker_rw;
830*f841f6adSraf 		aio_workerscnt = &__rw_workerscnt;
831*f841f6adSraf 		func = _aio_do_request;
832*f841f6adSraf 		break;
833*f841f6adSraf 	case AIONOTIFY:
834*f841f6adSraf 		workers = &__workers_no;
835*f841f6adSraf 		nextworker = &__nextworker_no;
836*f841f6adSraf 		func = _aio_do_notify;
837*f841f6adSraf 		aio_workerscnt = &__no_workerscnt;
838*f841f6adSraf 		break;
839*f841f6adSraf 	default:
840*f841f6adSraf 		aio_panic("_aio_create_worker: invalid mode");
841*f841f6adSraf 		break;
842*f841f6adSraf 	}
843*f841f6adSraf 
844*f841f6adSraf 	if ((aiowp = _aio_worker_alloc()) == NULL)
845*f841f6adSraf 		return (-1);
846*f841f6adSraf 
847*f841f6adSraf 	if (reqp) {
848*f841f6adSraf 		reqp->req_state = AIO_REQ_QUEUED;
849*f841f6adSraf 		reqp->req_worker = aiowp;
850*f841f6adSraf 		aiowp->work_head1 = reqp;
851*f841f6adSraf 		aiowp->work_tail1 = reqp;
852*f841f6adSraf 		aiowp->work_next1 = reqp;
853*f841f6adSraf 		aiowp->work_count1 = 1;
854*f841f6adSraf 		aiowp->work_minload1 = 1;
855*f841f6adSraf 	}
856*f841f6adSraf 
857*f841f6adSraf 	(void) pthread_sigmask(SIG_SETMASK, &maskset, &oset);
858*f841f6adSraf 	error = thr_create(NULL, AIOSTKSIZE, func, aiowp,
859*f841f6adSraf 		THR_DAEMON | THR_SUSPENDED, &aiowp->work_tid);
860*f841f6adSraf 	(void) pthread_sigmask(SIG_SETMASK, &oset, NULL);
861*f841f6adSraf 	if (error) {
862*f841f6adSraf 		if (reqp) {
863*f841f6adSraf 			reqp->req_state = 0;
864*f841f6adSraf 			reqp->req_worker = NULL;
865*f841f6adSraf 		}
866*f841f6adSraf 		_aio_worker_free(aiowp);
867*f841f6adSraf 		return (-1);
868*f841f6adSraf 	}
869*f841f6adSraf 
870*f841f6adSraf 	lmutex_lock(&__aio_mutex);
871*f841f6adSraf 	(*aio_workerscnt)++;
872*f841f6adSraf 	if (*workers == NULL) {
873*f841f6adSraf 		aiowp->work_forw = aiowp;
874*f841f6adSraf 		aiowp->work_backw = aiowp;
875*f841f6adSraf 		*nextworker = aiowp;
876*f841f6adSraf 		*workers = aiowp;
877*f841f6adSraf 	} else {
878*f841f6adSraf 		aiowp->work_backw = (*workers)->work_backw;
879*f841f6adSraf 		aiowp->work_forw = (*workers);
880*f841f6adSraf 		(*workers)->work_backw->work_forw = aiowp;
881*f841f6adSraf 		(*workers)->work_backw = aiowp;
882*f841f6adSraf 	}
883*f841f6adSraf 	_aio_worker_cnt++;
884*f841f6adSraf 	lmutex_unlock(&__aio_mutex);
885*f841f6adSraf 
886*f841f6adSraf 	(void) thr_continue(aiowp->work_tid);
887*f841f6adSraf 
888*f841f6adSraf 	return (0);
889*f841f6adSraf }
890*f841f6adSraf 
8917c478bd9Sstevel@tonic-gate /*
8927c478bd9Sstevel@tonic-gate  * This is the worker's main routine.
8937c478bd9Sstevel@tonic-gate  * The task of this function is to execute all queued requests;
8947c478bd9Sstevel@tonic-gate  * once the last pending request is executed this function will block
89534709573Sraf  * in _aio_idle().  A new incoming request must wakeup this thread to
8967c478bd9Sstevel@tonic-gate  * restart the work.
89734709573Sraf  * Every worker has an own work queue.  The queue lock is required
8987c478bd9Sstevel@tonic-gate  * to synchronize the addition of new requests for this worker or
8997c478bd9Sstevel@tonic-gate  * cancellation of pending/running requests.
9007c478bd9Sstevel@tonic-gate  *
9017c478bd9Sstevel@tonic-gate  * Cancellation scenarios:
9027c478bd9Sstevel@tonic-gate  * The cancellation of a request is being done asynchronously using
9037c478bd9Sstevel@tonic-gate  * _aio_cancel_req() from another thread context.
9047c478bd9Sstevel@tonic-gate  * A queued request can be cancelled in different manners :
9057c478bd9Sstevel@tonic-gate  * a) request is queued but not "in progress" or "done" (AIO_REQ_QUEUED):
9067c478bd9Sstevel@tonic-gate  *	- lock the queue -> remove the request -> unlock the queue
9077c478bd9Sstevel@tonic-gate  *	- this function/thread does not detect this cancellation process
9087c478bd9Sstevel@tonic-gate  * b) request is in progress (AIO_REQ_INPROGRESS) :
9097c478bd9Sstevel@tonic-gate  *	- this function first allow the cancellation of the running
9107c478bd9Sstevel@tonic-gate  *	  request with the flag "work_cancel_flg=1"
9117c478bd9Sstevel@tonic-gate  * 		see _aio_req_get() -> _aio_cancel_on()
9127c478bd9Sstevel@tonic-gate  *	  During this phase, it is allowed to interrupt the worker
9137c478bd9Sstevel@tonic-gate  *	  thread running the request (this thread) using the SIGAIOCANCEL
9147c478bd9Sstevel@tonic-gate  *	  signal.
9157c478bd9Sstevel@tonic-gate  *	  Once this thread returns from the kernel (because the request
9167c478bd9Sstevel@tonic-gate  *	  is just done), then it must disable a possible cancellation
91734709573Sraf  *	  and proceed to finish the request.  To disable the cancellation
9187c478bd9Sstevel@tonic-gate  *	  this thread must use _aio_cancel_off() to set "work_cancel_flg=0".
9197c478bd9Sstevel@tonic-gate  * c) request is already done (AIO_REQ_DONE || AIO_REQ_DONEQ):
9207c478bd9Sstevel@tonic-gate  *	  same procedure as in a)
9217c478bd9Sstevel@tonic-gate  *
9227c478bd9Sstevel@tonic-gate  * To b)
9237c478bd9Sstevel@tonic-gate  *	This thread uses sigsetjmp() to define the position in the code, where
9247c478bd9Sstevel@tonic-gate  *	it wish to continue working in the case that a SIGAIOCANCEL signal
9257c478bd9Sstevel@tonic-gate  *	is detected.
9267c478bd9Sstevel@tonic-gate  *	Normally this thread should get the cancellation signal during the
92734709573Sraf  *	kernel phase (reading or writing).  In that case the signal handler
9287c478bd9Sstevel@tonic-gate  *	aiosigcancelhndlr() is activated using the worker thread context,
9297c478bd9Sstevel@tonic-gate  *	which again will use the siglongjmp() function to break the standard
9307c478bd9Sstevel@tonic-gate  *	code flow and jump to the "sigsetjmp" position, provided that
9317c478bd9Sstevel@tonic-gate  *	"work_cancel_flg" is set to "1".
9327c478bd9Sstevel@tonic-gate  *	Because the "work_cancel_flg" is only manipulated by this worker
9337c478bd9Sstevel@tonic-gate  *	thread and it can only run on one CPU at a given time, it is not
9347c478bd9Sstevel@tonic-gate  *	necessary to protect that flag with the queue lock.
9357c478bd9Sstevel@tonic-gate  *	Returning from the kernel (read or write system call) we must
9367c478bd9Sstevel@tonic-gate  *	first disable the use of the SIGAIOCANCEL signal and accordingly
9377c478bd9Sstevel@tonic-gate  *	the use of the siglongjmp() function to prevent a possible deadlock:
9387c478bd9Sstevel@tonic-gate  *	- It can happens that this worker thread returns from the kernel and
9397c478bd9Sstevel@tonic-gate  *	  blocks in "work_qlock1",
9407c478bd9Sstevel@tonic-gate  *	- then a second thread cancels the apparently "in progress" request
9417c478bd9Sstevel@tonic-gate  *	  and sends the SIGAIOCANCEL signal to the worker thread,
9427c478bd9Sstevel@tonic-gate  *	- the worker thread gets assigned the "work_qlock1" and will returns
9437c478bd9Sstevel@tonic-gate  *	  from the kernel,
9447c478bd9Sstevel@tonic-gate  *	- the kernel detects the pending signal and activates the signal
9457c478bd9Sstevel@tonic-gate  *	  handler instead,
9467c478bd9Sstevel@tonic-gate  *	- if the "work_cancel_flg" is still set then the signal handler
9477c478bd9Sstevel@tonic-gate  *	  should use siglongjmp() to cancel the "in progress" request and
9487c478bd9Sstevel@tonic-gate  *	  it would try to acquire the same work_qlock1 in _aio_req_get()
9497c478bd9Sstevel@tonic-gate  *	  for a second time => deadlock.
9507c478bd9Sstevel@tonic-gate  *	To avoid that situation we disable the cancellation of the request
9517c478bd9Sstevel@tonic-gate  *	in progress BEFORE we try to acquire the work_qlock1.
9527c478bd9Sstevel@tonic-gate  *	In that case the signal handler will not call siglongjmp() and the
9537c478bd9Sstevel@tonic-gate  *	worker thread will continue running the standard code flow.
9547c478bd9Sstevel@tonic-gate  *	Then this thread must check the AIO_REQ_CANCELED flag to emulate
9557c478bd9Sstevel@tonic-gate  *	an eventually required siglongjmp() freeing the work_qlock1 and
9567c478bd9Sstevel@tonic-gate  *	avoiding a deadlock.
9577c478bd9Sstevel@tonic-gate  */
9587c478bd9Sstevel@tonic-gate void *
9597c478bd9Sstevel@tonic-gate _aio_do_request(void *arglist)
9607c478bd9Sstevel@tonic-gate {
9617c478bd9Sstevel@tonic-gate 	aio_worker_t *aiowp = (aio_worker_t *)arglist;
962*f841f6adSraf 	ulwp_t *self = curthread;
9637c478bd9Sstevel@tonic-gate 	struct aio_args *arg;
96434709573Sraf 	aio_req_t *reqp;		/* current AIO request */
9657c478bd9Sstevel@tonic-gate 	ssize_t retval;
96634709573Sraf 	int error;
9677c478bd9Sstevel@tonic-gate 
96834709573Sraf 	if (pthread_setspecific(_aio_key, aiowp) != 0)
969*f841f6adSraf 		aio_panic("_aio_do_request, pthread_setspecific()");
97034709573Sraf 	(void) pthread_sigmask(SIG_SETMASK, &_worker_set, NULL);
97134709573Sraf 	ASSERT(aiowp->work_req == NULL);
9727c478bd9Sstevel@tonic-gate 
97334709573Sraf 	/*
97434709573Sraf 	 * We resume here when an operation is cancelled.
97534709573Sraf 	 * On first entry, aiowp->work_req == NULL, so all
97634709573Sraf 	 * we do is block SIGAIOCANCEL.
97734709573Sraf 	 */
97834709573Sraf 	(void) sigsetjmp(aiowp->work_jmp_buf, 0);
979*f841f6adSraf 	ASSERT(self->ul_sigdefer == 0);
9807c478bd9Sstevel@tonic-gate 
981*f841f6adSraf 	sigoff(self);	/* block SIGAIOCANCEL */
98234709573Sraf 	if (aiowp->work_req != NULL)
98334709573Sraf 		_aio_finish_request(aiowp, -1, ECANCELED);
9847c478bd9Sstevel@tonic-gate 
98534709573Sraf 	for (;;) {
9867c478bd9Sstevel@tonic-gate 		/*
98734709573Sraf 		 * Put completed requests on aio_done_list.  This has
9887c478bd9Sstevel@tonic-gate 		 * to be done as part of the main loop to ensure that
9897c478bd9Sstevel@tonic-gate 		 * we don't artificially starve any aiowait'ers.
9907c478bd9Sstevel@tonic-gate 		 */
9917c478bd9Sstevel@tonic-gate 		if (aiowp->work_done1)
9927c478bd9Sstevel@tonic-gate 			_aio_work_done(aiowp);
9937c478bd9Sstevel@tonic-gate 
99434709573Sraf top:
99534709573Sraf 		/* consume any deferred SIGAIOCANCEL signal here */
996*f841f6adSraf 		sigon(self);
997*f841f6adSraf 		sigoff(self);
99834709573Sraf 
999*f841f6adSraf 		while ((reqp = _aio_req_get(aiowp)) == NULL) {
1000*f841f6adSraf 			if (_aio_idle(aiowp) != 0)
1001*f841f6adSraf 				goto top;
1002*f841f6adSraf 		}
100334709573Sraf 		arg = &reqp->req_args;
100434709573Sraf 		ASSERT(reqp->req_state == AIO_REQ_INPROGRESS ||
100534709573Sraf 		    reqp->req_state == AIO_REQ_CANCELED);
100634709573Sraf 		error = 0;
100734709573Sraf 
100834709573Sraf 		switch (reqp->req_op) {
100934709573Sraf 		case AIOREAD:
101034709573Sraf 		case AIOAREAD:
1011*f841f6adSraf 			sigon(self);	/* unblock SIGAIOCANCEL */
101234709573Sraf 			retval = pread(arg->fd, arg->buf,
101334709573Sraf 			    arg->bufsz, arg->offset);
101434709573Sraf 			if (retval == -1) {
101534709573Sraf 				if (errno == ESPIPE) {
101634709573Sraf 					retval = read(arg->fd,
101734709573Sraf 					    arg->buf, arg->bufsz);
101834709573Sraf 					if (retval == -1)
101934709573Sraf 						error = errno;
102034709573Sraf 				} else {
102134709573Sraf 					error = errno;
10227c478bd9Sstevel@tonic-gate 				}
102334709573Sraf 			}
1024*f841f6adSraf 			sigoff(self);	/* block SIGAIOCANCEL */
102534709573Sraf 			break;
102634709573Sraf 		case AIOWRITE:
102734709573Sraf 		case AIOAWRITE:
1028*f841f6adSraf 			sigon(self);	/* unblock SIGAIOCANCEL */
102934709573Sraf 			retval = pwrite(arg->fd, arg->buf,
103034709573Sraf 			    arg->bufsz, arg->offset);
103134709573Sraf 			if (retval == -1) {
103234709573Sraf 				if (errno == ESPIPE) {
103334709573Sraf 					retval = write(arg->fd,
103434709573Sraf 					    arg->buf, arg->bufsz);
103534709573Sraf 					if (retval == -1)
103634709573Sraf 						error = errno;
103734709573Sraf 				} else {
103834709573Sraf 					error = errno;
10397c478bd9Sstevel@tonic-gate 				}
104034709573Sraf 			}
1041*f841f6adSraf 			sigoff(self);	/* block SIGAIOCANCEL */
104234709573Sraf 			break;
104334709573Sraf #if !defined(_LP64)
104434709573Sraf 		case AIOAREAD64:
1045*f841f6adSraf 			sigon(self);	/* unblock SIGAIOCANCEL */
104634709573Sraf 			retval = pread64(arg->fd, arg->buf,
104734709573Sraf 			    arg->bufsz, arg->offset);
104834709573Sraf 			if (retval == -1) {
104934709573Sraf 				if (errno == ESPIPE) {
105034709573Sraf 					retval = read(arg->fd,
105134709573Sraf 					    arg->buf, arg->bufsz);
105234709573Sraf 					if (retval == -1)
105334709573Sraf 						error = errno;
105434709573Sraf 				} else {
105534709573Sraf 					error = errno;
10567c478bd9Sstevel@tonic-gate 				}
105734709573Sraf 			}
1058*f841f6adSraf 			sigoff(self);	/* block SIGAIOCANCEL */
105934709573Sraf 			break;
106034709573Sraf 		case AIOAWRITE64:
1061*f841f6adSraf 			sigon(self);	/* unblock SIGAIOCANCEL */
106234709573Sraf 			retval = pwrite64(arg->fd, arg->buf,
106334709573Sraf 			    arg->bufsz, arg->offset);
106434709573Sraf 			if (retval == -1) {
106534709573Sraf 				if (errno == ESPIPE) {
106634709573Sraf 					retval = write(arg->fd,
106734709573Sraf 					    arg->buf, arg->bufsz);
106834709573Sraf 					if (retval == -1)
106934709573Sraf 						error = errno;
107034709573Sraf 				} else {
107134709573Sraf 					error = errno;
10727c478bd9Sstevel@tonic-gate 				}
107334709573Sraf 			}
1074*f841f6adSraf 			sigoff(self);	/* block SIGAIOCANCEL */
107534709573Sraf 			break;
107634709573Sraf #endif	/* !defined(_LP64) */
107734709573Sraf 		case AIOFSYNC:
1078c2575b5eSraf 			if (_aio_fsync_del(aiowp, reqp))
107934709573Sraf 				goto top;
108034709573Sraf 			ASSERT(reqp->req_head == NULL);
108134709573Sraf 			/*
108234709573Sraf 			 * All writes for this fsync request are now
108334709573Sraf 			 * acknowledged.  Now make these writes visible
108434709573Sraf 			 * and put the final request into the hash table.
108534709573Sraf 			 */
108634709573Sraf 			if (reqp->req_state == AIO_REQ_CANCELED) {
108734709573Sraf 				/* EMPTY */;
108834709573Sraf 			} else if (arg->offset == O_SYNC) {
108934709573Sraf 				if ((retval = __fdsync(arg->fd, FSYNC)) == -1)
109034709573Sraf 					error = errno;
109134709573Sraf 			} else {
109234709573Sraf 				if ((retval = __fdsync(arg->fd, FDSYNC)) == -1)
109334709573Sraf 					error = errno;
109434709573Sraf 			}
109534709573Sraf 			if (_aio_hash_insert(reqp->req_resultp, reqp) != 0)
1096*f841f6adSraf 				aio_panic("_aio_do_request(): AIOFSYNC: "
109734709573Sraf 				    "request already in hash table");
109834709573Sraf 			break;
109934709573Sraf 		default:
1100*f841f6adSraf 			aio_panic("_aio_do_request, bad op");
11017c478bd9Sstevel@tonic-gate 		}
11027c478bd9Sstevel@tonic-gate 
110334709573Sraf 		_aio_finish_request(aiowp, retval, error);
110434709573Sraf 	}
110534709573Sraf 	/* NOTREACHED */
110634709573Sraf 	return (NULL);
110734709573Sraf }
110834709573Sraf 
110934709573Sraf /*
111034709573Sraf  * Perform the tail processing for _aio_do_request().
111134709573Sraf  * The in-progress request may or may not have been cancelled.
111234709573Sraf  */
111334709573Sraf static void
111434709573Sraf _aio_finish_request(aio_worker_t *aiowp, ssize_t retval, int error)
111534709573Sraf {
111634709573Sraf 	aio_req_t *reqp;
111734709573Sraf 
111834709573Sraf 	sig_mutex_lock(&aiowp->work_qlock1);
111934709573Sraf 	if ((reqp = aiowp->work_req) == NULL)
112034709573Sraf 		sig_mutex_unlock(&aiowp->work_qlock1);
112134709573Sraf 	else {
112234709573Sraf 		aiowp->work_req = NULL;
112334709573Sraf 		if (reqp->req_state == AIO_REQ_CANCELED) {
112434709573Sraf 			retval = -1;
112534709573Sraf 			error = ECANCELED;
112634709573Sraf 		}
112734709573Sraf 		if (!POSIX_AIO(reqp)) {
112834709573Sraf 			sig_mutex_unlock(&aiowp->work_qlock1);
112934709573Sraf 			sig_mutex_lock(&__aio_mutex);
113034709573Sraf 			if (reqp->req_state == AIO_REQ_INPROGRESS)
113134709573Sraf 				reqp->req_state = AIO_REQ_DONE;
113234709573Sraf 			_aio_req_done_cnt++;
113334709573Sraf 			_aio_set_result(reqp, retval, error);
113434709573Sraf 			if (error == ECANCELED)
113534709573Sraf 				_aio_outstand_cnt--;
113634709573Sraf 			sig_mutex_unlock(&__aio_mutex);
113734709573Sraf 		} else {
113834709573Sraf 			if (reqp->req_state == AIO_REQ_INPROGRESS)
113934709573Sraf 				reqp->req_state = AIO_REQ_DONE;
114034709573Sraf 			sig_mutex_unlock(&aiowp->work_qlock1);
114134709573Sraf 			_aiodone(reqp, retval, error);
114234709573Sraf 		}
114334709573Sraf 	}
114434709573Sraf }
11457c478bd9Sstevel@tonic-gate 
114634709573Sraf void
114734709573Sraf _aio_req_mark_done(aio_req_t *reqp)
114834709573Sraf {
114934709573Sraf #if !defined(_LP64)
115034709573Sraf 	if (reqp->req_largefile)
115134709573Sraf 		((aiocb64_t *)reqp->req_aiocbp)->aio_state = USERAIO_DONE;
115234709573Sraf 	else
115334709573Sraf #endif
115434709573Sraf 		((aiocb_t *)reqp->req_aiocbp)->aio_state = USERAIO_DONE;
115534709573Sraf }
11567c478bd9Sstevel@tonic-gate 
115734709573Sraf /*
115834709573Sraf  * Sleep for 'ticks' clock ticks to give somebody else a chance to run,
115934709573Sraf  * hopefully to consume one of our queued signals.
116034709573Sraf  */
116134709573Sraf static void
116234709573Sraf _aio_delay(int ticks)
116334709573Sraf {
116434709573Sraf 	(void) usleep(ticks * (MICROSEC / hz));
116534709573Sraf }
11667c478bd9Sstevel@tonic-gate 
116734709573Sraf /*
116834709573Sraf  * Actually send the notifications.
116934709573Sraf  * We could block indefinitely here if the application
117034709573Sraf  * is not listening for the signal or port notifications.
117134709573Sraf  */
117234709573Sraf static void
117334709573Sraf send_notification(notif_param_t *npp)
117434709573Sraf {
1175*f841f6adSraf 	extern int __sigqueue(pid_t pid, int signo,
1176*f841f6adSraf 		/* const union sigval */ void *value, int si_code, int block);
1177*f841f6adSraf 
1178*f841f6adSraf 	if (npp->np_signo)
1179*f841f6adSraf 		(void) __sigqueue(__pid, npp->np_signo, npp->np_user,
1180*f841f6adSraf 		    SI_ASYNCIO, 1);
1181*f841f6adSraf 	else if (npp->np_port >= 0)
118234709573Sraf 		(void) _port_dispatch(npp->np_port, 0, PORT_SOURCE_AIO,
118334709573Sraf 		    npp->np_event, npp->np_object, npp->np_user);
1184*f841f6adSraf 
1185*f841f6adSraf 	if (npp->np_lio_signo)
1186*f841f6adSraf 		(void) __sigqueue(__pid, npp->np_lio_signo, npp->np_lio_user,
1187*f841f6adSraf 		    SI_ASYNCIO, 1);
1188*f841f6adSraf 	else if (npp->np_lio_port >= 0)
118934709573Sraf 		(void) _port_dispatch(npp->np_lio_port, 0, PORT_SOURCE_AIO,
119034709573Sraf 		    npp->np_lio_event, npp->np_lio_object, npp->np_lio_user);
11917c478bd9Sstevel@tonic-gate }
11927c478bd9Sstevel@tonic-gate 
11937c478bd9Sstevel@tonic-gate /*
119434709573Sraf  * Asynchronous notification worker.
11957c478bd9Sstevel@tonic-gate  */
11967c478bd9Sstevel@tonic-gate void *
119734709573Sraf _aio_do_notify(void *arg)
11987c478bd9Sstevel@tonic-gate {
11997c478bd9Sstevel@tonic-gate 	aio_worker_t *aiowp = (aio_worker_t *)arg;
120034709573Sraf 	aio_req_t *reqp;
12017c478bd9Sstevel@tonic-gate 
120234709573Sraf 	/*
120334709573Sraf 	 * This isn't really necessary.  All signals are blocked.
120434709573Sraf 	 */
120534709573Sraf 	if (pthread_setspecific(_aio_key, aiowp) != 0)
1206*f841f6adSraf 		aio_panic("_aio_do_notify, pthread_setspecific()");
120734709573Sraf 
120834709573Sraf 	/*
120934709573Sraf 	 * Notifications are never cancelled.
121034709573Sraf 	 * All signals remain blocked, forever.
121134709573Sraf 	 */
12127c478bd9Sstevel@tonic-gate 	for (;;) {
1213*f841f6adSraf 		while ((reqp = _aio_req_get(aiowp)) == NULL) {
1214*f841f6adSraf 			if (_aio_idle(aiowp) != 0)
1215*f841f6adSraf 				aio_panic("_aio_do_notify: _aio_idle() failed");
1216*f841f6adSraf 		}
121734709573Sraf 		send_notification(&reqp->req_notify);
121834709573Sraf 		_aio_req_free(reqp);
12197c478bd9Sstevel@tonic-gate 	}
122034709573Sraf 
12217c478bd9Sstevel@tonic-gate 	/* NOTREACHED */
12227c478bd9Sstevel@tonic-gate 	return (NULL);
12237c478bd9Sstevel@tonic-gate }
12247c478bd9Sstevel@tonic-gate 
12257c478bd9Sstevel@tonic-gate /*
122634709573Sraf  * Do the completion semantics for a request that was either canceled
122734709573Sraf  * by _aio_cancel_req() or was completed by _aio_do_request().
12287c478bd9Sstevel@tonic-gate  */
122934709573Sraf static void
123034709573Sraf _aiodone(aio_req_t *reqp, ssize_t retval, int error)
12317c478bd9Sstevel@tonic-gate {
123234709573Sraf 	aio_result_t *resultp = reqp->req_resultp;
123334709573Sraf 	int notify = 0;
123434709573Sraf 	aio_lio_t *head;
123534709573Sraf 	int sigev_none;
123634709573Sraf 	int sigev_signal;
123734709573Sraf 	int sigev_thread;
123834709573Sraf 	int sigev_port;
123934709573Sraf 	notif_param_t np;
12407c478bd9Sstevel@tonic-gate 
124134709573Sraf 	/*
124234709573Sraf 	 * We call _aiodone() only for Posix I/O.
124334709573Sraf 	 */
124434709573Sraf 	ASSERT(POSIX_AIO(reqp));
124534709573Sraf 
124634709573Sraf 	sigev_none = 0;
124734709573Sraf 	sigev_signal = 0;
124834709573Sraf 	sigev_thread = 0;
124934709573Sraf 	sigev_port = 0;
125034709573Sraf 	np.np_signo = 0;
125134709573Sraf 	np.np_port = -1;
125234709573Sraf 	np.np_lio_signo = 0;
125334709573Sraf 	np.np_lio_port = -1;
125434709573Sraf 
125534709573Sraf 	switch (reqp->req_sigevent.sigev_notify) {
125634709573Sraf 	case SIGEV_NONE:
125734709573Sraf 		sigev_none = 1;
125834709573Sraf 		break;
125934709573Sraf 	case SIGEV_SIGNAL:
126034709573Sraf 		sigev_signal = 1;
126134709573Sraf 		break;
126234709573Sraf 	case SIGEV_THREAD:
126334709573Sraf 		sigev_thread = 1;
126434709573Sraf 		break;
126534709573Sraf 	case SIGEV_PORT:
126634709573Sraf 		sigev_port = 1;
126734709573Sraf 		break;
126834709573Sraf 	default:
1269*f841f6adSraf 		aio_panic("_aiodone: improper sigev_notify");
127034709573Sraf 		break;
127134709573Sraf 	}
12727c478bd9Sstevel@tonic-gate 
127334709573Sraf 	/*
127434709573Sraf 	 * Figure out the notification parameters while holding __aio_mutex.
127534709573Sraf 	 * Actually perform the notifications after dropping __aio_mutex.
127634709573Sraf 	 * This allows us to sleep for a long time (if the notifications
127734709573Sraf 	 * incur delays) without impeding other async I/O operations.
127834709573Sraf 	 */
12797c478bd9Sstevel@tonic-gate 
128034709573Sraf 	sig_mutex_lock(&__aio_mutex);
128134709573Sraf 
128234709573Sraf 	if (sigev_signal) {
128334709573Sraf 		if ((np.np_signo = reqp->req_sigevent.sigev_signo) != 0)
128434709573Sraf 			notify = 1;
128534709573Sraf 		np.np_user = reqp->req_sigevent.sigev_value.sival_ptr;
128634709573Sraf 	} else if (sigev_thread | sigev_port) {
128734709573Sraf 		if ((np.np_port = reqp->req_sigevent.sigev_signo) >= 0)
128834709573Sraf 			notify = 1;
128934709573Sraf 		np.np_event = reqp->req_op;
129034709573Sraf 		if (np.np_event == AIOFSYNC && reqp->req_largefile)
129134709573Sraf 			np.np_event = AIOFSYNC64;
129234709573Sraf 		np.np_object = (uintptr_t)reqp->req_aiocbp;
129334709573Sraf 		np.np_user = reqp->req_sigevent.sigev_value.sival_ptr;
129434709573Sraf 	}
12957c478bd9Sstevel@tonic-gate 
129634709573Sraf 	if (resultp->aio_errno == EINPROGRESS)
129734709573Sraf 		_aio_set_result(reqp, retval, error);
12987c478bd9Sstevel@tonic-gate 
129934709573Sraf 	_aio_outstand_cnt--;
13007c478bd9Sstevel@tonic-gate 
130134709573Sraf 	head = reqp->req_head;
130234709573Sraf 	reqp->req_head = NULL;
13037c478bd9Sstevel@tonic-gate 
130434709573Sraf 	if (sigev_none) {
130534709573Sraf 		_aio_enq_doneq(reqp);
130634709573Sraf 		reqp = NULL;
130734709573Sraf 	} else {
130834709573Sraf 		(void) _aio_hash_del(resultp);
130934709573Sraf 		_aio_req_mark_done(reqp);
131034709573Sraf 	}
13117c478bd9Sstevel@tonic-gate 
131234709573Sraf 	_aio_waitn_wakeup();
13137c478bd9Sstevel@tonic-gate 
131434709573Sraf 	/*
131534709573Sraf 	 * __aio_waitn() sets AIO_WAIT_INPROGRESS and
131634709573Sraf 	 * __aio_suspend() increments "_aio_kernel_suspend"
131734709573Sraf 	 * when they are waiting in the kernel for completed I/Os.
131834709573Sraf 	 *
131934709573Sraf 	 * _kaio(AIONOTIFY) awakes the corresponding function
132034709573Sraf 	 * in the kernel; then the corresponding __aio_waitn() or
132134709573Sraf 	 * __aio_suspend() function could reap the recently
132234709573Sraf 	 * completed I/Os (_aiodone()).
132334709573Sraf 	 */
132434709573Sraf 	if ((_aio_flags & AIO_WAIT_INPROGRESS) || _aio_kernel_suspend > 0)
132534709573Sraf 		(void) _kaio(AIONOTIFY);
13267c478bd9Sstevel@tonic-gate 
132734709573Sraf 	sig_mutex_unlock(&__aio_mutex);
13287c478bd9Sstevel@tonic-gate 
132934709573Sraf 	if (head != NULL) {
13307c478bd9Sstevel@tonic-gate 		/*
133134709573Sraf 		 * If all the lio requests have completed,
133234709573Sraf 		 * prepare to notify the waiting thread.
13337c478bd9Sstevel@tonic-gate 		 */
133434709573Sraf 		sig_mutex_lock(&head->lio_mutex);
133534709573Sraf 		ASSERT(head->lio_refcnt == head->lio_nent);
133634709573Sraf 		if (head->lio_refcnt == 1) {
133734709573Sraf 			int waiting = 0;
133834709573Sraf 			if (head->lio_mode == LIO_WAIT) {
133934709573Sraf 				if ((waiting = head->lio_waiting) != 0)
134034709573Sraf 					(void) cond_signal(&head->lio_cond_cv);
134134709573Sraf 			} else if (head->lio_port < 0) { /* none or signal */
134234709573Sraf 				if ((np.np_lio_signo = head->lio_signo) != 0)
134334709573Sraf 					notify = 1;
134434709573Sraf 				np.np_lio_user = head->lio_sigval.sival_ptr;
134534709573Sraf 			} else {			/* thread or port */
134634709573Sraf 				notify = 1;
134734709573Sraf 				np.np_lio_port = head->lio_port;
134834709573Sraf 				np.np_lio_event = head->lio_event;
134934709573Sraf 				np.np_lio_object =
135034709573Sraf 				    (uintptr_t)head->lio_sigevent;
135134709573Sraf 				np.np_lio_user = head->lio_sigval.sival_ptr;
13527c478bd9Sstevel@tonic-gate 			}
135334709573Sraf 			head->lio_nent = head->lio_refcnt = 0;
135434709573Sraf 			sig_mutex_unlock(&head->lio_mutex);
135534709573Sraf 			if (waiting == 0)
135634709573Sraf 				_aio_lio_free(head);
135734709573Sraf 		} else {
135834709573Sraf 			head->lio_nent--;
135934709573Sraf 			head->lio_refcnt--;
136034709573Sraf 			sig_mutex_unlock(&head->lio_mutex);
13617c478bd9Sstevel@tonic-gate 		}
136234709573Sraf 	}
13637c478bd9Sstevel@tonic-gate 
136434709573Sraf 	/*
136534709573Sraf 	 * The request is completed; now perform the notifications.
136634709573Sraf 	 */
136734709573Sraf 	if (notify) {
136834709573Sraf 		if (reqp != NULL) {
13697c478bd9Sstevel@tonic-gate 			/*
137034709573Sraf 			 * We usually put the request on the notification
137134709573Sraf 			 * queue because we don't want to block and delay
137234709573Sraf 			 * other operations behind us in the work queue.
137334709573Sraf 			 * Also we must never block on a cancel notification
137434709573Sraf 			 * because we are being called from an application
137534709573Sraf 			 * thread in this case and that could lead to deadlock
137634709573Sraf 			 * if no other thread is receiving notificatins.
13777c478bd9Sstevel@tonic-gate 			 */
137834709573Sraf 			reqp->req_notify = np;
137934709573Sraf 			reqp->req_op = AIONOTIFY;
138034709573Sraf 			_aio_req_add(reqp, &__workers_no, AIONOTIFY);
138134709573Sraf 			reqp = NULL;
138234709573Sraf 		} else {
138334709573Sraf 			/*
138434709573Sraf 			 * We already put the request on the done queue,
138534709573Sraf 			 * so we can't queue it to the notification queue.
138634709573Sraf 			 * Just do the notification directly.
138734709573Sraf 			 */
138834709573Sraf 			send_notification(&np);
13897c478bd9Sstevel@tonic-gate 		}
13907c478bd9Sstevel@tonic-gate 	}
139134709573Sraf 
139234709573Sraf 	if (reqp != NULL)
139334709573Sraf 		_aio_req_free(reqp);
13947c478bd9Sstevel@tonic-gate }
13957c478bd9Sstevel@tonic-gate 
13967c478bd9Sstevel@tonic-gate /*
139734709573Sraf  * Delete fsync requests from list head until there is
139834709573Sraf  * only one left.  Return 0 when there is only one,
139934709573Sraf  * otherwise return a non-zero value.
14007c478bd9Sstevel@tonic-gate  */
14017c478bd9Sstevel@tonic-gate static int
1402c2575b5eSraf _aio_fsync_del(aio_worker_t *aiowp, aio_req_t *reqp)
14037c478bd9Sstevel@tonic-gate {
140434709573Sraf 	aio_lio_t *head = reqp->req_head;
140534709573Sraf 	int rval = 0;
140634709573Sraf 
1407c2575b5eSraf 	ASSERT(reqp == aiowp->work_req);
1408c2575b5eSraf 	sig_mutex_lock(&aiowp->work_qlock1);
140934709573Sraf 	sig_mutex_lock(&head->lio_mutex);
141034709573Sraf 	if (head->lio_refcnt > 1) {
141134709573Sraf 		head->lio_refcnt--;
141234709573Sraf 		head->lio_nent--;
1413c2575b5eSraf 		aiowp->work_req = NULL;
141434709573Sraf 		sig_mutex_unlock(&head->lio_mutex);
1415c2575b5eSraf 		sig_mutex_unlock(&aiowp->work_qlock1);
141634709573Sraf 		sig_mutex_lock(&__aio_mutex);
141734709573Sraf 		_aio_outstand_cnt--;
141834709573Sraf 		_aio_waitn_wakeup();
141934709573Sraf 		sig_mutex_unlock(&__aio_mutex);
142034709573Sraf 		_aio_req_free(reqp);
142134709573Sraf 		return (1);
14227c478bd9Sstevel@tonic-gate 	}
142334709573Sraf 	ASSERT(head->lio_nent == 1 && head->lio_refcnt == 1);
142434709573Sraf 	reqp->req_head = NULL;
142534709573Sraf 	if (head->lio_canned)
142634709573Sraf 		reqp->req_state = AIO_REQ_CANCELED;
142734709573Sraf 	if (head->lio_mode == LIO_DESTROY) {
1428c2575b5eSraf 		aiowp->work_req = NULL;
142934709573Sraf 		rval = 1;
143034709573Sraf 	}
1431c2575b5eSraf 	sig_mutex_unlock(&head->lio_mutex);
1432c2575b5eSraf 	sig_mutex_unlock(&aiowp->work_qlock1);
143334709573Sraf 	head->lio_refcnt--;
143434709573Sraf 	head->lio_nent--;
143534709573Sraf 	_aio_lio_free(head);
1436c2575b5eSraf 	if (rval != 0)
1437c2575b5eSraf 		_aio_req_free(reqp);
143834709573Sraf 	return (rval);
14397c478bd9Sstevel@tonic-gate }
14407c478bd9Sstevel@tonic-gate 
14417c478bd9Sstevel@tonic-gate /*
1442*f841f6adSraf  * A worker is set idle when its work queue is empty.
1443*f841f6adSraf  * The worker checks again that it has no more work
1444*f841f6adSraf  * and then goes to sleep waiting for more work.
14457c478bd9Sstevel@tonic-gate  */
1446*f841f6adSraf int
14477c478bd9Sstevel@tonic-gate _aio_idle(aio_worker_t *aiowp)
14487c478bd9Sstevel@tonic-gate {
144934709573Sraf 	int error = 0;
145034709573Sraf 
145134709573Sraf 	sig_mutex_lock(&aiowp->work_qlock1);
145234709573Sraf 	if (aiowp->work_count1 == 0) {
145334709573Sraf 		ASSERT(aiowp->work_minload1 == 0);
14547c478bd9Sstevel@tonic-gate 		aiowp->work_idleflg = 1;
14557c478bd9Sstevel@tonic-gate 		/*
145634709573Sraf 		 * A cancellation handler is not needed here.
145734709573Sraf 		 * aio worker threads are never cancelled via pthread_cancel().
14587c478bd9Sstevel@tonic-gate 		 */
145934709573Sraf 		error = sig_cond_wait(&aiowp->work_idle_cv,
146034709573Sraf 		    &aiowp->work_qlock1);
146134709573Sraf 		/*
146234709573Sraf 		 * The idle flag is normally cleared before worker is awakened
146334709573Sraf 		 * by aio_req_add().  On error (EINTR), we clear it ourself.
146434709573Sraf 		 */
146534709573Sraf 		if (error)
146634709573Sraf 			aiowp->work_idleflg = 0;
14677c478bd9Sstevel@tonic-gate 	}
146834709573Sraf 	sig_mutex_unlock(&aiowp->work_qlock1);
1469*f841f6adSraf 	return (error);
14707c478bd9Sstevel@tonic-gate }
14717c478bd9Sstevel@tonic-gate 
14727c478bd9Sstevel@tonic-gate /*
14737c478bd9Sstevel@tonic-gate  * A worker's completed AIO requests are placed onto a global
147434709573Sraf  * done queue.  The application is only sent a SIGIO signal if
14757c478bd9Sstevel@tonic-gate  * the process has a handler enabled and it is not waiting via
14767c478bd9Sstevel@tonic-gate  * aiowait().
14777c478bd9Sstevel@tonic-gate  */
14787c478bd9Sstevel@tonic-gate static void
147934709573Sraf _aio_work_done(aio_worker_t *aiowp)
14807c478bd9Sstevel@tonic-gate {
148134709573Sraf 	aio_req_t *reqp;
14827c478bd9Sstevel@tonic-gate 
148334709573Sraf 	sig_mutex_lock(&aiowp->work_qlock1);
148434709573Sraf 	reqp = aiowp->work_prev1;
148534709573Sraf 	reqp->req_next = NULL;
14867c478bd9Sstevel@tonic-gate 	aiowp->work_done1 = 0;
14877c478bd9Sstevel@tonic-gate 	aiowp->work_tail1 = aiowp->work_next1;
14887c478bd9Sstevel@tonic-gate 	if (aiowp->work_tail1 == NULL)
14897c478bd9Sstevel@tonic-gate 		aiowp->work_head1 = NULL;
14907c478bd9Sstevel@tonic-gate 	aiowp->work_prev1 = NULL;
149134709573Sraf 	sig_mutex_unlock(&aiowp->work_qlock1);
149234709573Sraf 	sig_mutex_lock(&__aio_mutex);
14937c478bd9Sstevel@tonic-gate 	_aio_donecnt++;
14947c478bd9Sstevel@tonic-gate 	_aio_outstand_cnt--;
14957c478bd9Sstevel@tonic-gate 	_aio_req_done_cnt--;
149634709573Sraf 	ASSERT(_aio_donecnt > 0 &&
149734709573Sraf 	    _aio_outstand_cnt >= 0 &&
149834709573Sraf 	    _aio_req_done_cnt >= 0);
149934709573Sraf 	ASSERT(reqp != NULL);
15007c478bd9Sstevel@tonic-gate 
15017c478bd9Sstevel@tonic-gate 	if (_aio_done_tail == NULL) {
150234709573Sraf 		_aio_done_head = _aio_done_tail = reqp;
15037c478bd9Sstevel@tonic-gate 	} else {
150434709573Sraf 		_aio_done_head->req_next = reqp;
150534709573Sraf 		_aio_done_head = reqp;
15067c478bd9Sstevel@tonic-gate 	}
15077c478bd9Sstevel@tonic-gate 
15087c478bd9Sstevel@tonic-gate 	if (_aiowait_flag) {
150934709573Sraf 		sig_mutex_unlock(&__aio_mutex);
15107c478bd9Sstevel@tonic-gate 		(void) _kaio(AIONOTIFY);
15117c478bd9Sstevel@tonic-gate 	} else {
151234709573Sraf 		sig_mutex_unlock(&__aio_mutex);
151334709573Sraf 		if (_sigio_enabled)
15147c478bd9Sstevel@tonic-gate 			(void) kill(__pid, SIGIO);
15157c478bd9Sstevel@tonic-gate 	}
15167c478bd9Sstevel@tonic-gate }
15177c478bd9Sstevel@tonic-gate 
15187c478bd9Sstevel@tonic-gate /*
151934709573Sraf  * The done queue consists of AIO requests that are in either the
152034709573Sraf  * AIO_REQ_DONE or AIO_REQ_CANCELED state.  Requests that were cancelled
152134709573Sraf  * are discarded.  If the done queue is empty then NULL is returned.
152234709573Sraf  * Otherwise the address of a done aio_result_t is returned.
15237c478bd9Sstevel@tonic-gate  */
152434709573Sraf aio_result_t *
15257c478bd9Sstevel@tonic-gate _aio_req_done(void)
15267c478bd9Sstevel@tonic-gate {
152734709573Sraf 	aio_req_t *reqp;
15287c478bd9Sstevel@tonic-gate 	aio_result_t *resultp;
15297c478bd9Sstevel@tonic-gate 
15307c478bd9Sstevel@tonic-gate 	ASSERT(MUTEX_HELD(&__aio_mutex));
15317c478bd9Sstevel@tonic-gate 
153234709573Sraf 	if ((reqp = _aio_done_tail) != NULL) {
153334709573Sraf 		if ((_aio_done_tail = reqp->req_next) == NULL)
153434709573Sraf 			_aio_done_head = NULL;
15357c478bd9Sstevel@tonic-gate 		ASSERT(_aio_donecnt > 0);
15367c478bd9Sstevel@tonic-gate 		_aio_donecnt--;
153734709573Sraf 		(void) _aio_hash_del(reqp->req_resultp);
153834709573Sraf 		resultp = reqp->req_resultp;
153934709573Sraf 		ASSERT(reqp->req_state == AIO_REQ_DONE);
154034709573Sraf 		_aio_req_free(reqp);
15417c478bd9Sstevel@tonic-gate 		return (resultp);
15427c478bd9Sstevel@tonic-gate 	}
15437c478bd9Sstevel@tonic-gate 	/* is queue empty? */
154434709573Sraf 	if (reqp == NULL && _aio_outstand_cnt == 0) {
15457c478bd9Sstevel@tonic-gate 		return ((aio_result_t *)-1);
15467c478bd9Sstevel@tonic-gate 	}
15477c478bd9Sstevel@tonic-gate 	return (NULL);
15487c478bd9Sstevel@tonic-gate }
15497c478bd9Sstevel@tonic-gate 
15507c478bd9Sstevel@tonic-gate /*
155134709573Sraf  * Set the return and errno values for the application's use.
155234709573Sraf  *
155334709573Sraf  * For the Posix interfaces, we must set the return value first followed
155434709573Sraf  * by the errno value because the Posix interfaces allow for a change
155534709573Sraf  * in the errno value from EINPROGRESS to something else to signal
155634709573Sraf  * the completion of the asynchronous request.
155734709573Sraf  *
155834709573Sraf  * The opposite is true for the Solaris interfaces.  These allow for
155934709573Sraf  * a change in the return value from AIO_INPROGRESS to something else
156034709573Sraf  * to signal the completion of the asynchronous request.
15617c478bd9Sstevel@tonic-gate  */
15627c478bd9Sstevel@tonic-gate void
156334709573Sraf _aio_set_result(aio_req_t *reqp, ssize_t retval, int error)
15647c478bd9Sstevel@tonic-gate {
156534709573Sraf 	aio_result_t *resultp = reqp->req_resultp;
156634709573Sraf 
156734709573Sraf 	if (POSIX_AIO(reqp)) {
156834709573Sraf 		resultp->aio_return = retval;
156934709573Sraf 		membar_producer();
157034709573Sraf 		resultp->aio_errno = error;
157134709573Sraf 	} else {
157234709573Sraf 		resultp->aio_errno = error;
157334709573Sraf 		membar_producer();
157434709573Sraf 		resultp->aio_return = retval;
157534709573Sraf 	}
157634709573Sraf }
157734709573Sraf 
157834709573Sraf /*
157934709573Sraf  * Add an AIO request onto the next work queue.
158034709573Sraf  * A circular list of workers is used to choose the next worker.
158134709573Sraf  */
158234709573Sraf void
158334709573Sraf _aio_req_add(aio_req_t *reqp, aio_worker_t **nextworker, int mode)
158434709573Sraf {
1585*f841f6adSraf 	ulwp_t *self = curthread;
158634709573Sraf 	aio_worker_t *aiowp;
158734709573Sraf 	aio_worker_t *first;
158834709573Sraf 	int load_bal_flg = 1;
158934709573Sraf 	int found;
159034709573Sraf 
159134709573Sraf 	ASSERT(reqp->req_state != AIO_REQ_DONEQ);
159234709573Sraf 	reqp->req_next = NULL;
15937c478bd9Sstevel@tonic-gate 	/*
159434709573Sraf 	 * Try to acquire the next worker's work queue.  If it is locked,
15957c478bd9Sstevel@tonic-gate 	 * then search the list of workers until a queue is found unlocked,
15967c478bd9Sstevel@tonic-gate 	 * or until the list is completely traversed at which point another
15977c478bd9Sstevel@tonic-gate 	 * worker will be created.
15987c478bd9Sstevel@tonic-gate 	 */
1599*f841f6adSraf 	sigoff(self);		/* defer SIGIO */
160034709573Sraf 	sig_mutex_lock(&__aio_mutex);
160134709573Sraf 	first = aiowp = *nextworker;
160234709573Sraf 	if (mode != AIONOTIFY)
16037c478bd9Sstevel@tonic-gate 		_aio_outstand_cnt++;
160434709573Sraf 	sig_mutex_unlock(&__aio_mutex);
160534709573Sraf 
16067c478bd9Sstevel@tonic-gate 	switch (mode) {
160734709573Sraf 	case AIOREAD:
160834709573Sraf 	case AIOWRITE:
160934709573Sraf 	case AIOAREAD:
161034709573Sraf 	case AIOAWRITE:
161134709573Sraf #if !defined(_LP64)
161234709573Sraf 	case AIOAREAD64:
161334709573Sraf 	case AIOAWRITE64:
16147c478bd9Sstevel@tonic-gate #endif
161534709573Sraf 		/* try to find an idle worker */
161634709573Sraf 		found = 0;
161734709573Sraf 		do {
161834709573Sraf 			if (sig_mutex_trylock(&aiowp->work_qlock1) == 0) {
161934709573Sraf 				if (aiowp->work_idleflg) {
162034709573Sraf 					found = 1;
16217c478bd9Sstevel@tonic-gate 					break;
16227c478bd9Sstevel@tonic-gate 				}
162334709573Sraf 				sig_mutex_unlock(&aiowp->work_qlock1);
16247c478bd9Sstevel@tonic-gate 			}
162534709573Sraf 		} while ((aiowp = aiowp->work_forw) != first);
16267c478bd9Sstevel@tonic-gate 
162734709573Sraf 		if (found) {
162834709573Sraf 			aiowp->work_minload1++;
162934709573Sraf 			break;
163034709573Sraf 		}
16317c478bd9Sstevel@tonic-gate 
163234709573Sraf 		/* try to acquire some worker's queue lock */
163334709573Sraf 		do {
163434709573Sraf 			if (sig_mutex_trylock(&aiowp->work_qlock1) == 0) {
163534709573Sraf 				found = 1;
163634709573Sraf 				break;
16377c478bd9Sstevel@tonic-gate 			}
163834709573Sraf 		} while ((aiowp = aiowp->work_forw) != first);
163934709573Sraf 
164034709573Sraf 		/*
164134709573Sraf 		 * Create more workers when the workers appear overloaded.
164234709573Sraf 		 * Either all the workers are busy draining their queues
164334709573Sraf 		 * or no worker's queue lock could be acquired.
164434709573Sraf 		 */
164534709573Sraf 		if (!found) {
164634709573Sraf 			if (_aio_worker_cnt < _max_workers) {
164734709573Sraf 				if (_aio_create_worker(reqp, mode))
1648*f841f6adSraf 					aio_panic("_aio_req_add: add worker");
1649*f841f6adSraf 				sigon(self);	/* reenable SIGIO */
16507c478bd9Sstevel@tonic-gate 				return;
16517c478bd9Sstevel@tonic-gate 			}
165234709573Sraf 
165334709573Sraf 			/*
165434709573Sraf 			 * No worker available and we have created
165534709573Sraf 			 * _max_workers, keep going through the
165634709573Sraf 			 * list slowly until we get a lock
165734709573Sraf 			 */
165834709573Sraf 			while (sig_mutex_trylock(&aiowp->work_qlock1) != 0) {
165934709573Sraf 				/*
166034709573Sraf 				 * give someone else a chance
166134709573Sraf 				 */
166234709573Sraf 				_aio_delay(1);
166334709573Sraf 				aiowp = aiowp->work_forw;
166434709573Sraf 			}
166534709573Sraf 		}
166634709573Sraf 
166734709573Sraf 		ASSERT(MUTEX_HELD(&aiowp->work_qlock1));
166834709573Sraf 		if (_aio_worker_cnt < _max_workers &&
166934709573Sraf 		    aiowp->work_minload1 >= _minworkload) {
167034709573Sraf 			sig_mutex_unlock(&aiowp->work_qlock1);
167134709573Sraf 			sig_mutex_lock(&__aio_mutex);
167234709573Sraf 			*nextworker = aiowp->work_forw;
167334709573Sraf 			sig_mutex_unlock(&__aio_mutex);
167434709573Sraf 			if (_aio_create_worker(reqp, mode))
1675*f841f6adSraf 				aio_panic("aio_req_add: add worker");
1676*f841f6adSraf 			sigon(self);	/* reenable SIGIO */
167734709573Sraf 			return;
167834709573Sraf 		}
167934709573Sraf 		aiowp->work_minload1++;
168034709573Sraf 		break;
168134709573Sraf 	case AIOFSYNC:
168234709573Sraf 	case AIONOTIFY:
168334709573Sraf 		load_bal_flg = 0;
168434709573Sraf 		sig_mutex_lock(&aiowp->work_qlock1);
168534709573Sraf 		break;
168634709573Sraf 	default:
1687*f841f6adSraf 		aio_panic("_aio_req_add: invalid mode");
168834709573Sraf 		break;
16897c478bd9Sstevel@tonic-gate 	}
16907c478bd9Sstevel@tonic-gate 	/*
16917c478bd9Sstevel@tonic-gate 	 * Put request onto worker's work queue.
16927c478bd9Sstevel@tonic-gate 	 */
16937c478bd9Sstevel@tonic-gate 	if (aiowp->work_tail1 == NULL) {
169434709573Sraf 		ASSERT(aiowp->work_count1 == 0);
169534709573Sraf 		aiowp->work_tail1 = reqp;
169634709573Sraf 		aiowp->work_next1 = reqp;
16977c478bd9Sstevel@tonic-gate 	} else {
169834709573Sraf 		aiowp->work_head1->req_next = reqp;
16997c478bd9Sstevel@tonic-gate 		if (aiowp->work_next1 == NULL)
170034709573Sraf 			aiowp->work_next1 = reqp;
17017c478bd9Sstevel@tonic-gate 	}
170234709573Sraf 	reqp->req_state = AIO_REQ_QUEUED;
170334709573Sraf 	reqp->req_worker = aiowp;
170434709573Sraf 	aiowp->work_head1 = reqp;
17057c478bd9Sstevel@tonic-gate 	/*
17067c478bd9Sstevel@tonic-gate 	 * Awaken worker if it is not currently active.
17077c478bd9Sstevel@tonic-gate 	 */
170834709573Sraf 	if (aiowp->work_count1++ == 0 && aiowp->work_idleflg) {
17097c478bd9Sstevel@tonic-gate 		aiowp->work_idleflg = 0;
171034709573Sraf 		(void) cond_signal(&aiowp->work_idle_cv);
17117c478bd9Sstevel@tonic-gate 	}
171234709573Sraf 	sig_mutex_unlock(&aiowp->work_qlock1);
171334709573Sraf 
171434709573Sraf 	if (load_bal_flg) {
171534709573Sraf 		sig_mutex_lock(&__aio_mutex);
171634709573Sraf 		*nextworker = aiowp->work_forw;
171734709573Sraf 		sig_mutex_unlock(&__aio_mutex);
171834709573Sraf 	}
1719*f841f6adSraf 	sigon(self);	/* reenable SIGIO */
17207c478bd9Sstevel@tonic-gate }
17217c478bd9Sstevel@tonic-gate 
17227c478bd9Sstevel@tonic-gate /*
172334709573Sraf  * Get an AIO request for a specified worker.
172434709573Sraf  * If the work queue is empty, return NULL.
17257c478bd9Sstevel@tonic-gate  */
17267c478bd9Sstevel@tonic-gate aio_req_t *
17277c478bd9Sstevel@tonic-gate _aio_req_get(aio_worker_t *aiowp)
17287c478bd9Sstevel@tonic-gate {
172934709573Sraf 	aio_req_t *reqp;
17307c478bd9Sstevel@tonic-gate 
173134709573Sraf 	sig_mutex_lock(&aiowp->work_qlock1);
173234709573Sraf 	if ((reqp = aiowp->work_next1) != NULL) {
17337c478bd9Sstevel@tonic-gate 		/*
173434709573Sraf 		 * Remove a POSIX request from the queue; the
17357c478bd9Sstevel@tonic-gate 		 * request queue is a singularly linked list
173634709573Sraf 		 * with a previous pointer.  The request is
173734709573Sraf 		 * removed by updating the previous pointer.
17387c478bd9Sstevel@tonic-gate 		 *
173934709573Sraf 		 * Non-posix requests are left on the queue
174034709573Sraf 		 * to eventually be placed on the done queue.
17417c478bd9Sstevel@tonic-gate 		 */
17427c478bd9Sstevel@tonic-gate 
174334709573Sraf 		if (POSIX_AIO(reqp)) {
17447c478bd9Sstevel@tonic-gate 			if (aiowp->work_prev1 == NULL) {
174534709573Sraf 				aiowp->work_tail1 = reqp->req_next;
17467c478bd9Sstevel@tonic-gate 				if (aiowp->work_tail1 == NULL)
17477c478bd9Sstevel@tonic-gate 					aiowp->work_head1 = NULL;
17487c478bd9Sstevel@tonic-gate 			} else {
174934709573Sraf 				aiowp->work_prev1->req_next = reqp->req_next;
175034709573Sraf 				if (aiowp->work_head1 == reqp)
175134709573Sraf 					aiowp->work_head1 = reqp->req_next;
17527c478bd9Sstevel@tonic-gate 			}
17537c478bd9Sstevel@tonic-gate 
17547c478bd9Sstevel@tonic-gate 		} else {
175534709573Sraf 			aiowp->work_prev1 = reqp;
17567c478bd9Sstevel@tonic-gate 			ASSERT(aiowp->work_done1 >= 0);
17577c478bd9Sstevel@tonic-gate 			aiowp->work_done1++;
17587c478bd9Sstevel@tonic-gate 		}
175934709573Sraf 		ASSERT(reqp != reqp->req_next);
176034709573Sraf 		aiowp->work_next1 = reqp->req_next;
176134709573Sraf 		ASSERT(aiowp->work_count1 >= 1);
176234709573Sraf 		aiowp->work_count1--;
176334709573Sraf 		switch (reqp->req_op) {
176434709573Sraf 		case AIOREAD:
176534709573Sraf 		case AIOWRITE:
176634709573Sraf 		case AIOAREAD:
176734709573Sraf 		case AIOAWRITE:
176834709573Sraf #if !defined(_LP64)
176934709573Sraf 		case AIOAREAD64:
177034709573Sraf 		case AIOAWRITE64:
17717c478bd9Sstevel@tonic-gate #endif
177234709573Sraf 			ASSERT(aiowp->work_minload1 > 0);
177334709573Sraf 			aiowp->work_minload1--;
177434709573Sraf 			break;
177534709573Sraf 		}
177634709573Sraf 		reqp->req_state = AIO_REQ_INPROGRESS;
17777c478bd9Sstevel@tonic-gate 	}
177834709573Sraf 	aiowp->work_req = reqp;
177934709573Sraf 	ASSERT(reqp != NULL || aiowp->work_count1 == 0);
178034709573Sraf 	sig_mutex_unlock(&aiowp->work_qlock1);
178134709573Sraf 	return (reqp);
17827c478bd9Sstevel@tonic-gate }
17837c478bd9Sstevel@tonic-gate 
17847c478bd9Sstevel@tonic-gate static void
178534709573Sraf _aio_req_del(aio_worker_t *aiowp, aio_req_t *reqp, int ostate)
17867c478bd9Sstevel@tonic-gate {
178734709573Sraf 	aio_req_t **last;
178834709573Sraf 	aio_req_t *lastrp;
178934709573Sraf 	aio_req_t *next;
17907c478bd9Sstevel@tonic-gate 
17917c478bd9Sstevel@tonic-gate 	ASSERT(aiowp != NULL);
17927c478bd9Sstevel@tonic-gate 	ASSERT(MUTEX_HELD(&aiowp->work_qlock1));
179334709573Sraf 	if (POSIX_AIO(reqp)) {
17947c478bd9Sstevel@tonic-gate 		if (ostate != AIO_REQ_QUEUED)
17957c478bd9Sstevel@tonic-gate 			return;
17967c478bd9Sstevel@tonic-gate 	}
17977c478bd9Sstevel@tonic-gate 	last = &aiowp->work_tail1;
17987c478bd9Sstevel@tonic-gate 	lastrp = aiowp->work_tail1;
17997c478bd9Sstevel@tonic-gate 	ASSERT(ostate == AIO_REQ_QUEUED || ostate == AIO_REQ_INPROGRESS);
18007c478bd9Sstevel@tonic-gate 	while ((next = *last) != NULL) {
180134709573Sraf 		if (next == reqp) {
18027c478bd9Sstevel@tonic-gate 			*last = next->req_next;
18037c478bd9Sstevel@tonic-gate 			if (aiowp->work_next1 == next)
18047c478bd9Sstevel@tonic-gate 				aiowp->work_next1 = next->req_next;
18057c478bd9Sstevel@tonic-gate 
18067c478bd9Sstevel@tonic-gate 			if ((next->req_next != NULL) ||
18077c478bd9Sstevel@tonic-gate 			    (aiowp->work_done1 == 0)) {
18087c478bd9Sstevel@tonic-gate 				if (aiowp->work_head1 == next)
18097c478bd9Sstevel@tonic-gate 					aiowp->work_head1 = next->req_next;
18107c478bd9Sstevel@tonic-gate 				if (aiowp->work_prev1 == next)
18117c478bd9Sstevel@tonic-gate 					aiowp->work_prev1 = next->req_next;
18127c478bd9Sstevel@tonic-gate 			} else {
18137c478bd9Sstevel@tonic-gate 				if (aiowp->work_head1 == next)
18147c478bd9Sstevel@tonic-gate 					aiowp->work_head1 = lastrp;
18157c478bd9Sstevel@tonic-gate 				if (aiowp->work_prev1 == next)
18167c478bd9Sstevel@tonic-gate 					aiowp->work_prev1 = lastrp;
18177c478bd9Sstevel@tonic-gate 			}
18187c478bd9Sstevel@tonic-gate 
18197c478bd9Sstevel@tonic-gate 			if (ostate == AIO_REQ_QUEUED) {
182034709573Sraf 				ASSERT(aiowp->work_count1 >= 1);
182134709573Sraf 				aiowp->work_count1--;
182234709573Sraf 				ASSERT(aiowp->work_minload1 >= 1);
182334709573Sraf 				aiowp->work_minload1--;
18247c478bd9Sstevel@tonic-gate 			} else {
18257c478bd9Sstevel@tonic-gate 				ASSERT(ostate == AIO_REQ_INPROGRESS &&
182634709573Sraf 				    !POSIX_AIO(reqp));
18277c478bd9Sstevel@tonic-gate 				aiowp->work_done1--;
18287c478bd9Sstevel@tonic-gate 			}
18297c478bd9Sstevel@tonic-gate 			return;
18307c478bd9Sstevel@tonic-gate 		}
18317c478bd9Sstevel@tonic-gate 		last = &next->req_next;
18327c478bd9Sstevel@tonic-gate 		lastrp = next;
18337c478bd9Sstevel@tonic-gate 	}
18347c478bd9Sstevel@tonic-gate 	/* NOTREACHED */
18357c478bd9Sstevel@tonic-gate }
18367c478bd9Sstevel@tonic-gate 
18377c478bd9Sstevel@tonic-gate static void
18387c478bd9Sstevel@tonic-gate _aio_enq_doneq(aio_req_t *reqp)
18397c478bd9Sstevel@tonic-gate {
18407c478bd9Sstevel@tonic-gate 	if (_aio_doneq == NULL) {
18417c478bd9Sstevel@tonic-gate 		_aio_doneq = reqp;
184234709573Sraf 		reqp->req_next = reqp->req_prev = reqp;
18437c478bd9Sstevel@tonic-gate 	} else {
18447c478bd9Sstevel@tonic-gate 		reqp->req_next = _aio_doneq;
18457c478bd9Sstevel@tonic-gate 		reqp->req_prev = _aio_doneq->req_prev;
184634709573Sraf 		_aio_doneq->req_prev->req_next = reqp;
18477c478bd9Sstevel@tonic-gate 		_aio_doneq->req_prev = reqp;
18487c478bd9Sstevel@tonic-gate 	}
18497c478bd9Sstevel@tonic-gate 	reqp->req_state = AIO_REQ_DONEQ;
18507c478bd9Sstevel@tonic-gate 	_aio_doneq_cnt++;
18517c478bd9Sstevel@tonic-gate }
18527c478bd9Sstevel@tonic-gate 
18537c478bd9Sstevel@tonic-gate /*
18547c478bd9Sstevel@tonic-gate  * caller owns the _aio_mutex
18557c478bd9Sstevel@tonic-gate  */
18567c478bd9Sstevel@tonic-gate aio_req_t *
18577c478bd9Sstevel@tonic-gate _aio_req_remove(aio_req_t *reqp)
18587c478bd9Sstevel@tonic-gate {
18597c478bd9Sstevel@tonic-gate 	if (reqp && reqp->req_state != AIO_REQ_DONEQ)
18607c478bd9Sstevel@tonic-gate 		return (NULL);
18617c478bd9Sstevel@tonic-gate 
18627c478bd9Sstevel@tonic-gate 	if (reqp) {
18637c478bd9Sstevel@tonic-gate 		/* request in done queue */
186434709573Sraf 		if (_aio_doneq == reqp)
186534709573Sraf 			_aio_doneq = reqp->req_next;
186634709573Sraf 		if (_aio_doneq == reqp) {
18677c478bd9Sstevel@tonic-gate 			/* only one request on queue */
18687c478bd9Sstevel@tonic-gate 			_aio_doneq = NULL;
18697c478bd9Sstevel@tonic-gate 		} else {
187034709573Sraf 			aio_req_t *tmp = reqp->req_next;
187134709573Sraf 			reqp->req_prev->req_next = tmp;
187234709573Sraf 			tmp->req_prev = reqp->req_prev;
18737c478bd9Sstevel@tonic-gate 		}
187434709573Sraf 	} else if ((reqp = _aio_doneq) != NULL) {
187534709573Sraf 		if (reqp == reqp->req_next) {
18767c478bd9Sstevel@tonic-gate 			/* only one request on queue */
18777c478bd9Sstevel@tonic-gate 			_aio_doneq = NULL;
18787c478bd9Sstevel@tonic-gate 		} else {
187934709573Sraf 			reqp->req_prev->req_next = _aio_doneq = reqp->req_next;
188034709573Sraf 			_aio_doneq->req_prev = reqp->req_prev;
18817c478bd9Sstevel@tonic-gate 		}
188234709573Sraf 	}
188334709573Sraf 	if (reqp) {
18847c478bd9Sstevel@tonic-gate 		_aio_doneq_cnt--;
188534709573Sraf 		reqp->req_next = reqp->req_prev = reqp;
188634709573Sraf 		reqp->req_state = AIO_REQ_DONE;
18877c478bd9Sstevel@tonic-gate 	}
188834709573Sraf 	return (reqp);
18897c478bd9Sstevel@tonic-gate }
18907c478bd9Sstevel@tonic-gate 
18917c478bd9Sstevel@tonic-gate /*
189234709573Sraf  * An AIO request is identified by an aio_result_t pointer.  The library
189334709573Sraf  * maps this aio_result_t pointer to its internal representation using a
189434709573Sraf  * hash table.  This function adds an aio_result_t pointer to the hash table.
18957c478bd9Sstevel@tonic-gate  */
18967c478bd9Sstevel@tonic-gate static int
189734709573Sraf _aio_hash_insert(aio_result_t *resultp, aio_req_t *reqp)
18987c478bd9Sstevel@tonic-gate {
189934709573Sraf 	aio_hash_t *hashp;
190034709573Sraf 	aio_req_t **prev;
190134709573Sraf 	aio_req_t *next;
19027c478bd9Sstevel@tonic-gate 
190334709573Sraf 	hashp = _aio_hash + AIOHASH(resultp);
1904*f841f6adSraf 	lmutex_lock(&hashp->hash_lock);
190534709573Sraf 	prev = &hashp->hash_ptr;
19067c478bd9Sstevel@tonic-gate 	while ((next = *prev) != NULL) {
19077c478bd9Sstevel@tonic-gate 		if (resultp == next->req_resultp) {
1908*f841f6adSraf 			lmutex_unlock(&hashp->hash_lock);
190934709573Sraf 			return (-1);
19107c478bd9Sstevel@tonic-gate 		}
19117c478bd9Sstevel@tonic-gate 		prev = &next->req_link;
19127c478bd9Sstevel@tonic-gate 	}
191334709573Sraf 	*prev = reqp;
191434709573Sraf 	ASSERT(reqp->req_link == NULL);
1915*f841f6adSraf 	lmutex_unlock(&hashp->hash_lock);
191634709573Sraf 	return (0);
19177c478bd9Sstevel@tonic-gate }
19187c478bd9Sstevel@tonic-gate 
19197c478bd9Sstevel@tonic-gate /*
192034709573Sraf  * Remove an entry from the hash table.
19217c478bd9Sstevel@tonic-gate  */
192234709573Sraf aio_req_t *
192334709573Sraf _aio_hash_del(aio_result_t *resultp)
19247c478bd9Sstevel@tonic-gate {
192534709573Sraf 	aio_hash_t *hashp;
192634709573Sraf 	aio_req_t **prev;
192734709573Sraf 	aio_req_t *next = NULL;
192834709573Sraf 
192934709573Sraf 	if (_aio_hash != NULL) {
193034709573Sraf 		hashp = _aio_hash + AIOHASH(resultp);
1931*f841f6adSraf 		lmutex_lock(&hashp->hash_lock);
193234709573Sraf 		prev = &hashp->hash_ptr;
193334709573Sraf 		while ((next = *prev) != NULL) {
193434709573Sraf 			if (resultp == next->req_resultp) {
193534709573Sraf 				*prev = next->req_link;
193634709573Sraf 				next->req_link = NULL;
193734709573Sraf 				break;
193834709573Sraf 			}
193934709573Sraf 			prev = &next->req_link;
19407c478bd9Sstevel@tonic-gate 		}
1941*f841f6adSraf 		lmutex_unlock(&hashp->hash_lock);
19427c478bd9Sstevel@tonic-gate 	}
194334709573Sraf 	return (next);
19447c478bd9Sstevel@tonic-gate }
19457c478bd9Sstevel@tonic-gate 
19467c478bd9Sstevel@tonic-gate /*
194734709573Sraf  *  find an entry in the hash table
19487c478bd9Sstevel@tonic-gate  */
19497c478bd9Sstevel@tonic-gate aio_req_t *
195034709573Sraf _aio_hash_find(aio_result_t *resultp)
19517c478bd9Sstevel@tonic-gate {
195234709573Sraf 	aio_hash_t *hashp;
195334709573Sraf 	aio_req_t **prev;
195434709573Sraf 	aio_req_t *next = NULL;
195534709573Sraf 
195634709573Sraf 	if (_aio_hash != NULL) {
195734709573Sraf 		hashp = _aio_hash + AIOHASH(resultp);
1958*f841f6adSraf 		lmutex_lock(&hashp->hash_lock);
195934709573Sraf 		prev = &hashp->hash_ptr;
196034709573Sraf 		while ((next = *prev) != NULL) {
196134709573Sraf 			if (resultp == next->req_resultp)
196234709573Sraf 				break;
196334709573Sraf 			prev = &next->req_link;
196434709573Sraf 		}
1965*f841f6adSraf 		lmutex_unlock(&hashp->hash_lock);
19667c478bd9Sstevel@tonic-gate 	}
196734709573Sraf 	return (next);
19687c478bd9Sstevel@tonic-gate }
19697c478bd9Sstevel@tonic-gate 
19707c478bd9Sstevel@tonic-gate /*
19717c478bd9Sstevel@tonic-gate  * AIO interface for POSIX
19727c478bd9Sstevel@tonic-gate  */
19737c478bd9Sstevel@tonic-gate int
197434709573Sraf _aio_rw(aiocb_t *aiocbp, aio_lio_t *lio_head, aio_worker_t **nextworker,
197534709573Sraf     int mode, int flg)
19767c478bd9Sstevel@tonic-gate {
197734709573Sraf 	aio_req_t *reqp;
197834709573Sraf 	aio_args_t *ap;
19797c478bd9Sstevel@tonic-gate 	int kerr;
19807c478bd9Sstevel@tonic-gate 
198134709573Sraf 	if (aiocbp == NULL) {
19827c478bd9Sstevel@tonic-gate 		errno = EINVAL;
19837c478bd9Sstevel@tonic-gate 		return (-1);
19847c478bd9Sstevel@tonic-gate 	}
19857c478bd9Sstevel@tonic-gate 
19867c478bd9Sstevel@tonic-gate 	/* initialize kaio */
19877c478bd9Sstevel@tonic-gate 	if (!_kaio_ok)
19887c478bd9Sstevel@tonic-gate 		_kaio_init();
19897c478bd9Sstevel@tonic-gate 
199034709573Sraf 	aiocbp->aio_state = NOCHECK;
19917c478bd9Sstevel@tonic-gate 
19927c478bd9Sstevel@tonic-gate 	/*
199334709573Sraf 	 * If we have been called because a list I/O
19947c478bd9Sstevel@tonic-gate 	 * kaio() failed, we dont want to repeat the
19957c478bd9Sstevel@tonic-gate 	 * system call
19967c478bd9Sstevel@tonic-gate 	 */
19977c478bd9Sstevel@tonic-gate 
19987c478bd9Sstevel@tonic-gate 	if (flg & AIO_KAIO) {
19997c478bd9Sstevel@tonic-gate 		/*
20007c478bd9Sstevel@tonic-gate 		 * Try kernel aio first.
20017c478bd9Sstevel@tonic-gate 		 * If errno is ENOTSUP/EBADFD,
20027c478bd9Sstevel@tonic-gate 		 * fall back to the thread implementation.
20037c478bd9Sstevel@tonic-gate 		 */
200434709573Sraf 		if (_kaio_ok > 0 && KAIO_SUPPORTED(aiocbp->aio_fildes)) {
200534709573Sraf 			aiocbp->aio_resultp.aio_errno = EINPROGRESS;
200634709573Sraf 			aiocbp->aio_state = CHECK;
200734709573Sraf 			kerr = (int)_kaio(mode, aiocbp);
20087c478bd9Sstevel@tonic-gate 			if (kerr == 0)
20097c478bd9Sstevel@tonic-gate 				return (0);
201034709573Sraf 			if (errno != ENOTSUP && errno != EBADFD) {
201134709573Sraf 				aiocbp->aio_resultp.aio_errno = errno;
201234709573Sraf 				aiocbp->aio_resultp.aio_return = -1;
201334709573Sraf 				aiocbp->aio_state = NOCHECK;
20147c478bd9Sstevel@tonic-gate 				return (-1);
20157c478bd9Sstevel@tonic-gate 			}
20167c478bd9Sstevel@tonic-gate 			if (errno == EBADFD)
201734709573Sraf 				SET_KAIO_NOT_SUPPORTED(aiocbp->aio_fildes);
20187c478bd9Sstevel@tonic-gate 		}
20197c478bd9Sstevel@tonic-gate 	}
20207c478bd9Sstevel@tonic-gate 
202134709573Sraf 	aiocbp->aio_resultp.aio_errno = EINPROGRESS;
202234709573Sraf 	aiocbp->aio_state = USERAIO;
20237c478bd9Sstevel@tonic-gate 
202434709573Sraf 	if (!__uaio_ok && __uaio_init() == -1)
202534709573Sraf 		return (-1);
20267c478bd9Sstevel@tonic-gate 
202734709573Sraf 	if ((reqp = _aio_req_alloc()) == NULL) {
20287c478bd9Sstevel@tonic-gate 		errno = EAGAIN;
20297c478bd9Sstevel@tonic-gate 		return (-1);
20307c478bd9Sstevel@tonic-gate 	}
20317c478bd9Sstevel@tonic-gate 
20327c478bd9Sstevel@tonic-gate 	/*
203334709573Sraf 	 * If an LIO request, add the list head to the aio request
20347c478bd9Sstevel@tonic-gate 	 */
203534709573Sraf 	reqp->req_head = lio_head;
203634709573Sraf 	reqp->req_type = AIO_POSIX_REQ;
203734709573Sraf 	reqp->req_op = mode;
203834709573Sraf 	reqp->req_largefile = 0;
203934709573Sraf 
204034709573Sraf 	if (aiocbp->aio_sigevent.sigev_notify == SIGEV_NONE) {
204134709573Sraf 		reqp->req_sigevent.sigev_notify = SIGEV_NONE;
204234709573Sraf 	} else if (aiocbp->aio_sigevent.sigev_notify == SIGEV_SIGNAL) {
204334709573Sraf 		reqp->req_sigevent.sigev_notify = SIGEV_SIGNAL;
204434709573Sraf 		reqp->req_sigevent.sigev_signo =
204534709573Sraf 		    aiocbp->aio_sigevent.sigev_signo;
204634709573Sraf 		reqp->req_sigevent.sigev_value.sival_ptr =
204734709573Sraf 		    aiocbp->aio_sigevent.sigev_value.sival_ptr;
204834709573Sraf 	} else if (aiocbp->aio_sigevent.sigev_notify == SIGEV_PORT) {
204934709573Sraf 		port_notify_t *pn = aiocbp->aio_sigevent.sigev_value.sival_ptr;
205034709573Sraf 		reqp->req_sigevent.sigev_notify = SIGEV_PORT;
205134709573Sraf 		/*
205234709573Sraf 		 * Reuse the sigevent structure to contain the port number
205334709573Sraf 		 * and the user value.  Same for SIGEV_THREAD, below.
205434709573Sraf 		 */
205534709573Sraf 		reqp->req_sigevent.sigev_signo =
205634709573Sraf 		    pn->portnfy_port;
205734709573Sraf 		reqp->req_sigevent.sigev_value.sival_ptr =
205834709573Sraf 		    pn->portnfy_user;
205934709573Sraf 	} else if (aiocbp->aio_sigevent.sigev_notify == SIGEV_THREAD) {
206034709573Sraf 		reqp->req_sigevent.sigev_notify = SIGEV_THREAD;
206134709573Sraf 		/*
206234709573Sraf 		 * The sigevent structure contains the port number
206334709573Sraf 		 * and the user value.  Same for SIGEV_PORT, above.
206434709573Sraf 		 */
206534709573Sraf 		reqp->req_sigevent.sigev_signo =
206634709573Sraf 		    aiocbp->aio_sigevent.sigev_signo;
206734709573Sraf 		reqp->req_sigevent.sigev_value.sival_ptr =
206834709573Sraf 		    aiocbp->aio_sigevent.sigev_value.sival_ptr;
20697c478bd9Sstevel@tonic-gate 	}
20707c478bd9Sstevel@tonic-gate 
207134709573Sraf 	reqp->req_resultp = &aiocbp->aio_resultp;
207234709573Sraf 	reqp->req_aiocbp = aiocbp;
207334709573Sraf 	ap = &reqp->req_args;
207434709573Sraf 	ap->fd = aiocbp->aio_fildes;
207534709573Sraf 	ap->buf = (caddr_t)aiocbp->aio_buf;
207634709573Sraf 	ap->bufsz = aiocbp->aio_nbytes;
207734709573Sraf 	ap->offset = aiocbp->aio_offset;
207834709573Sraf 
207934709573Sraf 	if ((flg & AIO_NO_DUPS) &&
208034709573Sraf 	    _aio_hash_insert(&aiocbp->aio_resultp, reqp) != 0) {
2081*f841f6adSraf 		aio_panic("_aio_rw(): request already in hash table");
208234709573Sraf 		_aio_req_free(reqp);
20837c478bd9Sstevel@tonic-gate 		errno = EINVAL;
20847c478bd9Sstevel@tonic-gate 		return (-1);
20857c478bd9Sstevel@tonic-gate 	}
208634709573Sraf 	_aio_req_add(reqp, nextworker, mode);
208734709573Sraf 	return (0);
20887c478bd9Sstevel@tonic-gate }
20897c478bd9Sstevel@tonic-gate 
209034709573Sraf #if !defined(_LP64)
20917c478bd9Sstevel@tonic-gate /*
20927c478bd9Sstevel@tonic-gate  * 64-bit AIO interface for POSIX
20937c478bd9Sstevel@tonic-gate  */
20947c478bd9Sstevel@tonic-gate int
209534709573Sraf _aio_rw64(aiocb64_t *aiocbp, aio_lio_t *lio_head, aio_worker_t **nextworker,
209634709573Sraf     int mode, int flg)
20977c478bd9Sstevel@tonic-gate {
209834709573Sraf 	aio_req_t *reqp;
209934709573Sraf 	aio_args_t *ap;
21007c478bd9Sstevel@tonic-gate 	int kerr;
21017c478bd9Sstevel@tonic-gate 
210234709573Sraf 	if (aiocbp == NULL) {
21037c478bd9Sstevel@tonic-gate 		errno = EINVAL;
21047c478bd9Sstevel@tonic-gate 		return (-1);
21057c478bd9Sstevel@tonic-gate 	}
21067c478bd9Sstevel@tonic-gate 
21077c478bd9Sstevel@tonic-gate 	/* initialize kaio */
21087c478bd9Sstevel@tonic-gate 	if (!_kaio_ok)
21097c478bd9Sstevel@tonic-gate 		_kaio_init();
21107c478bd9Sstevel@tonic-gate 
211134709573Sraf 	aiocbp->aio_state = NOCHECK;
21127c478bd9Sstevel@tonic-gate 
21137c478bd9Sstevel@tonic-gate 	/*
211434709573Sraf 	 * If we have been called because a list I/O
21157c478bd9Sstevel@tonic-gate 	 * kaio() failed, we dont want to repeat the
21167c478bd9Sstevel@tonic-gate 	 * system call
21177c478bd9Sstevel@tonic-gate 	 */
21187c478bd9Sstevel@tonic-gate 
21197c478bd9Sstevel@tonic-gate 	if (flg & AIO_KAIO) {
21207c478bd9Sstevel@tonic-gate 		/*
21217c478bd9Sstevel@tonic-gate 		 * Try kernel aio first.
21227c478bd9Sstevel@tonic-gate 		 * If errno is ENOTSUP/EBADFD,
21237c478bd9Sstevel@tonic-gate 		 * fall back to the thread implementation.
21247c478bd9Sstevel@tonic-gate 		 */
212534709573Sraf 		if (_kaio_ok > 0 && KAIO_SUPPORTED(aiocbp->aio_fildes)) {
212634709573Sraf 			aiocbp->aio_resultp.aio_errno = EINPROGRESS;
212734709573Sraf 			aiocbp->aio_state = CHECK;
212834709573Sraf 			kerr = (int)_kaio(mode, aiocbp);
21297c478bd9Sstevel@tonic-gate 			if (kerr == 0)
21307c478bd9Sstevel@tonic-gate 				return (0);
213134709573Sraf 			if (errno != ENOTSUP && errno != EBADFD) {
213234709573Sraf 				aiocbp->aio_resultp.aio_errno = errno;
213334709573Sraf 				aiocbp->aio_resultp.aio_return = -1;
213434709573Sraf 				aiocbp->aio_state = NOCHECK;
21357c478bd9Sstevel@tonic-gate 				return (-1);
21367c478bd9Sstevel@tonic-gate 			}
21377c478bd9Sstevel@tonic-gate 			if (errno == EBADFD)
213834709573Sraf 				SET_KAIO_NOT_SUPPORTED(aiocbp->aio_fildes);
21397c478bd9Sstevel@tonic-gate 		}
21407c478bd9Sstevel@tonic-gate 	}
21417c478bd9Sstevel@tonic-gate 
214234709573Sraf 	aiocbp->aio_resultp.aio_errno = EINPROGRESS;
214334709573Sraf 	aiocbp->aio_state = USERAIO;
21447c478bd9Sstevel@tonic-gate 
214534709573Sraf 	if (!__uaio_ok && __uaio_init() == -1)
214634709573Sraf 		return (-1);
21477c478bd9Sstevel@tonic-gate 
214834709573Sraf 	if ((reqp = _aio_req_alloc()) == NULL) {
21497c478bd9Sstevel@tonic-gate 		errno = EAGAIN;
21507c478bd9Sstevel@tonic-gate 		return (-1);
21517c478bd9Sstevel@tonic-gate 	}
21527c478bd9Sstevel@tonic-gate 
21537c478bd9Sstevel@tonic-gate 	/*
215434709573Sraf 	 * If an LIO request, add the list head to the aio request
21557c478bd9Sstevel@tonic-gate 	 */
215634709573Sraf 	reqp->req_head = lio_head;
215734709573Sraf 	reqp->req_type = AIO_POSIX_REQ;
215834709573Sraf 	reqp->req_op = mode;
215934709573Sraf 	reqp->req_largefile = 1;
216034709573Sraf 
216134709573Sraf 	if (aiocbp->aio_sigevent.sigev_notify == SIGEV_NONE) {
216234709573Sraf 		reqp->req_sigevent.sigev_notify = SIGEV_NONE;
216334709573Sraf 	} else if (aiocbp->aio_sigevent.sigev_notify == SIGEV_SIGNAL) {
216434709573Sraf 		reqp->req_sigevent.sigev_notify = SIGEV_SIGNAL;
216534709573Sraf 		reqp->req_sigevent.sigev_signo =
216634709573Sraf 		    aiocbp->aio_sigevent.sigev_signo;
216734709573Sraf 		reqp->req_sigevent.sigev_value.sival_ptr =
216834709573Sraf 		    aiocbp->aio_sigevent.sigev_value.sival_ptr;
216934709573Sraf 	} else if (aiocbp->aio_sigevent.sigev_notify == SIGEV_PORT) {
217034709573Sraf 		port_notify_t *pn = aiocbp->aio_sigevent.sigev_value.sival_ptr;
217134709573Sraf 		reqp->req_sigevent.sigev_notify = SIGEV_PORT;
217234709573Sraf 		reqp->req_sigevent.sigev_signo =
217334709573Sraf 		    pn->portnfy_port;
217434709573Sraf 		reqp->req_sigevent.sigev_value.sival_ptr =
217534709573Sraf 		    pn->portnfy_user;
217634709573Sraf 	} else if (aiocbp->aio_sigevent.sigev_notify == SIGEV_THREAD) {
217734709573Sraf 		reqp->req_sigevent.sigev_notify = SIGEV_THREAD;
217834709573Sraf 		reqp->req_sigevent.sigev_signo =
217934709573Sraf 		    aiocbp->aio_sigevent.sigev_signo;
218034709573Sraf 		reqp->req_sigevent.sigev_value.sival_ptr =
218134709573Sraf 		    aiocbp->aio_sigevent.sigev_value.sival_ptr;
21827c478bd9Sstevel@tonic-gate 	}
21837c478bd9Sstevel@tonic-gate 
218434709573Sraf 	reqp->req_resultp = &aiocbp->aio_resultp;
218534709573Sraf 	reqp->req_aiocbp = aiocbp;
218634709573Sraf 	ap = &reqp->req_args;
218734709573Sraf 	ap->fd = aiocbp->aio_fildes;
218834709573Sraf 	ap->buf = (caddr_t)aiocbp->aio_buf;
218934709573Sraf 	ap->bufsz = aiocbp->aio_nbytes;
219034709573Sraf 	ap->offset = aiocbp->aio_offset;
219134709573Sraf 
219234709573Sraf 	if ((flg & AIO_NO_DUPS) &&
219334709573Sraf 	    _aio_hash_insert(&aiocbp->aio_resultp, reqp) != 0) {
2194*f841f6adSraf 		aio_panic("_aio_rw64(): request already in hash table");
219534709573Sraf 		_aio_req_free(reqp);
21967c478bd9Sstevel@tonic-gate 		errno = EINVAL;
21977c478bd9Sstevel@tonic-gate 		return (-1);
21987c478bd9Sstevel@tonic-gate 	}
219934709573Sraf 	_aio_req_add(reqp, nextworker, mode);
220034709573Sraf 	return (0);
22017c478bd9Sstevel@tonic-gate }
220234709573Sraf #endif	/* !defined(_LP64) */
2203