xref: /illumos-gate/usr/src/lib/libc/port/aio/aio.c (revision bced1f33)
17c478bd9Sstevel@tonic-gate /*
27c478bd9Sstevel@tonic-gate  * CDDL HEADER START
37c478bd9Sstevel@tonic-gate  *
47c478bd9Sstevel@tonic-gate  * The contents of this file are subject to the terms of the
534709573Sraf  * Common Development and Distribution License (the "License").
634709573Sraf  * You may not use this file except in compliance with the License.
77c478bd9Sstevel@tonic-gate  *
87c478bd9Sstevel@tonic-gate  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
97c478bd9Sstevel@tonic-gate  * or http://www.opensolaris.org/os/licensing.
107c478bd9Sstevel@tonic-gate  * See the License for the specific language governing permissions
117c478bd9Sstevel@tonic-gate  * and limitations under the License.
127c478bd9Sstevel@tonic-gate  *
137c478bd9Sstevel@tonic-gate  * When distributing Covered Code, include this CDDL HEADER in each
147c478bd9Sstevel@tonic-gate  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
157c478bd9Sstevel@tonic-gate  * If applicable, add the following below this CDDL HEADER, with the
167c478bd9Sstevel@tonic-gate  * fields enclosed by brackets "[]" replaced with your own identifying
177c478bd9Sstevel@tonic-gate  * information: Portions Copyright [yyyy] [name of copyright owner]
187c478bd9Sstevel@tonic-gate  *
197c478bd9Sstevel@tonic-gate  * CDDL HEADER END
207c478bd9Sstevel@tonic-gate  */
2134709573Sraf 
227c478bd9Sstevel@tonic-gate /*
23a574db85Sraf  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
247c478bd9Sstevel@tonic-gate  * Use is subject to license terms.
257c478bd9Sstevel@tonic-gate  */
267c478bd9Sstevel@tonic-gate 
277c478bd9Sstevel@tonic-gate #pragma ident	"%Z%%M%	%I%	%E% SMI"
287c478bd9Sstevel@tonic-gate 
297257d1b4Sraf #include "lint.h"
30f841f6adSraf #include "thr_uberdata.h"
31f841f6adSraf #include "asyncio.h"
3234709573Sraf #include <atomic.h>
337c478bd9Sstevel@tonic-gate #include <sys/param.h>
347c478bd9Sstevel@tonic-gate #include <sys/file.h>
357c478bd9Sstevel@tonic-gate #include <sys/port.h>
367c478bd9Sstevel@tonic-gate 
377c478bd9Sstevel@tonic-gate static int _aio_hash_insert(aio_result_t *, aio_req_t *);
387c478bd9Sstevel@tonic-gate static aio_req_t *_aio_req_get(aio_worker_t *);
397c478bd9Sstevel@tonic-gate static void _aio_req_add(aio_req_t *, aio_worker_t **, int);
407c478bd9Sstevel@tonic-gate static void _aio_req_del(aio_worker_t *, aio_req_t *, int);
417c478bd9Sstevel@tonic-gate static void _aio_work_done(aio_worker_t *);
4234709573Sraf static void _aio_enq_doneq(aio_req_t *);
437c478bd9Sstevel@tonic-gate 
4434709573Sraf extern void _aio_lio_free(aio_lio_t *);
457c478bd9Sstevel@tonic-gate 
4634709573Sraf extern int __fdsync(int, int);
474d86dd30Sraf extern int __fcntl(int, int, ...);
487c478bd9Sstevel@tonic-gate extern int _port_dispatch(int, int, int, int, uintptr_t, void *);
497c478bd9Sstevel@tonic-gate 
50c2575b5eSraf static int _aio_fsync_del(aio_worker_t *, aio_req_t *);
5134709573Sraf static void _aiodone(aio_req_t *, ssize_t, int);
527c478bd9Sstevel@tonic-gate static void _aio_cancel_work(aio_worker_t *, int, int *, int *);
5334709573Sraf static void _aio_finish_request(aio_worker_t *, ssize_t, int);
547c478bd9Sstevel@tonic-gate 
557c478bd9Sstevel@tonic-gate /*
567c478bd9Sstevel@tonic-gate  * switch for kernel async I/O
577c478bd9Sstevel@tonic-gate  */
5834709573Sraf int _kaio_ok = 0;		/* 0 = disabled, 1 = on, -1 = error */
597c478bd9Sstevel@tonic-gate 
607c478bd9Sstevel@tonic-gate /*
617c478bd9Sstevel@tonic-gate  * Key for thread-specific data
627c478bd9Sstevel@tonic-gate  */
6334709573Sraf pthread_key_t _aio_key;
647c478bd9Sstevel@tonic-gate 
657c478bd9Sstevel@tonic-gate /*
6634709573Sraf  * Array for determining whether or not a file supports kaio.
6734709573Sraf  * Initialized in _kaio_init().
687c478bd9Sstevel@tonic-gate  */
6934709573Sraf uint32_t *_kaio_supported = NULL;
707c478bd9Sstevel@tonic-gate 
717c478bd9Sstevel@tonic-gate /*
7234709573Sraf  *  workers for read/write requests
7334709573Sraf  * (__aio_mutex lock protects circular linked list of workers)
747c478bd9Sstevel@tonic-gate  */
7534709573Sraf aio_worker_t *__workers_rw;	/* circular list of AIO workers */
7634709573Sraf aio_worker_t *__nextworker_rw;	/* next worker in list of workers */
7734709573Sraf int __rw_workerscnt;		/* number of read/write workers */
787c478bd9Sstevel@tonic-gate 
797c478bd9Sstevel@tonic-gate /*
8034709573Sraf  * worker for notification requests.
817c478bd9Sstevel@tonic-gate  */
8234709573Sraf aio_worker_t *__workers_no;	/* circular list of AIO workers */
8334709573Sraf aio_worker_t *__nextworker_no;	/* next worker in list of workers */
8434709573Sraf int __no_workerscnt;		/* number of write workers */
857c478bd9Sstevel@tonic-gate 
8634709573Sraf aio_req_t *_aio_done_tail;		/* list of done requests */
8734709573Sraf aio_req_t *_aio_done_head;
887c478bd9Sstevel@tonic-gate 
897c478bd9Sstevel@tonic-gate mutex_t __aio_initlock = DEFAULTMUTEX;	/* makes aio initialization atomic */
90f841f6adSraf cond_t __aio_initcv = DEFAULTCV;
91f841f6adSraf int __aio_initbusy = 0;
92f841f6adSraf 
937c478bd9Sstevel@tonic-gate mutex_t __aio_mutex = DEFAULTMUTEX;	/* protects counts, and linked lists */
947c478bd9Sstevel@tonic-gate cond_t _aio_iowait_cv = DEFAULTCV;	/* wait for userland I/Os */
957c478bd9Sstevel@tonic-gate 
967c478bd9Sstevel@tonic-gate pid_t __pid = (pid_t)-1;		/* initialize as invalid pid */
9734709573Sraf int _sigio_enabled = 0;			/* when set, send SIGIO signal */
987c478bd9Sstevel@tonic-gate 
9934709573Sraf aio_hash_t *_aio_hash;
1007c478bd9Sstevel@tonic-gate 
10134709573Sraf aio_req_t *_aio_doneq;			/* double linked done queue list */
1027c478bd9Sstevel@tonic-gate 
1037c478bd9Sstevel@tonic-gate int _aio_donecnt = 0;
10434709573Sraf int _aio_waitncnt = 0;			/* # of requests for aio_waitn */
1057c478bd9Sstevel@tonic-gate int _aio_doneq_cnt = 0;
10634709573Sraf int _aio_outstand_cnt = 0;		/* # of outstanding requests */
10734709573Sraf int _kaio_outstand_cnt = 0;		/* # of outstanding kaio requests */
1087c478bd9Sstevel@tonic-gate int _aio_req_done_cnt = 0;		/* req. done but not in "done queue" */
1097c478bd9Sstevel@tonic-gate int _aio_kernel_suspend = 0;		/* active kernel kaio calls */
1107c478bd9Sstevel@tonic-gate int _aio_suscv_cnt = 0;			/* aio_suspend calls waiting on cv's */
1117c478bd9Sstevel@tonic-gate 
1127c478bd9Sstevel@tonic-gate int _max_workers = 256;			/* max number of workers permitted */
113f841f6adSraf int _min_workers = 4;			/* min number of workers */
1147c478bd9Sstevel@tonic-gate int _minworkload = 2;			/* min number of request in q */
1157c478bd9Sstevel@tonic-gate int _aio_worker_cnt = 0;		/* number of workers to do requests */
1167c478bd9Sstevel@tonic-gate int __uaio_ok = 0;			/* AIO has been enabled */
1177c478bd9Sstevel@tonic-gate sigset_t _worker_set;			/* worker's signal mask */
1187c478bd9Sstevel@tonic-gate 
1197c478bd9Sstevel@tonic-gate int _aiowait_flag = 0;			/* when set, aiowait() is inprogress */
120f841f6adSraf int _aio_flags = 0;			/* see asyncio.h defines for */
1217c478bd9Sstevel@tonic-gate 
122f841f6adSraf aio_worker_t *_kaiowp = NULL;		/* points to kaio cleanup thread */
1237c478bd9Sstevel@tonic-gate 
12434709573Sraf int hz;					/* clock ticks per second */
1257c478bd9Sstevel@tonic-gate 
12634709573Sraf static int
_kaio_supported_init(void)12734709573Sraf _kaio_supported_init(void)
1287c478bd9Sstevel@tonic-gate {
12934709573Sraf 	void *ptr;
13034709573Sraf 	size_t size;
13134709573Sraf 
13234709573Sraf 	if (_kaio_supported != NULL)	/* already initialized */
13334709573Sraf 		return (0);
13434709573Sraf 
13534709573Sraf 	size = MAX_KAIO_FDARRAY_SIZE * sizeof (uint32_t);
13634709573Sraf 	ptr = mmap(NULL, size, PROT_READ | PROT_WRITE,
13734709573Sraf 	    MAP_PRIVATE | MAP_ANON, -1, (off_t)0);
13834709573Sraf 	if (ptr == MAP_FAILED)
13934709573Sraf 		return (-1);
14034709573Sraf 	_kaio_supported = ptr;
14134709573Sraf 	return (0);
1427c478bd9Sstevel@tonic-gate }
1437c478bd9Sstevel@tonic-gate 
1447c478bd9Sstevel@tonic-gate /*
145f841f6adSraf  * The aio subsystem is initialized when an AIO request is made.
146f841f6adSraf  * Constants are initialized like the max number of workers that
147f841f6adSraf  * the subsystem can create, and the minimum number of workers
148f841f6adSraf  * permitted before imposing some restrictions.  Also, some
149f841f6adSraf  * workers are created.
1507c478bd9Sstevel@tonic-gate  */
1517c478bd9Sstevel@tonic-gate int
__uaio_init(void)1527c478bd9Sstevel@tonic-gate __uaio_init(void)
1537c478bd9Sstevel@tonic-gate {
154f841f6adSraf 	int ret = -1;
1557c478bd9Sstevel@tonic-gate 	int i;
156a574db85Sraf 	int cancel_state;
1577c478bd9Sstevel@tonic-gate 
158f841f6adSraf 	lmutex_lock(&__aio_initlock);
159a574db85Sraf 	(void) pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &cancel_state);
160f841f6adSraf 	while (__aio_initbusy)
161a574db85Sraf 		(void) cond_wait(&__aio_initcv, &__aio_initlock);
162a574db85Sraf 	(void) pthread_setcancelstate(cancel_state, NULL);
16334709573Sraf 	if (__uaio_ok) {	/* already initialized */
164f841f6adSraf 		lmutex_unlock(&__aio_initlock);
16534709573Sraf 		return (0);
16634709573Sraf 	}
167f841f6adSraf 	__aio_initbusy = 1;
168f841f6adSraf 	lmutex_unlock(&__aio_initlock);
1697c478bd9Sstevel@tonic-gate 
17034709573Sraf 	hz = (int)sysconf(_SC_CLK_TCK);
17134709573Sraf 	__pid = getpid();
1727c478bd9Sstevel@tonic-gate 
173f841f6adSraf 	setup_cancelsig(SIGAIOCANCEL);
1747c478bd9Sstevel@tonic-gate 
17534709573Sraf 	if (_kaio_supported_init() != 0)
17634709573Sraf 		goto out;
1777c478bd9Sstevel@tonic-gate 
17834709573Sraf 	/*
17934709573Sraf 	 * Allocate and initialize the hash table.
180f7499066Ssp 	 * Do this only once, even if __uaio_init() is called twice.
18134709573Sraf 	 */
182f7499066Ssp 	if (_aio_hash == NULL) {
183f7499066Ssp 		/* LINTED pointer cast */
184f7499066Ssp 		_aio_hash = (aio_hash_t *)mmap(NULL,
185f7499066Ssp 		    HASHSZ * sizeof (aio_hash_t), PROT_READ | PROT_WRITE,
186f7499066Ssp 		    MAP_PRIVATE | MAP_ANON, -1, (off_t)0);
187f7499066Ssp 		if ((void *)_aio_hash == MAP_FAILED) {
188f7499066Ssp 			_aio_hash = NULL;
189f7499066Ssp 			goto out;
190f7499066Ssp 		}
191f7499066Ssp 		for (i = 0; i < HASHSZ; i++)
192f7499066Ssp 			(void) mutex_init(&_aio_hash[i].hash_lock,
193f7499066Ssp 			    USYNC_THREAD, NULL);
1947c478bd9Sstevel@tonic-gate 	}
1957c478bd9Sstevel@tonic-gate 
19634709573Sraf 	/*
19734709573Sraf 	 * Initialize worker's signal mask to only catch SIGAIOCANCEL.
19834709573Sraf 	 */
19934709573Sraf 	(void) sigfillset(&_worker_set);
20034709573Sraf 	(void) sigdelset(&_worker_set, SIGAIOCANCEL);
20134709573Sraf 
20234709573Sraf 	/*
203f7499066Ssp 	 * Create one worker to send asynchronous notifications.
204f7499066Ssp 	 * Do this only once, even if __uaio_init() is called twice.
20534709573Sraf 	 */
206f7499066Ssp 	if (__no_workerscnt == 0 &&
207f7499066Ssp 	    (_aio_create_worker(NULL, AIONOTIFY) != 0)) {
208f7499066Ssp 		errno = EAGAIN;
209f7499066Ssp 		goto out;
210f7499066Ssp 	}
21134709573Sraf 
21234709573Sraf 	/*
213f7499066Ssp 	 * Create the minimum number of read/write workers.
214f7499066Ssp 	 * And later check whether atleast one worker is created;
215f7499066Ssp 	 * lwp_create() calls could fail because of segkp exhaustion.
21634709573Sraf 	 */
217f7499066Ssp 	for (i = 0; i < _min_workers; i++)
218f7499066Ssp 		(void) _aio_create_worker(NULL, AIOREAD);
219f7499066Ssp 	if (__rw_workerscnt == 0) {
220f7499066Ssp 		errno = EAGAIN;
221f7499066Ssp 		goto out;
222f7499066Ssp 	}
22334709573Sraf 
22434709573Sraf 	ret = 0;
22534709573Sraf out:
226f841f6adSraf 	lmutex_lock(&__aio_initlock);
227f841f6adSraf 	if (ret == 0)
228f841f6adSraf 		__uaio_ok = 1;
229f841f6adSraf 	__aio_initbusy = 0;
230f841f6adSraf 	(void) cond_broadcast(&__aio_initcv);
231f841f6adSraf 	lmutex_unlock(&__aio_initlock);
23234709573Sraf 	return (ret);
2337c478bd9Sstevel@tonic-gate }
2347c478bd9Sstevel@tonic-gate 
235f841f6adSraf /*
236f841f6adSraf  * Called from close() before actually performing the real _close().
237f841f6adSraf  */
238f841f6adSraf void
_aio_close(int fd)239f841f6adSraf _aio_close(int fd)
240f841f6adSraf {
241f841f6adSraf 	if (fd < 0)	/* avoid cancelling everything */
242f841f6adSraf 		return;
243f841f6adSraf 	/*
244f841f6adSraf 	 * Cancel all outstanding aio requests for this file descriptor.
245f841f6adSraf 	 */
246f841f6adSraf 	if (__uaio_ok)
247f841f6adSraf 		(void) aiocancel_all(fd);
248f841f6adSraf 	/*
249f841f6adSraf 	 * If we have allocated the bit array, clear the bit for this file.
250f841f6adSraf 	 * The next open may re-use this file descriptor and the new file
251f841f6adSraf 	 * may have different kaio() behaviour.
252f841f6adSraf 	 */
253f841f6adSraf 	if (_kaio_supported != NULL)
254f841f6adSraf 		CLEAR_KAIO_SUPPORTED(fd);
255f841f6adSraf }
256f841f6adSraf 
2577c478bd9Sstevel@tonic-gate /*
2587c478bd9Sstevel@tonic-gate  * special kaio cleanup thread sits in a loop in the
2597c478bd9Sstevel@tonic-gate  * kernel waiting for pending kaio requests to complete.
2607c478bd9Sstevel@tonic-gate  */
2617c478bd9Sstevel@tonic-gate void *
_kaio_cleanup_thread(void * arg)2627c478bd9Sstevel@tonic-gate _kaio_cleanup_thread(void *arg)
2637c478bd9Sstevel@tonic-gate {
26434709573Sraf 	if (pthread_setspecific(_aio_key, arg) != 0)
265f841f6adSraf 		aio_panic("_kaio_cleanup_thread, pthread_setspecific()");
2667c478bd9Sstevel@tonic-gate 	(void) _kaio(AIOSTART);
2677c478bd9Sstevel@tonic-gate 	return (arg);
2687c478bd9Sstevel@tonic-gate }
2697c478bd9Sstevel@tonic-gate 
2707c478bd9Sstevel@tonic-gate /*
2717c478bd9Sstevel@tonic-gate  * initialize kaio.
2727c478bd9Sstevel@tonic-gate  */
2737c478bd9Sstevel@tonic-gate void
_kaio_init()2747c478bd9Sstevel@tonic-gate _kaio_init()
2757c478bd9Sstevel@tonic-gate {
2767c478bd9Sstevel@tonic-gate 	int error;
27734709573Sraf 	sigset_t oset;
278a574db85Sraf 	int cancel_state;
27934709573Sraf 
280f841f6adSraf 	lmutex_lock(&__aio_initlock);
281a574db85Sraf 	(void) pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &cancel_state);
282f841f6adSraf 	while (__aio_initbusy)
283a574db85Sraf 		(void) cond_wait(&__aio_initcv, &__aio_initlock);
284a574db85Sraf 	(void) pthread_setcancelstate(cancel_state, NULL);
285f841f6adSraf 	if (_kaio_ok) {		/* already initialized */
286f841f6adSraf 		lmutex_unlock(&__aio_initlock);
287f841f6adSraf 		return;
288f841f6adSraf 	}
289f841f6adSraf 	__aio_initbusy = 1;
290f841f6adSraf 	lmutex_unlock(&__aio_initlock);
291f841f6adSraf 
29234709573Sraf 	if (_kaio_supported_init() != 0)
293f841f6adSraf 		error = ENOMEM;
294f841f6adSraf 	else if ((_kaiowp = _aio_worker_alloc()) == NULL)
295f841f6adSraf 		error = ENOMEM;
296f841f6adSraf 	else if ((error = (int)_kaio(AIOINIT)) == 0) {
297f841f6adSraf 		(void) pthread_sigmask(SIG_SETMASK, &maskset, &oset);
298f841f6adSraf 		error = thr_create(NULL, AIOSTKSIZE, _kaio_cleanup_thread,
299f841f6adSraf 		    _kaiowp, THR_DAEMON, &_kaiowp->work_tid);
300f841f6adSraf 		(void) pthread_sigmask(SIG_SETMASK, &oset, NULL);
301f841f6adSraf 	}
302f841f6adSraf 	if (error && _kaiowp != NULL) {
303f841f6adSraf 		_aio_worker_free(_kaiowp);
304f841f6adSraf 		_kaiowp = NULL;
3057c478bd9Sstevel@tonic-gate 	}
306f841f6adSraf 
307f841f6adSraf 	lmutex_lock(&__aio_initlock);
308f841f6adSraf 	if (error)
309f841f6adSraf 		_kaio_ok = -1;
310f841f6adSraf 	else
311f841f6adSraf 		_kaio_ok = 1;
312f841f6adSraf 	__aio_initbusy = 0;
313f841f6adSraf 	(void) cond_broadcast(&__aio_initcv);
314f841f6adSraf 	lmutex_unlock(&__aio_initlock);
3157c478bd9Sstevel@tonic-gate }
3167c478bd9Sstevel@tonic-gate 
3177c478bd9Sstevel@tonic-gate int
aioread(int fd,caddr_t buf,int bufsz,off_t offset,int whence,aio_result_t * resultp)3187c478bd9Sstevel@tonic-gate aioread(int fd, caddr_t buf, int bufsz, off_t offset, int whence,
3197c478bd9Sstevel@tonic-gate     aio_result_t *resultp)
3207c478bd9Sstevel@tonic-gate {
3217c478bd9Sstevel@tonic-gate 	return (_aiorw(fd, buf, bufsz, offset, whence, resultp, AIOREAD));
3227c478bd9Sstevel@tonic-gate }
3237c478bd9Sstevel@tonic-gate 
3247c478bd9Sstevel@tonic-gate int
aiowrite(int fd,caddr_t buf,int bufsz,off_t offset,int whence,aio_result_t * resultp)3257c478bd9Sstevel@tonic-gate aiowrite(int fd, caddr_t buf, int bufsz, off_t offset, int whence,
3267c478bd9Sstevel@tonic-gate     aio_result_t *resultp)
3277c478bd9Sstevel@tonic-gate {
3287c478bd9Sstevel@tonic-gate 	return (_aiorw(fd, buf, bufsz, offset, whence, resultp, AIOWRITE));
3297c478bd9Sstevel@tonic-gate }
3307c478bd9Sstevel@tonic-gate 
33134709573Sraf #if !defined(_LP64)
3327c478bd9Sstevel@tonic-gate int
aioread64(int fd,caddr_t buf,int bufsz,off64_t offset,int whence,aio_result_t * resultp)3337c478bd9Sstevel@tonic-gate aioread64(int fd, caddr_t buf, int bufsz, off64_t offset, int whence,
3347c478bd9Sstevel@tonic-gate     aio_result_t *resultp)
3357c478bd9Sstevel@tonic-gate {
3367c478bd9Sstevel@tonic-gate 	return (_aiorw(fd, buf, bufsz, offset, whence, resultp, AIOAREAD64));
3377c478bd9Sstevel@tonic-gate }
3387c478bd9Sstevel@tonic-gate 
3397c478bd9Sstevel@tonic-gate int
aiowrite64(int fd,caddr_t buf,int bufsz,off64_t offset,int whence,aio_result_t * resultp)3407c478bd9Sstevel@tonic-gate aiowrite64(int fd, caddr_t buf, int bufsz, off64_t offset, int whence,
3417c478bd9Sstevel@tonic-gate     aio_result_t *resultp)
3427c478bd9Sstevel@tonic-gate {
3437c478bd9Sstevel@tonic-gate 	return (_aiorw(fd, buf, bufsz, offset, whence, resultp, AIOAWRITE64));
3447c478bd9Sstevel@tonic-gate }
34534709573Sraf #endif	/* !defined(_LP64) */
3467c478bd9Sstevel@tonic-gate 
3477c478bd9Sstevel@tonic-gate int
_aiorw(int fd,caddr_t buf,int bufsz,offset_t offset,int whence,aio_result_t * resultp,int mode)3487c478bd9Sstevel@tonic-gate _aiorw(int fd, caddr_t buf, int bufsz, offset_t offset, int whence,
3497c478bd9Sstevel@tonic-gate     aio_result_t *resultp, int mode)
3507c478bd9Sstevel@tonic-gate {
35134709573Sraf 	aio_req_t *reqp;
35234709573Sraf 	aio_args_t *ap;
35334709573Sraf 	offset_t loffset;
354967072a1Spraks 	struct stat64 stat64;
35534709573Sraf 	int error = 0;
3567c478bd9Sstevel@tonic-gate 	int kerr;
3577c478bd9Sstevel@tonic-gate 	int umode;
3587c478bd9Sstevel@tonic-gate 
3597c478bd9Sstevel@tonic-gate 	switch (whence) {
3607c478bd9Sstevel@tonic-gate 
3617c478bd9Sstevel@tonic-gate 	case SEEK_SET:
3627c478bd9Sstevel@tonic-gate 		loffset = offset;
3637c478bd9Sstevel@tonic-gate 		break;
3647c478bd9Sstevel@tonic-gate 	case SEEK_CUR:
3657c478bd9Sstevel@tonic-gate 		if ((loffset = llseek(fd, 0, SEEK_CUR)) == -1)
36634709573Sraf 			error = -1;
3677c478bd9Sstevel@tonic-gate 		else
3687c478bd9Sstevel@tonic-gate 			loffset += offset;
3697c478bd9Sstevel@tonic-gate 		break;
3707c478bd9Sstevel@tonic-gate 	case SEEK_END:
371967072a1Spraks 		if (fstat64(fd, &stat64) == -1)
37234709573Sraf 			error = -1;
3737c478bd9Sstevel@tonic-gate 		else
374967072a1Spraks 			loffset = offset + stat64.st_size;
3757c478bd9Sstevel@tonic-gate 		break;
3767c478bd9Sstevel@tonic-gate 	default:
3777c478bd9Sstevel@tonic-gate 		errno = EINVAL;
37834709573Sraf 		error = -1;
3797c478bd9Sstevel@tonic-gate 	}
3807c478bd9Sstevel@tonic-gate 
38134709573Sraf 	if (error)
38234709573Sraf 		return (error);
3837c478bd9Sstevel@tonic-gate 
3847c478bd9Sstevel@tonic-gate 	/* initialize kaio */
3857c478bd9Sstevel@tonic-gate 	if (!_kaio_ok)
3867c478bd9Sstevel@tonic-gate 		_kaio_init();
3877c478bd9Sstevel@tonic-gate 
3887c478bd9Sstevel@tonic-gate 	/*
3897c478bd9Sstevel@tonic-gate 	 * _aio_do_request() needs the original request code (mode) to be able
39034709573Sraf 	 * to choose the appropiate 32/64 bit function.  All other functions
3917c478bd9Sstevel@tonic-gate 	 * only require the difference between READ and WRITE (umode).
3927c478bd9Sstevel@tonic-gate 	 */
3937c478bd9Sstevel@tonic-gate 	if (mode == AIOAREAD64 || mode == AIOAWRITE64)
3947c478bd9Sstevel@tonic-gate 		umode = mode - AIOAREAD64;
3957c478bd9Sstevel@tonic-gate 	else
3967c478bd9Sstevel@tonic-gate 		umode = mode;
3977c478bd9Sstevel@tonic-gate 
3987c478bd9Sstevel@tonic-gate 	/*
3997c478bd9Sstevel@tonic-gate 	 * Try kernel aio first.
4007c478bd9Sstevel@tonic-gate 	 * If errno is ENOTSUP/EBADFD, fall back to the thread implementation.
4017c478bd9Sstevel@tonic-gate 	 */
40234709573Sraf 	if (_kaio_ok > 0 && KAIO_SUPPORTED(fd)) {
4037c478bd9Sstevel@tonic-gate 		resultp->aio_errno = 0;
40434709573Sraf 		sig_mutex_lock(&__aio_mutex);
40534709573Sraf 		_kaio_outstand_cnt++;
406967072a1Spraks 		sig_mutex_unlock(&__aio_mutex);
4077c478bd9Sstevel@tonic-gate 		kerr = (int)_kaio(((resultp->aio_return == AIO_INPROGRESS) ?
4087c478bd9Sstevel@tonic-gate 		    (umode | AIO_POLL_BIT) : umode),
4097c478bd9Sstevel@tonic-gate 		    fd, buf, bufsz, loffset, resultp);
41034709573Sraf 		if (kerr == 0) {
4117c478bd9Sstevel@tonic-gate 			return (0);
41234709573Sraf 		}
413967072a1Spraks 		sig_mutex_lock(&__aio_mutex);
41434709573Sraf 		_kaio_outstand_cnt--;
41534709573Sraf 		sig_mutex_unlock(&__aio_mutex);
41634709573Sraf 		if (errno != ENOTSUP && errno != EBADFD)
4177c478bd9Sstevel@tonic-gate 			return (-1);
4187c478bd9Sstevel@tonic-gate 		if (errno == EBADFD)
4197c478bd9Sstevel@tonic-gate 			SET_KAIO_NOT_SUPPORTED(fd);
4207c478bd9Sstevel@tonic-gate 	}
4217c478bd9Sstevel@tonic-gate 
42234709573Sraf 	if (!__uaio_ok && __uaio_init() == -1)
42334709573Sraf 		return (-1);
42434709573Sraf 
42534709573Sraf 	if ((reqp = _aio_req_alloc()) == NULL) {
4267c478bd9Sstevel@tonic-gate 		errno = EAGAIN;
4277c478bd9Sstevel@tonic-gate 		return (-1);
4287c478bd9Sstevel@tonic-gate 	}
4297c478bd9Sstevel@tonic-gate 
4307c478bd9Sstevel@tonic-gate 	/*
43134709573Sraf 	 * _aio_do_request() checks reqp->req_op to differentiate
4327c478bd9Sstevel@tonic-gate 	 * between 32 and 64 bit access.
4337c478bd9Sstevel@tonic-gate 	 */
43434709573Sraf 	reqp->req_op = mode;
43534709573Sraf 	reqp->req_resultp = resultp;
43634709573Sraf 	ap = &reqp->req_args;
4377c478bd9Sstevel@tonic-gate 	ap->fd = fd;
4387c478bd9Sstevel@tonic-gate 	ap->buf = buf;
4397c478bd9Sstevel@tonic-gate 	ap->bufsz = bufsz;
4407c478bd9Sstevel@tonic-gate 	ap->offset = loffset;
4417c478bd9Sstevel@tonic-gate 
44234709573Sraf 	if (_aio_hash_insert(resultp, reqp) != 0) {
44334709573Sraf 		_aio_req_free(reqp);
4447c478bd9Sstevel@tonic-gate 		errno = EINVAL;
4457c478bd9Sstevel@tonic-gate 		return (-1);
4467c478bd9Sstevel@tonic-gate 	}
44734709573Sraf 	/*
44834709573Sraf 	 * _aio_req_add() only needs the difference between READ and
44934709573Sraf 	 * WRITE to choose the right worker queue.
45034709573Sraf 	 */
45134709573Sraf 	_aio_req_add(reqp, &__nextworker_rw, umode);
45234709573Sraf 	return (0);
4537c478bd9Sstevel@tonic-gate }
4547c478bd9Sstevel@tonic-gate 
4557c478bd9Sstevel@tonic-gate int
aiocancel(aio_result_t * resultp)4567c478bd9Sstevel@tonic-gate aiocancel(aio_result_t *resultp)
4577c478bd9Sstevel@tonic-gate {
45834709573Sraf 	aio_req_t *reqp;
45934709573Sraf 	aio_worker_t *aiowp;
46034709573Sraf 	int ret;
46134709573Sraf 	int done = 0;
46234709573Sraf 	int canceled = 0;
4637c478bd9Sstevel@tonic-gate 
4647c478bd9Sstevel@tonic-gate 	if (!__uaio_ok) {
4657c478bd9Sstevel@tonic-gate 		errno = EINVAL;
4667c478bd9Sstevel@tonic-gate 		return (-1);
4677c478bd9Sstevel@tonic-gate 	}
4687c478bd9Sstevel@tonic-gate 
46934709573Sraf 	sig_mutex_lock(&__aio_mutex);
47034709573Sraf 	reqp = _aio_hash_find(resultp);
47134709573Sraf 	if (reqp == NULL) {
4727c478bd9Sstevel@tonic-gate 		if (_aio_outstand_cnt == _aio_req_done_cnt)
4737c478bd9Sstevel@tonic-gate 			errno = EINVAL;
4747c478bd9Sstevel@tonic-gate 		else
4757c478bd9Sstevel@tonic-gate 			errno = EACCES;
47634709573Sraf 		ret = -1;
4777c478bd9Sstevel@tonic-gate 	} else {
47834709573Sraf 		aiowp = reqp->req_worker;
47934709573Sraf 		sig_mutex_lock(&aiowp->work_qlock1);
48034709573Sraf 		(void) _aio_cancel_req(aiowp, reqp, &canceled, &done);
48134709573Sraf 		sig_mutex_unlock(&aiowp->work_qlock1);
4827c478bd9Sstevel@tonic-gate 
4837c478bd9Sstevel@tonic-gate 		if (canceled) {
48434709573Sraf 			ret = 0;
4857c478bd9Sstevel@tonic-gate 		} else {
48634709573Sraf 			if (_aio_outstand_cnt == 0 ||
48734709573Sraf 			    _aio_outstand_cnt == _aio_req_done_cnt)
48834709573Sraf 				errno = EINVAL;
48934709573Sraf 			else
49034709573Sraf 				errno = EACCES;
49134709573Sraf 			ret = -1;
4927c478bd9Sstevel@tonic-gate 		}
4937c478bd9Sstevel@tonic-gate 	}
49434709573Sraf 	sig_mutex_unlock(&__aio_mutex);
49534709573Sraf 	return (ret);
4967c478bd9Sstevel@tonic-gate }
4977c478bd9Sstevel@tonic-gate 
498a574db85Sraf /* ARGSUSED */
499a574db85Sraf static void
_aiowait_cleanup(void * arg)500a574db85Sraf _aiowait_cleanup(void *arg)
501a574db85Sraf {
502a574db85Sraf 	sig_mutex_lock(&__aio_mutex);
503a574db85Sraf 	_aiowait_flag--;
504a574db85Sraf 	sig_mutex_unlock(&__aio_mutex);
505a574db85Sraf }
506a574db85Sraf 
5077c478bd9Sstevel@tonic-gate /*
508a574db85Sraf  * This must be asynch safe and cancel safe
5097c478bd9Sstevel@tonic-gate  */
5107c478bd9Sstevel@tonic-gate aio_result_t *
aiowait(struct timeval * uwait)5117c478bd9Sstevel@tonic-gate aiowait(struct timeval *uwait)
5127c478bd9Sstevel@tonic-gate {
51334709573Sraf 	aio_result_t *uresultp;
51434709573Sraf 	aio_result_t *kresultp;
51534709573Sraf 	aio_result_t *resultp;
5167c478bd9Sstevel@tonic-gate 	int dontblock;
5177c478bd9Sstevel@tonic-gate 	int timedwait = 0;
5187c478bd9Sstevel@tonic-gate 	int kaio_errno = 0;
51934709573Sraf 	struct timeval twait;
52034709573Sraf 	struct timeval *wait = NULL;
5217c478bd9Sstevel@tonic-gate 	hrtime_t hrtend;
5227c478bd9Sstevel@tonic-gate 	hrtime_t hres;
5237c478bd9Sstevel@tonic-gate 
5247c478bd9Sstevel@tonic-gate 	if (uwait) {
5257c478bd9Sstevel@tonic-gate 		/*
52634709573Sraf 		 * Check for a valid specified wait time.
52734709573Sraf 		 * If it is invalid, fail the call right away.
5287c478bd9Sstevel@tonic-gate 		 */
5297c478bd9Sstevel@tonic-gate 		if (uwait->tv_sec < 0 || uwait->tv_usec < 0 ||
5307c478bd9Sstevel@tonic-gate 		    uwait->tv_usec >= MICROSEC) {
5317c478bd9Sstevel@tonic-gate 			errno = EINVAL;
5327c478bd9Sstevel@tonic-gate 			return ((aio_result_t *)-1);
5337c478bd9Sstevel@tonic-gate 		}
5347c478bd9Sstevel@tonic-gate 
53534709573Sraf 		if (uwait->tv_sec > 0 || uwait->tv_usec > 0) {
5367c478bd9Sstevel@tonic-gate 			hrtend = gethrtime() +
53734b3058fSpraks 			    (hrtime_t)uwait->tv_sec * NANOSEC +
53834b3058fSpraks 			    (hrtime_t)uwait->tv_usec * (NANOSEC / MICROSEC);
5397c478bd9Sstevel@tonic-gate 			twait = *uwait;
5407c478bd9Sstevel@tonic-gate 			wait = &twait;
5417c478bd9Sstevel@tonic-gate 			timedwait++;
5427c478bd9Sstevel@tonic-gate 		} else {
5437c478bd9Sstevel@tonic-gate 			/* polling */
54434709573Sraf 			sig_mutex_lock(&__aio_mutex);
54534709573Sraf 			if (_kaio_outstand_cnt == 0) {
54634709573Sraf 				kresultp = (aio_result_t *)-1;
54734709573Sraf 			} else {
54834709573Sraf 				kresultp = (aio_result_t *)_kaio(AIOWAIT,
54934709573Sraf 				    (struct timeval *)-1, 1);
55034709573Sraf 				if (kresultp != (aio_result_t *)-1 &&
55134709573Sraf 				    kresultp != NULL &&
55234709573Sraf 				    kresultp != (aio_result_t *)1) {
55334709573Sraf 					_kaio_outstand_cnt--;
55434709573Sraf 					sig_mutex_unlock(&__aio_mutex);
55534709573Sraf 					return (kresultp);
55634709573Sraf 				}
55734709573Sraf 			}
5587c478bd9Sstevel@tonic-gate 			uresultp = _aio_req_done();
55934709573Sraf 			sig_mutex_unlock(&__aio_mutex);
56034709573Sraf 			if (uresultp != NULL &&
56134709573Sraf 			    uresultp != (aio_result_t *)-1) {
5627c478bd9Sstevel@tonic-gate 				return (uresultp);
5637c478bd9Sstevel@tonic-gate 			}
5647c478bd9Sstevel@tonic-gate 			if (uresultp == (aio_result_t *)-1 &&
5657c478bd9Sstevel@tonic-gate 			    kresultp == (aio_result_t *)-1) {
5667c478bd9Sstevel@tonic-gate 				errno = EINVAL;
5677c478bd9Sstevel@tonic-gate 				return ((aio_result_t *)-1);
56834709573Sraf 			} else {
5697c478bd9Sstevel@tonic-gate 				return (NULL);
57034709573Sraf 			}
5717c478bd9Sstevel@tonic-gate 		}
5727c478bd9Sstevel@tonic-gate 	}
5737c478bd9Sstevel@tonic-gate 
5747c478bd9Sstevel@tonic-gate 	for (;;) {
57534709573Sraf 		sig_mutex_lock(&__aio_mutex);
5767c478bd9Sstevel@tonic-gate 		uresultp = _aio_req_done();
5777c478bd9Sstevel@tonic-gate 		if (uresultp != NULL && uresultp != (aio_result_t *)-1) {
57834709573Sraf 			sig_mutex_unlock(&__aio_mutex);
5797c478bd9Sstevel@tonic-gate 			resultp = uresultp;
5807c478bd9Sstevel@tonic-gate 			break;
5817c478bd9Sstevel@tonic-gate 		}
5827c478bd9Sstevel@tonic-gate 		_aiowait_flag++;
5837c478bd9Sstevel@tonic-gate 		dontblock = (uresultp == (aio_result_t *)-1);
58434709573Sraf 		if (dontblock && _kaio_outstand_cnt == 0) {
58534709573Sraf 			kresultp = (aio_result_t *)-1;
58634709573Sraf 			kaio_errno = EINVAL;
58734709573Sraf 		} else {
58834709573Sraf 			sig_mutex_unlock(&__aio_mutex);
589a574db85Sraf 			pthread_cleanup_push(_aiowait_cleanup, NULL);
590a574db85Sraf 			_cancel_prologue();
59134709573Sraf 			kresultp = (aio_result_t *)_kaio(AIOWAIT,
59234709573Sraf 			    wait, dontblock);
593a574db85Sraf 			_cancel_epilogue();
594a574db85Sraf 			pthread_cleanup_pop(0);
59534709573Sraf 			sig_mutex_lock(&__aio_mutex);
59634709573Sraf 			kaio_errno = errno;
59734709573Sraf 		}
5987c478bd9Sstevel@tonic-gate 		_aiowait_flag--;
59934709573Sraf 		sig_mutex_unlock(&__aio_mutex);
6007c478bd9Sstevel@tonic-gate 		if (kresultp == (aio_result_t *)1) {
6017c478bd9Sstevel@tonic-gate 			/* aiowait() awakened by an aionotify() */
6027c478bd9Sstevel@tonic-gate 			continue;
60334709573Sraf 		} else if (kresultp != NULL &&
60434709573Sraf 		    kresultp != (aio_result_t *)-1) {
6057c478bd9Sstevel@tonic-gate 			resultp = kresultp;
60634709573Sraf 			sig_mutex_lock(&__aio_mutex);
60734709573Sraf 			_kaio_outstand_cnt--;
60834709573Sraf 			sig_mutex_unlock(&__aio_mutex);
6097c478bd9Sstevel@tonic-gate 			break;
61034709573Sraf 		} else if (kresultp == (aio_result_t *)-1 &&
61134709573Sraf 		    kaio_errno == EINVAL &&
61234709573Sraf 		    uresultp == (aio_result_t *)-1) {
6137c478bd9Sstevel@tonic-gate 			errno = kaio_errno;
6147c478bd9Sstevel@tonic-gate 			resultp = (aio_result_t *)-1;
6157c478bd9Sstevel@tonic-gate 			break;
6167c478bd9Sstevel@tonic-gate 		} else if (kresultp == (aio_result_t *)-1 &&
6177c478bd9Sstevel@tonic-gate 		    kaio_errno == EINTR) {
6187c478bd9Sstevel@tonic-gate 			errno = kaio_errno;
6197c478bd9Sstevel@tonic-gate 			resultp = (aio_result_t *)-1;
6207c478bd9Sstevel@tonic-gate 			break;
6217c478bd9Sstevel@tonic-gate 		} else if (timedwait) {
6227c478bd9Sstevel@tonic-gate 			hres = hrtend - gethrtime();
6237c478bd9Sstevel@tonic-gate 			if (hres <= 0) {
62434709573Sraf 				/* time is up; return */
6257c478bd9Sstevel@tonic-gate 				resultp = NULL;
6267c478bd9Sstevel@tonic-gate 				break;
6277c478bd9Sstevel@tonic-gate 			} else {
6287c478bd9Sstevel@tonic-gate 				/*
62934709573Sraf 				 * Some time left.  Round up the remaining time
63034709573Sraf 				 * in nanoseconds to microsec.  Retry the call.
6317c478bd9Sstevel@tonic-gate 				 */
63234709573Sraf 				hres += (NANOSEC / MICROSEC) - 1;
6337c478bd9Sstevel@tonic-gate 				wait->tv_sec = hres / NANOSEC;
6347c478bd9Sstevel@tonic-gate 				wait->tv_usec =
63534b3058fSpraks 				    (hres % NANOSEC) / (NANOSEC / MICROSEC);
6367c478bd9Sstevel@tonic-gate 			}
6377c478bd9Sstevel@tonic-gate 		} else {
63834709573Sraf 			ASSERT(kresultp == NULL && uresultp == NULL);
6397c478bd9Sstevel@tonic-gate 			resultp = NULL;
6407c478bd9Sstevel@tonic-gate 			continue;
6417c478bd9Sstevel@tonic-gate 		}
6427c478bd9Sstevel@tonic-gate 	}
6437c478bd9S