port/aio/aio.c

/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

#pragma ident	"%Z%%M%	%I%	%E% SMI"

#include "libaio.h"
#include <atomic.h>
#include <sys/param.h>
#include <sys/file.h>
#include <sys/port.h>

static int _aio_hash_insert(aio_result_t *, aio_req_t *);
static aio_req_t *_aio_req_get(aio_worker_t *);
static void _aio_req_add(aio_req_t *, aio_worker_t **, int);
static void _aio_req_del(aio_worker_t *, aio_req_t *, int);
static void _aio_work_done(aio_worker_t *);
aio_req_t *_aio_req_remove(aio_req_t *);
static void _aio_enq_doneq(aio_req_t *);

extern void _aio_lio_free(aio_lio_t *);

extern int __fdsync(int, int);
extern int _port_dispatch(int, int, int, int, uintptr_t, void *);

static int _aio_fsync_del(aio_worker_t *, aio_req_t *);
static void _aiodone(aio_req_t *, ssize_t, int);
static void _aio_cancel_work(aio_worker_t *, int, int *, int *);
static void _aio_finish_request(aio_worker_t *, ssize_t, int);

/*
 * switch for kernel async I/O
 */
int _kaio_ok = 0;		/* 0 = disabled, 1 = on, -1 = error */

/*
 * Key for thread-specific data
 */
pthread_key_t _aio_key;

/*
 * Array for determining whether or not a file supports kaio.
 * Initialized in _kaio_init().
 */
uint32_t *_kaio_supported = NULL;

/*
 *  workers for read/write requests
 * (__aio_mutex lock protects circular linked list of workers)
 */
aio_worker_t *__workers_rw;	/* circular list of AIO workers */
aio_worker_t *__nextworker_rw;	/* next worker in list of workers */
int __rw_workerscnt;		/* number of read/write workers */

/*
 * worker for notification requests.
 */
aio_worker_t *__workers_no;	/* circular list of AIO workers */
aio_worker_t *__nextworker_no;	/* next worker in list of workers */
int __no_workerscnt;		/* number of write workers */

aio_req_t *_aio_done_tail;		/* list of done requests */
aio_req_t *_aio_done_head;

mutex_t __aio_initlock = DEFAULTMUTEX;	/* makes aio initialization atomic */
mutex_t __aio_mutex = DEFAULTMUTEX;	/* protects counts, and linked lists */
cond_t _aio_iowait_cv = DEFAULTCV;	/* wait for userland I/Os */

pid_t __pid = (pid_t)-1;		/* initialize as invalid pid */
int _sigio_enabled = 0;			/* when set, send SIGIO signal */

aio_hash_t *_aio_hash;

aio_req_t *_aio_doneq;			/* double linked done queue list */

int _aio_donecnt = 0;
int _aio_waitncnt = 0;			/* # of requests for aio_waitn */
int _aio_doneq_cnt = 0;
int _aio_outstand_cnt = 0;		/* # of outstanding requests */
int _kaio_outstand_cnt = 0;		/* # of outstanding kaio requests */
int _aio_req_done_cnt = 0;		/* req. done but not in "done queue" */
int _aio_kernel_suspend = 0;		/* active kernel kaio calls */
int _aio_suscv_cnt = 0;			/* aio_suspend calls waiting on cv's */

int _max_workers = 256;			/* max number of workers permitted */
int _min_workers = 8;			/* min number of workers */
int _minworkload = 2;			/* min number of request in q */
int _aio_worker_cnt = 0;		/* number of workers to do requests */
int __uaio_ok = 0;			/* AIO has been enabled */
sigset_t _worker_set;			/* worker's signal mask */
sigset_t _full_set;			/* all signals (sigfillset()) */

int _aiowait_flag = 0;			/* when set, aiowait() is inprogress */
int _aio_flags = 0;			/* see libaio.h defines for */

aio_worker_t *_kaiowp;			/* points to kaio cleanup thread */

int hz;					/* clock ticks per second */

static int
_kaio_supported_init(void)
{
	void *ptr;
	size_t size;

	if (_kaio_supported != NULL)	/* already initialized */
		return (0);

	size = MAX_KAIO_FDARRAY_SIZE * sizeof (uint32_t);
	ptr = mmap(NULL, size, PROT_READ | PROT_WRITE,
	    MAP_PRIVATE | MAP_ANON, -1, (off_t)0);
	if (ptr == MAP_FAILED)
		return (-1);
	_kaio_supported = ptr;
	return (0);
}

/*
 * libaio is initialized when an AIO request is made.  Important
 * constants are initialized like the max number of workers that
 * libaio can create, and the minimum number of workers permitted before
 * imposing some restrictions.  Also, some workers are created.
 */
int
__uaio_init(void)
{
	int i;
	int ret;

	sig_mutex_lock(&__aio_initlock);
	if (__uaio_ok) {	/* already initialized */
		sig_mutex_unlock(&__aio_initlock);
		return (0);
	}

	ret = -1;

	hz = (int)sysconf(_SC_CLK_TCK);
	__pid = getpid();

	init_signals();

	if (_kaio_supported_init() != 0)
		goto out;

	/*
	 * Allocate and initialize the hash table.
	 */
	/* LINTED pointer cast */
	_aio_hash = (aio_hash_t *)mmap(NULL,
	    HASHSZ * sizeof (aio_hash_t), PROT_READ | PROT_WRITE,
	    MAP_PRIVATE | MAP_ANON, -1, (off_t)0);
	if ((void *)_aio_hash == MAP_FAILED) {
		_aio_hash = NULL;
		goto out;
	}
	for (i = 0; i < HASHSZ; i++)
		(void) mutex_init(&_aio_hash[i].hash_lock, USYNC_THREAD, NULL);

	/*
	 * Initialize worker's signal mask to only catch SIGAIOCANCEL.
	 */
	(void) sigfillset(&_full_set);
	(void) sigfillset(&_worker_set);
	(void) sigdelset(&_worker_set, SIGAIOCANCEL);

	/*
	 * Create the minimum number of workers.
	 */
	for (i = 0; i < _min_workers; i++)
		(void) _aio_create_worker(NULL, AIOREAD);

	/*
	 * Create one worker to send asynchronous notifications.
	 */
	(void) _aio_create_worker(NULL, AIONOTIFY);

	__uaio_ok = 1;
	ret = 0;

out:
	sig_mutex_unlock(&__aio_initlock);
	return (ret);
}

/*
 * special kaio cleanup thread sits in a loop in the
 * kernel waiting for pending kaio requests to complete.
 */
void *
_kaio_cleanup_thread(void *arg)
{
	if (pthread_setspecific(_aio_key, arg) != 0)
		_aiopanic("_kaio_cleanup_thread, pthread_setspecific()");
	(void) _kaio(AIOSTART);
	return (arg);
}

/*
 * initialize kaio.
 */
void
_kaio_init()
{
	int error;
	sigset_t set;
	sigset_t oset;

	sig_mutex_lock(&__aio_initlock);
	if (_kaio_supported_init() != 0)
		_kaio_ok = -1;
	if (_kaio_ok == 0) {
		if ((_kaiowp = _aio_worker_alloc()) == NULL) {
			error =  ENOMEM;
		} else {
			if ((error = (int)_kaio(AIOINIT)) == 0) {
				(void) sigfillset(&set);
				(void) pthread_sigmask(SIG_SETMASK,
				    &set, &oset);
				error = thr_create(NULL, AIOSTKSIZE,
				    _kaio_cleanup_thread, _kaiowp,
				    THR_DAEMON, &_kaiowp->work_tid);
				(void) pthread_sigmask(SIG_SETMASK,
				    &oset, NULL);
			}
			if (error) {
				_aio_worker_free(_kaiowp);
				_kaiowp = NULL;
			}
		}
		if (error)
			_kaio_ok = -1;
		else
			_kaio_ok = 1;
	}
	sig_mutex_unlock(&__aio_initlock);
}

int
aioread(int fd, caddr_t buf, int bufsz, off_t offset, int whence,
    aio_result_t *resultp)
{
	return (_aiorw(fd, buf, bufsz, offset, whence, resultp, AIOREAD));
}

int
aiowrite(int fd, caddr_t buf, int bufsz, off_t offset, int whence,
    aio_result_t *resultp)
{
	return (_aiorw(fd, buf, bufsz, offset, whence, resultp, AIOWRITE));
}

#if !defined(_LP64)
int
aioread64(int fd, caddr_t buf, int bufsz, off64_t offset, int whence,
    aio_result_t *resultp)
{
	return (_aiorw(fd, buf, bufsz, offset, whence, resultp, AIOAREAD64));
}

int
aiowrite64(int fd, caddr_t buf, int bufsz, off64_t offset, int whence,
    aio_result_t *resultp)
{
	return (_aiorw(fd, buf, bufsz, offset, whence, resultp, AIOAWRITE64));
}
#endif	/* !defined(_LP64) */

int
_aiorw(int fd, caddr_t buf, int bufsz, offset_t offset, int whence,
    aio_result_t *resultp, int mode)
{
	aio_req_t *reqp;
	aio_args_t *ap;
	offset_t loffset;
	struct stat stat;
	int error = 0;
	int kerr;
	int umode;

	switch (whence) {

	case SEEK_SET:
		loffset = offset;
		break;
	case SEEK_CUR:
		if ((loffset = llseek(fd, 0, SEEK_CUR)) == -1)
			error = -1;
		else
			loffset += offset;
		break;
	case SEEK_END:
		if (fstat(fd, &stat) == -1)
			error = -1;
		else
			loffset = offset + stat.st_size;
		break;
	default:
		errno = EINVAL;
		error = -1;
	}

	if (error)
		return (error);

	/* initialize kaio */
	if (!_kaio_ok)
		_kaio_init();

	/*
	 * _aio_do_request() needs the original request code (mode) to be able
	 * to choose the appropiate 32/64 bit function.  All other functions
	 * only require the difference between READ and WRITE (umode).
	 */
	if (mode == AIOAREAD64 || mode == AIOAWRITE64)
		umode = mode - AIOAREAD64;
	else
		umode = mode;

	/*
	 * Try kernel aio first.
	 * If errno is ENOTSUP/EBADFD, fall back to the thread implementation.
	 */
	if (_kaio_ok > 0 && KAIO_SUPPORTED(fd)) {
		resultp->aio_errno = 0;
		sig_mutex_lock(&__aio_mutex);
		_kaio_outstand_cnt++;
		kerr = (int)_kaio(((resultp->aio_return == AIO_INPROGRESS) ?
		    (umode | AIO_POLL_BIT) : umode),
		    fd, buf, bufsz, loffset, resultp);
		if (kerr == 0) {
			sig_mutex_unlock(&__aio_mutex);
			return (0);
		}
		_kaio_outstand_cnt--;
		sig_mutex_unlock(&__aio_mutex);
		if (errno != ENOTSUP && errno != EBADFD)
			return (-1);
		if (errno == EBADFD)
			SET_KAIO_NOT_SUPPORTED(fd);
	}

	if (!__uaio_ok && __uaio_init() == -1)
		return (-1);

	if ((reqp = _aio_req_alloc()) == NULL) {
		errno = EAGAIN;
		return (-1);
	}

	/*
	 * _aio_do_request() checks reqp->req_op to differentiate
	 * between 32 and 64 bit access.
	 */
	reqp->req_op = mode;
	reqp->req_resultp = resultp;
	ap = &reqp->req_args;
	ap->fd = fd;
	ap->buf = buf;
	ap->bufsz = bufsz;
	ap->offset = loffset;

	if (_aio_hash_insert(resultp, reqp) != 0) {
		_aio_req_free(reqp);
		errno = EINVAL;
		return (-1);
	}
	/*
	 * _aio_req_add() only needs the difference between READ and
	 * WRITE to choose the right worker queue.
	 */
	_aio_req_add(reqp, &__nextworker_rw, umode);
	return (0);
}

int
aiocancel(aio_result_t *resultp)
{
	aio_req_t *reqp;
	aio_worker_t *aiowp;
	int ret;
	int done = 0;
	int canceled = 0;

	if (!__uaio_ok) {
		errno = EINVAL;
		return (-1);
	}

	sig_mutex_lock(&__aio_mutex);
	reqp = _aio_hash_find(resultp);
	if (reqp == NULL) {
		if (_aio_outstand_cnt == _aio_req_done_cnt)
			errno = EINVAL;
		else
			errno = EACCES;
		ret = -1;
	} else {
		aiowp = reqp->req_worker;
		sig_mutex_lock(&aiowp->work_qlock1);
		(void) _aio_cancel_req(aiowp, reqp, &canceled, &done);
		sig_mutex_unlock(&aiowp->work_qlock1);

		if (canceled) {
			ret = 0;
		} else {
			if (_aio_outstand_cnt == 0 ||
			    _aio_outstand_cnt == _aio_req_done_cnt)
				errno = EINVAL;
			else
				errno = EACCES;
			ret = -1;
		}
	}
	sig_mutex_unlock(&__aio_mutex);
	return (ret);
}

/*
 * This must be asynch safe
 */
aio_result_t *
aiowait(struct timeval *uwait)
{
	aio_result_t *uresultp;
	aio_result_t *kresultp;
	aio_result_t *resultp;
	int dontblock;
	int timedwait = 0;
	int kaio_errno = 0;
	struct timeval twait;
	struct timeval *wait = NULL;
	hrtime_t hrtend;
	hrtime_t hres;

	if (uwait) {
		/*
		 * Check for a valid specified wait time.
		 * If it is invalid, fail the call right away.
		 */
		if (uwait->tv_sec < 0 || uwait->tv_usec < 0 ||
		    uwait->tv_usec >= MICROSEC) {
			errno = EINVAL;
			return ((aio_result_t *)-1);
		}

		if (uwait->tv_sec > 0 || uwait->tv_usec > 0) {
			hrtend = gethrtime() +
				(hrtime_t)uwait->tv_sec * NANOSEC +
				(hrtime_t)uwait->tv_usec * (NANOSEC / MICROSEC);
			twait = *uwait;
			wait = &twait;
			timedwait++;
		} else {
			/* polling */
			sig_mutex_lock(&__aio_mutex);
			if (_kaio_outstand_cnt == 0) {
				kresultp = (aio_result_t *)-1;
			} else {
				kresultp = (aio_result_t *)_kaio(AIOWAIT,
				    (struct timeval *)-1, 1);
				if (kresultp != (aio_result_t *)-1 &&
				    kresultp != NULL &&
				    kresultp != (aio_result_t *)1) {
					_kaio_outstand_cnt--;
					sig_mutex_unlock(&__aio_mutex);
					return (kresultp);
				}
			}
			uresultp = _aio_req_done();
			sig_mutex_unlock(&__aio_mutex);
			if (uresultp != NULL &&
			    uresultp != (aio_result_t *)-1) {
				return (uresultp);
			}
			if (uresultp == (aio_result_t *)-1 &&
			    kresultp == (aio_result_t *)-1) {
				errno = EINVAL;
				return ((aio_result_t *)-1);
			} else {
				return (NULL);
			}
		}
	}

	for (;;) {
		sig_mutex_lock(&__aio_mutex);
		uresultp = _aio_req_done();
		if (uresultp != NULL && uresultp != (aio_result_t *)-1) {
			sig_mutex_unlock(&__aio_mutex);
			resultp = uresultp;
			break;
		}
		_aiowait_flag++;
		dontblock = (uresultp == (aio_result_t *)-1);
		if (dontblock && _kaio_outstand_cnt == 0) {
			kresultp = (aio_result_t *)-1;
			kaio_errno = EINVAL;
		} else {
			sig_mutex_unlock(&__aio_mutex);
			kresultp = (aio_result_t *)_kaio(AIOWAIT,
			    wait, dontblock);
			sig_mutex_lock(&__aio_mutex);
			kaio_errno = errno;
		}
		_aiowait_flag--;
		sig_mutex_unlock(&__aio_mutex);
		if (kresultp == (aio_result_t *)1) {
			/* aiowait() awakened by an aionotify() */
			continue;
		} else if (kresultp != NULL &&
		    kresultp != (aio_result_t *)-1) {
			resultp = kresultp;
			sig_mutex_lock(&__aio_mutex);
			_kaio_outstand_cnt--;
			sig_mutex_unlock(&__aio_mutex);
			break;
		} else if (kresultp == (aio_result_t *)-1 &&
		    kaio_errno == EINVAL &&
		    uresultp == (aio_result_t *)-1) {
			errno = kaio_errno;
			resultp = (aio_result_t *)-1;
			break;
		} else if (kresultp == (aio_result_t *)-1 &&
		    kaio_errno == EINTR) {
			errno = kaio_errno;
			resultp = (aio_result_t *)-1;
			break;
		} else if (timedwait) {
			hres = hrtend - gethrtime();
			if (hres <= 0) {
				/* time is up; return */
				resultp = NULL;
				break;
			} else {
				/*
				 * Some time left.  Round up the remaining time
				 * in nanoseconds to microsec.  Retry the call.
				 */
				hres += (NANOSEC / MICROSEC) - 1;
				wait->tv_sec = hres / NANOSEC;
				wait->tv_usec =
					(hres % NANOSEC) / (NANOSEC / MICROSEC);
			}
		} else {
			ASSERT(kresultp == NULL && uresultp == NULL);
			resultp = NULL;
			continue;
		}
	}
	return (resultp);
}

/*
 * _aio_get_timedelta calculates the remaining time and stores the result
 * into timespec_t *wait.
 */

int
_aio_get_timedelta(timespec_t *end, timespec_t *wait)
{
	int	ret = 0;
	struct	timeval cur;
	timespec_t curtime;

	(void) gettimeofday(&cur, NULL);
	curtime.tv_sec = cur.tv_sec;
	curtime.tv_nsec = cur.tv_usec * 1000;   /* convert us to ns */

	if (end->tv_sec >= curtime.tv_sec) {
		wait->tv_sec = end->tv_sec - curtime.tv_sec;
		if (end->tv_nsec >= curtime.tv_nsec) {
			wait->tv_nsec = end->tv_nsec - curtime.tv_nsec;
			if (wait->tv_sec == 0 && wait->tv_nsec == 0)
				ret = -1;	/* timer expired */
		} else {
			if (end->tv_sec > curtime.tv_sec) {
				wait->tv_sec -= 1;
				wait->tv_nsec = NANOSEC -
				    (curtime.tv_nsec - end->tv_nsec);
			} else {
				ret = -1;	/* timer expired */
			}
		}
	} else {
		ret = -1;
	}
	return (ret);
}

/*
 * If closing by file descriptor: we will simply cancel all the outstanding
 * aio`s and return.  Those aio's in question will have either noticed the
 * cancellation notice before, during, or after initiating io.
 */
int
aiocancel_all(int fd)
{
	aio_req_t *reqp;
	aio_req_t **reqpp;
	aio_worker_t *first;
	aio_worker_t *next;
	int canceled = 0;
	int done = 0;
	int cancelall = 0;

	sig_mutex_lock(&__aio_mutex);

	if (_aio_outstand_cnt == 0) {
		sig_mutex_unlock(&__aio_mutex);
		return (AIO_ALLDONE);
	}

	/*
	 * Cancel requests from the read/write workers' queues.
	 */
	first = __nextworker_rw;
	next = first;
	do {
		_aio_cancel_work(next, fd, &canceled, &done);
	} while ((next = next->work_forw) != first);

	/*
	 * finally, check if there are requests on the done queue that
	 * should be canceled.
	 */
	if (fd < 0)
		cancelall = 1;
	reqpp = &_aio_done_tail;
	while ((reqp = *reqpp) != NULL) {
		if (cancelall || reqp->req_args.fd == fd) {
			*reqpp = reqp->req_next;
			_aio_donecnt--;
			(void) _aio_hash_del(reqp->req_resultp);
			_aio_req_free(reqp);
		} else
			reqpp = &reqp->req_next;
	}
	if (cancelall) {
		ASSERT(_aio_donecnt == 0);
		_aio_done_head = NULL;
	}
	sig_mutex_unlock(&__aio_mutex);

	if (canceled && done == 0)
		return (AIO_CANCELED);
	else if (done && canceled == 0)
		return (AIO_ALLDONE);
	else if ((canceled + done == 0) && KAIO_SUPPORTED(fd))
		return ((int)_kaio(AIOCANCEL, fd, NULL));
	return (AIO_NOTCANCELED);
}

/*
 * Cancel requests from a given work queue.  If the file descriptor
 * parameter, fd, is non-negative, then only cancel those requests
 * in this queue that are to this file descriptor.  If the fd
 * parameter is -1, then cancel all requests.
 */
static void
_aio_cancel_work(aio_worker_t *aiowp, int fd, int *canceled, int *done)
{
	aio_req_t *reqp;

	sig_mutex_lock(&aiowp->work_qlock1);
	/*
	 * cancel queued requests first.
	 */
	reqp = aiowp->work_tail1;
	while (reqp != NULL) {
		if (fd < 0 || reqp->req_args.fd == fd) {
			if (_aio_cancel_req(aiowp, reqp, canceled, done)) {
				/*
				 * Callers locks were dropped.
				 * reqp is invalid; start traversing
				 * the list from the beginning again.
				 */
				reqp = aiowp->work_tail1;
				continue;
			}
		}
		reqp = reqp->req_next;
	}
	/*
	 * Since the queued requests have been canceled, there can
	 * only be one inprogress request that should be canceled.
	 */
	if ((reqp = aiowp->work_req) != NULL &&
	    (fd < 0 || reqp->req_args.fd == fd))
		(void) _aio_cancel_req(aiowp, reqp, canceled, done);
	sig_mutex_unlock(&aiowp->work_qlock1);
}

/*
 * Cancel a request.  Return 1 if the callers locks were temporarily
 * dropped, otherwise return 0.
 */
int
_aio_cancel_req(aio_worker_t *aiowp, aio_req_t *reqp, int *canceled, int *done)
{
	int ostate = reqp->req_state;

	ASSERT(MUTEX_HELD(&__aio_mutex));
	ASSERT(MUTEX_HELD(&aiowp->work_qlock1));
	if (ostate == AIO_REQ_CANCELED)
		return (0);
	if (ostate == AIO_REQ_DONE || ostate == AIO_REQ_DONEQ) {
		(*done)++;
		return (0);
	}
	if (reqp->req_op == AIOFSYNC && reqp != aiowp->work_req) {
		ASSERT(POSIX_AIO(reqp));
		/* Cancel the queued aio_fsync() request */
		if (!reqp->req_head->lio_canned) {
			reqp->req_head->lio_canned = 1;
			_aio_outstand_cnt--;
			(*canceled)++;
		}
		return (0);
	}
	reqp->req_state = AIO_REQ_CANCELED;
	_aio_req_del(aiowp, reqp, ostate);
	(void) _aio_hash_del(reqp->req_resultp);
	(*canceled)++;
	if (reqp == aiowp->work_req) {
		ASSERT(ostate == AIO_REQ_INPROGRESS);
		/*
		 * Set the result values now, before _aiodone() is called.
		 * We do this because the application can expect aio_return
		 * and aio_errno to be set to -1 and ECANCELED, respectively,
		 * immediately after a successful return from aiocancel()
		 * or aio_cancel().
		 */
		_aio_set_result(reqp, -1, ECANCELED);
		(void) thr_kill(aiowp->work_tid, SIGAIOCANCEL);
		return (0);
	}
	if (!POSIX_AIO(reqp)) {
		_aio_outstand_cnt--;
		_aio_set_result(reqp, -1, ECANCELED);
		return (0);
	}
	sig_mutex_unlock(&aiowp->work_qlock1);
	sig_mutex_unlock(&__aio_mutex);
	_aiodone(reqp, -1, ECANCELED);
	sig_mutex_lock(&__aio_mutex);
	sig_mutex_lock(&aiowp->work_qlock1);
	return (1);
}

/*
 * This is the worker's main routine.
 * The task of this function is to execute all queued requests;
 * once the last pending request is executed this function will block
 * in _aio_idle().  A new incoming request must wakeup this thread to
 * restart the work.
 * Every worker has an own work queue.  The queue lock is required
 * to synchronize the addition of new requests for this worker or
 * cancellation of pending/running requests.
 *
 * Cancellation scenarios:
 * The cancellation of a request is being done asynchronously using
 * _aio_cancel_req() from another thread context.
 * A queued request can be cancelled in different manners :
 * a) request is queued but not "in progress" or "done" (AIO_REQ_QUEUED):
 *	- lock the queue -> remove the request -> unlock the queue
 *	- this function/thread does not detect this cancellation process
 * b) request is in progress (AIO_REQ_INPROGRESS) :
 *	- this function first allow the cancellation of the running
 *	  request with the flag "work_cancel_flg=1"
 * 		see _aio_req_get() -> _aio_cancel_on()
 *	  During this phase, it is allowed to interrupt the worker
 *	  thread running the request (this thread) using the SIGAIOCANCEL
 *	  signal.
 *	  Once this thread returns from the kernel (because the request
 *	  is just done), then it must disable a possible cancellation
 *	  and proceed to finish the request.  To disable the cancellation
 *	  this thread must use _aio_cancel_off() to set "work_cancel_flg=0".
 * c) request is already done (AIO_REQ_DONE || AIO_REQ_DONEQ):
 *	  same procedure as in a)
 *
 * To b)
 *	This thread uses sigsetjmp() to define the position in the code, where
 *	it wish to continue working in the case that a SIGAIOCANCEL signal
 *	is detected.
 *	Normally this thread should get the cancellation signal during the
 *	kernel phase (reading or writing).  In that case the signal handler
 *	aiosigcancelhndlr() is activated using the worker thread context,
 *	which again will use the siglongjmp() function to break the standard
 *	code flow and jump to the "sigsetjmp" position, provided that
 *	"work_cancel_flg" is set to "1".
 *	Because the "work_cancel_flg" is only manipulated by this worker
 *	thread and it can only run on one CPU at a given time, it is not
 *	necessary to protect that flag with the queue lock.
 *	Returning from the kernel (read or write system call) we must
 *	first disable the use of the SIGAIOCANCEL signal and accordingly
 *	the use of the siglongjmp() function to prevent a possible deadlock:
 *	- It can happens that this worker thread returns from the kernel and
 *	  blocks in "work_qlock1",
 *	- then a second thread cancels the apparently "in progress" request
 *	  and sends the SIGAIOCANCEL signal to the worker thread,
 *	- the worker thread gets assigned the "work_qlock1" and will returns
 *	  from the kernel,
 *	- the kernel detects the pending signal and activates the signal
 *	  handler instead,
 *	- if the "work_cancel_flg" is still set then the signal handler
 *	  should use siglongjmp() to cancel the "in progress" request and
 *	  it would try to acquire the same work_qlock1 in _aio_req_get()
 *	  for a second time => deadlock.
 *	To avoid that situation we disable the cancellation of the request
 *	in progress BEFORE we try to acquire the work_qlock1.
 *	In that case the signal handler will not call siglongjmp() and the
 *	worker thread will continue running the standard code flow.
 *	Then this thread must check the AIO_REQ_CANCELED flag to emulate
 *	an eventually required siglongjmp() freeing the work_qlock1 and
 *	avoiding a deadlock.
 */
void *
_aio_do_request(void *arglist)
{
	aio_worker_t *aiowp = (aio_worker_t *)arglist;
	struct aio_args *arg;
	aio_req_t *reqp;		/* current AIO request */
	ssize_t retval;
	int error;

	if (pthread_setspecific(_aio_key, aiowp) != 0)
		_aiopanic("_aio_do_request, pthread_setspecific()");
	(void) pthread_sigmask(SIG_SETMASK, &_worker_set, NULL);
	ASSERT(aiowp->work_req == NULL);

	/*
	 * We resume here when an operation is cancelled.
	 * On first entry, aiowp->work_req == NULL, so all
	 * we do is block SIGAIOCANCEL.
	 */
	(void) sigsetjmp(aiowp->work_jmp_buf, 0);

	_sigoff();	/* block SIGAIOCANCEL */
	if (aiowp->work_req != NULL)
		_aio_finish_request(aiowp, -1, ECANCELED);

	for (;;) {
		/*
		 * Put completed requests on aio_done_list.  This has
		 * to be done as part of the main loop to ensure that
		 * we don't artificially starve any aiowait'ers.
		 */
		if (aiowp->work_done1)
			_aio_work_done(aiowp);

top:
		/* consume any deferred SIGAIOCANCEL signal here */
		_sigon();
		_sigoff();

		while ((reqp = _aio_req_get(aiowp)) == NULL)
			_aio_idle(aiowp);
		arg = &reqp->req_args;
		ASSERT(reqp->req_state == AIO_REQ_INPROGRESS ||
		    reqp->req_state == AIO_REQ_CANCELED);
		error = 0;

		switch (reqp->req_op) {
		case AIOREAD:
		case AIOAREAD:
			_sigon();	/* unblock SIGAIOCANCEL */
			retval = pread(arg->fd, arg->buf,
			    arg->bufsz, arg->offset);
			if (retval == -1) {
				if (errno == ESPIPE) {
					retval = read(arg->fd,
					    arg->buf, arg->bufsz);
					if (retval == -1)
						error = errno;
				} else {
					error = errno;
				}
			}
			_sigoff();	/* block SIGAIOCANCEL */
			break;
		case AIOWRITE:
		case AIOAWRITE:
			_sigon();	/* unblock SIGAIOCANCEL */
			retval = pwrite(arg->fd, arg->buf,
			    arg->bufsz, arg->offset);
			if (retval == -1) {
				if (errno == ESPIPE) {
					retval = write(arg->fd,
					    arg->buf, arg->bufsz);
					if (retval == -1)
						error = errno;
				} else {
					error = errno;
				}
			}
			_sigoff();	/* block SIGAIOCANCEL */
			break;
#if !defined(_LP64)
		case AIOAREAD64:
			_sigon();	/* unblock SIGAIOCANCEL */
			retval = pread64(arg->fd, arg->buf,
			    arg->bufsz, arg->offset);
			if (retval == -1) {
				if (errno == ESPIPE) {
					retval = read(arg->fd,
					    arg->buf, arg->bufsz);
					if (retval == -1)
						error = errno;
				} else {
					error = errno;
				}
			}
			_sigoff();	/* block SIGAIOCANCEL */
			break;
		case AIOAWRITE64:
			_sigon();	/* unblock SIGAIOCANCEL */
			retval = pwrite64(arg->fd, arg->buf,
			    arg->bufsz, arg->offset);
			if (retval == -1) {
				if (errno == ESPIPE) {
					retval = write(arg->fd,
					    arg->buf, arg->bufsz);
					if (retval == -1)
						error = errno;
				} else {
					error = errno;
				}
			}
			_sigoff();	/* block SIGAIOCANCEL */
			break;
#endif	/* !defined(_LP64) */
		case AIOFSYNC:
			if (_aio_fsync_del(aiowp, reqp))
				goto top;
			ASSERT(reqp->req_head == NULL);
			/*
			 * All writes for this fsync request are now
			 * acknowledged.  Now make these writes visible
			 * and put the final request into the hash table.
			 */
			if (reqp->req_state == AIO_REQ_CANCELED) {
				/* EMPTY */;
			} else if (arg->offset == O_SYNC) {
				if ((retval = __fdsync(arg->fd, FSYNC)) == -1)
					error = errno;
			} else {
				if ((retval = __fdsync(arg->fd, FDSYNC)) == -1)
					error = errno;
			}
			if (_aio_hash_insert(reqp->req_resultp, reqp) != 0)
				_aiopanic("_aio_do_request(): AIOFSYNC: "
				    "request already in hash table");
			break;
		default:
			_aiopanic("_aio_do_request, bad op");
		}

		_aio_finish_request(aiowp, retval, error);
	}
	/* NOTREACHED */
	return (NULL);
}

/*
 * Perform the tail processing for _aio_do_request().
 * The in-progress request may or may not have been cancelled.
 */
static void
_aio_finish_request(aio_worker_t *aiowp, ssize_t retval, int error)
{
	aio_req_t *reqp;

	sig_mutex_lock(&aiowp->work_qlock1);
	if ((reqp = aiowp->work_req) == NULL)
		sig_mutex_unlock(&aiowp->work_qlock1);
	else {
		aiowp->work_req = NULL;
		if (reqp->req_state == AIO_REQ_CANCELED) {
			retval = -1;
			error = ECANCELED;
		}
		if (!POSIX_AIO(reqp)) {
			sig_mutex_unlock(&aiowp->work_qlock1);
			sig_mutex_lock(&__aio_mutex);
			if (reqp->req_state == AIO_REQ_INPROGRESS)
				reqp->req_state = AIO_REQ_DONE;
			_aio_req_done_cnt++;
			_aio_set_result(reqp, retval, error);
			if (error == ECANCELED)
				_aio_outstand_cnt--;
			sig_mutex_unlock(&__aio_mutex);
		} else {
			if (reqp->req_state == AIO_REQ_INPROGRESS)
				reqp->req_state = AIO_REQ_DONE;
			sig_mutex_unlock(&aiowp->work_qlock1);
			_aiodone(reqp, retval, error);
		}
	}
}

void
_aio_req_mark_done(aio_req_t *reqp)
{
#if !defined(_LP64)
	if (reqp->req_largefile)
		((aiocb64_t *)reqp->req_aiocbp)->aio_state = USERAIO_DONE;
	else
#endif
		((aiocb_t *)reqp->req_aiocbp)->aio_state = USERAIO_DONE;
}

/*
 * Sleep for 'ticks' clock ticks to give somebody else a chance to run,
 * hopefully to consume one of our queued signals.
 */
static void
_aio_delay(int ticks)
{
	(void) usleep(ticks * (MICROSEC / hz));
}

/*
 * Actually send the notifications.
 * We could block indefinitely here if the application
 * is not listening for the signal or port notifications.
 */
static void
send_notification(notif_param_t *npp)
{
	int backoff;

	if (npp->np_signo) {
		backoff = 0;
		while (__sigqueue(__pid, npp->np_signo, npp->np_user,
		    SI_ASYNCIO) == -1) {
			ASSERT(errno == EAGAIN);
			if (++backoff > 10)
				backoff = 10;
			_aio_delay(backoff);
		}
	} else if (npp->np_port >= 0) {
		(void) _port_dispatch(npp->np_port, 0, PORT_SOURCE_AIO,
		    npp->np_event, npp->np_object, npp->np_user);
	}
	if (npp->np_lio_signo) {
		backoff = 0;
		while (__sigqueue(__pid, npp->np_lio_signo, npp->np_lio_user,
		    SI_ASYNCIO) == -1) {
			ASSERT(errno == EAGAIN);
			if (++backoff > 10)
				backoff = 10;
			_aio_delay(backoff);
		}
	} else if (npp->np_lio_port >= 0) {
		(void) _port_dispatch(npp->np_lio_port, 0, PORT_SOURCE_AIO,
		    npp->np_lio_event, npp->np_lio_object, npp->np_lio_user);
	}
}

/*
 * Asynchronous notification worker.
 */
void *
_aio_do_notify(void *arg)
{
	aio_worker_t *aiowp = (aio_worker_t *)arg;
	aio_req_t *reqp;

	/*
	 * This isn't really necessary.  All signals are blocked.
	 */
	if (pthread_setspecific(_aio_key, aiowp) != 0)
		_aiopanic("_aio_do_notify, pthread_setspecific()");

	/*
	 * Notifications are never cancelled.
	 * All signals remain blocked, forever.
	 */

	for (;;) {
		while ((reqp = _aio_req_get(aiowp)) == NULL)
			_aio_idle(aiowp);
		send_notification(&reqp->req_notify);
		_aio_req_free(reqp);
	}

	/* NOTREACHED */
	return (NULL);
}

/*
 * Do the completion semantics for a request that was either canceled
 * by _aio_cancel_req() or was completed by _aio_do_request().
 */
static void
_aiodone(aio_req_t *reqp, ssize_t retval, int error)
{
	aio_result_t *resultp = reqp->req_resultp;
	int notify = 0;
	aio_lio_t *head;
	int sigev_none;
	int sigev_signal;
	int sigev_thread;
	int sigev_port;
	notif_param_t np;

	/*
	 * We call _aiodone() only for Posix I/O.
	 */
	ASSERT(POSIX_AIO(reqp));

	sigev_none = 0;
	sigev_signal = 0;
	sigev_thread = 0;
	sigev_port = 0;
	np.np_signo = 0;
	np.np_port = -1;
	np.np_lio_signo = 0;
	np.np_lio_port = -1;

	switch (reqp->req_sigevent.sigev_notify) {
	case SIGEV_NONE:
		sigev_none = 1;
		break;
	case SIGEV_SIGNAL:
		sigev_signal = 1;
		break;
	case SIGEV_THREAD:
		sigev_thread = 1;
		break;
	case SIGEV_PORT:
		sigev_port = 1;
		break;
	default:
		_aiopanic("_aiodone: improper sigev_notify");
		break;
	}

	/*
	 * Figure out the notification parameters while holding __aio_mutex.
	 * Actually perform the notifications after dropping __aio_mutex.
	 * This allows us to sleep for a long time (if the notifications
	 * incur delays) without impeding other async I/O operations.
	 */

	sig_mutex_lock(&__aio_mutex);

	if (sigev_signal) {
		if ((np.np_signo = reqp->req_sigevent.sigev_signo) != 0)
			notify = 1;
		np.np_user = reqp->req_sigevent.sigev_value.sival_ptr;
	} else if (sigev_thread | sigev_port) {
		if ((np.np_port = reqp->req_sigevent.sigev_signo) >= 0)
			notify = 1;
		np.np_event = reqp->req_op;
		if (np.np_event == AIOFSYNC && reqp->req_largefile)
			np.np_event = AIOFSYNC64;
		np.np_object = (uintptr_t)reqp->req_aiocbp;
		np.np_user = reqp->req_sigevent.sigev_value.sival_ptr;
	}

	if (resultp->aio_errno == EINPROGRESS)
		_aio_set_result(reqp, retval, error);

	_aio_outstand_cnt--;

	head = reqp->req_head;
	reqp->req_head = NULL;

	if (sigev_none) {
		_aio_enq_doneq(reqp);
		reqp = NULL;
	} else {
		(void) _aio_hash_del(resultp);
		_aio_req_mark_done(reqp);
	}

	_aio_waitn_wakeup();

	/*
	 * __aio_waitn() sets AIO_WAIT_INPROGRESS and
	 * __aio_suspend() increments "_aio_kernel_suspend"
	 * when they are waiting in the kernel for completed I/Os.
	 *
	 * _kaio(AIONOTIFY) awakes the corresponding function
	 * in the kernel; then the corresponding __aio_waitn() or
	 * __aio_suspend() function could reap the recently
	 * completed I/Os (_aiodone()).
	 */
	if ((_aio_flags & AIO_WAIT_INPROGRESS) || _aio_kernel_suspend > 0)
		(void) _kaio(AIONOTIFY);

	sig_mutex_unlock(&__aio_mutex);

	if (head != NULL) {
		/*
		 * If all the lio requests have completed,
		 * prepare to notify the waiting thread.
		 */
		sig_mutex_lock(&head->lio_mutex);
		ASSERT(head->lio_refcnt == head->lio_nent);
		if (head->lio_refcnt == 1) {
			int waiting = 0;
			if (head->lio_mode == LIO_WAIT) {
				if ((waiting = head->lio_waiting) != 0)
					(void) cond_signal(&head->lio_cond_cv);
			} else if (head->lio_port < 0) { /* none or signal */
				if ((np.np_lio_signo = head->lio_signo) != 0)
					notify = 1;
				np.np_lio_user = head->lio_sigval.sival_ptr;
			} else {			/* thread or port */
				notify = 1;
				np.np_lio_port = head->lio_port;
				np.np_lio_event = head->lio_event;
				np.np_lio_object =
				    (uintptr_t)head->lio_sigevent;
				np.np_lio_user = head->lio_sigval.sival_ptr;
			}
			head->lio_nent = head->lio_refcnt = 0;
			sig_mutex_unlock(&head->lio_mutex);
			if (waiting == 0)
				_aio_lio_free(head);
		} else {
			head->lio_nent--;
			head->lio_refcnt--;
			sig_mutex_unlock(&head->lio_mutex);
		}
	}

	/*
	 * The request is completed; now perform the notifications.
	 */
	if (notify) {
		if (reqp != NULL) {
			/*
			 * We usually put the request on the notification
			 * queue because we don't want to block and delay
			 * other operations behind us in the work queue.
			 * Also we must never block on a cancel notification
			 * because we are being called from an application
			 * thread in this case and that could lead to deadlock
			 * if no other thread is receiving notificatins.
			 */
			reqp->req_notify = np;
			reqp->req_op = AIONOTIFY;
			_aio_req_add(reqp, &__workers_no, AIONOTIFY);
			reqp = NULL;
		} else {
			/*
			 * We already put the request on the done queue,
			 * so we can't queue it to the notification queue.
			 * Just do the notification directly.
			 */
			send_notification(&np);
		}
	}

	if (reqp != NULL)
		_aio_req_free(reqp);
}

/*
 * Delete fsync requests from list head until there is
 * only one left.  Return 0 when there is only one,
 * otherwise return a non-zero value.
 */
static int
_aio_fsync_del(aio_worker_t *aiowp, aio_req_t *reqp)
{
	aio_lio_t *head = reqp->req_head;
	int rval = 0;

	ASSERT(reqp == aiowp->work_req);
	sig_mutex_lock(&aiowp->work_qlock1);
	sig_mutex_lock(&head->lio_mutex);
	if (head->lio_refcnt > 1) {
		head->lio_refcnt--;
		head->lio_nent--;
		aiowp->work_req = NULL;
		sig_mutex_unlock(&head->lio_mutex);
		sig_mutex_unlock(&aiowp->work_qlock1);
		sig_mutex_lock(&__aio_mutex);
		_aio_outstand_cnt--;
		_aio_waitn_wakeup();
		sig_mutex_unlock(&__aio_mutex);
		_aio_req_free(reqp);
		return (1);
	}
	ASSERT(head->lio_nent == 1 && head->lio_refcnt == 1);
	reqp->req_head = NULL;
	if (head->lio_canned)
		reqp->req_state = AIO_REQ_CANCELED;
	if (head->lio_mode == LIO_DESTROY) {
		aiowp->work_req = NULL;
		rval = 1;
	}
	sig_mutex_unlock(&head->lio_mutex);
	sig_mutex_unlock(&aiowp->work_qlock1);
	head->lio_refcnt--;
	head->lio_nent--;
	_aio_lio_free(head);
	if (rval != 0)
		_aio_req_free(reqp);
	return (rval);
}

/*
 * worker is set idle when its work queue is empty.
 * The worker checks again that it has no more work and then
 * goes to sleep waiting for more work.
 */
void
_aio_idle(aio_worker_t *aiowp)
{
	int error = 0;

	sig_mutex_lock(&aiowp->work_qlock1);
	if (aiowp->work_count1 == 0) {
		ASSERT(aiowp->work_minload1 == 0);
		aiowp->work_idleflg = 1;
		/*
		 * A cancellation handler is not needed here.
		 * aio worker threads are never cancelled via pthread_cancel().
		 */
		error = sig_cond_wait(&aiowp->work_idle_cv,
		    &aiowp->work_qlock1);
		/*
		 * The idle flag is normally cleared before worker is awakened
		 * by aio_req_add().  On error (EINTR), we clear it ourself.
		 */
		if (error)
			aiowp->work_idleflg = 0;
	}
	sig_mutex_unlock(&aiowp->work_qlock1);
}

/*
 * A worker's completed AIO requests are placed onto a global
 * done queue.  The application is only sent a SIGIO signal if
 * the process has a handler enabled and it is not waiting via
 * aiowait().
 */
static void
_aio_work_done(aio_worker_t *aiowp)
{
	aio_req_t *reqp;

	sig_mutex_lock(&aiowp->work_qlock1);
	reqp = aiowp->work_prev1;
	reqp->req_next = NULL;
	aiowp->work_done1 = 0;
	aiowp->work_tail1 = aiowp->work_next1;
	if (aiowp->work_tail1 == NULL)
		aiowp->work_head1 = NULL;
	aiowp->work_prev1 = NULL;
	sig_mutex_unlock(&aiowp->work_qlock1);
	sig_mutex_lock(&__aio_mutex);
	_aio_donecnt++;
	_aio_outstand_cnt--;
	_aio_req_done_cnt--;
	ASSERT(_aio_donecnt > 0 &&
	    _aio_outstand_cnt >= 0 &&
	    _aio_req_done_cnt >= 0);
	ASSERT(reqp != NULL);

	if (_aio_done_tail == NULL) {
		_aio_done_head = _aio_done_tail = reqp;
	} else {
		_aio_done_head->req_next = reqp;
		_aio_done_head = reqp;
	}

	if (_aiowait_flag) {
		sig_mutex_unlock(&__aio_mutex);
		(void) _kaio(AIONOTIFY);
	} else {
		sig_mutex_unlock(&__aio_mutex);
		if (_sigio_enabled)
			(void) kill(__pid, SIGIO);
	}
}

/*
 * The done queue consists of AIO requests that are in either the
 * AIO_REQ_DONE or AIO_REQ_CANCELED state.  Requests that were cancelled
 * are discarded.  If the done queue is empty then NULL is returned.
 * Otherwise the address of a done aio_result_t is returned.
 */
aio_result_t *
_aio_req_done(void)
{
	aio_req_t *reqp;
	aio_result_t *resultp;

	ASSERT(MUTEX_HELD(&__aio_mutex));

	if ((reqp = _aio_done_tail) != NULL) {
		if ((_aio_done_tail = reqp->req_next) == NULL)
			_aio_done_head = NULL;
		ASSERT(_aio_donecnt > 0);
		_aio_donecnt--;
		(void) _aio_hash_del(reqp->req_resultp);
		resultp = reqp->req_resultp;
		ASSERT(reqp->req_state == AIO_REQ_DONE);
		_aio_req_free(reqp);
		return (resultp);
	}
	/* is queue empty? */
	if (reqp == NULL && _aio_outstand_cnt == 0) {
		return ((aio_result_t *)-1);
	}
	return (NULL);
}

/*
 * Set the return and errno values for the application's use.
 *
 * For the Posix interfaces, we must set the return value first followed
 * by the errno value because the Posix interfaces allow for a change
 * in the errno value from EINPROGRESS to something else to signal
 * the completion of the asynchronous request.
 *
 * The opposite is true for the Solaris interfaces.  These allow for
 * a change in the return value from AIO_INPROGRESS to something else
 * to signal the completion of the asynchronous request.
 */
void
_aio_set_result(aio_req_t *reqp, ssize_t retval, int error)
{
	aio_result_t *resultp = reqp->req_resultp;

	if (POSIX_AIO(reqp)) {
		resultp->aio_return = retval;
		membar_producer();
		resultp->aio_errno = error;
	} else {
		resultp->aio_errno = error;
		membar_producer();
		resultp->aio_return = retval;
	}
}

/*
 * Add an AIO request onto the next work queue.
 * A circular list of workers is used to choose the next worker.
 */
void
_aio_req_add(aio_req_t *reqp, aio_worker_t **nextworker, int mode)
{
	aio_worker_t *aiowp;
	aio_worker_t *first;
	int load_bal_flg = 1;
	int found;

	ASSERT(reqp->req_state != AIO_REQ_DONEQ);
	reqp->req_next = NULL;
	/*
	 * Try to acquire the next worker's work queue.  If it is locked,
	 * then search the list of workers until a queue is found unlocked,
	 * or until the list is completely traversed at which point another
	 * worker will be created.
	 */
	_sigoff();		/* defer SIGIO */
	sig_mutex_lock(&__aio_mutex);
	first = aiowp = *nextworker;
	if (mode != AIONOTIFY)
		_aio_outstand_cnt++;
	sig_mutex_unlock(&__aio_mutex);

	switch (mode) {
	case AIOREAD:
	case AIOWRITE:
	case AIOAREAD:
	case AIOAWRITE:
#if !defined(_LP64)
	case AIOAREAD64:
	case AIOAWRITE64:
#endif
		/* try to find an idle worker */
		found = 0;
		do {
			if (sig_mutex_trylock(&aiowp->work_qlock1) == 0) {
				if (aiowp->work_idleflg) {
					found = 1;
					break;
				}
				sig_mutex_unlock(&aiowp->work_qlock1);
			}
		} while ((aiowp = aiowp->work_forw) != first);

		if (found) {
			aiowp->work_minload1++;
			break;
		}

		/* try to acquire some worker's queue lock */
		do {
			if (sig_mutex_trylock(&aiowp->work_qlock1) == 0) {
				found = 1;
				break;
			}
		} while ((aiowp = aiowp->work_forw) != first);

		/*
		 * Create more workers when the workers appear overloaded.
		 * Either all the workers are busy draining their queues
		 * or no worker's queue lock could be acquired.
		 */
		if (!found) {
			if (_aio_worker_cnt < _max_workers) {
				if (_aio_create_worker(reqp, mode))
					_aiopanic("_aio_req_add: add worker");
				_sigon();	/* reenable SIGIO */
				return;
			}

			/*
			 * No worker available and we have created
			 * _max_workers, keep going through the
			 * list slowly until we get a lock
			 */
			while (sig_mutex_trylock(&aiowp->work_qlock1) != 0) {
				/*
				 * give someone else a chance
				 */
				_aio_delay(1);
				aiowp = aiowp->work_forw;
			}
		}

		ASSERT(MUTEX_HELD(&aiowp->work_qlock1));
		if (_aio_worker_cnt < _max_workers &&
		    aiowp->work_minload1 >= _minworkload) {
			sig_mutex_unlock(&aiowp->work_qlock1);
			sig_mutex_lock(&__aio_mutex);
			*nextworker = aiowp->work_forw;
			sig_mutex_unlock(&__aio_mutex);
			if (_aio_create_worker(reqp, mode))
				_aiopanic("aio_req_add: add worker");
			_sigon();	/* reenable SIGIO */
			return;
		}
		aiowp->work_minload1++;
		break;
	case AIOFSYNC:
	case AIONOTIFY:
		load_bal_flg = 0;
		sig_mutex_lock(&aiowp->work_qlock1);
		break;
	default:
		_aiopanic("_aio_req_add: invalid mode");
		break;
	}
	/*
	 * Put request onto worker's work queue.
	 */
	if (aiowp->work_tail1 == NULL) {
		ASSERT(aiowp->work_count1 == 0);
		aiowp->work_tail1 = reqp;
		aiowp->work_next1 = reqp;
	} else {
		aiowp->work_head1->req_next = reqp;
		if (aiowp->work_next1 == NULL)
			aiowp->work_next1 = reqp;
	}
	reqp->req_state = AIO_REQ_QUEUED;
	reqp->req_worker = aiowp;
	aiowp->work_head1 = reqp;
	/*
	 * Awaken worker if it is not currently active.
	 */
	if (aiowp->work_count1++ == 0 && aiowp->work_idleflg) {
		aiowp->work_idleflg = 0;
		(void) cond_signal(&aiowp->work_idle_cv);
	}
	sig_mutex_unlock(&aiowp->work_qlock1);

	if (load_bal_flg) {
		sig_mutex_lock(&__aio_mutex);
		*nextworker = aiowp->work_forw;
		sig_mutex_unlock(&__aio_mutex);
	}
	_sigon();	/* reenable SIGIO */
}

/*
 * Get an AIO request for a specified worker.
 * If the work queue is empty, return NULL.
 */
aio_req_t *
_aio_req_get(aio_worker_t *aiowp)
{
	aio_req_t *reqp;

	sig_mutex_lock(&aiowp->work_qlock1);
	if ((reqp = aiowp->work_next1) != NULL) {
		/*
		 * Remove a POSIX request from the queue; the
		 * request queue is a singularly linked list
		 * with a previous pointer.  The request is
		 * removed by updating the previous pointer.
		 *
		 * Non-posix requests are left on the queue
		 * to eventually be placed on the done queue.
		 */

		if (POSIX_AIO(reqp)) {
			if (aiowp->work_prev1 == NULL) {
				aiowp->work_tail1 = reqp->req_next;
				if (aiowp->work_tail1 == NULL)
					aiowp->work_head1 = NULL;
			} else {
				aiowp->work_prev1->req_next = reqp->req_next;
				if (aiowp->work_head1 == reqp)
					aiowp->work_head1 = reqp->req_next;
			}

		} else {
			aiowp->work_prev1 = reqp;
			ASSERT(aiowp->work_done1 >= 0);
			aiowp->work_done1++;
		}
		ASSERT(reqp != reqp->req_next);
		aiowp->work_next1 = reqp->req_next;
		ASSERT(aiowp->work_count1 >= 1);
		aiowp->work_count1--;
		switch (reqp->req_op) {
		case AIOREAD:
		case AIOWRITE:
		case AIOAREAD:
		case AIOAWRITE:
#if !defined(_LP64)
		case AIOAREAD64:
		case AIOAWRITE64:
#endif
			ASSERT(aiowp->work_minload1 > 0);
			aiowp->work_minload1--;
			break;
		}
		reqp->req_state = AIO_REQ_INPROGRESS;
	}
	aiowp->work_req = reqp;
	ASSERT(reqp != NULL || aiowp->work_count1 == 0);
	sig_mutex_unlock(&aiowp->work_qlock1);
	return (reqp);
}

static void
_aio_req_del(aio_worker_t *aiowp, aio_req_t *reqp, int ostate)
{
	aio_req_t **last;
	aio_req_t *lastrp;
	aio_req_t *next;

	ASSERT(aiowp != NULL);
	ASSERT(MUTEX_HELD(&aiowp->work_qlock1));
	if (POSIX_AIO(reqp)) {
		if (ostate != AIO_REQ_QUEUED)
			return;
	}
	last = &aiowp->work_tail1;
	lastrp = aiowp->work_tail1;
	ASSERT(ostate == AIO_REQ_QUEUED || ostate == AIO_REQ_INPROGRESS);
	while ((next = *last) != NULL) {
		if (next == reqp) {
			*last = next->req_next;
			if (aiowp->work_next1 == next)
				aiowp->work_next1 = next->req_next;

			if ((next->req_next != NULL) ||
			    (aiowp->work_done1 == 0)) {
				if (aiowp->work_head1 == next)
					aiowp->work_head1 = next->req_next;
				if (aiowp->work_prev1 == next)
					aiowp->work_prev1 = next->req_next;
			} else {
				if (aiowp->work_head1 == next)
					aiowp->work_head1 = lastrp;
				if (aiowp->work_prev1 == next)
					aiowp->work_prev1 = lastrp;
			}

			if (ostate == AIO_REQ_QUEUED) {
				ASSERT(aiowp->work_count1 >= 1);
				aiowp->work_count1--;
				ASSERT(aiowp->work_minload1 >= 1);
				aiowp->work_minload1--;
			} else {
				ASSERT(ostate == AIO_REQ_INPROGRESS &&
				    !POSIX_AIO(reqp));
				aiowp->work_done1--;
			}
			return;
		}
		last = &next->req_next;
		lastrp = next;
	}
	/* NOTREACHED */
}

static void
_aio_enq_doneq(aio_req_t *reqp)
{
	if (_aio_doneq == NULL) {
		_aio_doneq = reqp;
		reqp->req_next = reqp->req_prev = reqp;
	} else {
		reqp->req_next = _aio_doneq;
		reqp->req_prev = _aio_doneq->req_prev;
		_aio_doneq->req_prev->req_next = reqp;
		_aio_doneq->req_prev = reqp;
	}
	reqp->req_state = AIO_REQ_DONEQ;
	_aio_doneq_cnt++;
}

/*
 * caller owns the _aio_mutex
 */
aio_req_t *
_aio_req_remove(aio_req_t *reqp)
{
	if (reqp && reqp->req_state != AIO_REQ_DONEQ)
		return (NULL);

	if (reqp) {
		/* request in done queue */
		if (_aio_doneq == reqp)
			_aio_doneq = reqp->req_next;
		if (_aio_doneq == reqp) {
			/* only one request on queue */
			_aio_doneq = NULL;
		} else {
			aio_req_t *tmp = reqp->req_next;
			reqp->req_prev->req_next = tmp;
			tmp->req_prev = reqp->req_prev;
		}
	} else if ((reqp = _aio_doneq) != NULL) {
		if (reqp == reqp->req_next) {
			/* only one request on queue */
			_aio_doneq = NULL;
		} else {
			reqp->req_prev->req_next = _aio_doneq = reqp->req_next;
			_aio_doneq->req_prev = reqp->req_prev;
		}
	}
	if (reqp) {
		_aio_doneq_cnt--;
		reqp->req_next = reqp->req_prev = reqp;
		reqp->req_state = AIO_REQ_DONE;
	}
	return (reqp);
}

/*
 * An AIO request is identified by an aio_result_t pointer.  The library
 * maps this aio_result_t pointer to its internal representation using a
 * hash table.  This function adds an aio_result_t pointer to the hash table.
 */
static int
_aio_hash_insert(aio_result_t *resultp, aio_req_t *reqp)
{
	aio_hash_t *hashp;
	aio_req_t **prev;
	aio_req_t *next;

	hashp = _aio_hash + AIOHASH(resultp);
	sig_mutex_lock(&hashp->hash_lock);
	prev = &hashp->hash_ptr;
	while ((next = *prev) != NULL) {
		if (resultp == next->req_resultp) {
			sig_mutex_unlock(&hashp->hash_lock);
			return (-1);
		}
		prev = &next->req_link;
	}
	*prev = reqp;
	ASSERT(reqp->req_link == NULL);
	sig_mutex_unlock(&hashp->hash_lock);
	return (0);
}

/*
 * Remove an entry from the hash table.
 */
aio_req_t *
_aio_hash_del(aio_result_t *resultp)
{
	aio_hash_t *hashp;
	aio_req_t **prev;
	aio_req_t *next = NULL;

	if (_aio_hash != NULL) {
		hashp = _aio_hash + AIOHASH(resultp);
		sig_mutex_lock(&hashp->hash_lock);
		prev = &hashp->hash_ptr;
		while ((next = *prev) != NULL) {
			if (resultp == next->req_resultp) {
				*prev = next->req_link;
				next->req_link = NULL;
				break;
			}
			prev = &next->req_link;
		}
		sig_mutex_unlock(&hashp->hash_lock);
	}
	return (next);
}

/*
 *  find an entry in the hash table
 */
aio_req_t *
_aio_hash_find(aio_result_t *resultp)
{
	aio_hash_t *hashp;
	aio_req_t **prev;
	aio_req_t *next = NULL;

	if (_aio_hash != NULL) {
		hashp = _aio_hash + AIOHASH(resultp);
		sig_mutex_lock(&hashp->hash_lock);
		prev = &hashp->hash_ptr;
		while ((next = *prev) != NULL) {
			if (resultp == next->req_resultp)
				break;
			prev = &next->req_link;
		}
		sig_mutex_unlock(&hashp->hash_lock);
	}
	return (next);
}

/*
 * AIO interface for POSIX
 */
int
_aio_rw(aiocb_t *aiocbp, aio_lio_t *lio_head, aio_worker_t **nextworker,
    int mode, int flg)
{
	aio_req_t *reqp;
	aio_args_t *ap;
	int kerr;

	if (aiocbp == NULL) {
		errno = EINVAL;
		return (-1);
	}

	/* initialize kaio */
	if (!_kaio_ok)
		_kaio_init();

	aiocbp->aio_state = NOCHECK;

	/*
	 * If we have been called because a list I/O
	 * kaio() failed, we dont want to repeat the
	 * system call
	 */

	if (flg & AIO_KAIO) {
		/*
		 * Try kernel aio first.
		 * If errno is ENOTSUP/EBADFD,
		 * fall back to the thread implementation.
		 */
		if (_kaio_ok > 0 && KAIO_SUPPORTED(aiocbp->aio_fildes)) {
			aiocbp->aio_resultp.aio_errno = EINPROGRESS;
			aiocbp->aio_state = CHECK;
			kerr = (int)_kaio(mode, aiocbp);
			if (kerr == 0)
				return (0);
			if (errno != ENOTSUP && errno != EBADFD) {
				aiocbp->aio_resultp.aio_errno = errno;
				aiocbp->aio_resultp.aio_return = -1;
				aiocbp->aio_state = NOCHECK;
				return (-1);
			}
			if (errno == EBADFD)
				SET_KAIO_NOT_SUPPORTED(aiocbp->aio_fildes);
		}
	}

	aiocbp->aio_resultp.aio_errno = EINPROGRESS;
	aiocbp->aio_state = USERAIO;

	if (!__uaio_ok && __uaio_init() == -1)
		return (-1);

	if ((reqp = _aio_req_alloc()) == NULL) {
		errno = EAGAIN;
		return (-1);
	}

	/*
	 * If an LIO request, add the list head to the aio request
	 */
	reqp->req_head = lio_head;
	reqp->req_type = AIO_POSIX_REQ;
	reqp->req_op = mode;
	reqp->req_largefile = 0;

	if (aiocbp->aio_sigevent.sigev_notify == SIGEV_NONE) {
		reqp->req_sigevent.sigev_notify = SIGEV_NONE;
	} else if (aiocbp->aio_sigevent.sigev_notify == SIGEV_SIGNAL) {
		reqp->req_sigevent.sigev_notify = SIGEV_SIGNAL;
		reqp->req_sigevent.sigev_signo =
		    aiocbp->aio_sigevent.sigev_signo;
		reqp->req_sigevent.sigev_value.sival_ptr =
		    aiocbp->aio_sigevent.sigev_value.sival_ptr;
	} else if (aiocbp->aio_sigevent.sigev_notify == SIGEV_PORT) {
		port_notify_t *pn = aiocbp->aio_sigevent.sigev_value.sival_ptr;
		reqp->req_sigevent.sigev_notify = SIGEV_PORT;
		/*
		 * Reuse the sigevent structure to contain the port number
		 * and the user value.  Same for SIGEV_THREAD, below.
		 */
		reqp->req_sigevent.sigev_signo =
		    pn->portnfy_port;
		reqp->req_sigevent.sigev_value.sival_ptr =
		    pn->portnfy_user;
	} else if (aiocbp->aio_sigevent.sigev_notify == SIGEV_THREAD) {
		reqp->req_sigevent.sigev_notify = SIGEV_THREAD;
		/*
		 * The sigevent structure contains the port number
		 * and the user value.  Same for SIGEV_PORT, above.
		 */
		reqp->req_sigevent.sigev_signo =
		    aiocbp->aio_sigevent.sigev_signo;
		reqp->req_sigevent.sigev_value.sival_ptr =
		    aiocbp->aio_sigevent.sigev_value.sival_ptr;
	}

	reqp->req_resultp = &aiocbp->aio_resultp;
	reqp->req_aiocbp = aiocbp;
	ap = &reqp->req_args;
	ap->fd = aiocbp->aio_fildes;
	ap->buf = (caddr_t)aiocbp->aio_buf;
	ap->bufsz = aiocbp->aio_nbytes;
	ap->offset = aiocbp->aio_offset;

	if ((flg & AIO_NO_DUPS) &&
	    _aio_hash_insert(&aiocbp->aio_resultp, reqp) != 0) {
		_aiopanic("_aio_rw(): request already in hash table");
		_aio_req_free(reqp);
		errno = EINVAL;
		return (-1);
	}
	_aio_req_add(reqp, nextworker, mode);
	return (0);
}

#if !defined(_LP64)
/*
 * 64-bit AIO interface for POSIX
 */
int
_aio_rw64(aiocb64_t *aiocbp, aio_lio_t *lio_head, aio_worker_t **nextworker,
    int mode, int flg)
{
	aio_req_t *reqp;
	aio_args_t *ap;
	int kerr;

	if (aiocbp == NULL) {
		errno = EINVAL;
		return (-1);
	}

	/* initialize kaio */
	if (!_kaio_ok)
		_kaio_init();

	aiocbp->aio_state = NOCHECK;

	/*
	 * If we have been called because a list I/O
	 * kaio() failed, we dont want to repeat the
	 * system call
	 */

	if (flg & AIO_KAIO) {
		/*
		 * Try kernel aio first.
		 * If errno is ENOTSUP/EBADFD,
		 * fall back to the thread implementation.
		 */
		if (_kaio_ok > 0 && KAIO_SUPPORTED(aiocbp->aio_fildes)) {
			aiocbp->aio_resultp.aio_errno = EINPROGRESS;
			aiocbp->aio_state = CHECK;
			kerr = (int)_kaio(mode, aiocbp);
			if (kerr == 0)
				return (0);
			if (errno != ENOTSUP && errno != EBADFD) {
				aiocbp->aio_resultp.aio_errno = errno;
				aiocbp->aio_resultp.aio_return = -1;
				aiocbp->aio_state = NOCHECK;
				return (-1);
			}
			if (errno == EBADFD)
				SET_KAIO_NOT_SUPPORTED(aiocbp->aio_fildes);
		}
	}

	aiocbp->aio_resultp.aio_errno = EINPROGRESS;
	aiocbp->aio_state = USERAIO;

	if (!__uaio_ok && __uaio_init() == -1)
		return (-1);

	if ((reqp = _aio_req_alloc()) == NULL) {
		errno = EAGAIN;
		return (-1);
	}

	/*
	 * If an LIO request, add the list head to the aio request
	 */
	reqp->req_head = lio_head;
	reqp->req_type = AIO_POSIX_REQ;
	reqp->req_op = mode;
	reqp->req_largefile = 1;

	if (aiocbp->aio_sigevent.sigev_notify == SIGEV_NONE) {
		reqp->req_sigevent.sigev_notify = SIGEV_NONE;
	} else if (aiocbp->aio_sigevent.sigev_notify == SIGEV_SIGNAL) {
		reqp->req_sigevent.sigev_notify = SIGEV_SIGNAL;
		reqp->req_sigevent.sigev_signo =
		    aiocbp->aio_sigevent.sigev_signo;
		reqp->req_sigevent.sigev_value.sival_ptr =
		    aiocbp->aio_sigevent.sigev_value.sival_ptr;
	} else if (aiocbp->aio_sigevent.sigev_notify == SIGEV_PORT) {
		port_notify_t *pn = aiocbp->aio_sigevent.sigev_value.sival_ptr;
		reqp->req_sigevent.sigev_notify = SIGEV_PORT;
		reqp->req_sigevent.sigev_signo =
		    pn->portnfy_port;
		reqp->req_sigevent.sigev_value.sival_ptr =
		    pn->portnfy_user;
	} else if (aiocbp->aio_sigevent.sigev_notify == SIGEV_THREAD) {
		reqp->req_sigevent.sigev_notify = SIGEV_THREAD;
		reqp->req_sigevent.sigev_signo =
		    aiocbp->aio_sigevent.sigev_signo;
		reqp->req_sigevent.sigev_value.sival_ptr =
		    aiocbp->aio_sigevent.sigev_value.sival_ptr;
	}

	reqp->req_resultp = &aiocbp->aio_resultp;
	reqp->req_aiocbp = aiocbp;
	ap = &reqp->req_args;
	ap->fd = aiocbp->aio_fildes;
	ap->buf = (caddr_t)aiocbp->aio_buf;
	ap->bufsz = aiocbp->aio_nbytes;
	ap->offset = aiocbp->aio_offset;

	if ((flg & AIO_NO_DUPS) &&
	    _aio_hash_insert(&aiocbp->aio_resultp, reqp) != 0) {
		_aiopanic("_aio_rw64(): request already in hash table");
		_aio_req_free(reqp);
		errno = EINVAL;
		return (-1);
	}
	_aio_req_add(reqp, nextworker, mode);
	return (0);
}
#endif	/* !defined(_LP64) */