17c478bd9Sstevel@tonic-gate /* 27c478bd9Sstevel@tonic-gate * CDDL HEADER START 37c478bd9Sstevel@tonic-gate * 47c478bd9Sstevel@tonic-gate * The contents of this file are subject to the terms of the 534709573Sraf * Common Development and Distribution License (the "License"). 634709573Sraf * You may not use this file except in compliance with the License. 77c478bd9Sstevel@tonic-gate * 87c478bd9Sstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 97c478bd9Sstevel@tonic-gate * or http://www.opensolaris.org/os/licensing. 107c478bd9Sstevel@tonic-gate * See the License for the specific language governing permissions 117c478bd9Sstevel@tonic-gate * and limitations under the License. 127c478bd9Sstevel@tonic-gate * 137c478bd9Sstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each 147c478bd9Sstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 157c478bd9Sstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the 167c478bd9Sstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying 177c478bd9Sstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner] 187c478bd9Sstevel@tonic-gate * 197c478bd9Sstevel@tonic-gate * CDDL HEADER END 207c478bd9Sstevel@tonic-gate */ 2134709573Sraf 227c478bd9Sstevel@tonic-gate /* 23*34b3058fSpraks * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 247c478bd9Sstevel@tonic-gate * Use is subject to license terms. 257c478bd9Sstevel@tonic-gate */ 267c478bd9Sstevel@tonic-gate 277c478bd9Sstevel@tonic-gate #pragma ident "%Z%%M% %I% %E% SMI" 287c478bd9Sstevel@tonic-gate 29f841f6adSraf #include "synonyms.h" 30f841f6adSraf #include "thr_uberdata.h" 31f841f6adSraf #include "asyncio.h" 3234709573Sraf #include <atomic.h> 337c478bd9Sstevel@tonic-gate #include <sys/param.h> 347c478bd9Sstevel@tonic-gate #include <sys/file.h> 357c478bd9Sstevel@tonic-gate #include <sys/port.h> 367c478bd9Sstevel@tonic-gate 377c478bd9Sstevel@tonic-gate static int _aio_hash_insert(aio_result_t *, aio_req_t *); 387c478bd9Sstevel@tonic-gate static aio_req_t *_aio_req_get(aio_worker_t *); 397c478bd9Sstevel@tonic-gate static void _aio_req_add(aio_req_t *, aio_worker_t **, int); 407c478bd9Sstevel@tonic-gate static void _aio_req_del(aio_worker_t *, aio_req_t *, int); 417c478bd9Sstevel@tonic-gate static void _aio_work_done(aio_worker_t *); 4234709573Sraf static void _aio_enq_doneq(aio_req_t *); 437c478bd9Sstevel@tonic-gate 4434709573Sraf extern void _aio_lio_free(aio_lio_t *); 457c478bd9Sstevel@tonic-gate 4634709573Sraf extern int __fdsync(int, int); 477c478bd9Sstevel@tonic-gate extern int _port_dispatch(int, int, int, int, uintptr_t, void *); 487c478bd9Sstevel@tonic-gate 49c2575b5eSraf static int _aio_fsync_del(aio_worker_t *, aio_req_t *); 5034709573Sraf static void _aiodone(aio_req_t *, ssize_t, int); 517c478bd9Sstevel@tonic-gate static void _aio_cancel_work(aio_worker_t *, int, int *, int *); 5234709573Sraf static void _aio_finish_request(aio_worker_t *, ssize_t, int); 537c478bd9Sstevel@tonic-gate 547c478bd9Sstevel@tonic-gate /* 557c478bd9Sstevel@tonic-gate * switch for kernel async I/O 567c478bd9Sstevel@tonic-gate */ 5734709573Sraf int _kaio_ok = 0; /* 0 = disabled, 1 = on, -1 = error */ 587c478bd9Sstevel@tonic-gate 597c478bd9Sstevel@tonic-gate /* 607c478bd9Sstevel@tonic-gate * Key for thread-specific data 617c478bd9Sstevel@tonic-gate */ 6234709573Sraf pthread_key_t _aio_key; 637c478bd9Sstevel@tonic-gate 647c478bd9Sstevel@tonic-gate /* 6534709573Sraf * Array for determining whether or not a file supports kaio. 6634709573Sraf * Initialized in _kaio_init(). 677c478bd9Sstevel@tonic-gate */ 6834709573Sraf uint32_t *_kaio_supported = NULL; 697c478bd9Sstevel@tonic-gate 707c478bd9Sstevel@tonic-gate /* 7134709573Sraf * workers for read/write requests 7234709573Sraf * (__aio_mutex lock protects circular linked list of workers) 737c478bd9Sstevel@tonic-gate */ 7434709573Sraf aio_worker_t *__workers_rw; /* circular list of AIO workers */ 7534709573Sraf aio_worker_t *__nextworker_rw; /* next worker in list of workers */ 7634709573Sraf int __rw_workerscnt; /* number of read/write workers */ 777c478bd9Sstevel@tonic-gate 787c478bd9Sstevel@tonic-gate /* 7934709573Sraf * worker for notification requests. 807c478bd9Sstevel@tonic-gate */ 8134709573Sraf aio_worker_t *__workers_no; /* circular list of AIO workers */ 8234709573Sraf aio_worker_t *__nextworker_no; /* next worker in list of workers */ 8334709573Sraf int __no_workerscnt; /* number of write workers */ 847c478bd9Sstevel@tonic-gate 8534709573Sraf aio_req_t *_aio_done_tail; /* list of done requests */ 8634709573Sraf aio_req_t *_aio_done_head; 877c478bd9Sstevel@tonic-gate 887c478bd9Sstevel@tonic-gate mutex_t __aio_initlock = DEFAULTMUTEX; /* makes aio initialization atomic */ 89f841f6adSraf cond_t __aio_initcv = DEFAULTCV; 90f841f6adSraf int __aio_initbusy = 0; 91f841f6adSraf 927c478bd9Sstevel@tonic-gate mutex_t __aio_mutex = DEFAULTMUTEX; /* protects counts, and linked lists */ 937c478bd9Sstevel@tonic-gate cond_t _aio_iowait_cv = DEFAULTCV; /* wait for userland I/Os */ 947c478bd9Sstevel@tonic-gate 957c478bd9Sstevel@tonic-gate pid_t __pid = (pid_t)-1; /* initialize as invalid pid */ 9634709573Sraf int _sigio_enabled = 0; /* when set, send SIGIO signal */ 977c478bd9Sstevel@tonic-gate 9834709573Sraf aio_hash_t *_aio_hash; 997c478bd9Sstevel@tonic-gate 10034709573Sraf aio_req_t *_aio_doneq; /* double linked done queue list */ 1017c478bd9Sstevel@tonic-gate 1027c478bd9Sstevel@tonic-gate int _aio_donecnt = 0; 10334709573Sraf int _aio_waitncnt = 0; /* # of requests for aio_waitn */ 1047c478bd9Sstevel@tonic-gate int _aio_doneq_cnt = 0; 10534709573Sraf int _aio_outstand_cnt = 0; /* # of outstanding requests */ 10634709573Sraf int _kaio_outstand_cnt = 0; /* # of outstanding kaio requests */ 1077c478bd9Sstevel@tonic-gate int _aio_req_done_cnt = 0; /* req. done but not in "done queue" */ 1087c478bd9Sstevel@tonic-gate int _aio_kernel_suspend = 0; /* active kernel kaio calls */ 1097c478bd9Sstevel@tonic-gate int _aio_suscv_cnt = 0; /* aio_suspend calls waiting on cv's */ 1107c478bd9Sstevel@tonic-gate 1117c478bd9Sstevel@tonic-gate int _max_workers = 256; /* max number of workers permitted */ 112f841f6adSraf int _min_workers = 4; /* min number of workers */ 1137c478bd9Sstevel@tonic-gate int _minworkload = 2; /* min number of request in q */ 1147c478bd9Sstevel@tonic-gate int _aio_worker_cnt = 0; /* number of workers to do requests */ 1157c478bd9Sstevel@tonic-gate int __uaio_ok = 0; /* AIO has been enabled */ 1167c478bd9Sstevel@tonic-gate sigset_t _worker_set; /* worker's signal mask */ 1177c478bd9Sstevel@tonic-gate 1187c478bd9Sstevel@tonic-gate int _aiowait_flag = 0; /* when set, aiowait() is inprogress */ 119f841f6adSraf int _aio_flags = 0; /* see asyncio.h defines for */ 1207c478bd9Sstevel@tonic-gate 121f841f6adSraf aio_worker_t *_kaiowp = NULL; /* points to kaio cleanup thread */ 1227c478bd9Sstevel@tonic-gate 12334709573Sraf int hz; /* clock ticks per second */ 1247c478bd9Sstevel@tonic-gate 12534709573Sraf static int 12634709573Sraf _kaio_supported_init(void) 1277c478bd9Sstevel@tonic-gate { 12834709573Sraf void *ptr; 12934709573Sraf size_t size; 13034709573Sraf 13134709573Sraf if (_kaio_supported != NULL) /* already initialized */ 13234709573Sraf return (0); 13334709573Sraf 13434709573Sraf size = MAX_KAIO_FDARRAY_SIZE * sizeof (uint32_t); 13534709573Sraf ptr = mmap(NULL, size, PROT_READ | PROT_WRITE, 13634709573Sraf MAP_PRIVATE | MAP_ANON, -1, (off_t)0); 13734709573Sraf if (ptr == MAP_FAILED) 13834709573Sraf return (-1); 13934709573Sraf _kaio_supported = ptr; 14034709573Sraf return (0); 1417c478bd9Sstevel@tonic-gate } 1427c478bd9Sstevel@tonic-gate 1437c478bd9Sstevel@tonic-gate /* 144f841f6adSraf * The aio subsystem is initialized when an AIO request is made. 145f841f6adSraf * Constants are initialized like the max number of workers that 146f841f6adSraf * the subsystem can create, and the minimum number of workers 147f841f6adSraf * permitted before imposing some restrictions. Also, some 148f841f6adSraf * workers are created. 1497c478bd9Sstevel@tonic-gate */ 1507c478bd9Sstevel@tonic-gate int 1517c478bd9Sstevel@tonic-gate __uaio_init(void) 1527c478bd9Sstevel@tonic-gate { 153f841f6adSraf int ret = -1; 1547c478bd9Sstevel@tonic-gate int i; 1557c478bd9Sstevel@tonic-gate 156f841f6adSraf lmutex_lock(&__aio_initlock); 157f841f6adSraf while (__aio_initbusy) 158f841f6adSraf (void) _cond_wait(&__aio_initcv, &__aio_initlock); 15934709573Sraf if (__uaio_ok) { /* already initialized */ 160f841f6adSraf lmutex_unlock(&__aio_initlock); 16134709573Sraf return (0); 16234709573Sraf } 163f841f6adSraf __aio_initbusy = 1; 164f841f6adSraf lmutex_unlock(&__aio_initlock); 1657c478bd9Sstevel@tonic-gate 16634709573Sraf hz = (int)sysconf(_SC_CLK_TCK); 16734709573Sraf __pid = getpid(); 1687c478bd9Sstevel@tonic-gate 169f841f6adSraf setup_cancelsig(SIGAIOCANCEL); 1707c478bd9Sstevel@tonic-gate 17134709573Sraf if (_kaio_supported_init() != 0) 17234709573Sraf goto out; 1737c478bd9Sstevel@tonic-gate 17434709573Sraf /* 17534709573Sraf * Allocate and initialize the hash table. 176f7499066Ssp * Do this only once, even if __uaio_init() is called twice. 17734709573Sraf */ 178f7499066Ssp if (_aio_hash == NULL) { 179f7499066Ssp /* LINTED pointer cast */ 180f7499066Ssp _aio_hash = (aio_hash_t *)mmap(NULL, 181f7499066Ssp HASHSZ * sizeof (aio_hash_t), PROT_READ | PROT_WRITE, 182f7499066Ssp MAP_PRIVATE | MAP_ANON, -1, (off_t)0); 183f7499066Ssp if ((void *)_aio_hash == MAP_FAILED) { 184f7499066Ssp _aio_hash = NULL; 185f7499066Ssp goto out; 186f7499066Ssp } 187f7499066Ssp for (i = 0; i < HASHSZ; i++) 188f7499066Ssp (void) mutex_init(&_aio_hash[i].hash_lock, 189f7499066Ssp USYNC_THREAD, NULL); 1907c478bd9Sstevel@tonic-gate } 1917c478bd9Sstevel@tonic-gate 19234709573Sraf /* 19334709573Sraf * Initialize worker's signal mask to only catch SIGAIOCANCEL. 19434709573Sraf */ 19534709573Sraf (void) sigfillset(&_worker_set); 19634709573Sraf (void) sigdelset(&_worker_set, SIGAIOCANCEL); 19734709573Sraf 19834709573Sraf /* 199f7499066Ssp * Create one worker to send asynchronous notifications. 200f7499066Ssp * Do this only once, even if __uaio_init() is called twice. 20134709573Sraf */ 202f7499066Ssp if (__no_workerscnt == 0 && 203f7499066Ssp (_aio_create_worker(NULL, AIONOTIFY) != 0)) { 204f7499066Ssp errno = EAGAIN; 205f7499066Ssp goto out; 206f7499066Ssp } 20734709573Sraf 20834709573Sraf /* 209f7499066Ssp * Create the minimum number of read/write workers. 210f7499066Ssp * And later check whether atleast one worker is created; 211f7499066Ssp * lwp_create() calls could fail because of segkp exhaustion. 21234709573Sraf */ 213f7499066Ssp for (i = 0; i < _min_workers; i++) 214f7499066Ssp (void) _aio_create_worker(NULL, AIOREAD); 215f7499066Ssp if (__rw_workerscnt == 0) { 216f7499066Ssp errno = EAGAIN; 217f7499066Ssp goto out; 218f7499066Ssp } 21934709573Sraf 22034709573Sraf ret = 0; 22134709573Sraf out: 222f841f6adSraf lmutex_lock(&__aio_initlock); 223f841f6adSraf if (ret == 0) 224f841f6adSraf __uaio_ok = 1; 225f841f6adSraf __aio_initbusy = 0; 226f841f6adSraf (void) cond_broadcast(&__aio_initcv); 227f841f6adSraf lmutex_unlock(&__aio_initlock); 22834709573Sraf return (ret); 2297c478bd9Sstevel@tonic-gate } 2307c478bd9Sstevel@tonic-gate 231f841f6adSraf /* 232f841f6adSraf * Called from close() before actually performing the real _close(). 233f841f6adSraf */ 234f841f6adSraf void 235f841f6adSraf _aio_close(int fd) 236f841f6adSraf { 237f841f6adSraf if (fd < 0) /* avoid cancelling everything */ 238f841f6adSraf return; 239f841f6adSraf /* 240f841f6adSraf * Cancel all outstanding aio requests for this file descriptor. 241f841f6adSraf */ 242f841f6adSraf if (__uaio_ok) 243f841f6adSraf (void) aiocancel_all(fd); 244f841f6adSraf /* 245f841f6adSraf * If we have allocated the bit array, clear the bit for this file. 246f841f6adSraf * The next open may re-use this file descriptor and the new file 247f841f6adSraf * may have different kaio() behaviour. 248f841f6adSraf */ 249f841f6adSraf if (_kaio_supported != NULL) 250f841f6adSraf CLEAR_KAIO_SUPPORTED(fd); 251f841f6adSraf } 252f841f6adSraf 2537c478bd9Sstevel@tonic-gate /* 2547c478bd9Sstevel@tonic-gate * special kaio cleanup thread sits in a loop in the 2557c478bd9Sstevel@tonic-gate * kernel waiting for pending kaio requests to complete. 2567c478bd9Sstevel@tonic-gate */ 2577c478bd9Sstevel@tonic-gate void * 2587c478bd9Sstevel@tonic-gate _kaio_cleanup_thread(void *arg) 2597c478bd9Sstevel@tonic-gate { 26034709573Sraf if (pthread_setspecific(_aio_key, arg) != 0) 261f841f6adSraf aio_panic("_kaio_cleanup_thread, pthread_setspecific()"); 2627c478bd9Sstevel@tonic-gate (void) _kaio(AIOSTART); 2637c478bd9Sstevel@tonic-gate return (arg); 2647c478bd9Sstevel@tonic-gate } 2657c478bd9Sstevel@tonic-gate 2667c478bd9Sstevel@tonic-gate /* 2677c478bd9Sstevel@tonic-gate * initialize kaio. 2687c478bd9Sstevel@tonic-gate */ 2697c478bd9Sstevel@tonic-gate void 2707c478bd9Sstevel@tonic-gate _kaio_init() 2717c478bd9Sstevel@tonic-gate { 2727c478bd9Sstevel@tonic-gate int error; 27334709573Sraf sigset_t oset; 27434709573Sraf 275f841f6adSraf lmutex_lock(&__aio_initlock); 276f841f6adSraf while (__aio_initbusy) 277f841f6adSraf (void) _cond_wait(&__aio_initcv, &__aio_initlock); 278f841f6adSraf if (_kaio_ok) { /* already initialized */ 279f841f6adSraf lmutex_unlock(&__aio_initlock); 280f841f6adSraf return; 281f841f6adSraf } 282f841f6adSraf __aio_initbusy = 1; 283f841f6adSraf lmutex_unlock(&__aio_initlock); 284f841f6adSraf 28534709573Sraf if (_kaio_supported_init() != 0) 286f841f6adSraf error = ENOMEM; 287f841f6adSraf else if ((_kaiowp = _aio_worker_alloc()) == NULL) 288f841f6adSraf error = ENOMEM; 289f841f6adSraf else if ((error = (int)_kaio(AIOINIT)) == 0) { 290f841f6adSraf (void) pthread_sigmask(SIG_SETMASK, &maskset, &oset); 291f841f6adSraf error = thr_create(NULL, AIOSTKSIZE, _kaio_cleanup_thread, 292f841f6adSraf _kaiowp, THR_DAEMON, &_kaiowp->work_tid); 293f841f6adSraf (void) pthread_sigmask(SIG_SETMASK, &oset, NULL); 294f841f6adSraf } 295f841f6adSraf if (error && _kaiowp != NULL) { 296f841f6adSraf _aio_worker_free(_kaiowp); 297f841f6adSraf _kaiowp = NULL; 2987c478bd9Sstevel@tonic-gate } 299f841f6adSraf 300f841f6adSraf lmutex_lock(&__aio_initlock); 301f841f6adSraf if (error) 302f841f6adSraf _kaio_ok = -1; 303f841f6adSraf else 304f841f6adSraf _kaio_ok = 1; 305f841f6adSraf __aio_initbusy = 0; 306f841f6adSraf (void) cond_broadcast(&__aio_initcv); 307f841f6adSraf lmutex_unlock(&__aio_initlock); 3087c478bd9Sstevel@tonic-gate } 3097c478bd9Sstevel@tonic-gate 3107c478bd9Sstevel@tonic-gate int 3117c478bd9Sstevel@tonic-gate aioread(int fd, caddr_t buf, int bufsz, off_t offset, int whence, 3127c478bd9Sstevel@tonic-gate aio_result_t *resultp) 3137c478bd9Sstevel@tonic-gate { 3147c478bd9Sstevel@tonic-gate return (_aiorw(fd, buf, bufsz, offset, whence, resultp, AIOREAD)); 3157c478bd9Sstevel@tonic-gate } 3167c478bd9Sstevel@tonic-gate 3177c478bd9Sstevel@tonic-gate int 3187c478bd9Sstevel@tonic-gate aiowrite(int fd, caddr_t buf, int bufsz, off_t offset, int whence, 3197c478bd9Sstevel@tonic-gate aio_result_t *resultp) 3207c478bd9Sstevel@tonic-gate { 3217c478bd9Sstevel@tonic-gate return (_aiorw(fd, buf, bufsz, offset, whence, resultp, AIOWRITE)); 3227c478bd9Sstevel@tonic-gate } 3237c478bd9Sstevel@tonic-gate 32434709573Sraf #if !defined(_LP64) 3257c478bd9Sstevel@tonic-gate int 3267c478bd9Sstevel@tonic-gate aioread64(int fd, caddr_t buf, int bufsz, off64_t offset, int whence, 3277c478bd9Sstevel@tonic-gate aio_result_t *resultp) 3287c478bd9Sstevel@tonic-gate { 3297c478bd9Sstevel@tonic-gate return (_aiorw(fd, buf, bufsz, offset, whence, resultp, AIOAREAD64)); 3307c478bd9Sstevel@tonic-gate } 3317c478bd9Sstevel@tonic-gate 3327c478bd9Sstevel@tonic-gate int 3337c478bd9Sstevel@tonic-gate aiowrite64(int fd, caddr_t buf, int bufsz, off64_t offset, int whence, 3347c478bd9Sstevel@tonic-gate aio_result_t *resultp) 3357c478bd9Sstevel@tonic-gate { 3367c478bd9Sstevel@tonic-gate return (_aiorw(fd, buf, bufsz, offset, whence, resultp, AIOAWRITE64)); 3377c478bd9Sstevel@tonic-gate } 33834709573Sraf #endif /* !defined(_LP64) */ 3397c478bd9Sstevel@tonic-gate 3407c478bd9Sstevel@tonic-gate int 3417c478bd9Sstevel@tonic-gate _aiorw(int fd, caddr_t buf, int bufsz, offset_t offset, int whence, 3427c478bd9Sstevel@tonic-gate aio_result_t *resultp, int mode) 3437c478bd9Sstevel@tonic-gate { 34434709573Sraf aio_req_t *reqp; 34534709573Sraf aio_args_t *ap; 34634709573Sraf offset_t loffset; 3477c478bd9Sstevel@tonic-gate struct stat stat; 34834709573Sraf int error = 0; 3497c478bd9Sstevel@tonic-gate int kerr; 3507c478bd9Sstevel@tonic-gate int umode; 3517c478bd9Sstevel@tonic-gate 3527c478bd9Sstevel@tonic-gate switch (whence) { 3537c478bd9Sstevel@tonic-gate 3547c478bd9Sstevel@tonic-gate case SEEK_SET: 3557c478bd9Sstevel@tonic-gate loffset = offset; 3567c478bd9Sstevel@tonic-gate break; 3577c478bd9Sstevel@tonic-gate case SEEK_CUR: 3587c478bd9Sstevel@tonic-gate if ((loffset = llseek(fd, 0, SEEK_CUR)) == -1) 35934709573Sraf error = -1; 3607c478bd9Sstevel@tonic-gate else 3617c478bd9Sstevel@tonic-gate loffset += offset; 3627c478bd9Sstevel@tonic-gate break; 3637c478bd9Sstevel@tonic-gate case SEEK_END: 3647c478bd9Sstevel@tonic-gate if (fstat(fd, &stat) == -1) 36534709573Sraf error = -1; 3667c478bd9Sstevel@tonic-gate else 3677c478bd9Sstevel@tonic-gate loffset = offset + stat.st_size; 3687c478bd9Sstevel@tonic-gate break; 3697c478bd9Sstevel@tonic-gate default: 3707c478bd9Sstevel@tonic-gate errno = EINVAL; 37134709573Sraf error = -1; 3727c478bd9Sstevel@tonic-gate } 3737c478bd9Sstevel@tonic-gate 37434709573Sraf if (error) 37534709573Sraf return (error); 3767c478bd9Sstevel@tonic-gate 3777c478bd9Sstevel@tonic-gate /* initialize kaio */ 3787c478bd9Sstevel@tonic-gate if (!_kaio_ok) 3797c478bd9Sstevel@tonic-gate _kaio_init(); 3807c478bd9Sstevel@tonic-gate 3817c478bd9Sstevel@tonic-gate /* 3827c478bd9Sstevel@tonic-gate * _aio_do_request() needs the original request code (mode) to be able 38334709573Sraf * to choose the appropiate 32/64 bit function. All other functions 3847c478bd9Sstevel@tonic-gate * only require the difference between READ and WRITE (umode). 3857c478bd9Sstevel@tonic-gate */ 3867c478bd9Sstevel@tonic-gate if (mode == AIOAREAD64 || mode == AIOAWRITE64) 3877c478bd9Sstevel@tonic-gate umode = mode - AIOAREAD64; 3887c478bd9Sstevel@tonic-gate else 3897c478bd9Sstevel@tonic-gate umode = mode; 3907c478bd9Sstevel@tonic-gate 3917c478bd9Sstevel@tonic-gate /* 3927c478bd9Sstevel@tonic-gate * Try kernel aio first. 3937c478bd9Sstevel@tonic-gate * If errno is ENOTSUP/EBADFD, fall back to the thread implementation. 3947c478bd9Sstevel@tonic-gate */ 39534709573Sraf if (_kaio_ok > 0 && KAIO_SUPPORTED(fd)) { 3967c478bd9Sstevel@tonic-gate resultp->aio_errno = 0; 39734709573Sraf sig_mutex_lock(&__aio_mutex); 39834709573Sraf _kaio_outstand_cnt++; 3997c478bd9Sstevel@tonic-gate kerr = (int)_kaio(((resultp->aio_return == AIO_INPROGRESS) ? 4007c478bd9Sstevel@tonic-gate (umode | AIO_POLL_BIT) : umode), 4017c478bd9Sstevel@tonic-gate fd, buf, bufsz, loffset, resultp); 40234709573Sraf if (kerr == 0) { 403b9868792Sraf sig_mutex_unlock(&__aio_mutex); 4047c478bd9Sstevel@tonic-gate return (0); 40534709573Sraf } 40634709573Sraf _kaio_outstand_cnt--; 40734709573Sraf sig_mutex_unlock(&__aio_mutex); 40834709573Sraf if (errno != ENOTSUP && errno != EBADFD) 4097c478bd9Sstevel@tonic-gate return (-1); 4107c478bd9Sstevel@tonic-gate if (errno == EBADFD) 4117c478bd9Sstevel@tonic-gate SET_KAIO_NOT_SUPPORTED(fd); 4127c478bd9Sstevel@tonic-gate } 4137c478bd9Sstevel@tonic-gate 41434709573Sraf if (!__uaio_ok && __uaio_init() == -1) 41534709573Sraf return (-1); 41634709573Sraf 41734709573Sraf if ((reqp = _aio_req_alloc()) == NULL) { 4187c478bd9Sstevel@tonic-gate errno = EAGAIN; 4197c478bd9Sstevel@tonic-gate return (-1); 4207c478bd9Sstevel@tonic-gate } 4217c478bd9Sstevel@tonic-gate 4227c478bd9Sstevel@tonic-gate /* 42334709573Sraf * _aio_do_request() checks reqp->req_op to differentiate 4247c478bd9Sstevel@tonic-gate * between 32 and 64 bit access. 4257c478bd9Sstevel@tonic-gate */ 42634709573Sraf reqp->req_op = mode; 42734709573Sraf reqp->req_resultp = resultp; 42834709573Sraf ap = &reqp->req_args; 4297c478bd9Sstevel@tonic-gate ap->fd = fd; 4307c478bd9Sstevel@tonic-gate ap->buf = buf; 4317c478bd9Sstevel@tonic-gate ap->bufsz = bufsz; 4327c478bd9Sstevel@tonic-gate ap->offset = loffset; 4337c478bd9Sstevel@tonic-gate 43434709573Sraf if (_aio_hash_insert(resultp, reqp) != 0) { 43534709573Sraf _aio_req_free(reqp); 4367c478bd9Sstevel@tonic-gate errno = EINVAL; 4377c478bd9Sstevel@tonic-gate return (-1); 4387c478bd9Sstevel@tonic-gate } 43934709573Sraf /* 44034709573Sraf * _aio_req_add() only needs the difference between READ and 44134709573Sraf * WRITE to choose the right worker queue. 44234709573Sraf */ 44334709573Sraf _aio_req_add(reqp, &__nextworker_rw, umode); 44434709573Sraf return (0); 4457c478bd9Sstevel@tonic-gate } 4467c478bd9Sstevel@tonic-gate 4477c478bd9Sstevel@tonic-gate int 4487c478bd9Sstevel@tonic-gate aiocancel(aio_result_t *resultp) 4497c478bd9Sstevel@tonic-gate { 45034709573Sraf aio_req_t *reqp; 45134709573Sraf aio_worker_t *aiowp; 45234709573Sraf int ret; 45334709573Sraf int done = 0; 45434709573Sraf int canceled = 0; 4557c478bd9Sstevel@tonic-gate 4567c478bd9Sstevel@tonic-gate if (!__uaio_ok) { 4577c478bd9Sstevel@tonic-gate errno = EINVAL; 4587c478bd9Sstevel@tonic-gate return (-1); 4597c478bd9Sstevel@tonic-gate } 4607c478bd9Sstevel@tonic-gate 46134709573Sraf sig_mutex_lock(&__aio_mutex); 46234709573Sraf reqp = _aio_hash_find(resultp); 46334709573Sraf if (reqp == NULL) { 4647c478bd9Sstevel@tonic-gate if (_aio_outstand_cnt == _aio_req_done_cnt) 4657c478bd9Sstevel@tonic-gate errno = EINVAL; 4667c478bd9Sstevel@tonic-gate else 4677c478bd9Sstevel@tonic-gate errno = EACCES; 46834709573Sraf ret = -1; 4697c478bd9Sstevel@tonic-gate } else { 47034709573Sraf aiowp = reqp->req_worker; 47134709573Sraf sig_mutex_lock(&aiowp->work_qlock1); 47234709573Sraf (void) _aio_cancel_req(aiowp, reqp, &canceled, &done); 47334709573Sraf sig_mutex_unlock(&aiowp->work_qlock1); 4747c478bd9Sstevel@tonic-gate 4757c478bd9Sstevel@tonic-gate if (canceled) { 47634709573Sraf ret = 0; 4777c478bd9Sstevel@tonic-gate } else { 47834709573Sraf if (_aio_outstand_cnt == 0 || 47934709573Sraf _aio_outstand_cnt == _aio_req_done_cnt) 48034709573Sraf errno = EINVAL; 48134709573Sraf else 48234709573Sraf errno = EACCES; 48334709573Sraf ret = -1; 4847c478bd9Sstevel@tonic-gate } 4857c478bd9Sstevel@tonic-gate } 48634709573Sraf sig_mutex_unlock(&__aio_mutex); 48734709573Sraf return (ret); 4887c478bd9Sstevel@tonic-gate } 4897c478bd9Sstevel@tonic-gate 4907c478bd9Sstevel@tonic-gate /* 4917c478bd9Sstevel@tonic-gate * This must be asynch safe 4927c478bd9Sstevel@tonic-gate */ 4937c478bd9Sstevel@tonic-gate aio_result_t * 4947c478bd9Sstevel@tonic-gate aiowait(struct timeval *uwait) 4957c478bd9Sstevel@tonic-gate { 49634709573Sraf aio_result_t *uresultp; 49734709573Sraf aio_result_t *kresultp; 49834709573Sraf aio_result_t *resultp; 4997c478bd9Sstevel@tonic-gate int dontblock; 5007c478bd9Sstevel@tonic-gate int timedwait = 0; 5017c478bd9Sstevel@tonic-gate int kaio_errno = 0; 50234709573Sraf struct timeval twait; 50334709573Sraf struct timeval *wait = NULL; 5047c478bd9Sstevel@tonic-gate hrtime_t hrtend; 5057c478bd9Sstevel@tonic-gate hrtime_t hres; 5067c478bd9Sstevel@tonic-gate 5077c478bd9Sstevel@tonic-gate if (uwait) { 5087c478bd9Sstevel@tonic-gate /* 50934709573Sraf * Check for a valid specified wait time. 51034709573Sraf * If it is invalid, fail the call right away. 5117c478bd9Sstevel@tonic-gate */ 5127c478bd9Sstevel@tonic-gate if (uwait->tv_sec < 0 || uwait->tv_usec < 0 || 5137c478bd9Sstevel@tonic-gate uwait->tv_usec >= MICROSEC) { 5147c478bd9Sstevel@tonic-gate errno = EINVAL; 5157c478bd9Sstevel@tonic-gate return ((aio_result_t *)-1); 5167c478bd9Sstevel@tonic-gate } 5177c478bd9Sstevel@tonic-gate 51834709573Sraf if (uwait->tv_sec > 0 || uwait->tv_usec > 0) { 5197c478bd9Sstevel@tonic-gate hrtend = gethrtime() + 520*34b3058fSpraks (hrtime_t)uwait->tv_sec * NANOSEC + 521*34b3058fSpraks (hrtime_t)uwait->tv_usec * (NANOSEC / MICROSEC); 5227c478bd9Sstevel@tonic-gate twait = *uwait; 5237c478bd9Sstevel@tonic-gate wait = &twait; 5247c478bd9Sstevel@tonic-gate timedwait++; 5257c478bd9Sstevel@tonic-gate } else { 5267c478bd9Sstevel@tonic-gate /* polling */ 52734709573Sraf sig_mutex_lock(&__aio_mutex); 52834709573Sraf if (_kaio_outstand_cnt == 0) { 52934709573Sraf kresultp = (aio_result_t *)-1; 53034709573Sraf } else { 53134709573Sraf kresultp = (aio_result_t *)_kaio(AIOWAIT, 53234709573Sraf (struct timeval *)-1, 1); 53334709573Sraf if (kresultp != (aio_result_t *)-1 && 53434709573Sraf kresultp != NULL && 53534709573Sraf kresultp != (aio_result_t *)1) { 53634709573Sraf _kaio_outstand_cnt--; 53734709573Sraf sig_mutex_unlock(&__aio_mutex); 53834709573Sraf return (kresultp); 53934709573Sraf } 54034709573Sraf } 5417c478bd9Sstevel@tonic-gate uresultp = _aio_req_done(); 54234709573Sraf sig_mutex_unlock(&__aio_mutex); 54334709573Sraf if (uresultp != NULL && 54434709573Sraf uresultp != (aio_result_t *)-1) { 5457c478bd9Sstevel@tonic-gate return (uresultp); 5467c478bd9Sstevel@tonic-gate } 5477c478bd9Sstevel@tonic-gate if (uresultp == (aio_result_t *)-1 && 5487c478bd9Sstevel@tonic-gate kresultp == (aio_result_t *)-1) { 5497c478bd9Sstevel@tonic-gate errno = EINVAL; 5507c478bd9Sstevel@tonic-gate return ((aio_result_t *)-1); 55134709573Sraf } else { 5527c478bd9Sstevel@tonic-gate return (NULL); 55334709573Sraf } 5547c478bd9Sstevel@tonic-gate } 5557c478bd9Sstevel@tonic-gate } 5567c478bd9Sstevel@tonic-gate 5577c478bd9Sstevel@tonic-gate for (;;) { 55834709573Sraf sig_mutex_lock(&__aio_mutex); 5597c478bd9Sstevel@tonic-gate uresultp = _aio_req_done(); 5607c478bd9Sstevel@tonic-gate if (uresultp != NULL && uresultp != (aio_result_t *)-1) { 56134709573Sraf sig_mutex_unlock(&__aio_mutex); 5627c478bd9Sstevel@tonic-gate resultp = uresultp; 5637c478bd9Sstevel@tonic-gate break; 5647c478bd9Sstevel@tonic-gate } 5657c478bd9Sstevel@tonic-gate _aiowait_flag++; 5667c478bd9Sstevel@tonic-gate dontblock = (uresultp == (aio_result_t *)-1); 56734709573Sraf if (dontblock && _kaio_outstand_cnt == 0) { 56834709573Sraf kresultp = (aio_result_t *)-1; 56934709573Sraf kaio_errno = EINVAL; 57034709573Sraf } else { 57134709573Sraf sig_mutex_unlock(&__aio_mutex); 57234709573Sraf kresultp = (aio_result_t *)_kaio(AIOWAIT, 57334709573Sraf wait, dontblock); 57434709573Sraf sig_mutex_lock(&__aio_mutex); 57534709573Sraf kaio_errno = errno; 57634709573Sraf } 5777c478bd9Sstevel@tonic-gate _aiowait_flag--; 57834709573Sraf sig_mutex_unlock(&__aio_mutex); 5797c478bd9Sstevel@tonic-gate if (kresultp == (aio_result_t *)1) { 5807c478bd9Sstevel@tonic-gate /* aiowait() awakened by an aionotify() */ 5817c478bd9Sstevel@tonic-gate continue; 58234709573Sraf } else if (kresultp != NULL && 58334709573Sraf kresultp != (aio_result_t *)-1) { 5847c478bd9Sstevel@tonic-gate resultp = kresultp; 58534709573Sraf sig_mutex_lock(&__aio_mutex); 58634709573Sraf _kaio_outstand_cnt--; 58734709573Sraf sig_mutex_unlock(&__aio_mutex); 5887c478bd9Sstevel@tonic-gate break; 58934709573Sraf } else if (kresultp == (aio_result_t *)-1 && 59034709573Sraf kaio_errno == EINVAL && 59134709573Sraf uresultp == (aio_result_t *)-1) { 5927c478bd9Sstevel@tonic-gate errno = kaio_errno; 5937c478bd9Sstevel@tonic-gate resultp = (aio_result_t *)-1; 5947c478bd9Sstevel@tonic-gate break; 5957c478bd9Sstevel@tonic-gate } else if (kresultp == (aio_result_t *)-1 && 5967c478bd9Sstevel@tonic-gate kaio_errno == EINTR) { 5977c478bd9Sstevel@tonic-gate errno = kaio_errno; 5987c478bd9Sstevel@tonic-gate resultp = (aio_result_t *)-1; 5997c478bd9Sstevel@tonic-gate break; 6007c478bd9Sstevel@tonic-gate } else if (timedwait) { 6017c478bd9Sstevel@tonic-gate hres = hrtend - gethrtime(); 6027c478bd9Sstevel@tonic-gate if (hres <= 0) { 60334709573Sraf /* time is up; return */ 6047c478bd9Sstevel@tonic-gate resultp = NULL; 6057c478bd9Sstevel@tonic-gate break; 6067c478bd9Sstevel@tonic-gate } else { 6077c478bd9Sstevel@tonic-gate /* 60834709573Sraf * Some time left. Round up the remaining time 60934709573Sraf * in nanoseconds to microsec. Retry the call. 6107c478bd9Sstevel@tonic-gate */ 61134709573Sraf hres += (NANOSEC / MICROSEC) - 1; 6127c478bd9Sstevel@tonic-gate wait->tv_sec = hres / NANOSEC; 6137c478bd9Sstevel@tonic-gate wait->tv_usec = 614*34b3058fSpraks (hres % NANOSEC) / (NANOSEC / MICROSEC); 6157c478bd9Sstevel@tonic-gate } 6167c478bd9Sstevel@tonic-gate } else { 61734709573Sraf ASSERT(kresultp == NULL && uresultp == NULL); 6187c478bd9Sstevel@tonic-gate resultp = NULL; 6197c478bd9Sstevel@tonic-gate continue; 6207c478bd9Sstevel@tonic-gate } 6217c478bd9Sstevel@tonic-gate } 6227c478bd9Sstevel@tonic-gate return (resultp); 6237c478bd9Sstevel@tonic-gate } 6247c478bd9Sstevel@tonic-gate 6257c478bd9Sstevel@tonic-gate /* 6267c478bd9Sstevel@tonic-gate * _aio_get_timedelta calculates the remaining time and stores the result 62734709573Sraf * into timespec_t *wait. 6287c478bd9Sstevel@tonic-gate */ 6297c478bd9Sstevel@tonic-gate 6307c478bd9Sstevel@tonic-gate int 63134709573Sraf _aio_get_timedelta(timespec_t *end, timespec_t *wait) 6327c478bd9Sstevel@tonic-gate { 6337c478bd9Sstevel@tonic-gate int ret = 0; 6347c478bd9Sstevel@tonic-gate struct timeval cur; 63534709573Sraf timespec_t curtime; 6367c478bd9Sstevel@tonic-gate 6377c478bd9Sstevel@tonic-gate (void) gettimeofday(&cur, NULL); 6387c478bd9Sstevel@tonic-gate curtime.tv_sec = cur.tv_sec; 6397c478bd9Sstevel@tonic-gate curtime.tv_nsec = cur.tv_usec * 1000; /* convert us to ns */ 6407c478bd9Sstevel@tonic-gate 6417c478bd9Sstevel@tonic-gate if (end->tv_sec >= curtime.tv_sec) { 6427c478bd9Sstevel@tonic-gate wait->tv_sec = end->tv_sec - curtime.tv_sec; 6437c478bd9Sstevel@tonic-gate if (end->tv_nsec >= curtime.tv_nsec) { 6447c478bd9Sstevel@tonic-gate wait->tv_nsec = end->tv_nsec - curtime.tv_nsec; 6457c478bd9Sstevel@tonic-gate if (wait->tv_sec == 0 && wait->tv_nsec == 0) 6467c478bd9Sstevel@tonic-gate ret = -1; /* timer expired */ 6477c478bd9Sstevel@tonic-gate } else { 6487c478bd9Sstevel@tonic-gate if (end->tv_sec > curtime.tv_sec) { 6497c478bd9Sstevel@tonic-gate wait->tv_sec -= 1; 6507c478bd9Sstevel@tonic-gate wait->tv_nsec = NANOSEC - 6517c478bd9Sstevel@tonic-gate (curtime.tv_nsec - end->tv_nsec); 6527c478bd9Sstevel@tonic-gate } else { 6537c478bd9Sstevel@tonic-gate ret = -1; /* timer expired */ 6547c478bd9Sstevel@tonic-gate } 6557c478bd9Sstevel@tonic-gate } 6567c478bd9Sstevel@tonic-gate } else { 6577c478bd9Sstevel@tonic-gate ret = -1; 6587c478bd9Sstevel@tonic-gate } 6597c478bd9Sstevel@tonic-gate return (ret); 6607c478bd9Sstevel@tonic-gate } 6617c478bd9Sstevel@tonic-gate 6627c478bd9Sstevel@tonic-gate /* 6637c478bd9Sstevel@tonic-gate * If closing by file descriptor: we will simply cancel all the outstanding 66434709573Sraf * aio`s and return. Those aio's in question will have either noticed the 6657c478bd9Sstevel@tonic-gate * cancellation notice before, during, or after initiating io. 6667c478bd9Sstevel@tonic-gate */ 6677c478bd9Sstevel@tonic-gate int 6687c478bd9Sstevel@tonic-gate aiocancel_all(int fd) 6697c478bd9Sstevel@tonic-gate { 67034709573Sraf aio_req_t *reqp; 67134709573Sraf aio_req_t **reqpp; 67234709573Sraf aio_worker_t *first; 67334709573Sraf aio_worker_t *next; 6747c478bd9Sstevel@tonic-gate int canceled = 0; 6757c478bd9Sstevel@tonic-gate int done = 0; 6767c478bd9Sstevel@tonic-gate int cancelall = 0; 6777c478bd9Sstevel@tonic-gate 67834709573Sraf sig_mutex_lock(&__aio_mutex); 6797c478bd9Sstevel@tonic-gate 68034709573Sraf if (_aio_outstand_cnt == 0) { 68134709573Sraf sig_mutex_unlock(&__aio_mutex); 68234709573Sraf return (AIO_ALLDONE); 68334709573Sraf } 6847c478bd9Sstevel@tonic-gate 6857c478bd9Sstevel@tonic-gate /* 68634709573Sraf * Cancel requests from the read/write workers' queues. 6877c478bd9Sstevel@tonic-gate */ 68834709573Sraf first = __nextworker_rw; 6897c478bd9Sstevel@tonic-gate next = first; 6907c478bd9Sstevel@tonic-gate do { 6917c478bd9Sstevel@tonic-gate _aio_cancel_work(next, fd, &canceled, &done); 6927c478bd9Sstevel@tonic-gate } while ((next = next->work_forw) != first); 6937c478bd9Sstevel@tonic-gate 6947c478bd9Sstevel@tonic-gate /* 6957c478bd9Sstevel@tonic-gate * finally, check if there are requests on the done queue that 6967c478bd9Sstevel@tonic-gate * should be canceled. 6977c478bd9Sstevel@tonic-gate */ 6987c478bd9Sstevel@tonic-gate if (fd < 0) 6997c478bd9Sstevel@tonic-gate cancelall = 1; 70034709573Sraf reqpp = &_aio_done_tail; 70134709573Sraf while ((reqp = *reqpp) != NULL) { 70234709573Sraf if (cancelall || reqp->req_args.fd == fd) { 70334709573Sraf *reqpp = reqp->req_next; 7047c478bd9Sstevel@tonic-gate _aio_donecnt--; 70534709573Sraf (void) _aio_hash_del(reqp->req_resultp); 70634709573Sraf _aio_req_free(reqp); 7077c478bd9Sstevel@tonic-gate } else 70834709573Sraf reqpp = &reqp->req_next; 7097c478bd9Sstevel@tonic-gate } 7107c478bd9Sstevel@tonic-gate if (cancelall) { 7117c478bd9Sstevel@tonic-gate ASSERT(_aio_donecnt == 0); 7127c478bd9Sstevel@tonic-gate _aio_done_head = NULL; 7137c478bd9Sstevel@tonic-gate } 71434709573Sraf sig_mutex_unlock(&__aio_mutex); 7157c478bd9Sstevel@tonic-gate 7167c478bd9Sstevel@tonic-gate if (canceled && done == 0) 7177c478bd9Sstevel@tonic-gate return (AIO_CANCELED); 7187c478bd9Sstevel@tonic-gate else if (done && canceled == 0) 7197c478bd9Sstevel@tonic-gate return (AIO_ALLDONE); 7207c478bd9Sstevel@tonic-gate else if ((canceled + done == 0) && KAIO_SUPPORTED(fd)) 7217c478bd9Sstevel@tonic-gate return ((int)_kaio(AIOCANCEL, fd, NULL)); 7227c478bd9Sstevel@tonic-gate return (AIO_NOTCANCELED); 7237c478bd9Sstevel@tonic-gate } 7247c478bd9Sstevel@tonic-gate 7257c478bd9Sstevel@tonic-gate /* 72634709573Sraf * Cancel requests from a given work queue. If the file descriptor 72734709573Sraf * parameter, fd, is non-negative, then only cancel those requests 72834709573Sraf * in this queue that are to this file descriptor. If the fd 7297c478bd9Sstevel@tonic-gate * parameter is -1, then cancel all requests. 7307c478bd9Sstevel@tonic-gate */ 7317c478bd9Sstevel@tonic-gate static void 7327c478bd9Sstevel@tonic-gate _aio_cancel_work(aio_worker_t *aiowp, int fd, int *canceled, int *done) 7337c478bd9Sstevel@tonic-gate { 73434709573Sraf aio_req_t *reqp; 7357c478bd9Sstevel@tonic-gate 73634709573Sraf sig_mutex_lock(&aiowp->work_qlock1); 7377c478bd9Sstevel@tonic-gate /* 7387c478bd9Sstevel@tonic-gate * cancel queued requests first. 7397c478bd9Sstevel@tonic-gate */ 74034709573Sraf reqp = aiowp->work_tail1; 74134709573Sraf while (reqp != NULL) { 74234709573Sraf if (fd < 0 || reqp->req_args.fd == fd) { 74334709573Sraf if (_aio_cancel_req(aiowp, reqp, canceled, done)) { 7447c478bd9Sstevel@tonic-gate /* 74534709573Sraf * Callers locks were dropped. 74634709573Sraf * reqp is invalid; start traversing 74734709573Sraf * the list from the beginning again. 7487c478bd9Sstevel@tonic-gate */ 74934709573Sraf reqp = aiowp->work_tail1; 7507c478bd9Sstevel@tonic-gate continue; 7517c478bd9Sstevel@tonic-gate } 7527c478bd9Sstevel@tonic-gate } 75334709573Sraf reqp = reqp->req_next; 7547c478bd9Sstevel@tonic-gate } 7557c478bd9Sstevel@tonic-gate /* 75634709573Sraf * Since the queued requests have been canceled, there can 75734709573Sraf * only be one inprogress request that should be canceled. 7587c478bd9Sstevel@tonic-gate */ 75934709573Sraf if ((reqp = aiowp->work_req) != NULL && 76034709573Sraf (fd < 0 || reqp->req_args.fd == fd)) 76134709573Sraf (void) _aio_cancel_req(aiowp, reqp, canceled, done); 76234709573Sraf sig_mutex_unlock(&aiowp->work_qlock1); 7637c478bd9Sstevel@tonic-gate } 7647c478bd9Sstevel@tonic-gate 7657c478bd9Sstevel@tonic-gate /* 76634709573Sraf * Cancel a request. Return 1 if the callers locks were temporarily 7677c478bd9Sstevel@tonic-gate * dropped, otherwise return 0. 7687c478bd9Sstevel@tonic-gate */ 7697c478bd9Sstevel@tonic-gate int 77034709573Sraf _aio_cancel_req(aio_worker_t *aiowp, aio_req_t *reqp, int *canceled, int *done) 7717c478bd9Sstevel@tonic-gate { 77234709573Sraf int ostate = reqp->req_state; 7737c478bd9Sstevel@tonic-gate 7747c478bd9Sstevel@tonic-gate ASSERT(MUTEX_HELD(&__aio_mutex)); 7757c478bd9Sstevel@tonic-gate ASSERT(MUTEX_HELD(&aiowp->work_qlock1)); 77634709573Sraf if (ostate == AIO_REQ_CANCELED) 7777c478bd9Sstevel@tonic-gate return (0); 7787c478bd9Sstevel@tonic-gate if (ostate == AIO_REQ_DONE || ostate == AIO_REQ_DONEQ) { 7797c478bd9Sstevel@tonic-gate (*done)++; 7807c478bd9Sstevel@tonic-gate return (0); 7817c478bd9Sstevel@tonic-gate } 782c2575b5eSraf if (reqp->req_op == AIOFSYNC && reqp != aiowp->work_req) { 78334709573Sraf ASSERT(POSIX_AIO(reqp)); 784c2575b5eSraf /* Cancel the queued aio_fsync() request */ 78534709573Sraf if (!reqp->req_head->lio_canned) { 78634709573Sraf reqp->req_head->lio_canned = 1; 78734709573Sraf _aio_outstand_cnt--; 78834709573Sraf (*canceled)++; 78934709573Sraf } 7907c478bd9Sstevel@tonic-gate return (0); 7917c478bd9Sstevel@tonic-gate } 79234709573Sraf reqp->req_state = AIO_REQ_CANCELED; 79334709573Sraf _aio_req_del(aiowp, reqp, ostate); 79434709573Sraf (void) _aio_hash_del(reqp->req_resultp); 7957c478bd9Sstevel@tonic-gate (*canceled)++; 79634709573Sraf if (reqp == aiowp->work_req) { 79734709573Sraf ASSERT(ostate == AIO_REQ_INPROGRESS); 79834709573Sraf /* 79934709573Sraf * Set the result values now, before _aiodone() is called. 80034709573Sraf * We do this because the application can expect aio_return 80134709573Sraf * and aio_errno to be set to -1 and ECANCELED, respectively, 80234709573Sraf * immediately after a successful return from aiocancel() 80334709573Sraf * or aio_cancel(). 80434709573Sraf */ 80534709573Sraf _aio_set_result(reqp, -1, ECANCELED); 80634709573Sraf (void) thr_kill(aiowp->work_tid, SIGAIOCANCEL); 80734709573Sraf return (0); 80834709573Sraf } 80934709573Sraf if (!POSIX_AIO(reqp)) { 81034709573Sraf _aio_outstand_cnt--; 81134709573Sraf _aio_set_result(reqp, -1, ECANCELED); 81234709573Sraf return (0); 81334709573Sraf } 81434709573Sraf sig_mutex_unlock(&aiowp->work_qlock1); 81534709573Sraf sig_mutex_unlock(&__aio_mutex); 81634709573Sraf _aiodone(reqp, -1, ECANCELED); 81734709573Sraf sig_mutex_lock(&__aio_mutex); 81834709573Sraf sig_mutex_lock(&aiowp->work_qlock1); 8197c478bd9Sstevel@tonic-gate return (1); 8207c478bd9Sstevel@tonic-gate } 8217c478bd9Sstevel@tonic-gate 822f841f6adSraf int 823f841f6adSraf _aio_create_worker(aio_req_t *reqp, int mode) 824f841f6adSraf { 825f841f6adSraf aio_worker_t *aiowp, **workers, **nextworker; 826f841f6adSraf int *aio_workerscnt; 827f841f6adSraf void *(*func)(void *); 828f841f6adSraf sigset_t oset; 829f841f6adSraf int error; 830f841f6adSraf 831f841f6adSraf /* 832f841f6adSraf * Put the new worker thread in the right queue. 833f841f6adSraf */ 834f841f6adSraf switch (mode) { 835f841f6adSraf case AIOREAD: 836f841f6adSraf case AIOWRITE: 837f841f6adSraf case AIOAREAD: 838f841f6adSraf case AIOAWRITE: 839f841f6adSraf #if !defined(_LP64) 840f841f6adSraf case AIOAREAD64: 841f841f6adSraf case AIOAWRITE64: 842f841f6adSraf #endif 843f841f6adSraf workers = &__workers_rw; 844f841f6adSraf nextworker = &__nextworker_rw; 845f841f6adSraf aio_workerscnt = &__rw_workerscnt; 846f841f6adSraf func = _aio_do_request; 847f841f6adSraf break; 848f841f6adSraf case AIONOTIFY: 849f841f6adSraf workers = &__workers_no; 850f841f6adSraf nextworker = &__nextworker_no; 851f841f6adSraf func = _aio_do_notify; 852f841f6adSraf aio_workerscnt = &__no_workerscnt; 853f841f6adSraf break; 854f841f6adSraf default: 855f841f6adSraf aio_panic("_aio_create_worker: invalid mode"); 856f841f6adSraf break; 857f841f6adSraf } 858f841f6adSraf 859f841f6adSraf if ((aiowp = _aio_worker_alloc()) == NULL) 860f841f6adSraf return (-1); 861f841f6adSraf 862f841f6adSraf if (reqp) { 863f841f6adSraf reqp->req_state = AIO_REQ_QUEUED; 864f841f6adSraf reqp->req_worker = aiowp; 865f841f6adSraf aiowp->work_head1 = reqp; 866f841f6adSraf aiowp->work_tail1 = reqp; 867f841f6adSraf aiowp->work_next1 = reqp; 868f841f6adSraf aiowp->work_count1 = 1; 869f841f6adSraf aiowp->work_minload1 = 1; 870f841f6adSraf } 871f841f6adSraf 872f841f6adSraf (void) pthread_sigmask(SIG_SETMASK, &maskset, &oset); 873f841f6adSraf error = thr_create(NULL, AIOSTKSIZE, func, aiowp, 874*34b3058fSpraks THR_DAEMON | THR_SUSPENDED, &aiowp->work_tid); 875f841f6adSraf (void) pthread_sigmask(SIG_SETMASK, &oset, NULL); 876f841f6adSraf if (error) { 877f841f6adSraf if (reqp) { 878f841f6adSraf reqp->req_state = 0; 879f841f6adSraf reqp->req_worker = NULL; 880f841f6adSraf } 881f841f6adSraf _aio_worker_free(aiowp); 882f841f6adSraf return (-1); 883f841f6adSraf } 884f841f6adSraf 885f841f6adSraf lmutex_lock(&__aio_mutex); 886f841f6adSraf (*aio_workerscnt)++; 887f841f6adSraf if (*workers == NULL) { 888f841f6adSraf aiowp->work_forw = aiowp; 889f841f6adSraf aiowp->work_backw = aiowp; 890f841f6adSraf *nextworker = aiowp; 891f841f6adSraf *workers = aiowp; 892f841f6adSraf } else { 893f841f6adSraf aiowp->work_backw = (*workers)->work_backw; 894f841f6adSraf aiowp->work_forw = (*workers); 895f841f6adSraf (*workers)->work_backw->work_forw = aiowp; 896f841f6adSraf (*workers)->work_backw = aiowp; 897f841f6adSraf } 898f841f6adSraf _aio_worker_cnt++; 899f841f6adSraf lmutex_unlock(&__aio_mutex); 900f841f6adSraf 901f841f6adSraf (void) thr_continue(aiowp->work_tid); 902f841f6adSraf 903f841f6adSraf return (0); 904f841f6adSraf } 905f841f6adSraf 9067c478bd9Sstevel@tonic-gate /* 9077c478bd9Sstevel@tonic-gate * This is the worker's main routine. 9087c478bd9Sstevel@tonic-gate * The task of this function is to execute all queued requests; 9097c478bd9Sstevel@tonic-gate * once the last pending request is executed this function will block 91034709573Sraf * in _aio_idle(). A new incoming request must wakeup this thread to 9117c478bd9Sstevel@tonic-gate * restart the work. 91234709573Sraf * Every worker has an own work queue. The queue lock is required 9137c478bd9Sstevel@tonic-gate * to synchronize the addition of new requests for this worker or 9147c478bd9Sstevel@tonic-gate * cancellation of pending/running requests. 9157c478bd9Sstevel@tonic-gate * 9167c478bd9Sstevel@tonic-gate * Cancellation scenarios: 9177c478bd9Sstevel@tonic-gate * The cancellation of a request is being done asynchronously using 9187c478bd9Sstevel@tonic-gate * _aio_cancel_req() from another thread context. 9197c478bd9Sstevel@tonic-gate * A queued request can be cancelled in different manners : 9207c478bd9Sstevel@tonic-gate * a) request is queued but not "in progress" or "done" (AIO_REQ_QUEUED): 9217c478bd9Sstevel@tonic-gate * - lock the queue -> remove the request -> unlock the queue 9227c478bd9Sstevel@tonic-gate * - this function/thread does not detect this cancellation process 9237c478bd9Sstevel@tonic-gate * b) request is in progress (AIO_REQ_INPROGRESS) : 9247c478bd9Sstevel@tonic-gate * - this function first allow the cancellation of the running 9257c478bd9Sstevel@tonic-gate * request with the flag "work_cancel_flg=1" 9267c478bd9Sstevel@tonic-gate * see _aio_req_get() -> _aio_cancel_on() 9277c478bd9Sstevel@tonic-gate * During this phase, it is allowed to interrupt the worker 9287c478bd9Sstevel@tonic-gate * thread running the request (this thread) using the SIGAIOCANCEL 9297c478bd9Sstevel@tonic-gate * signal. 9307c478bd9Sstevel@tonic-gate * Once this thread returns from the kernel (because the request 9317c478bd9Sstevel@tonic-gate * is just done), then it must disable a possible cancellation 93234709573Sraf * and proceed to finish the request. To disable the cancellation 9337c478bd9Sstevel@tonic-gate * this thread must use _aio_cancel_off() to set "work_cancel_flg=0". 9347c478bd9Sstevel@tonic-gate * c) request is already done (AIO_REQ_DONE || AIO_REQ_DONEQ): 9357c478bd9Sstevel@tonic-gate * same procedure as in a) 9367c478bd9Sstevel@tonic-gate * 9377c478bd9Sstevel@tonic-gate * To b) 9387c478bd9Sstevel@tonic-gate * This thread uses sigsetjmp() to define the position in the code, where 9397c478bd9Sstevel@tonic-gate * it wish to continue working in the case that a SIGAIOCANCEL signal 9407c478bd9Sstevel@tonic-gate * is detected. 9417c478bd9Sstevel@tonic-gate * Normally this thread should get the cancellation signal during the 94234709573Sraf * kernel phase (reading or writing). In that case the signal handler 9437c478bd9Sstevel@tonic-gate * aiosigcancelhndlr() is activated using the worker thread context, 9447c478bd9Sstevel@tonic-gate * which again will use the siglongjmp() function to break the standard 9457c478bd9Sstevel@tonic-gate * code flow and jump to the "sigsetjmp" position, provided that 9467c478bd9Sstevel@tonic-gate * "work_cancel_flg" is set to "1". 9477c478bd9Sstevel@tonic-gate * Because the "work_cancel_flg" is only manipulated by this worker 9487c478bd9Sstevel@tonic-gate * thread and it can only run on one CPU at a given time, it is not 9497c478bd9Sstevel@tonic-gate * necessary to protect that flag with the queue lock. 9507c478bd9Sstevel@tonic-gate * Returning from the kernel (read or write system call) we must 9517c478bd9Sstevel@tonic-gate * first disable the use of the SIGAIOCANCEL signal and accordingly 9527c478bd9Sstevel@tonic-gate * the use of the siglongjmp() function to prevent a possible deadlock: 9537c478bd9Sstevel@tonic-gate * - It can happens that this worker thread returns from the kernel and 9547c478bd9Sstevel@tonic-gate * blocks in "work_qlock1", 9557c478bd9Sstevel@tonic-gate * - then a second thread cancels the apparently "in progress" request 9567c478bd9Sstevel@tonic-gate * and sends the SIGAIOCANCEL signal to the worker thread, 9577c478bd9Sstevel@tonic-gate * - the worker thread gets assigned the "work_qlock1" and will returns 9587c478bd9Sstevel@tonic-gate * from the kernel, 9597c478bd9Sstevel@tonic-gate * - the kernel detects the pending signal and activates the signal 9607c478bd9Sstevel@tonic-gate * handler instead, 9617c478bd9Sstevel@tonic-gate * - if the "work_cancel_flg" is still set then the signal handler 9627c478bd9Sstevel@tonic-gate * should use siglongjmp() to cancel the "in progress" request and 9637c478bd9Sstevel@tonic-gate * it would try to acquire the same work_qlock1 in _aio_req_get() 9647c478bd9Sstevel@tonic-gate * for a second time => deadlock. 9657c478bd9Sstevel@tonic-gate * To avoid that situation we disable the cancellation of the request 9667c478bd9Sstevel@tonic-gate * in progress BEFORE we try to acquire the work_qlock1. 9677c478bd9Sstevel@tonic-gate * In that case the signal handler will not call siglongjmp() and the 9687c478bd9Sstevel@tonic-gate * worker thread will continue running the standard code flow. 9697c478bd9Sstevel@tonic-gate * Then this thread must check the AIO_REQ_CANCELED flag to emulate 9707c478bd9Sstevel@tonic-gate * an eventually required siglongjmp() freeing the work_qlock1 and 9717c478bd9Sstevel@tonic-gate * avoiding a deadlock. 9727c478bd9Sstevel@tonic-gate */ 9737c478bd9Sstevel@tonic-gate void * 9747c478bd9Sstevel@tonic-gate _aio_do_request(void *arglist) 9757c478bd9Sstevel@tonic-gate { 9767c478bd9Sstevel@tonic-gate aio_worker_t *aiowp = (aio_worker_t *)arglist; 977f841f6adSraf ulwp_t *self = curthread; 9787c478bd9Sstevel@tonic-gate struct aio_args *arg; 97934709573Sraf aio_req_t *reqp; /* current AIO request */ 9807c478bd9Sstevel@tonic-gate ssize_t retval; 98134709573Sraf int error; 9827c478bd9Sstevel@tonic-gate 98334709573Sraf if (pthread_setspecific(_aio_key, aiowp) != 0) 984f841f6adSraf aio_panic("_aio_do_request, pthread_setspecific()"); 98534709573Sraf (void) pthread_sigmask(SIG_SETMASK, &_worker_set, NULL); 98634709573Sraf ASSERT(aiowp->work_req == NULL); 9877c478bd9Sstevel@tonic-gate 98834709573Sraf /* 98934709573Sraf * We resume here when an operation is cancelled. 99034709573Sraf * On first entry, aiowp->work_req == NULL, so all 99134709573Sraf * we do is block SIGAIOCANCEL. 99234709573Sraf */ 99334709573Sraf (void) sigsetjmp(aiowp->work_jmp_buf, 0); 994f841f6adSraf ASSERT(self->ul_sigdefer == 0); 9957c478bd9Sstevel@tonic-gate 996f841f6adSraf sigoff(self); /* block SIGAIOCANCEL */ 99734709573Sraf if (aiowp->work_req != NULL) 99834709573Sraf _aio_finish_request(aiowp, -1, ECANCELED); 9997c478bd9Sstevel@tonic-gate 100034709573Sraf for (;;) { 10017c478bd9Sstevel@tonic-gate /* 100234709573Sraf * Put completed requests on aio_done_list. This has 10037c478bd9Sstevel@tonic-gate * to be done as part of the main loop to ensure that 10047c478bd9Sstevel@tonic-gate * we don't artificially starve any aiowait'ers. 10057c478bd9Sstevel@tonic-gate */ 10067c478bd9Sstevel@tonic-gate if (aiowp->work_done1) 10077c478bd9Sstevel@tonic-gate _aio_work_done(aiowp); 10087c478bd9Sstevel@tonic-gate 100934709573Sraf top: 101034709573Sraf /* consume any deferred SIGAIOCANCEL signal here */ 1011f841f6adSraf sigon(self); 1012f841f6adSraf sigoff(self); 101334709573Sraf 1014f841f6adSraf while ((reqp = _aio_req_get(aiowp)) == NULL) { 1015f841f6adSraf if (_aio_idle(aiowp) != 0) 1016f841f6adSraf goto top; 1017f841f6adSraf } 101834709573Sraf arg = &reqp->req_args; 101934709573Sraf ASSERT(reqp->req_state == AIO_REQ_INPROGRESS || 102034709573Sraf reqp->req_state == AIO_REQ_CANCELED); 102134709573Sraf error = 0; 102234709573Sraf 102334709573Sraf switch (reqp->req_op) { 102434709573Sraf case AIOREAD: 102534709573Sraf case AIOAREAD: 1026f841f6adSraf sigon(self); /* unblock SIGAIOCANCEL */ 102734709573Sraf retval = pread(arg->fd, arg->buf, 102834709573Sraf arg->bufsz, arg->offset); 102934709573Sraf if (retval == -1) { 103034709573Sraf if (errno == ESPIPE) { 103134709573Sraf retval = read(arg->fd, 103234709573Sraf arg->buf, arg->bufsz); 103334709573Sraf if (retval == -1) 103434709573Sraf error = errno; 103534709573Sraf } else { 103634709573Sraf error = errno; 10377c478bd9Sstevel@tonic-gate } 103834709573Sraf } 1039f841f6adSraf sigoff(self); /* block SIGAIOCANCEL */ 104034709573Sraf break; 104134709573Sraf case AIOWRITE: 104234709573Sraf case AIOAWRITE: 1043f841f6adSraf sigon(self); /* unblock SIGAIOCANCEL */ 104434709573Sraf retval = pwrite(arg->fd, arg->buf, 104534709573Sraf arg->bufsz, arg->offset); 104634709573Sraf if (retval == -1) { 104734709573Sraf if (errno == ESPIPE) { 104834709573Sraf retval = write(arg->fd, 104934709573Sraf arg->buf, arg->bufsz); 105034709573Sraf if (retval == -1) 105134709573Sraf error = errno; 105234709573Sraf } else { 105334709573Sraf error = errno; 10547c478bd9Sstevel@tonic-gate } 105534709573Sraf } 1056f841f6adSraf sigoff(self); /* block SIGAIOCANCEL */ 105734709573Sraf break; 105834709573Sraf #if !defined(_LP64) 105934709573Sraf case AIOAREAD64: 1060f841f6adSraf sigon(self); /* unblock SIGAIOCANCEL */ 106134709573Sraf retval = pread64(arg->fd, arg->buf, 106234709573Sraf arg->bufsz, arg->offset); 106334709573Sraf if (retval == -1) { 106434709573Sraf if (errno == ESPIPE) { 106534709573Sraf retval = read(arg->fd, 106634709573Sraf arg->buf, arg->bufsz); 106734709573Sraf if (retval == -1) 106834709573Sraf error = errno; 106934709573Sraf } else { 107034709573Sraf error = errno; 10717c478bd9Sstevel@tonic-gate } 107234709573Sraf } 1073f841f6adSraf sigoff(self); /* block SIGAIOCANCEL */ 107434709573Sraf break; 107534709573Sraf case AIOAWRITE64: 1076f841f6adSraf sigon(self); /* unblock SIGAIOCANCEL */ 107734709573Sraf retval = pwrite64(arg->fd, arg->buf, 107834709573Sraf arg->bufsz, arg->offset); 107934709573Sraf if (retval == -1) { 108034709573Sraf if (errno == ESPIPE) { 108134709573Sraf retval = write(arg->fd, 108234709573Sraf arg->buf, arg->bufsz); 108334709573Sraf if (retval == -1) 108434709573Sraf error = errno; 108534709573Sraf } else { 108634709573Sraf error = errno; 10877c478bd9Sstevel@tonic-gate } 108834709573Sraf } 1089f841f6adSraf sigoff(self); /* block SIGAIOCANCEL */ 109034709573Sraf break; 109134709573Sraf #endif /* !defined(_LP64) */ 109234709573Sraf case AIOFSYNC: 1093c2575b5eSraf if (_aio_fsync_del(aiowp, reqp)) 109434709573Sraf goto top; 109534709573Sraf ASSERT(reqp->req_head == NULL); 109634709573Sraf /* 109734709573Sraf * All writes for this fsync request are now 109834709573Sraf * acknowledged. Now make these writes visible 109934709573Sraf * and put the final request into the hash table. 110034709573Sraf */ 110134709573Sraf if (reqp->req_state == AIO_REQ_CANCELED) { 110234709573Sraf /* EMPTY */; 110334709573Sraf } else if (arg->offset == O_SYNC) { 110434709573Sraf if ((retval = __fdsync(arg->fd, FSYNC)) == -1) 110534709573Sraf error = errno; 110634709573Sraf } else { 110734709573Sraf if ((retval = __fdsync(arg->fd, FDSYNC)) == -1) 110834709573Sraf error = errno; 110934709573Sraf } 111034709573Sraf if (_aio_hash_insert(reqp->req_resultp, reqp) != 0) 1111f841f6adSraf aio_panic("_aio_do_request(): AIOFSYNC: " 111234709573Sraf "request already in hash table"); 111334709573Sraf break; 111434709573Sraf default: 1115f841f6adSraf aio_panic("_aio_do_request, bad op"); 11167c478bd9Sstevel@tonic-gate } 11177c478bd9Sstevel@tonic-gate 111834709573Sraf _aio_finish_request(aiowp, retval, error); 111934709573Sraf } 112034709573Sraf /* NOTREACHED */ 112134709573Sraf return (NULL); 112234709573Sraf } 112334709573Sraf 112434709573Sraf /* 112534709573Sraf * Perform the tail processing for _aio_do_request(). 112634709573Sraf * The in-progress request may or may not have been cancelled. 112734709573Sraf */ 112834709573Sraf static void 112934709573Sraf _aio_finish_request(aio_worker_t *aiowp, ssize_t retval, int error) 113034709573Sraf { 113134709573Sraf aio_req_t *reqp; 113234709573Sraf 113334709573Sraf sig_mutex_lock(&aiowp->work_qlock1); 113434709573Sraf if ((reqp = aiowp->work_req) == NULL) 113534709573Sraf sig_mutex_unlock(&aiowp->work_qlock1); 113634709573Sraf else { 113734709573Sraf aiowp->work_req = NULL; 113834709573Sraf if (reqp->req_state == AIO_REQ_CANCELED) { 113934709573Sraf retval = -1; 114034709573Sraf error = ECANCELED; 114134709573Sraf } 114234709573Sraf if (!POSIX_AIO(reqp)) { 1143*34b3058fSpraks int notify; 114434709573Sraf sig_mutex_unlock(&aiowp->work_qlock1); 114534709573Sraf sig_mutex_lock(&__aio_mutex); 114634709573Sraf if (reqp->req_state == AIO_REQ_INPROGRESS) 114734709573Sraf reqp->req_state = AIO_REQ_DONE; 1148*34b3058fSpraks /* 1149*34b3058fSpraks * If it was canceled, this request will not be 1150*34b3058fSpraks * added to done list. Just free it. 1151*34b3058fSpraks */ 1152*34b3058fSpraks if (error == ECANCELED) { 115334709573Sraf _aio_outstand_cnt--; 1154*34b3058fSpraks _aio_req_free(reqp); 1155*34b3058fSpraks } else { 1156*34b3058fSpraks _aio_set_result(reqp, retval, error); 1157*34b3058fSpraks _aio_req_done_cnt++; 1158*34b3058fSpraks } 1159*34b3058fSpraks /* 1160*34b3058fSpraks * Notify any thread that may have blocked 1161*34b3058fSpraks * because it saw an outstanding request. 1162*34b3058fSpraks */ 1163*34b3058fSpraks notify = 0; 1164*34b3058fSpraks if (_aio_outstand_cnt == 0 && _aiowait_flag) { 1165*34b3058fSpraks notify = 1; 1166*34b3058fSpraks } 116734709573Sraf sig_mutex_unlock(&__aio_mutex); 1168*34b3058fSpraks if (notify) { 1169*34b3058fSpraks (void) _kaio(AIONOTIFY); 1170*34b3058fSpraks } 117134709573Sraf } else { 117234709573Sraf if (reqp->req_state == AIO_REQ_INPROGRESS) 117334709573Sraf reqp->req_state = AIO_REQ_DONE; 117434709573Sraf sig_mutex_unlock(&aiowp->work_qlock1); 117534709573Sraf _aiodone(reqp, retval, error); 117634709573Sraf } 117734709573Sraf } 117834709573Sraf } 11797c478bd9Sstevel@tonic-gate 118034709573Sraf void 118134709573Sraf _aio_req_mark_done(aio_req_t *reqp) 118234709573Sraf { 118334709573Sraf #if !defined(_LP64) 118434709573Sraf if (reqp->req_largefile) 118534709573Sraf ((aiocb64_t *)reqp->req_aiocbp)->aio_state = USERAIO_DONE; 118634709573Sraf else 118734709573Sraf #endif 118834709573Sraf ((aiocb_t *)reqp->req_aiocbp)->aio_state = USERAIO_DONE; 118934709573Sraf } 11907c478bd9Sstevel@tonic-gate 119134709573Sraf /* 119234709573Sraf * Sleep for 'ticks' clock ticks to give somebody else a chance to run, 119334709573Sraf * hopefully to consume one of our queued signals. 119434709573Sraf */ 119534709573Sraf static void 119634709573Sraf _aio_delay(int ticks) 119734709573Sraf { 119834709573Sraf (void) usleep(ticks * (MICROSEC / hz)); 119934709573Sraf } 12007c478bd9Sstevel@tonic-gate 120134709573Sraf /* 120234709573Sraf * Actually send the notifications. 120334709573Sraf * We could block indefinitely here if the application 120434709573Sraf * is not listening for the signal or port notifications. 120534709573Sraf */ 120634709573Sraf static void 120734709573Sraf send_notification(notif_param_t *npp) 120834709573Sraf { 1209f841f6adSraf extern int __sigqueue(pid_t pid, int signo, 1210*34b3058fSpraks /* const union sigval */ void *value, int si_code, int block); 1211f841f6adSraf 1212f841f6adSraf if (npp->np_signo) 1213f841f6adSraf (void) __sigqueue(__pid, npp->np_signo, npp->np_user, 1214f841f6adSraf SI_ASYNCIO, 1); 1215f841f6adSraf else if (npp->np_port >= 0) 121634709573Sraf (void) _port_dispatch(npp->np_port, 0, PORT_SOURCE_AIO, 121734709573Sraf npp->np_event, npp->np_object, npp->np_user); 1218f841f6adSraf 1219f841f6adSraf if (npp->np_lio_signo) 1220f841f6adSraf (void) __sigqueue(__pid, npp->np_lio_signo, npp->np_lio_user, 1221f841f6adSraf SI_ASYNCIO, 1); 1222f841f6adSraf else if (npp->np_lio_port >= 0) 122334709573Sraf (void) _port_dispatch(npp->np_lio_port, 0, PORT_SOURCE_AIO, 122434709573Sraf npp->np_lio_event, npp->np_lio_object, npp->np_lio_user); 12257c478bd9Sstevel@tonic-gate } 12267c478bd9Sstevel@tonic-gate 12277c478bd9Sstevel@tonic-gate /* 122834709573Sraf * Asynchronous notification worker. 12297c478bd9Sstevel@tonic-gate */ 12307c478bd9Sstevel@tonic-gate void * 123134709573Sraf _aio_do_notify(void *arg) 12327c478bd9Sstevel@tonic-gate { 12337c478bd9Sstevel@tonic-gate aio_worker_t *aiowp = (aio_worker_t *)arg; 123434709573Sraf aio_req_t *reqp; 12357c478bd9Sstevel@tonic-gate 123634709573Sraf /* 123734709573Sraf * This isn't really necessary. All signals are blocked. 123834709573Sraf */ 123934709573Sraf if (pthread_setspecific(_aio_key, aiowp) != 0) 1240f841f6adSraf aio_panic("_aio_do_notify, pthread_setspecific()"); 124134709573Sraf 124234709573Sraf /* 124334709573Sraf * Notifications are never cancelled. 124434709573Sraf * All signals remain blocked, forever. 124534709573Sraf */ 12467c478bd9Sstevel@tonic-gate for (;;) { 1247f841f6adSraf while ((reqp = _aio_req_get(aiowp)) == NULL) { 1248f841f6adSraf if (_aio_idle(aiowp) != 0) 1249f841f6adSraf aio_panic("_aio_do_notify: _aio_idle() failed"); 1250f841f6adSraf } 125134709573Sraf send_notification(&reqp->req_notify); 125234709573Sraf _aio_req_free(reqp); 12537c478bd9Sstevel@tonic-gate } 125434709573Sraf 12557c478bd9Sstevel@tonic-gate /* NOTREACHED */ 12567c478bd9Sstevel@tonic-gate return (NULL); 12577c478bd9Sstevel@tonic-gate } 12587c478bd9Sstevel@tonic-gate 12597c478bd9Sstevel@tonic-gate /* 126034709573Sraf * Do the completion semantics for a request that was either canceled 126134709573Sraf * by _aio_cancel_req() or was completed by _aio_do_request(). 12627c478bd9Sstevel@tonic-gate */ 126334709573Sraf static void 126434709573Sraf _aiodone(aio_req_t *reqp, ssize_t retval, int error) 12657c478bd9Sstevel@tonic-gate { 126634709573Sraf aio_result_t *resultp = reqp->req_resultp; 126734709573Sraf int notify = 0; 126834709573Sraf aio_lio_t *head; 126934709573Sraf int sigev_none; 127034709573Sraf int sigev_signal; 127134709573Sraf int sigev_thread; 127234709573Sraf int sigev_port; 127334709573Sraf notif_param_t np; 12747c478bd9Sstevel@tonic-gate 127534709573Sraf /* 127634709573Sraf * We call _aiodone() only for Posix I/O. 127734709573Sraf */ 127834709573Sraf ASSERT(POSIX_AIO(reqp)); 127934709573Sraf 128034709573Sraf sigev_none = 0; 128134709573Sraf sigev_signal = 0; 128234709573Sraf sigev_thread = 0; 128334709573Sraf sigev_port = 0; 128434709573Sraf np.np_signo = 0; 128534709573Sraf np.np_port = -1; 128634709573Sraf np.np_lio_signo = 0; 128734709573Sraf np.np_lio_port = -1; 128834709573Sraf 128934709573Sraf switch (reqp->req_sigevent.sigev_notify) { 129034709573Sraf case SIGEV_NONE: 129134709573Sraf sigev_none = 1; 129234709573Sraf break; 129334709573Sraf case SIGEV_SIGNAL: 129434709573Sraf sigev_signal = 1; 129534709573Sraf break; 129634709573Sraf case SIGEV_THREAD: 129734709573Sraf sigev_thread = 1; 129834709573Sraf break; 129934709573Sraf case SIGEV_PORT: 130034709573Sraf sigev_port = 1; 130134709573Sraf break; 130234709573Sraf default: 1303f841f6adSraf aio_panic("_aiodone: improper sigev_notify"); 130434709573Sraf break; 130534709573Sraf } 13067c478bd9Sstevel@tonic-gate 130734709573Sraf /* 130834709573Sraf * Figure out the notification parameters while holding __aio_mutex. 130934709573Sraf * Actually perform the notifications after dropping __aio_mutex. 131034709573Sraf * This allows us to sleep for a long time (if the notifications 131134709573Sraf * incur delays) without impeding other async I/O operations. 131234709573Sraf */ 13137c478bd9Sstevel@tonic-gate 131434709573Sraf sig_mutex_lock(&__aio_mutex); 131534709573Sraf 131634709573Sraf if (sigev_signal) { 131734709573Sraf if ((np.np_signo = reqp->req_sigevent.sigev_signo) != 0) 131834709573Sraf notify = 1; 131934709573Sraf np.np_user = reqp->req_sigevent.sigev_value.sival_ptr; 132034709573Sraf } else if (sigev_thread | sigev_port) { 132134709573Sraf if ((np.np_port = reqp->req_sigevent.sigev_signo) >= 0) 132234709573Sraf notify = 1; 132334709573Sraf np.np_event = reqp->req_op; 132434709573Sraf if (np.np_event == AIOFSYNC && reqp->req_largefile) 132534709573Sraf np.np_event = AIOFSYNC64; 132634709573Sraf np.np_object = (uintptr_t)reqp->req_aiocbp; 132734709573Sraf np.np_user = reqp->req_sigevent.sigev_value.sival_ptr; 132834709573Sraf } 13297c478bd9Sstevel@tonic-gate 133034709573Sraf if (resultp->aio_errno == EINPROGRESS) 133134709573Sraf _aio_set_result(reqp, retval, error); 13327c478bd9Sstevel@tonic-gate 133334709573Sraf _aio_outstand_cnt--; 13347c478bd9Sstevel@tonic-gate 133534709573Sraf head = reqp->req_head; 133634709573Sraf reqp->req_head = NULL; 13377c478bd9Sstevel@tonic-gate 133834709573Sraf if (sigev_none) { 133934709573Sraf _aio_enq_doneq(reqp); 134034709573Sraf reqp = NULL; 134134709573Sraf } else { 134234709573Sraf (void) _aio_hash_del(resultp); 134334709573Sraf _aio_req_mark_done(reqp); 134434709573Sraf } 13457c478bd9Sstevel@tonic-gate 134634709573Sraf _aio_waitn_wakeup(); 13477c478bd9Sstevel@tonic-gate 134834709573Sraf /* 134934709573Sraf * __aio_waitn() sets AIO_WAIT_INPROGRESS and 135034709573Sraf * __aio_suspend() increments "_aio_kernel_suspend" 135134709573Sraf * when they are waiting in the kernel for completed I/Os. 135234709573Sraf * 135334709573Sraf * _kaio(AIONOTIFY) awakes the corresponding function 135434709573Sraf * in the kernel; then the corresponding __aio_waitn() or 135534709573Sraf * __aio_suspend() function could reap the recently 135634709573Sraf * completed I/Os (_aiodone()). 135734709573Sraf */ 135834709573Sraf if ((_aio_flags & AIO_WAIT_INPROGRESS) || _aio_kernel_suspend > 0) 135934709573Sraf (void) _kaio(AIONOTIFY); 13607c478bd9Sstevel@tonic-gate 136134709573Sraf sig_mutex_unlock(&__aio_mutex); 13627c478bd9Sstevel@tonic-gate 136334709573Sraf if (head != NULL) { 13647c478bd9Sstevel@tonic-gate /* 136534709573Sraf * If all the lio requests have completed, 136634709573Sraf * prepare to notify the waiting thread. 13677c478bd9Sstevel@tonic-gate */ 136834709573Sraf sig_mutex_lock(&head->lio_mutex); 136934709573Sraf ASSERT(head->lio_refcnt == head->lio_nent); 137034709573Sraf if (head->lio_refcnt == 1) { 137134709573Sraf int waiting = 0; 137234709573Sraf if (head->lio_mode == LIO_WAIT) { 137334709573Sraf if ((waiting = head->lio_waiting) != 0) 137434709573Sraf (void) cond_signal(&head->lio_cond_cv); 137534709573Sraf } else if (head->lio_port < 0) { /* none or signal */ 137634709573Sraf if ((np.np_lio_signo = head->lio_signo) != 0) 137734709573Sraf notify = 1; 137834709573Sraf np.np_lio_user = head->lio_sigval.sival_ptr; 137934709573Sraf } else { /* thread or port */ 138034709573Sraf notify = 1; 138134709573Sraf np.np_lio_port = head->lio_port; 138234709573Sraf np.np_lio_event = head->lio_event; 138334709573Sraf np.np_lio_object = 138434709573Sraf (uintptr_t)head->lio_sigevent; 138534709573Sraf np.np_lio_user = head->lio_sigval.sival_ptr; 13867c478bd9Sstevel@tonic-gate } 138734709573Sraf head->lio_nent = head->lio_refcnt = 0; 138834709573Sraf sig_mutex_unlock(&head->lio_mutex); 138934709573Sraf if (waiting == 0) 139034709573Sraf _aio_lio_free(head); 139134709573Sraf } else { 139234709573Sraf head->lio_nent--; 139334709573Sraf head->lio_refcnt--; 139434709573Sraf sig_mutex_unlock(&head->lio_mutex); 13957c478bd9Sstevel@tonic-gate } 139634709573Sraf } 13977c478bd9Sstevel@tonic-gate 139834709573Sraf /* 139934709573Sraf * The request is completed; now perform the notifications. 140034709573Sraf */ 140134709573Sraf if (notify) { 140234709573Sraf if (reqp != NULL) { 14037c478bd9Sstevel@tonic-gate /* 140434709573Sraf * We usually put the request on the notification 140534709573Sraf * queue because we don't want to block and delay 140634709573Sraf * other operations behind us in the work queue. 140734709573Sraf * Also we must never block on a cancel notification 140834709573Sraf * because we are being called from an application 140934709573Sraf * thread in this case and that could lead to deadlock 141034709573Sraf * if no other thread is receiving notificatins. 14117c478bd9Sstevel@tonic-gate */ 141234709573Sraf reqp->req_notify = np; 141334709573Sraf reqp->req_op = AIONOTIFY; 141434709573Sraf _aio_req_add(reqp, &__workers_no, AIONOTIFY); 141534709573Sraf reqp = NULL; 141634709573Sraf } else { 141734709573Sraf /* 141834709573Sraf * We already put the request on the done queue, 141934709573Sraf * so we can't queue it to the notification queue. 142034709573Sraf * Just do the notification directly. 142134709573Sraf */ 142234709573Sraf send_notification(&np); 14237c478bd9Sstevel@tonic-gate } 14247c478bd9Sstevel@tonic-gate } 142534709573Sraf 142634709573Sraf if (reqp != NULL) 142734709573Sraf _aio_req_free(reqp); 14287c478bd9Sstevel@tonic-gate } 14297c478bd9Sstevel@tonic-gate 14307c478bd9Sstevel@tonic-gate /* 143134709573Sraf * Delete fsync requests from list head until there is 143234709573Sraf * only one left. Return 0 when there is only one, 143334709573Sraf * otherwise return a non-zero value. 14347c478bd9Sstevel@tonic-gate */ 14357c478bd9Sstevel@tonic-gate static int 1436c2575b5eSraf _aio_fsync_del(aio_worker_t *aiowp, aio_req_t *reqp) 14377c478bd9Sstevel@tonic-gate { 143834709573Sraf aio_lio_t *head = reqp->req_head; 143934709573Sraf int rval = 0; 144034709573Sraf 1441c2575b5eSraf ASSERT(reqp == aiowp->work_req); 1442c2575b5eSraf sig_mutex_lock(&aiowp->work_qlock1); 144334709573Sraf sig_mutex_lock(&head->lio_mutex); 144434709573Sraf if (head->lio_refcnt > 1) { 144534709573Sraf head->lio_refcnt--; 144634709573Sraf head->lio_nent--; 1447c2575b5eSraf aiowp->work_req = NULL; 144834709573Sraf sig_mutex_unlock(&head->lio_mutex); 1449c2575b5eSraf sig_mutex_unlock(&aiowp->work_qlock1); 145034709573Sraf sig_mutex_lock(&__aio_mutex); 145134709573Sraf _aio_outstand_cnt--; 145234709573Sraf _aio_waitn_wakeup(); 145334709573Sraf sig_mutex_unlock(&__aio_mutex); 145434709573Sraf _aio_req_free(reqp); 145534709573Sraf return (1); 14567c478bd9Sstevel@tonic-gate } 145734709573Sraf ASSERT(head->lio_nent == 1 && head->lio_refcnt == 1); 145834709573Sraf reqp->req_head = NULL; 145934709573Sraf if (head->lio_canned) 146034709573Sraf reqp->req_state = AIO_REQ_CANCELED; 146134709573Sraf if (head->lio_mode == LIO_DESTROY) { 1462c2575b5eSraf aiowp->work_req = NULL; 146334709573Sraf rval = 1; 146434709573Sraf } 1465c2575b5eSraf sig_mutex_unlock(&head->lio_mutex); 1466c2575b5eSraf sig_mutex_unlock(&aiowp->work_qlock1); 146734709573Sraf head->lio_refcnt--; 146834709573Sraf head->lio_nent--; 146934709573Sraf _aio_lio_free(head); 1470c2575b5eSraf if (rval != 0) 1471c2575b5eSraf _aio_req_free(reqp); 147234709573Sraf return (rval); 14737c478bd9Sstevel@tonic-gate } 14747c478bd9Sstevel@tonic-gate 14757c478bd9Sstevel@tonic-gate /* 1476f841f6adSraf * A worker is set idle when its work queue is empty. 1477f841f6adSraf * The worker checks again that it has no more work 1478f841f6adSraf * and then goes to sleep waiting for more work. 14797c478bd9Sstevel@tonic-gate */ 1480f841f6adSraf int 14817c478bd9Sstevel@tonic-gate _aio_idle(aio_worker_t *aiowp) 14827c478bd9Sstevel@tonic-gate { 148334709573Sraf int error = 0; 148434709573Sraf 148534709573Sraf sig_mutex_lock(&aiowp->work_qlock1); 148634709573Sraf if (aiowp->work_count1 == 0) { 148734709573Sraf ASSERT(aiowp->work_minload1 == 0); 14887c478bd9Sstevel@tonic-gate aiowp->work_idleflg = 1; 14897c478bd9Sstevel@tonic-gate /* 149034709573Sraf * A cancellation handler is not needed here. 149134709573Sraf * aio worker threads are never cancelled via pthread_cancel(). 14927c478bd9Sstevel@tonic-gate */ 149334709573Sraf error = sig_cond_wait(&aiowp->work_idle_cv, 149434709573Sraf &aiowp->work_qlock1); 149534709573Sraf /* 149634709573Sraf * The idle flag is normally cleared before worker is awakened 149734709573Sraf * by aio_req_add(). On error (EINTR), we clear it ourself. 149834709573Sraf */ 149934709573Sraf if (error) 150034709573Sraf aiowp->work_idleflg = 0; 15017c478bd9Sstevel@tonic-gate } 150234709573Sraf sig_mutex_unlock(&aiowp->work_qlock1); 1503f841f6adSraf return (error); 15047c478bd9Sstevel@tonic-gate } 15057c478bd9Sstevel@tonic-gate 15067c478bd9Sstevel@tonic-gate /* 15077c478bd9Sstevel@tonic-gate * A worker's completed AIO requests are placed onto a global 150834709573Sraf * done queue. The application is only sent a SIGIO signal if 15097c478bd9Sstevel@tonic-gate * the process has a handler enabled and it is not waiting via 15107c478bd9Sstevel@tonic-gate * aiowait(). 15117c478bd9Sstevel@tonic-gate */ 15127c478bd9Sstevel@tonic-gate static void 151334709573Sraf _aio_work_done(aio_worker_t *aiowp) 15147c478bd9Sstevel@tonic-gate { 151534709573Sraf aio_req_t *reqp; 15167c478bd9Sstevel@tonic-gate 151734709573Sraf sig_mutex_lock(&aiowp->work_qlock1); 151834709573Sraf reqp = aiowp->work_prev1; 151934709573Sraf reqp->req_next = NULL; 15207c478bd9Sstevel@tonic-gate aiowp->work_done1 = 0; 15217c478bd9Sstevel@tonic-gate aiowp->work_tail1 = aiowp->work_next1; 15227c478bd9Sstevel@tonic-gate if (aiowp->work_tail1 == NULL) 15237c478bd9Sstevel@tonic-gate aiowp->work_head1 = NULL; 15247c478bd9Sstevel@tonic-gate aiowp->work_prev1 = NULL; 152534709573Sraf sig_mutex_unlock(&aiowp->work_qlock1); 152634709573Sraf sig_mutex_lock(&__aio_mutex); 15277c478bd9Sstevel@tonic-gate _aio_donecnt++; 15287c478bd9Sstevel@tonic-gate _aio_outstand_cnt--; 15297c478bd9Sstevel@tonic-gate _aio_req_done_cnt--; 153034709573Sraf ASSERT(_aio_donecnt > 0 && 153134709573Sraf _aio_outstand_cnt >= 0 && 153234709573Sraf _aio_req_done_cnt >= 0); 153334709573Sraf ASSERT(reqp != NULL); 15347c478bd9Sstevel@tonic-gate 15357c478bd9Sstevel@tonic-gate if (_aio_done_tail == NULL) { 153634709573Sraf _aio_done_head = _aio_done_tail = reqp; 15377c478bd9Sstevel@tonic-gate } else { 153834709573Sraf _aio_done_head->req_next = reqp; 153934709573Sraf _aio_done_head = reqp; 15407c478bd9Sstevel@tonic-gate } 15417c478bd9Sstevel@tonic-gate 15427c478bd9Sstevel@tonic-gate if (_aiowait_flag) { 154334709573Sraf sig_mutex_unlock(&__aio_mutex); 15447c478bd9Sstevel@tonic-gate (void) _kaio(AIONOTIFY); 15457c478bd9Sstevel@tonic-gate } else { 154634709573Sraf sig_mutex_unlock(&__aio_mutex); 154734709573Sraf if (_sigio_enabled) 15487c478bd9Sstevel@tonic-gate (void) kill(__pid, SIGIO); 15497c478bd9Sstevel@tonic-gate } 15507c478bd9Sstevel@tonic-gate } 15517c478bd9Sstevel@tonic-gate 15527c478bd9Sstevel@tonic-gate /* 155334709573Sraf * The done queue consists of AIO requests that are in either the 155434709573Sraf * AIO_REQ_DONE or AIO_REQ_CANCELED state. Requests that were cancelled 155534709573Sraf * are discarded. If the done queue is empty then NULL is returned. 155634709573Sraf * Otherwise the address of a done aio_result_t is returned. 15577c478bd9Sstevel@tonic-gate */ 155834709573Sraf aio_result_t * 15597c478bd9Sstevel@tonic-gate _aio_req_done(void) 15607c478bd9Sstevel@tonic-gate { 156134709573Sraf aio_req_t *reqp; 15627c478bd9Sstevel@tonic-gate aio_result_t *resultp; 15637c478bd9Sstevel@tonic-gate 15647c478bd9Sstevel@tonic-gate ASSERT(MUTEX_HELD(&__aio_mutex)); 15657c478bd9Sstevel@tonic-gate 156634709573Sraf if ((reqp = _aio_done_tail) != NULL) { 156734709573Sraf if ((_aio_done_tail = reqp->req_next) == NULL) 156834709573Sraf _aio_done_head = NULL; 15697c478bd9Sstevel@tonic-gate ASSERT(_aio_donecnt > 0); 15707c478bd9Sstevel@tonic-gate _aio_donecnt--; 157134709573Sraf (void) _aio_hash_del(reqp->req_resultp); 157234709573Sraf resultp = reqp->req_resultp; 157334709573Sraf ASSERT(reqp->req_state == AIO_REQ_DONE); 157434709573Sraf _aio_req_free(reqp); 15757c478bd9Sstevel@tonic-gate return (resultp); 15767c478bd9Sstevel@tonic-gate } 15777c478bd9Sstevel@tonic-gate /* is queue empty? */ 157834709573Sraf if (reqp == NULL && _aio_outstand_cnt == 0) { 15797c478bd9Sstevel@tonic-gate return ((aio_result_t *)-1); 15807c478bd9Sstevel@tonic-gate } 15817c478bd9Sstevel@tonic-gate return (NULL); 15827c478bd9Sstevel@tonic-gate } 15837c478bd9Sstevel@tonic-gate 15847c478bd9Sstevel@tonic-gate /* 158534709573Sraf * Set the return and errno values for the application's use. 158634709573Sraf * 158734709573Sraf * For the Posix interfaces, we must set the return value first followed 158834709573Sraf * by the errno value because the Posix interfaces allow for a change 158934709573Sraf * in the errno value from EINPROGRESS to something else to signal 159034709573Sraf * the completion of the asynchronous request. 159134709573Sraf * 159234709573Sraf * The opposite is true for the Solaris interfaces. These allow for 159334709573Sraf * a change in the return value from AIO_INPROGRESS to something else 159434709573Sraf * to signal the completion of the asynchronous request. 15957c478bd9Sstevel@tonic-gate */ 15967c478bd9Sstevel@tonic-gate void 159734709573Sraf _aio_set_result(aio_req_t *reqp, ssize_t retval, int error) 15987c478bd9Sstevel@tonic-gate { 159934709573Sraf aio_result_t *resultp = reqp->req_resultp; 160034709573Sraf 160134709573Sraf if (POSIX_AIO(reqp)) { 160234709573Sraf resultp->aio_return = retval; 160334709573Sraf membar_producer(); 160434709573Sraf resultp->aio_errno = error; 160534709573Sraf } else { 160634709573Sraf resultp->aio_errno = error; 160734709573Sraf membar_producer(); 160834709573Sraf resultp->aio_return = retval; 160934709573Sraf } 161034709573Sraf } 161134709573Sraf 161234709573Sraf /* 161334709573Sraf * Add an AIO request onto the next work queue. 161434709573Sraf * A circular list of workers is used to choose the next worker. 161534709573Sraf */ 161634709573Sraf void 161734709573Sraf _aio_req_add(aio_req_t *reqp, aio_worker_t **nextworker, int mode) 161834709573Sraf { 1619f841f6adSraf ulwp_t *self = curthread; 162034709573Sraf aio_worker_t *aiowp; 162134709573Sraf aio_worker_t *first; 162234709573Sraf int load_bal_flg = 1; 162334709573Sraf int found; 162434709573Sraf 162534709573Sraf ASSERT(reqp->req_state != AIO_REQ_DONEQ); 162634709573Sraf reqp->req_next = NULL; 16277c478bd9Sstevel@tonic-gate /* 162834709573Sraf * Try to acquire the next worker's work queue. If it is locked, 16297c478bd9Sstevel@tonic-gate * then search the list of workers until a queue is found unlocked, 16307c478bd9Sstevel@tonic-gate * or until the list is completely traversed at which point another 16317c478bd9Sstevel@tonic-gate * worker will be created. 16327c478bd9Sstevel@tonic-gate */ 1633f841f6adSraf sigoff(self); /* defer SIGIO */ 163434709573Sraf sig_mutex_lock(&__aio_mutex); 163534709573Sraf first = aiowp = *nextworker; 163634709573Sraf if (mode != AIONOTIFY) 16377c478bd9Sstevel@tonic-gate _aio_outstand_cnt++; 163834709573Sraf sig_mutex_unlock(&__aio_mutex); 163934709573Sraf 16407c478bd9Sstevel@tonic-gate switch (mode) { 164134709573Sraf case AIOREAD: 164234709573Sraf case AIOWRITE: 164334709573Sraf case AIOAREAD: 164434709573Sraf case AIOAWRITE: 164534709573Sraf #if !defined(_LP64) 164634709573Sraf case AIOAREAD64: 164734709573Sraf case AIOAWRITE64: 16487c478bd9Sstevel@tonic-gate #endif 164934709573Sraf /* try to find an idle worker */ 165034709573Sraf found = 0; 165134709573Sraf do { 165234709573Sraf if (sig_mutex_trylock(&aiowp->work_qlock1) == 0) { 165334709573Sraf if (aiowp->work_idleflg) { 165434709573Sraf found = 1; 16557c478bd9Sstevel@tonic-gate break; 16567c478bd9Sstevel@tonic-gate } 165734709573Sraf sig_mutex_unlock(&aiowp->work_qlock1); 16587c478bd9Sstevel@tonic-gate } 165934709573Sraf } while ((aiowp = aiowp->work_forw) != first); 16607c478bd9Sstevel@tonic-gate 166134709573Sraf if (found) { 166234709573Sraf aiowp->work_minload1++; 166334709573Sraf break; 166434709573Sraf } 16657c478bd9Sstevel@tonic-gate 166634709573Sraf /* try to acquire some worker's queue lock */ 166734709573Sraf do { 166834709573Sraf if (sig_mutex_trylock(&aiowp->work_qlock1) == 0) { 166934709573Sraf found = 1; 167034709573Sraf break; 16717c478bd9Sstevel@tonic-gate } 167234709573Sraf } while ((aiowp = aiowp->work_forw) != first); 167334709573Sraf 167434709573Sraf /* 167534709573Sraf * Create more workers when the workers appear overloaded. 167634709573Sraf * Either all the workers are busy draining their queues 167734709573Sraf * or no worker's queue lock could be acquired. 167834709573Sraf */ 167934709573Sraf if (!found) { 168034709573Sraf if (_aio_worker_cnt < _max_workers) { 168134709573Sraf if (_aio_create_worker(reqp, mode)) 1682f841f6adSraf aio_panic("_aio_req_add: add worker"); 1683f841f6adSraf sigon(self); /* reenable SIGIO */ 16847c478bd9Sstevel@tonic-gate return; 16857c478bd9Sstevel@tonic-gate } 168634709573Sraf 168734709573Sraf /* 168834709573Sraf * No worker available and we have created 168934709573Sraf * _max_workers, keep going through the 169034709573Sraf * list slowly until we get a lock 169134709573Sraf */ 169234709573Sraf while (sig_mutex_trylock(&aiowp->work_qlock1) != 0) { 169334709573Sraf /* 169434709573Sraf * give someone else a chance 169534709573Sraf */ 169634709573Sraf _aio_delay(1); 169734709573Sraf aiowp = aiowp->work_forw; 169834709573Sraf } 169934709573Sraf } 170034709573Sraf 170134709573Sraf ASSERT(MUTEX_HELD(&aiowp->work_qlock1)); 170234709573Sraf if (_aio_worker_cnt < _max_workers && 170334709573Sraf aiowp->work_minload1 >= _minworkload) { 170434709573Sraf sig_mutex_unlock(&aiowp->work_qlock1); 170534709573Sraf sig_mutex_lock(&__aio_mutex); 170634709573Sraf *nextworker = aiowp->work_forw; 170734709573Sraf sig_mutex_unlock(&__aio_mutex); 170834709573Sraf if (_aio_create_worker(reqp, mode)) 1709f841f6adSraf aio_panic("aio_req_add: add worker"); 1710f841f6adSraf sigon(self); /* reenable SIGIO */ 171134709573Sraf return; 171234709573Sraf } 171334709573Sraf aiowp->work_minload1++; 171434709573Sraf break; 171534709573Sraf case AIOFSYNC: 171634709573Sraf case AIONOTIFY: 171734709573Sraf load_bal_flg = 0; 171834709573Sraf sig_mutex_lock(&aiowp->work_qlock1); 171934709573Sraf break; 172034709573Sraf default: 1721f841f6adSraf aio_panic("_aio_req_add: invalid mode"); 172234709573Sraf break; 17237c478bd9Sstevel@tonic-gate } 17247c478bd9Sstevel@tonic-gate /* 17257c478bd9Sstevel@tonic-gate * Put request onto worker's work queue. 17267c478bd9Sstevel@tonic-gate */ 17277c478bd9Sstevel@tonic-gate if (aiowp->work_tail1 == NULL) { 172834709573Sraf ASSERT(aiowp->work_count1 == 0); 172934709573Sraf aiowp->work_tail1 = reqp; 173034709573Sraf aiowp->work_next1 = reqp; 17317c478bd9Sstevel@tonic-gate } else { 173234709573Sraf aiowp->work_head1->req_next = reqp; 17337c478bd9Sstevel@tonic-gate if (aiowp->work_next1 == NULL) 173434709573Sraf aiowp->work_next1 = reqp; 17357c478bd9Sstevel@tonic-gate } 173634709573Sraf reqp->req_state = AIO_REQ_QUEUED; 173734709573Sraf reqp->req_worker = aiowp; 173834709573Sraf aiowp->work_head1 = reqp; 17397c478bd9Sstevel@tonic-gate /* 17407c478bd9Sstevel@tonic-gate * Awaken worker if it is not currently active. 17417c478bd9Sstevel@tonic-gate */ 174234709573Sraf if (aiowp->work_count1++ == 0 && aiowp->work_idleflg) { 17437c478bd9Sstevel@tonic-gate aiowp->work_idleflg = 0; 174434709573Sraf (void) cond_signal(&aiowp->work_idle_cv); 17457c478bd9Sstevel@tonic-gate } 174634709573Sraf sig_mutex_unlock(&aiowp->work_qlock1); 174734709573Sraf 174834709573Sraf if (load_bal_flg) { 174934709573Sraf sig_mutex_lock(&__aio_mutex); 175034709573Sraf *nextworker = aiowp->work_forw; 175134709573Sraf sig_mutex_unlock(&__aio_mutex); 175234709573Sraf } 1753f841f6adSraf sigon(self); /* reenable SIGIO */ 17547c478bd9Sstevel@tonic-gate } 17557c478bd9Sstevel@tonic-gate 17567c478bd9Sstevel@tonic-gate /* 175734709573Sraf * Get an AIO request for a specified worker. 175834709573Sraf * If the work queue is empty, return NULL. 17597c478bd9Sstevel@tonic-gate */ 17607c478bd9Sstevel@tonic-gate aio_req_t * 17617c478bd9Sstevel@tonic-gate _aio_req_get(aio_worker_t *aiowp) 17627c478bd9Sstevel@tonic-gate { 176334709573Sraf aio_req_t *reqp; 17647c478bd9Sstevel@tonic-gate 176534709573Sraf sig_mutex_lock(&aiowp->work_qlock1); 176634709573Sraf if ((reqp = aiowp->work_next1) != NULL) { 17677c478bd9Sstevel@tonic-gate /* 176834709573Sraf * Remove a POSIX request from the queue; the 17697c478bd9Sstevel@tonic-gate * request queue is a singularly linked list 177034709573Sraf * with a previous pointer. The request is 177134709573Sraf * removed by updating the previous pointer. 17727c478bd9Sstevel@tonic-gate * 177334709573Sraf * Non-posix requests are left on the queue 177434709573Sraf * to eventually be placed on the done queue. 17757c478bd9Sstevel@tonic-gate */ 17767c478bd9Sstevel@tonic-gate 177734709573Sraf if (POSIX_AIO(reqp)) { 17787c478bd9Sstevel@tonic-gate if (aiowp->work_prev1 == NULL) { 177934709573Sraf aiowp->work_tail1 = reqp->req_next; 17807c478bd9Sstevel@tonic-gate if (aiowp->work_tail1 == NULL) 17817c478bd9Sstevel@tonic-gate aiowp->work_head1 = NULL; 17827c478bd9Sstevel@tonic-gate } else { 178334709573Sraf aiowp->work_prev1->req_next = reqp->req_next; 178434709573Sraf if (aiowp->work_head1 == reqp) 178534709573Sraf aiowp->work_head1 = reqp->req_next; 17867c478bd9Sstevel@tonic-gate } 17877c478bd9Sstevel@tonic-gate 17887c478bd9Sstevel@tonic-gate } else { 178934709573Sraf aiowp->work_prev1 = reqp; 17907c478bd9Sstevel@tonic-gate ASSERT(aiowp->work_done1 >= 0); 17917c478bd9Sstevel@tonic-gate aiowp->work_done1++; 17927c478bd9Sstevel@tonic-gate } 179334709573Sraf ASSERT(reqp != reqp->req_next); 179434709573Sraf aiowp->work_next1 = reqp->req_next; 179534709573Sraf ASSERT(aiowp->work_count1 >= 1); 179634709573Sraf aiowp->work_count1--; 179734709573Sraf switch (reqp->req_op) { 179834709573Sraf case AIOREAD: 179934709573Sraf case AIOWRITE: 180034709573Sraf case AIOAREAD: 180134709573Sraf case AIOAWRITE: 180234709573Sraf #if !defined(_LP64) 180334709573Sraf case AIOAREAD64: 180434709573Sraf case AIOAWRITE64: 18057c478bd9Sstevel@tonic-gate #endif 180634709573Sraf ASSERT(aiowp->work_minload1 > 0); 180734709573Sraf aiowp->work_minload1--; 180834709573Sraf break; 180934709573Sraf } 181034709573Sraf reqp->req_state = AIO_REQ_INPROGRESS; 18117c478bd9Sstevel@tonic-gate } 181234709573Sraf aiowp->work_req = reqp; 181334709573Sraf ASSERT(reqp != NULL || aiowp->work_count1 == 0); 181434709573Sraf sig_mutex_unlock(&aiowp->work_qlock1); 181534709573Sraf return (reqp); 18167c478bd9Sstevel@tonic-gate } 18177c478bd9Sstevel@tonic-gate 18187c478bd9Sstevel@tonic-gate static void 181934709573Sraf _aio_req_del(aio_worker_t *aiowp, aio_req_t *reqp, int ostate) 18207c478bd9Sstevel@tonic-gate { 182134709573Sraf aio_req_t **last; 182234709573Sraf aio_req_t *lastrp; 182334709573Sraf aio_req_t *next; 18247c478bd9Sstevel@tonic-gate 18257c478bd9Sstevel@tonic-gate ASSERT(aiowp != NULL); 18267c478bd9Sstevel@tonic-gate ASSERT(MUTEX_HELD(&aiowp->work_qlock1)); 182734709573Sraf if (POSIX_AIO(reqp)) { 18287c478bd9Sstevel@tonic-gate if (ostate != AIO_REQ_QUEUED) 18297c478bd9Sstevel@tonic-gate return; 18307c478bd9Sstevel@tonic-gate } 18317c478bd9Sstevel@tonic-gate last = &aiowp->work_tail1; 18327c478bd9Sstevel@tonic-gate lastrp = aiowp->work_tail1; 18337c478bd9Sstevel@tonic-gate ASSERT(ostate == AIO_REQ_QUEUED || ostate == AIO_REQ_INPROGRESS); 18347c478bd9Sstevel@tonic-gate while ((next = *last) != NULL) { 183534709573Sraf if (next == reqp) { 18367c478bd9Sstevel@tonic-gate *last = next->req_next; 18377c478bd9Sstevel@tonic-gate if (aiowp->work_next1 == next) 18387c478bd9Sstevel@tonic-gate aiowp->work_next1 = next->req_next; 18397c478bd9Sstevel@tonic-gate 18407c478bd9Sstevel@tonic-gate if ((next->req_next != NULL) || 18417c478bd9Sstevel@tonic-gate (aiowp->work_done1 == 0)) { 18427c478bd9Sstevel@tonic-gate if (aiowp->work_head1 == next) 18437c478bd9Sstevel@tonic-gate aiowp->work_head1 = next->req_next; 18447c478bd9Sstevel@tonic-gate if (aiowp->work_prev1 == next) 18457c478bd9Sstevel@tonic-gate aiowp->work_prev1 = next->req_next; 18467c478bd9Sstevel@tonic-gate } else { 18477c478bd9Sstevel@tonic-gate if (aiowp->work_head1 == next) 18487c478bd9Sstevel@tonic-gate aiowp->work_head1 = lastrp; 18497c478bd9Sstevel@tonic-gate if (aiowp->work_prev1 == next) 18507c478bd9Sstevel@tonic-gate aiowp->work_prev1 = lastrp; 18517c478bd9Sstevel@tonic-gate } 18527c478bd9Sstevel@tonic-gate 18537c478bd9Sstevel@tonic-gate if (ostate == AIO_REQ_QUEUED) { 185434709573Sraf ASSERT(aiowp->work_count1 >= 1); 185534709573Sraf aiowp->work_count1--; 185634709573Sraf ASSERT(aiowp->work_minload1 >= 1); 185734709573Sraf aiowp->work_minload1--; 18587c478bd9Sstevel@tonic-gate } else { 18597c478bd9Sstevel@tonic-gate ASSERT(ostate == AIO_REQ_INPROGRESS && 186034709573Sraf !POSIX_AIO(reqp)); 18617c478bd9Sstevel@tonic-gate aiowp->work_done1--; 18627c478bd9Sstevel@tonic-gate } 18637c478bd9Sstevel@tonic-gate return; 18647c478bd9Sstevel@tonic-gate } 18657c478bd9Sstevel@tonic-gate last = &next->req_next; 18667c478bd9Sstevel@tonic-gate lastrp = next; 18677c478bd9Sstevel@tonic-gate } 18687c478bd9Sstevel@tonic-gate /* NOTREACHED */ 18697c478bd9Sstevel@tonic-gate } 18707c478bd9Sstevel@tonic-gate 18717c478bd9Sstevel@tonic-gate static void 18727c478bd9Sstevel@tonic-gate _aio_enq_doneq(aio_req_t *reqp) 18737c478bd9Sstevel@tonic-gate { 18747c478bd9Sstevel@tonic-gate if (_aio_doneq == NULL) { 18757c478bd9Sstevel@tonic-gate _aio_doneq = reqp; 187634709573Sraf reqp->req_next = reqp->req_prev = reqp; 18777c478bd9Sstevel@tonic-gate } else { 18787c478bd9Sstevel@tonic-gate reqp->req_next = _aio_doneq; 18797c478bd9Sstevel@tonic-gate reqp->req_prev = _aio_doneq->req_prev; 188034709573Sraf _aio_doneq->req_prev->req_next = reqp; 18817c478bd9Sstevel@tonic-gate _aio_doneq->req_prev = reqp; 18827c478bd9Sstevel@tonic-gate } 18837c478bd9Sstevel@tonic-gate reqp->req_state = AIO_REQ_DONEQ; 18847c478bd9Sstevel@tonic-gate _aio_doneq_cnt++; 18857c478bd9Sstevel@tonic-gate } 18867c478bd9Sstevel@tonic-gate 18877c478bd9Sstevel@tonic-gate /* 18887c478bd9Sstevel@tonic-gate * caller owns the _aio_mutex 18897c478bd9Sstevel@tonic-gate */ 18907c478bd9Sstevel@tonic-gate aio_req_t * 18917c478bd9Sstevel@tonic-gate _aio_req_remove(aio_req_t *reqp) 18927c478bd9Sstevel@tonic-gate { 18937c478bd9Sstevel@tonic-gate if (reqp && reqp->req_state != AIO_REQ_DONEQ) 18947c478bd9Sstevel@tonic-gate return (NULL); 18957c478bd9Sstevel@tonic-gate 18967c478bd9Sstevel@tonic-gate if (reqp) { 18977c478bd9Sstevel@tonic-gate /* request in done queue */ 189834709573Sraf if (_aio_doneq == reqp) 189934709573Sraf _aio_doneq = reqp->req_next; 190034709573Sraf if (_aio_doneq == reqp) { 19017c478bd9Sstevel@tonic-gate /* only one request on queue */ 19027c478bd9Sstevel@tonic-gate _aio_doneq = NULL; 19037c478bd9Sstevel@tonic-gate } else { 190434709573Sraf aio_req_t *tmp = reqp->req_next; 190534709573Sraf reqp->req_prev->req_next = tmp; 190634709573Sraf tmp->req_prev = reqp->req_prev; 19077c478bd9Sstevel@tonic-gate } 190834709573Sraf } else if ((reqp = _aio_doneq) != NULL) { 190934709573Sraf if (reqp == reqp->req_next) { 19107c478bd9Sstevel@tonic-gate /* only one request on queue */ 19117c478bd9Sstevel@tonic-gate _aio_doneq = NULL; 19127c478bd9Sstevel@tonic-gate } else { 191334709573Sraf reqp->req_prev->req_next = _aio_doneq = reqp->req_next; 191434709573Sraf _aio_doneq->req_prev = reqp->req_prev; 19157c478bd9Sstevel@tonic-gate } 191634709573Sraf } 191734709573Sraf if (reqp) { 19187c478bd9Sstevel@tonic-gate _aio_doneq_cnt--; 191934709573Sraf reqp->req_next = reqp->req_prev = reqp; 192034709573Sraf reqp->req_state = AIO_REQ_DONE; 19217c478bd9Sstevel@tonic-gate } 192234709573Sraf return (reqp); 19237c478bd9Sstevel@tonic-gate } 19247c478bd9Sstevel@tonic-gate 19257c478bd9Sstevel@tonic-gate /* 192634709573Sraf * An AIO request is identified by an aio_result_t pointer. The library 192734709573Sraf * maps this aio_result_t pointer to its internal representation using a 192834709573Sraf * hash table. This function adds an aio_result_t pointer to the hash table. 19297c478bd9Sstevel@tonic-gate */ 19307c478bd9Sstevel@tonic-gate static int 193134709573Sraf _aio_hash_insert(aio_result_t *resultp, aio_req_t *reqp) 19327c478bd9Sstevel@tonic-gate { 193334709573Sraf aio_hash_t *hashp; 193434709573Sraf aio_req_t **prev; 193534709573Sraf aio_req_t *next; 19367c478bd9Sstevel@tonic-gate 193734709573Sraf hashp = _aio_hash + AIOHASH(resultp); 1938f841f6adSraf lmutex_lock(&hashp->hash_lock); 193934709573Sraf prev = &hashp->hash_ptr; 19407c478bd9Sstevel@tonic-gate while ((next = *prev) != NULL) { 19417c478bd9Sstevel@tonic-gate if (resultp == next->req_resultp) { 1942f841f6adSraf lmutex_unlock(&hashp->hash_lock); 194334709573Sraf return (-1); 19447c478bd9Sstevel@tonic-gate } 19457c478bd9Sstevel@tonic-gate prev = &next->req_link; 19467c478bd9Sstevel@tonic-gate } 194734709573Sraf *prev = reqp; 194834709573Sraf ASSERT(reqp->req_link == NULL); 1949f841f6adSraf lmutex_unlock(&hashp->hash_lock); 195034709573Sraf return (0); 19517c478bd9Sstevel@tonic-gate } 19527c478bd9Sstevel@tonic-gate 19537c478bd9Sstevel@tonic-gate /* 195434709573Sraf * Remove an entry from the hash table. 19557c478bd9Sstevel@tonic-gate */ 195634709573Sraf aio_req_t * 195734709573Sraf _aio_hash_del(aio_result_t *resultp) 19587c478bd9Sstevel@tonic-gate { 195934709573Sraf aio_hash_t *hashp; 196034709573Sraf aio_req_t **prev; 196134709573Sraf aio_req_t *next = NULL; 196234709573Sraf 196334709573Sraf if (_aio_hash != NULL) { 196434709573Sraf hashp = _aio_hash + AIOHASH(resultp); 1965f841f6adSraf lmutex_lock(&hashp->hash_lock); 196634709573Sraf prev = &hashp->hash_ptr; 196734709573Sraf while ((next = *prev) != NULL) { 196834709573Sraf if (resultp == next->req_resultp) { 196934709573Sraf *prev = next->req_link; 197034709573Sraf next->req_link = NULL; 197134709573Sraf break; 197234709573Sraf } 197334709573Sraf prev = &next->req_link; 19747c478bd9Sstevel@tonic-gate } 1975f841f6adSraf lmutex_unlock(&hashp->hash_lock); 19767c478bd9Sstevel@tonic-gate } 197734709573Sraf return (next); 19787c478bd9Sstevel@tonic-gate } 19797c478bd9Sstevel@tonic-gate 19807c478bd9Sstevel@tonic-gate /* 198134709573Sraf * find an entry in the hash table 19827c478bd9Sstevel@tonic-gate */ 19837c478bd9Sstevel@tonic-gate aio_req_t * 198434709573Sraf _aio_hash_find(aio_result_t *resultp) 19857c478bd9Sstevel@tonic-gate { 198634709573Sraf aio_hash_t *hashp; 198734709573Sraf aio_req_t **prev; 198834709573Sraf aio_req_t *next = NULL; 198934709573Sraf 199034709573Sraf if (_aio_hash != NULL) { 199134709573Sraf hashp = _aio_hash + AIOHASH(resultp); 1992f841f6adSraf lmutex_lock(&hashp->hash_lock); 199334709573Sraf prev = &hashp->hash_ptr; 199434709573Sraf while ((next = *prev) != NULL) { 199534709573Sraf if (resultp == next->req_resultp) 199634709573Sraf break; 199734709573Sraf prev = &next->req_link; 199834709573Sraf } 1999f841f6adSraf lmutex_unlock(&hashp->hash_lock); 20007c478bd9Sstevel@tonic-gate } 200134709573Sraf return (next); 20027c478bd9Sstevel@tonic-gate } 20037c478bd9Sstevel@tonic-gate 20047c478bd9Sstevel@tonic-gate /* 20057c478bd9Sstevel@tonic-gate * AIO interface for POSIX 20067c478bd9Sstevel@tonic-gate */ 20077c478bd9Sstevel@tonic-gate int 200834709573Sraf _aio_rw(aiocb_t *aiocbp, aio_lio_t *lio_head, aio_worker_t **nextworker, 200934709573Sraf int mode, int flg) 20107c478bd9Sstevel@tonic-gate { 201134709573Sraf aio_req_t *reqp; 201234709573Sraf aio_args_t *ap; 20137c478bd9Sstevel@tonic-gate int kerr; 20147c478bd9Sstevel@tonic-gate 201534709573Sraf if (aiocbp == NULL) { 20167c478bd9Sstevel@tonic-gate errno = EINVAL; 20177c478bd9Sstevel@tonic-gate return (-1); 20187c478bd9Sstevel@tonic-gate } 20197c478bd9Sstevel@tonic-gate 20207c478bd9Sstevel@tonic-gate /* initialize kaio */ 20217c478bd9Sstevel@tonic-gate if (!_kaio_ok) 20227c478bd9Sstevel@tonic-gate _kaio_init(); 20237c478bd9Sstevel@tonic-gate 202434709573Sraf aiocbp->aio_state = NOCHECK; 20257c478bd9Sstevel@tonic-gate 20267c478bd9Sstevel@tonic-gate /* 202734709573Sraf * If we have been called because a list I/O 20287c478bd9Sstevel@tonic-gate * kaio() failed, we dont want to repeat the 20297c478bd9Sstevel@tonic-gate * system call 20307c478bd9Sstevel@tonic-gate */ 20317c478bd9Sstevel@tonic-gate 20327c478bd9Sstevel@tonic-gate if (flg & AIO_KAIO) { 20337c478bd9Sstevel@tonic-gate /* 20347c478bd9Sstevel@tonic-gate * Try kernel aio first. 20357c478bd9Sstevel@tonic-gate * If errno is ENOTSUP/EBADFD, 20367c478bd9Sstevel@tonic-gate * fall back to the thread implementation. 20377c478bd9Sstevel@tonic-gate */ 203834709573Sraf if (_kaio_ok > 0 && KAIO_SUPPORTED(aiocbp->aio_fildes)) { 203934709573Sraf aiocbp->aio_resultp.aio_errno = EINPROGRESS; 204034709573Sraf aiocbp->aio_state = CHECK; 204134709573Sraf kerr = (int)_kaio(mode, aiocbp); 20427c478bd9Sstevel@tonic-gate if (kerr == 0) 20437c478bd9Sstevel@tonic-gate return (0); 204434709573Sraf if (errno != ENOTSUP && errno != EBADFD) { 204534709573Sraf aiocbp->aio_resultp.aio_errno = errno; 204634709573Sraf aiocbp->aio_resultp.aio_return = -1; 204734709573Sraf aiocbp->aio_state = NOCHECK; 20487c478bd9Sstevel@tonic-gate return (-1); 20497c478bd9Sstevel@tonic-gate } 20507c478bd9Sstevel@tonic-gate if (errno == EBADFD) 205134709573Sraf SET_KAIO_NOT_SUPPORTED(aiocbp->aio_fildes); 20527c478bd9Sstevel@tonic-gate } 20537c478bd9Sstevel@tonic-gate } 20547c478bd9Sstevel@tonic-gate 205534709573Sraf aiocbp->aio_resultp.aio_errno = EINPROGRESS; 205634709573Sraf aiocbp->aio_state = USERAIO; 20577c478bd9Sstevel@tonic-gate 205834709573Sraf if (!__uaio_ok && __uaio_init() == -1) 205934709573Sraf return (-1); 20607c478bd9Sstevel@tonic-gate 206134709573Sraf if ((reqp = _aio_req_alloc()) == NULL) { 20627c478bd9Sstevel@tonic-gate errno = EAGAIN; 20637c478bd9Sstevel@tonic-gate return (-1); 20647c478bd9Sstevel@tonic-gate } 20657c478bd9Sstevel@tonic-gate 20667c478bd9Sstevel@tonic-gate /* 206734709573Sraf * If an LIO request, add the list head to the aio request 20687c478bd9Sstevel@tonic-gate */ 206934709573Sraf reqp->req_head = lio_head; 207034709573Sraf reqp->req_type = AIO_POSIX_REQ; 207134709573Sraf reqp->req_op = mode; 207234709573Sraf reqp->req_largefile = 0; 207334709573Sraf 207434709573Sraf if (aiocbp->aio_sigevent.sigev_notify == SIGEV_NONE) { 207534709573Sraf reqp->req_sigevent.sigev_notify = SIGEV_NONE; 207634709573Sraf } else if (aiocbp->aio_sigevent.sigev_notify == SIGEV_SIGNAL) { 207734709573Sraf reqp->req_sigevent.sigev_notify = SIGEV_SIGNAL; 207834709573Sraf reqp->req_sigevent.sigev_signo = 207934709573Sraf aiocbp->aio_sigevent.sigev_signo; 208034709573Sraf reqp->req_sigevent.sigev_value.sival_ptr = 208134709573Sraf aiocbp->aio_sigevent.sigev_value.sival_ptr; 208234709573Sraf } else if (aiocbp->aio_sigevent.sigev_notify == SIGEV_PORT) { 208334709573Sraf port_notify_t *pn = aiocbp->aio_sigevent.sigev_value.sival_ptr; 208434709573Sraf reqp->req_sigevent.sigev_notify = SIGEV_PORT; 208534709573Sraf /* 208634709573Sraf * Reuse the sigevent structure to contain the port number 208734709573Sraf * and the user value. Same for SIGEV_THREAD, below. 208834709573Sraf */ 208934709573Sraf reqp->req_sigevent.sigev_signo = 209034709573Sraf pn->portnfy_port; 209134709573Sraf reqp->req_sigevent.sigev_value.sival_ptr = 209234709573Sraf pn->portnfy_user; 209334709573Sraf } else if (aiocbp->aio_sigevent.sigev_notify == SIGEV_THREAD) { 209434709573Sraf reqp->req_sigevent.sigev_notify = SIGEV_THREAD; 209534709573Sraf /* 209634709573Sraf * The sigevent structure contains the port number 209734709573Sraf * and the user value. Same for SIGEV_PORT, above. 209834709573Sraf */ 209934709573Sraf reqp->req_sigevent.sigev_signo = 210034709573Sraf aiocbp->aio_sigevent.sigev_signo; 210134709573Sraf reqp->req_sigevent.sigev_value.sival_ptr = 210234709573Sraf aiocbp->aio_sigevent.sigev_value.sival_ptr; 21037c478bd9Sstevel@tonic-gate } 21047c478bd9Sstevel@tonic-gate 210534709573Sraf reqp->req_resultp = &aiocbp->aio_resultp; 210634709573Sraf reqp->req_aiocbp = aiocbp; 210734709573Sraf ap = &reqp->req_args; 210834709573Sraf ap->fd = aiocbp->aio_fildes; 210934709573Sraf ap->buf = (caddr_t)aiocbp->aio_buf; 211034709573Sraf ap->bufsz = aiocbp->aio_nbytes; 211134709573Sraf ap->offset = aiocbp->aio_offset; 211234709573Sraf 211334709573Sraf if ((flg & AIO_NO_DUPS) && 211434709573Sraf _aio_hash_insert(&aiocbp->aio_resultp, reqp) != 0) { 2115f841f6adSraf aio_panic("_aio_rw(): request already in hash table"); 211634709573Sraf _aio_req_free(reqp); 21177c478bd9Sstevel@tonic-gate errno = EINVAL; 21187c478bd9Sstevel@tonic-gate return (-1); 21197c478bd9Sstevel@tonic-gate } 212034709573Sraf _aio_req_add(reqp, nextworker, mode); 212134709573Sraf return (0); 21227c478bd9Sstevel@tonic-gate } 21237c478bd9Sstevel@tonic-gate 212434709573Sraf #if !defined(_LP64) 21257c478bd9Sstevel@tonic-gate /* 21267c478bd9Sstevel@tonic-gate * 64-bit AIO interface for POSIX 21277c478bd9Sstevel@tonic-gate */ 21287c478bd9Sstevel@tonic-gate int 212934709573Sraf _aio_rw64(aiocb64_t *aiocbp, aio_lio_t *lio_head, aio_worker_t **nextworker, 213034709573Sraf int mode, int flg) 21317c478bd9Sstevel@tonic-gate { 213234709573Sraf aio_req_t *reqp; 213334709573Sraf aio_args_t *ap; 21347c478bd9Sstevel@tonic-gate int kerr; 21357c478bd9Sstevel@tonic-gate 213634709573Sraf if (aiocbp == NULL) { 21377c478bd9Sstevel@tonic-gate errno = EINVAL; 21387c478bd9Sstevel@tonic-gate return (-1); 21397c478bd9Sstevel@tonic-gate } 21407c478bd9Sstevel@tonic-gate 21417c478bd9Sstevel@tonic-gate /* initialize kaio */ 21427c478bd9Sstevel@tonic-gate if (!_kaio_ok) 21437c478bd9Sstevel@tonic-gate _kaio_init(); 21447c478bd9Sstevel@tonic-gate 214534709573Sraf aiocbp->aio_state = NOCHECK; 21467c478bd9Sstevel@tonic-gate 21477c478bd9Sstevel@tonic-gate /* 214834709573Sraf * If we have been called because a list I/O 21497c478bd9Sstevel@tonic-gate * kaio() failed, we dont want to repeat the 21507c478bd9Sstevel@tonic-gate * system call 21517c478bd9Sstevel@tonic-gate */ 21527c478bd9Sstevel@tonic-gate 21537c478bd9Sstevel@tonic-gate if (flg & AIO_KAIO) { 21547c478bd9Sstevel@tonic-gate /* 21557c478bd9Sstevel@tonic-gate * Try kernel aio first. 21567c478bd9Sstevel@tonic-gate * If errno is ENOTSUP/EBADFD, 21577c478bd9Sstevel@tonic-gate * fall back to the thread implementation. 21587c478bd9Sstevel@tonic-gate */ 215934709573Sraf if (_kaio_ok > 0 && KAIO_SUPPORTED(aiocbp->aio_fildes)) { 216034709573Sraf aiocbp->aio_resultp.aio_errno = EINPROGRESS; 216134709573Sraf aiocbp->aio_state = CHECK; 216234709573Sraf kerr = (int)_kaio(mode, aiocbp); 21637c478bd9Sstevel@tonic-gate if (kerr == 0) 21647c478bd9Sstevel@tonic-gate return (0); 216534709573Sraf if (errno != ENOTSUP && errno != EBADFD) { 216634709573Sraf aiocbp->aio_resultp.aio_errno = errno; 216734709573Sraf aiocbp->aio_resultp.aio_return = -1; 216834709573Sraf aiocbp->aio_state = NOCHECK; 21697c478bd9Sstevel@tonic-gate return (-1); 21707c478bd9Sstevel@tonic-gate } 21717c478bd9Sstevel@tonic-gate if (errno == EBADFD) 217234709573Sraf SET_KAIO_NOT_SUPPORTED(aiocbp->aio_fildes); 21737c478bd9Sstevel@tonic-gate } 21747c478bd9Sstevel@tonic-gate } 21757c478bd9Sstevel@tonic-gate 217634709573Sraf aiocbp->aio_resultp.aio_errno = EINPROGRESS; 217734709573Sraf aiocbp->aio_state = USERAIO; 21787c478bd9Sstevel@tonic-gate 217934709573Sraf if (!__uaio_ok && __uaio_init() == -1) 218034709573Sraf return (-1); 21817c478bd9Sstevel@tonic-gate 218234709573Sraf if ((reqp = _aio_req_alloc()) == NULL) { 21837c478bd9Sstevel@tonic-gate errno = EAGAIN; 21847c478bd9Sstevel@tonic-gate return (-1); 21857c478bd9Sstevel@tonic-gate } 21867c478bd9Sstevel@tonic-gate 21877c478bd9Sstevel@tonic-gate /* 218834709573Sraf * If an LIO request, add the list head to the aio request 21897c478bd9Sstevel@tonic-gate */ 219034709573Sraf reqp->req_head = lio_head; 219134709573Sraf reqp->req_type = AIO_POSIX_REQ; 219234709573Sraf reqp->req_op = mode; 219334709573Sraf reqp->req_largefile = 1; 219434709573Sraf 219534709573Sraf if (aiocbp->aio_sigevent.sigev_notify == SIGEV_NONE) { 219634709573Sraf reqp->req_sigevent.sigev_notify = SIGEV_NONE; 219734709573Sraf } else if (aiocbp->aio_sigevent.sigev_notify == SIGEV_SIGNAL) { 219834709573Sraf reqp->req_sigevent.sigev_notify = SIGEV_SIGNAL; 219934709573Sraf reqp->req_sigevent.sigev_signo = 220034709573Sraf aiocbp->aio_sigevent.sigev_signo; 220134709573Sraf reqp->req_sigevent.sigev_value.sival_ptr = 220234709573Sraf aiocbp->aio_sigevent.sigev_value.sival_ptr; 220334709573Sraf } else if (aiocbp->aio_sigevent.sigev_notify == SIGEV_PORT) { 220434709573Sraf port_notify_t *pn = aiocbp->aio_sigevent.sigev_value.sival_ptr; 220534709573Sraf reqp->req_sigevent.sigev_notify = SIGEV_PORT; 220634709573Sraf reqp->req_sigevent.sigev_signo = 220734709573Sraf pn->portnfy_port; 220834709573Sraf reqp->req_sigevent.sigev_value.sival_ptr = 220934709573Sraf pn->portnfy_user; 221034709573Sraf } else if (aiocbp->aio_sigevent.sigev_notify == SIGEV_THREAD) { 221134709573Sraf reqp->req_sigevent.sigev_notify = SIGEV_THREAD; 221234709573Sraf reqp->req_sigevent.sigev_signo = 221334709573Sraf aiocbp->aio_sigevent.sigev_signo; 221434709573Sraf reqp->req_sigevent.sigev_value.sival_ptr = 221534709573Sraf aiocbp->aio_sigevent.sigev_value.sival_ptr; 22167c478bd9Sstevel@tonic-gate } 22177c478bd9Sstevel@tonic-gate 221834709573Sraf reqp->req_resultp = &aiocbp->aio_resultp; 221934709573Sraf reqp->req_aiocbp = aiocbp; 222034709573Sraf ap = &reqp->req_args; 222134709573Sraf ap->fd = aiocbp->aio_fildes; 222234709573Sraf ap->buf = (caddr_t)aiocbp->aio_buf; 222334709573Sraf ap->bufsz = aiocbp->aio_nbytes; 222434709573Sraf ap->offset = aiocbp->aio_offset; 222534709573Sraf 222634709573Sraf if ((flg & AIO_NO_DUPS) && 222734709573Sraf _aio_hash_insert(&aiocbp->aio_resultp, reqp) != 0) { 2228f841f6adSraf aio_panic("_aio_rw64(): request already in hash table"); 222934709573Sraf _aio_req_free(reqp); 22307c478bd9Sstevel@tonic-gate errno = EINVAL; 22317c478bd9Sstevel@tonic-gate return (-1); 22327c478bd9Sstevel@tonic-gate } 223334709573Sraf _aio_req_add(reqp, nextworker, mode); 223434709573Sraf return (0); 22357c478bd9Sstevel@tonic-gate } 223634709573Sraf #endif /* !defined(_LP64) */ 2237