17c478bd9Sstevel@tonic-gate /* 27c478bd9Sstevel@tonic-gate * CDDL HEADER START 37c478bd9Sstevel@tonic-gate * 47c478bd9Sstevel@tonic-gate * The contents of this file are subject to the terms of the 534709573Sraf * Common Development and Distribution License (the "License"). 634709573Sraf * You may not use this file except in compliance with the License. 77c478bd9Sstevel@tonic-gate * 87c478bd9Sstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 97c478bd9Sstevel@tonic-gate * or http://www.opensolaris.org/os/licensing. 107c478bd9Sstevel@tonic-gate * See the License for the specific language governing permissions 117c478bd9Sstevel@tonic-gate * and limitations under the License. 127c478bd9Sstevel@tonic-gate * 137c478bd9Sstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each 147c478bd9Sstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 157c478bd9Sstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the 167c478bd9Sstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying 177c478bd9Sstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner] 187c478bd9Sstevel@tonic-gate * 197c478bd9Sstevel@tonic-gate * CDDL HEADER END 207c478bd9Sstevel@tonic-gate */ 2134709573Sraf 227c478bd9Sstevel@tonic-gate /* 2334709573Sraf * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 247c478bd9Sstevel@tonic-gate * Use is subject to license terms. 257c478bd9Sstevel@tonic-gate */ 267c478bd9Sstevel@tonic-gate 277c478bd9Sstevel@tonic-gate #pragma ident "%Z%%M% %I% %E% SMI" 287c478bd9Sstevel@tonic-gate 29*f841f6adSraf #include "synonyms.h" 30*f841f6adSraf #include "thr_uberdata.h" 31*f841f6adSraf #include "asyncio.h" 3234709573Sraf #include <atomic.h> 337c478bd9Sstevel@tonic-gate #include <sys/param.h> 347c478bd9Sstevel@tonic-gate #include <sys/file.h> 357c478bd9Sstevel@tonic-gate #include <sys/port.h> 367c478bd9Sstevel@tonic-gate 377c478bd9Sstevel@tonic-gate static int _aio_hash_insert(aio_result_t *, aio_req_t *); 387c478bd9Sstevel@tonic-gate static aio_req_t *_aio_req_get(aio_worker_t *); 397c478bd9Sstevel@tonic-gate static void _aio_req_add(aio_req_t *, aio_worker_t **, int); 407c478bd9Sstevel@tonic-gate static void _aio_req_del(aio_worker_t *, aio_req_t *, int); 417c478bd9Sstevel@tonic-gate static void _aio_work_done(aio_worker_t *); 4234709573Sraf static void _aio_enq_doneq(aio_req_t *); 437c478bd9Sstevel@tonic-gate 4434709573Sraf extern void _aio_lio_free(aio_lio_t *); 457c478bd9Sstevel@tonic-gate 4634709573Sraf extern int __fdsync(int, int); 477c478bd9Sstevel@tonic-gate extern int _port_dispatch(int, int, int, int, uintptr_t, void *); 487c478bd9Sstevel@tonic-gate 49c2575b5eSraf static int _aio_fsync_del(aio_worker_t *, aio_req_t *); 5034709573Sraf static void _aiodone(aio_req_t *, ssize_t, int); 517c478bd9Sstevel@tonic-gate static void _aio_cancel_work(aio_worker_t *, int, int *, int *); 5234709573Sraf static void _aio_finish_request(aio_worker_t *, ssize_t, int); 537c478bd9Sstevel@tonic-gate 547c478bd9Sstevel@tonic-gate /* 557c478bd9Sstevel@tonic-gate * switch for kernel async I/O 567c478bd9Sstevel@tonic-gate */ 5734709573Sraf int _kaio_ok = 0; /* 0 = disabled, 1 = on, -1 = error */ 587c478bd9Sstevel@tonic-gate 597c478bd9Sstevel@tonic-gate /* 607c478bd9Sstevel@tonic-gate * Key for thread-specific data 617c478bd9Sstevel@tonic-gate */ 6234709573Sraf pthread_key_t _aio_key; 637c478bd9Sstevel@tonic-gate 647c478bd9Sstevel@tonic-gate /* 6534709573Sraf * Array for determining whether or not a file supports kaio. 6634709573Sraf * Initialized in _kaio_init(). 677c478bd9Sstevel@tonic-gate */ 6834709573Sraf uint32_t *_kaio_supported = NULL; 697c478bd9Sstevel@tonic-gate 707c478bd9Sstevel@tonic-gate /* 7134709573Sraf * workers for read/write requests 7234709573Sraf * (__aio_mutex lock protects circular linked list of workers) 737c478bd9Sstevel@tonic-gate */ 7434709573Sraf aio_worker_t *__workers_rw; /* circular list of AIO workers */ 7534709573Sraf aio_worker_t *__nextworker_rw; /* next worker in list of workers */ 7634709573Sraf int __rw_workerscnt; /* number of read/write workers */ 777c478bd9Sstevel@tonic-gate 787c478bd9Sstevel@tonic-gate /* 7934709573Sraf * worker for notification requests. 807c478bd9Sstevel@tonic-gate */ 8134709573Sraf aio_worker_t *__workers_no; /* circular list of AIO workers */ 8234709573Sraf aio_worker_t *__nextworker_no; /* next worker in list of workers */ 8334709573Sraf int __no_workerscnt; /* number of write workers */ 847c478bd9Sstevel@tonic-gate 8534709573Sraf aio_req_t *_aio_done_tail; /* list of done requests */ 8634709573Sraf aio_req_t *_aio_done_head; 877c478bd9Sstevel@tonic-gate 887c478bd9Sstevel@tonic-gate mutex_t __aio_initlock = DEFAULTMUTEX; /* makes aio initialization atomic */ 89*f841f6adSraf cond_t __aio_initcv = DEFAULTCV; 90*f841f6adSraf int __aio_initbusy = 0; 91*f841f6adSraf 927c478bd9Sstevel@tonic-gate mutex_t __aio_mutex = DEFAULTMUTEX; /* protects counts, and linked lists */ 937c478bd9Sstevel@tonic-gate cond_t _aio_iowait_cv = DEFAULTCV; /* wait for userland I/Os */ 947c478bd9Sstevel@tonic-gate 957c478bd9Sstevel@tonic-gate pid_t __pid = (pid_t)-1; /* initialize as invalid pid */ 9634709573Sraf int _sigio_enabled = 0; /* when set, send SIGIO signal */ 977c478bd9Sstevel@tonic-gate 9834709573Sraf aio_hash_t *_aio_hash; 997c478bd9Sstevel@tonic-gate 10034709573Sraf aio_req_t *_aio_doneq; /* double linked done queue list */ 1017c478bd9Sstevel@tonic-gate 1027c478bd9Sstevel@tonic-gate int _aio_donecnt = 0; 10334709573Sraf int _aio_waitncnt = 0; /* # of requests for aio_waitn */ 1047c478bd9Sstevel@tonic-gate int _aio_doneq_cnt = 0; 10534709573Sraf int _aio_outstand_cnt = 0; /* # of outstanding requests */ 10634709573Sraf int _kaio_outstand_cnt = 0; /* # of outstanding kaio requests */ 1077c478bd9Sstevel@tonic-gate int _aio_req_done_cnt = 0; /* req. done but not in "done queue" */ 1087c478bd9Sstevel@tonic-gate int _aio_kernel_suspend = 0; /* active kernel kaio calls */ 1097c478bd9Sstevel@tonic-gate int _aio_suscv_cnt = 0; /* aio_suspend calls waiting on cv's */ 1107c478bd9Sstevel@tonic-gate 1117c478bd9Sstevel@tonic-gate int _max_workers = 256; /* max number of workers permitted */ 112*f841f6adSraf int _min_workers = 4; /* min number of workers */ 1137c478bd9Sstevel@tonic-gate int _minworkload = 2; /* min number of request in q */ 1147c478bd9Sstevel@tonic-gate int _aio_worker_cnt = 0; /* number of workers to do requests */ 1157c478bd9Sstevel@tonic-gate int __uaio_ok = 0; /* AIO has been enabled */ 1167c478bd9Sstevel@tonic-gate sigset_t _worker_set; /* worker's signal mask */ 1177c478bd9Sstevel@tonic-gate 1187c478bd9Sstevel@tonic-gate int _aiowait_flag = 0; /* when set, aiowait() is inprogress */ 119*f841f6adSraf int _aio_flags = 0; /* see asyncio.h defines for */ 1207c478bd9Sstevel@tonic-gate 121*f841f6adSraf aio_worker_t *_kaiowp = NULL; /* points to kaio cleanup thread */ 1227c478bd9Sstevel@tonic-gate 12334709573Sraf int hz; /* clock ticks per second */ 1247c478bd9Sstevel@tonic-gate 12534709573Sraf static int 12634709573Sraf _kaio_supported_init(void) 1277c478bd9Sstevel@tonic-gate { 12834709573Sraf void *ptr; 12934709573Sraf size_t size; 13034709573Sraf 13134709573Sraf if (_kaio_supported != NULL) /* already initialized */ 13234709573Sraf return (0); 13334709573Sraf 13434709573Sraf size = MAX_KAIO_FDARRAY_SIZE * sizeof (uint32_t); 13534709573Sraf ptr = mmap(NULL, size, PROT_READ | PROT_WRITE, 13634709573Sraf MAP_PRIVATE | MAP_ANON, -1, (off_t)0); 13734709573Sraf if (ptr == MAP_FAILED) 13834709573Sraf return (-1); 13934709573Sraf _kaio_supported = ptr; 14034709573Sraf return (0); 1417c478bd9Sstevel@tonic-gate } 1427c478bd9Sstevel@tonic-gate 1437c478bd9Sstevel@tonic-gate /* 144*f841f6adSraf * The aio subsystem is initialized when an AIO request is made. 145*f841f6adSraf * Constants are initialized like the max number of workers that 146*f841f6adSraf * the subsystem can create, and the minimum number of workers 147*f841f6adSraf * permitted before imposing some restrictions. Also, some 148*f841f6adSraf * workers are created. 1497c478bd9Sstevel@tonic-gate */ 1507c478bd9Sstevel@tonic-gate int 1517c478bd9Sstevel@tonic-gate __uaio_init(void) 1527c478bd9Sstevel@tonic-gate { 153*f841f6adSraf int ret = -1; 1547c478bd9Sstevel@tonic-gate int i; 1557c478bd9Sstevel@tonic-gate 156*f841f6adSraf lmutex_lock(&__aio_initlock); 157*f841f6adSraf while (__aio_initbusy) 158*f841f6adSraf (void) _cond_wait(&__aio_initcv, &__aio_initlock); 15934709573Sraf if (__uaio_ok) { /* already initialized */ 160*f841f6adSraf lmutex_unlock(&__aio_initlock); 16134709573Sraf return (0); 16234709573Sraf } 163*f841f6adSraf __aio_initbusy = 1; 164*f841f6adSraf lmutex_unlock(&__aio_initlock); 1657c478bd9Sstevel@tonic-gate 16634709573Sraf hz = (int)sysconf(_SC_CLK_TCK); 16734709573Sraf __pid = getpid(); 1687c478bd9Sstevel@tonic-gate 169*f841f6adSraf setup_cancelsig(SIGAIOCANCEL); 1707c478bd9Sstevel@tonic-gate 17134709573Sraf if (_kaio_supported_init() != 0) 17234709573Sraf goto out; 1737c478bd9Sstevel@tonic-gate 17434709573Sraf /* 17534709573Sraf * Allocate and initialize the hash table. 17634709573Sraf */ 17734709573Sraf /* LINTED pointer cast */ 17834709573Sraf _aio_hash = (aio_hash_t *)mmap(NULL, 17934709573Sraf HASHSZ * sizeof (aio_hash_t), PROT_READ | PROT_WRITE, 18034709573Sraf MAP_PRIVATE | MAP_ANON, -1, (off_t)0); 18134709573Sraf if ((void *)_aio_hash == MAP_FAILED) { 18234709573Sraf _aio_hash = NULL; 18334709573Sraf goto out; 1847c478bd9Sstevel@tonic-gate } 18534709573Sraf for (i = 0; i < HASHSZ; i++) 18634709573Sraf (void) mutex_init(&_aio_hash[i].hash_lock, USYNC_THREAD, NULL); 1877c478bd9Sstevel@tonic-gate 18834709573Sraf /* 18934709573Sraf * Initialize worker's signal mask to only catch SIGAIOCANCEL. 19034709573Sraf */ 19134709573Sraf (void) sigfillset(&_worker_set); 19234709573Sraf (void) sigdelset(&_worker_set, SIGAIOCANCEL); 19334709573Sraf 19434709573Sraf /* 195*f841f6adSraf * Create the minimum number of read/write workers. 19634709573Sraf */ 19734709573Sraf for (i = 0; i < _min_workers; i++) 19834709573Sraf (void) _aio_create_worker(NULL, AIOREAD); 19934709573Sraf 20034709573Sraf /* 20134709573Sraf * Create one worker to send asynchronous notifications. 20234709573Sraf */ 20334709573Sraf (void) _aio_create_worker(NULL, AIONOTIFY); 20434709573Sraf 20534709573Sraf ret = 0; 20634709573Sraf out: 207*f841f6adSraf lmutex_lock(&__aio_initlock); 208*f841f6adSraf if (ret == 0) 209*f841f6adSraf __uaio_ok = 1; 210*f841f6adSraf __aio_initbusy = 0; 211*f841f6adSraf (void) cond_broadcast(&__aio_initcv); 212*f841f6adSraf lmutex_unlock(&__aio_initlock); 21334709573Sraf return (ret); 2147c478bd9Sstevel@tonic-gate } 2157c478bd9Sstevel@tonic-gate 216*f841f6adSraf /* 217*f841f6adSraf * Called from close() before actually performing the real _close(). 218*f841f6adSraf */ 219*f841f6adSraf void 220*f841f6adSraf _aio_close(int fd) 221*f841f6adSraf { 222*f841f6adSraf if (fd < 0) /* avoid cancelling everything */ 223*f841f6adSraf return; 224*f841f6adSraf /* 225*f841f6adSraf * Cancel all outstanding aio requests for this file descriptor. 226*f841f6adSraf */ 227*f841f6adSraf if (__uaio_ok) 228*f841f6adSraf (void) aiocancel_all(fd); 229*f841f6adSraf /* 230*f841f6adSraf * If we have allocated the bit array, clear the bit for this file. 231*f841f6adSraf * The next open may re-use this file descriptor and the new file 232*f841f6adSraf * may have different kaio() behaviour. 233*f841f6adSraf */ 234*f841f6adSraf if (_kaio_supported != NULL) 235*f841f6adSraf CLEAR_KAIO_SUPPORTED(fd); 236*f841f6adSraf } 237*f841f6adSraf 2387c478bd9Sstevel@tonic-gate /* 2397c478bd9Sstevel@tonic-gate * special kaio cleanup thread sits in a loop in the 2407c478bd9Sstevel@tonic-gate * kernel waiting for pending kaio requests to complete. 2417c478bd9Sstevel@tonic-gate */ 2427c478bd9Sstevel@tonic-gate void * 2437c478bd9Sstevel@tonic-gate _kaio_cleanup_thread(void *arg) 2447c478bd9Sstevel@tonic-gate { 24534709573Sraf if (pthread_setspecific(_aio_key, arg) != 0) 246*f841f6adSraf aio_panic("_kaio_cleanup_thread, pthread_setspecific()"); 2477c478bd9Sstevel@tonic-gate (void) _kaio(AIOSTART); 2487c478bd9Sstevel@tonic-gate return (arg); 2497c478bd9Sstevel@tonic-gate } 2507c478bd9Sstevel@tonic-gate 2517c478bd9Sstevel@tonic-gate /* 2527c478bd9Sstevel@tonic-gate * initialize kaio. 2537c478bd9Sstevel@tonic-gate */ 2547c478bd9Sstevel@tonic-gate void 2557c478bd9Sstevel@tonic-gate _kaio_init() 2567c478bd9Sstevel@tonic-gate { 2577c478bd9Sstevel@tonic-gate int error; 25834709573Sraf sigset_t oset; 25934709573Sraf 260*f841f6adSraf lmutex_lock(&__aio_initlock); 261*f841f6adSraf while (__aio_initbusy) 262*f841f6adSraf (void) _cond_wait(&__aio_initcv, &__aio_initlock); 263*f841f6adSraf if (_kaio_ok) { /* already initialized */ 264*f841f6adSraf lmutex_unlock(&__aio_initlock); 265*f841f6adSraf return; 266*f841f6adSraf } 267*f841f6adSraf __aio_initbusy = 1; 268*f841f6adSraf lmutex_unlock(&__aio_initlock); 269*f841f6adSraf 27034709573Sraf if (_kaio_supported_init() != 0) 271*f841f6adSraf error = ENOMEM; 272*f841f6adSraf else if ((_kaiowp = _aio_worker_alloc()) == NULL) 273*f841f6adSraf error = ENOMEM; 274*f841f6adSraf else if ((error = (int)_kaio(AIOINIT)) == 0) { 275*f841f6adSraf (void) pthread_sigmask(SIG_SETMASK, &maskset, &oset); 276*f841f6adSraf error = thr_create(NULL, AIOSTKSIZE, _kaio_cleanup_thread, 277*f841f6adSraf _kaiowp, THR_DAEMON, &_kaiowp->work_tid); 278*f841f6adSraf (void) pthread_sigmask(SIG_SETMASK, &oset, NULL); 279*f841f6adSraf } 280*f841f6adSraf if (error && _kaiowp != NULL) { 281*f841f6adSraf _aio_worker_free(_kaiowp); 282*f841f6adSraf _kaiowp = NULL; 2837c478bd9Sstevel@tonic-gate } 284*f841f6adSraf 285*f841f6adSraf lmutex_lock(&__aio_initlock); 286*f841f6adSraf if (error) 287*f841f6adSraf _kaio_ok = -1; 288*f841f6adSraf else 289*f841f6adSraf _kaio_ok = 1; 290*f841f6adSraf __aio_initbusy = 0; 291*f841f6adSraf (void) cond_broadcast(&__aio_initcv); 292*f841f6adSraf lmutex_unlock(&__aio_initlock); 2937c478bd9Sstevel@tonic-gate } 2947c478bd9Sstevel@tonic-gate 2957c478bd9Sstevel@tonic-gate int 2967c478bd9Sstevel@tonic-gate aioread(int fd, caddr_t buf, int bufsz, off_t offset, int whence, 2977c478bd9Sstevel@tonic-gate aio_result_t *resultp) 2987c478bd9Sstevel@tonic-gate { 2997c478bd9Sstevel@tonic-gate return (_aiorw(fd, buf, bufsz, offset, whence, resultp, AIOREAD)); 3007c478bd9Sstevel@tonic-gate } 3017c478bd9Sstevel@tonic-gate 3027c478bd9Sstevel@tonic-gate int 3037c478bd9Sstevel@tonic-gate aiowrite(int fd, caddr_t buf, int bufsz, off_t offset, int whence, 3047c478bd9Sstevel@tonic-gate aio_result_t *resultp) 3057c478bd9Sstevel@tonic-gate { 3067c478bd9Sstevel@tonic-gate return (_aiorw(fd, buf, bufsz, offset, whence, resultp, AIOWRITE)); 3077c478bd9Sstevel@tonic-gate } 3087c478bd9Sstevel@tonic-gate 30934709573Sraf #if !defined(_LP64) 3107c478bd9Sstevel@tonic-gate int 3117c478bd9Sstevel@tonic-gate aioread64(int fd, caddr_t buf, int bufsz, off64_t offset, int whence, 3127c478bd9Sstevel@tonic-gate aio_result_t *resultp) 3137c478bd9Sstevel@tonic-gate { 3147c478bd9Sstevel@tonic-gate return (_aiorw(fd, buf, bufsz, offset, whence, resultp, AIOAREAD64)); 3157c478bd9Sstevel@tonic-gate } 3167c478bd9Sstevel@tonic-gate 3177c478bd9Sstevel@tonic-gate int 3187c478bd9Sstevel@tonic-gate aiowrite64(int fd, caddr_t buf, int bufsz, off64_t offset, int whence, 3197c478bd9Sstevel@tonic-gate aio_result_t *resultp) 3207c478bd9Sstevel@tonic-gate { 3217c478bd9Sstevel@tonic-gate return (_aiorw(fd, buf, bufsz, offset, whence, resultp, AIOAWRITE64)); 3227c478bd9Sstevel@tonic-gate } 32334709573Sraf #endif /* !defined(_LP64) */ 3247c478bd9Sstevel@tonic-gate 3257c478bd9Sstevel@tonic-gate int 3267c478bd9Sstevel@tonic-gate _aiorw(int fd, caddr_t buf, int bufsz, offset_t offset, int whence, 3277c478bd9Sstevel@tonic-gate aio_result_t *resultp, int mode) 3287c478bd9Sstevel@tonic-gate { 32934709573Sraf aio_req_t *reqp; 33034709573Sraf aio_args_t *ap; 33134709573Sraf offset_t loffset; 3327c478bd9Sstevel@tonic-gate struct stat stat; 33334709573Sraf int error = 0; 3347c478bd9Sstevel@tonic-gate int kerr; 3357c478bd9Sstevel@tonic-gate int umode; 3367c478bd9Sstevel@tonic-gate 3377c478bd9Sstevel@tonic-gate switch (whence) { 3387c478bd9Sstevel@tonic-gate 3397c478bd9Sstevel@tonic-gate case SEEK_SET: 3407c478bd9Sstevel@tonic-gate loffset = offset; 3417c478bd9Sstevel@tonic-gate break; 3427c478bd9Sstevel@tonic-gate case SEEK_CUR: 3437c478bd9Sstevel@tonic-gate if ((loffset = llseek(fd, 0, SEEK_CUR)) == -1) 34434709573Sraf error = -1; 3457c478bd9Sstevel@tonic-gate else 3467c478bd9Sstevel@tonic-gate loffset += offset; 3477c478bd9Sstevel@tonic-gate break; 3487c478bd9Sstevel@tonic-gate case SEEK_END: 3497c478bd9Sstevel@tonic-gate if (fstat(fd, &stat) == -1) 35034709573Sraf error = -1; 3517c478bd9Sstevel@tonic-gate else 3527c478bd9Sstevel@tonic-gate loffset = offset + stat.st_size; 3537c478bd9Sstevel@tonic-gate break; 3547c478bd9Sstevel@tonic-gate default: 3557c478bd9Sstevel@tonic-gate errno = EINVAL; 35634709573Sraf error = -1; 3577c478bd9Sstevel@tonic-gate } 3587c478bd9Sstevel@tonic-gate 35934709573Sraf if (error) 36034709573Sraf return (error); 3617c478bd9Sstevel@tonic-gate 3627c478bd9Sstevel@tonic-gate /* initialize kaio */ 3637c478bd9Sstevel@tonic-gate if (!_kaio_ok) 3647c478bd9Sstevel@tonic-gate _kaio_init(); 3657c478bd9Sstevel@tonic-gate 3667c478bd9Sstevel@tonic-gate /* 3677c478bd9Sstevel@tonic-gate * _aio_do_request() needs the original request code (mode) to be able 36834709573Sraf * to choose the appropiate 32/64 bit function. All other functions 3697c478bd9Sstevel@tonic-gate * only require the difference between READ and WRITE (umode). 3707c478bd9Sstevel@tonic-gate */ 3717c478bd9Sstevel@tonic-gate if (mode == AIOAREAD64 || mode == AIOAWRITE64) 3727c478bd9Sstevel@tonic-gate umode = mode - AIOAREAD64; 3737c478bd9Sstevel@tonic-gate else 3747c478bd9Sstevel@tonic-gate umode = mode; 3757c478bd9Sstevel@tonic-gate 3767c478bd9Sstevel@tonic-gate /* 3777c478bd9Sstevel@tonic-gate * Try kernel aio first. 3787c478bd9Sstevel@tonic-gate * If errno is ENOTSUP/EBADFD, fall back to the thread implementation. 3797c478bd9Sstevel@tonic-gate */ 38034709573Sraf if (_kaio_ok > 0 && KAIO_SUPPORTED(fd)) { 3817c478bd9Sstevel@tonic-gate resultp->aio_errno = 0; 38234709573Sraf sig_mutex_lock(&__aio_mutex); 38334709573Sraf _kaio_outstand_cnt++; 3847c478bd9Sstevel@tonic-gate kerr = (int)_kaio(((resultp->aio_return == AIO_INPROGRESS) ? 3857c478bd9Sstevel@tonic-gate (umode | AIO_POLL_BIT) : umode), 3867c478bd9Sstevel@tonic-gate fd, buf, bufsz, loffset, resultp); 38734709573Sraf if (kerr == 0) { 388b9868792Sraf sig_mutex_unlock(&__aio_mutex); 3897c478bd9Sstevel@tonic-gate return (0); 39034709573Sraf } 39134709573Sraf _kaio_outstand_cnt--; 39234709573Sraf sig_mutex_unlock(&__aio_mutex); 39334709573Sraf if (errno != ENOTSUP && errno != EBADFD) 3947c478bd9Sstevel@tonic-gate return (-1); 3957c478bd9Sstevel@tonic-gate if (errno == EBADFD) 3967c478bd9Sstevel@tonic-gate SET_KAIO_NOT_SUPPORTED(fd); 3977c478bd9Sstevel@tonic-gate } 3987c478bd9Sstevel@tonic-gate 39934709573Sraf if (!__uaio_ok && __uaio_init() == -1) 40034709573Sraf return (-1); 40134709573Sraf 40234709573Sraf if ((reqp = _aio_req_alloc()) == NULL) { 4037c478bd9Sstevel@tonic-gate errno = EAGAIN; 4047c478bd9Sstevel@tonic-gate return (-1); 4057c478bd9Sstevel@tonic-gate } 4067c478bd9Sstevel@tonic-gate 4077c478bd9Sstevel@tonic-gate /* 40834709573Sraf * _aio_do_request() checks reqp->req_op to differentiate 4097c478bd9Sstevel@tonic-gate * between 32 and 64 bit access. 4107c478bd9Sstevel@tonic-gate */ 41134709573Sraf reqp->req_op = mode; 41234709573Sraf reqp->req_resultp = resultp; 41334709573Sraf ap = &reqp->req_args; 4147c478bd9Sstevel@tonic-gate ap->fd = fd; 4157c478bd9Sstevel@tonic-gate ap->buf = buf; 4167c478bd9Sstevel@tonic-gate ap->bufsz = bufsz; 4177c478bd9Sstevel@tonic-gate ap->offset = loffset; 4187c478bd9Sstevel@tonic-gate 41934709573Sraf if (_aio_hash_insert(resultp, reqp) != 0) { 42034709573Sraf _aio_req_free(reqp); 4217c478bd9Sstevel@tonic-gate errno = EINVAL; 4227c478bd9Sstevel@tonic-gate return (-1); 4237c478bd9Sstevel@tonic-gate } 42434709573Sraf /* 42534709573Sraf * _aio_req_add() only needs the difference between READ and 42634709573Sraf * WRITE to choose the right worker queue. 42734709573Sraf */ 42834709573Sraf _aio_req_add(reqp, &__nextworker_rw, umode); 42934709573Sraf return (0); 4307c478bd9Sstevel@tonic-gate } 4317c478bd9Sstevel@tonic-gate 4327c478bd9Sstevel@tonic-gate int 4337c478bd9Sstevel@tonic-gate aiocancel(aio_result_t *resultp) 4347c478bd9Sstevel@tonic-gate { 43534709573Sraf aio_req_t *reqp; 43634709573Sraf aio_worker_t *aiowp; 43734709573Sraf int ret; 43834709573Sraf int done = 0; 43934709573Sraf int canceled = 0; 4407c478bd9Sstevel@tonic-gate 4417c478bd9Sstevel@tonic-gate if (!__uaio_ok) { 4427c478bd9Sstevel@tonic-gate errno = EINVAL; 4437c478bd9Sstevel@tonic-gate return (-1); 4447c478bd9Sstevel@tonic-gate } 4457c478bd9Sstevel@tonic-gate 44634709573Sraf sig_mutex_lock(&__aio_mutex); 44734709573Sraf reqp = _aio_hash_find(resultp); 44834709573Sraf if (reqp == NULL) { 4497c478bd9Sstevel@tonic-gate if (_aio_outstand_cnt == _aio_req_done_cnt) 4507c478bd9Sstevel@tonic-gate errno = EINVAL; 4517c478bd9Sstevel@tonic-gate else 4527c478bd9Sstevel@tonic-gate errno = EACCES; 45334709573Sraf ret = -1; 4547c478bd9Sstevel@tonic-gate } else { 45534709573Sraf aiowp = reqp->req_worker; 45634709573Sraf sig_mutex_lock(&aiowp->work_qlock1); 45734709573Sraf (void) _aio_cancel_req(aiowp, reqp, &canceled, &done); 45834709573Sraf sig_mutex_unlock(&aiowp->work_qlock1); 4597c478bd9Sstevel@tonic-gate 4607c478bd9Sstevel@tonic-gate if (canceled) { 46134709573Sraf ret = 0; 4627c478bd9Sstevel@tonic-gate } else { 46334709573Sraf if (_aio_outstand_cnt == 0 || 46434709573Sraf _aio_outstand_cnt == _aio_req_done_cnt) 46534709573Sraf errno = EINVAL; 46634709573Sraf else 46734709573Sraf errno = EACCES; 46834709573Sraf ret = -1; 4697c478bd9Sstevel@tonic-gate } 4707c478bd9Sstevel@tonic-gate } 47134709573Sraf sig_mutex_unlock(&__aio_mutex); 47234709573Sraf return (ret); 4737c478bd9Sstevel@tonic-gate } 4747c478bd9Sstevel@tonic-gate 4757c478bd9Sstevel@tonic-gate /* 4767c478bd9Sstevel@tonic-gate * This must be asynch safe 4777c478bd9Sstevel@tonic-gate */ 4787c478bd9Sstevel@tonic-gate aio_result_t * 4797c478bd9Sstevel@tonic-gate aiowait(struct timeval *uwait) 4807c478bd9Sstevel@tonic-gate { 48134709573Sraf aio_result_t *uresultp; 48234709573Sraf aio_result_t *kresultp; 48334709573Sraf aio_result_t *resultp; 4847c478bd9Sstevel@tonic-gate int dontblock; 4857c478bd9Sstevel@tonic-gate int timedwait = 0; 4867c478bd9Sstevel@tonic-gate int kaio_errno = 0; 48734709573Sraf struct timeval twait; 48834709573Sraf struct timeval *wait = NULL; 4897c478bd9Sstevel@tonic-gate hrtime_t hrtend; 4907c478bd9Sstevel@tonic-gate hrtime_t hres; 4917c478bd9Sstevel@tonic-gate 4927c478bd9Sstevel@tonic-gate if (uwait) { 4937c478bd9Sstevel@tonic-gate /* 49434709573Sraf * Check for a valid specified wait time. 49534709573Sraf * If it is invalid, fail the call right away. 4967c478bd9Sstevel@tonic-gate */ 4977c478bd9Sstevel@tonic-gate if (uwait->tv_sec < 0 || uwait->tv_usec < 0 || 4987c478bd9Sstevel@tonic-gate uwait->tv_usec >= MICROSEC) { 4997c478bd9Sstevel@tonic-gate errno = EINVAL; 5007c478bd9Sstevel@tonic-gate return ((aio_result_t *)-1); 5017c478bd9Sstevel@tonic-gate } 5027c478bd9Sstevel@tonic-gate 50334709573Sraf if (uwait->tv_sec > 0 || uwait->tv_usec > 0) { 5047c478bd9Sstevel@tonic-gate hrtend = gethrtime() + 5057c478bd9Sstevel@tonic-gate (hrtime_t)uwait->tv_sec * NANOSEC + 5067c478bd9Sstevel@tonic-gate (hrtime_t)uwait->tv_usec * (NANOSEC / MICROSEC); 5077c478bd9Sstevel@tonic-gate twait = *uwait; 5087c478bd9Sstevel@tonic-gate wait = &twait; 5097c478bd9Sstevel@tonic-gate timedwait++; 5107c478bd9Sstevel@tonic-gate } else { 5117c478bd9Sstevel@tonic-gate /* polling */ 51234709573Sraf sig_mutex_lock(&__aio_mutex); 51334709573Sraf if (_kaio_outstand_cnt == 0) { 51434709573Sraf kresultp = (aio_result_t *)-1; 51534709573Sraf } else { 51634709573Sraf kresultp = (aio_result_t *)_kaio(AIOWAIT, 51734709573Sraf (struct timeval *)-1, 1); 51834709573Sraf if (kresultp != (aio_result_t *)-1 && 51934709573Sraf kresultp != NULL && 52034709573Sraf kresultp != (aio_result_t *)1) { 52134709573Sraf _kaio_outstand_cnt--; 52234709573Sraf sig_mutex_unlock(&__aio_mutex); 52334709573Sraf return (kresultp); 52434709573Sraf } 52534709573Sraf } 5267c478bd9Sstevel@tonic-gate uresultp = _aio_req_done(); 52734709573Sraf sig_mutex_unlock(&__aio_mutex); 52834709573Sraf if (uresultp != NULL && 52934709573Sraf uresultp != (aio_result_t *)-1) { 5307c478bd9Sstevel@tonic-gate return (uresultp); 5317c478bd9Sstevel@tonic-gate } 5327c478bd9Sstevel@tonic-gate if (uresultp == (aio_result_t *)-1 && 5337c478bd9Sstevel@tonic-gate kresultp == (aio_result_t *)-1) { 5347c478bd9Sstevel@tonic-gate errno = EINVAL; 5357c478bd9Sstevel@tonic-gate return ((aio_result_t *)-1); 53634709573Sraf } else { 5377c478bd9Sstevel@tonic-gate return (NULL); 53834709573Sraf } 5397c478bd9Sstevel@tonic-gate } 5407c478bd9Sstevel@tonic-gate } 5417c478bd9Sstevel@tonic-gate 5427c478bd9Sstevel@tonic-gate for (;;) { 54334709573Sraf sig_mutex_lock(&__aio_mutex); 5447c478bd9Sstevel@tonic-gate uresultp = _aio_req_done(); 5457c478bd9Sstevel@tonic-gate if (uresultp != NULL && uresultp != (aio_result_t *)-1) { 54634709573Sraf sig_mutex_unlock(&__aio_mutex); 5477c478bd9Sstevel@tonic-gate resultp = uresultp; 5487c478bd9Sstevel@tonic-gate break; 5497c478bd9Sstevel@tonic-gate } 5507c478bd9Sstevel@tonic-gate _aiowait_flag++; 5517c478bd9Sstevel@tonic-gate dontblock = (uresultp == (aio_result_t *)-1); 55234709573Sraf if (dontblock && _kaio_outstand_cnt == 0) { 55334709573Sraf kresultp = (aio_result_t *)-1; 55434709573Sraf kaio_errno = EINVAL; 55534709573Sraf } else { 55634709573Sraf sig_mutex_unlock(&__aio_mutex); 55734709573Sraf kresultp = (aio_result_t *)_kaio(AIOWAIT, 55834709573Sraf wait, dontblock); 55934709573Sraf sig_mutex_lock(&__aio_mutex); 56034709573Sraf kaio_errno = errno; 56134709573Sraf } 5627c478bd9Sstevel@tonic-gate _aiowait_flag--; 56334709573Sraf sig_mutex_unlock(&__aio_mutex); 5647c478bd9Sstevel@tonic-gate if (kresultp == (aio_result_t *)1) { 5657c478bd9Sstevel@tonic-gate /* aiowait() awakened by an aionotify() */ 5667c478bd9Sstevel@tonic-gate continue; 56734709573Sraf } else if (kresultp != NULL && 56834709573Sraf kresultp != (aio_result_t *)-1) { 5697c478bd9Sstevel@tonic-gate resultp = kresultp; 57034709573Sraf sig_mutex_lock(&__aio_mutex); 57134709573Sraf _kaio_outstand_cnt--; 57234709573Sraf sig_mutex_unlock(&__aio_mutex); 5737c478bd9Sstevel@tonic-gate break; 57434709573Sraf } else if (kresultp == (aio_result_t *)-1 && 57534709573Sraf kaio_errno == EINVAL && 57634709573Sraf uresultp == (aio_result_t *)-1) { 5777c478bd9Sstevel@tonic-gate errno = kaio_errno; 5787c478bd9Sstevel@tonic-gate resultp = (aio_result_t *)-1; 5797c478bd9Sstevel@tonic-gate break; 5807c478bd9Sstevel@tonic-gate } else if (kresultp == (aio_result_t *)-1 && 5817c478bd9Sstevel@tonic-gate kaio_errno == EINTR) { 5827c478bd9Sstevel@tonic-gate errno = kaio_errno; 5837c478bd9Sstevel@tonic-gate resultp = (aio_result_t *)-1; 5847c478bd9Sstevel@tonic-gate break; 5857c478bd9Sstevel@tonic-gate } else if (timedwait) { 5867c478bd9Sstevel@tonic-gate hres = hrtend - gethrtime(); 5877c478bd9Sstevel@tonic-gate if (hres <= 0) { 58834709573Sraf /* time is up; return */ 5897c478bd9Sstevel@tonic-gate resultp = NULL; 5907c478bd9Sstevel@tonic-gate break; 5917c478bd9Sstevel@tonic-gate } else { 5927c478bd9Sstevel@tonic-gate /* 59334709573Sraf * Some time left. Round up the remaining time 59434709573Sraf * in nanoseconds to microsec. Retry the call. 5957c478bd9Sstevel@tonic-gate */ 59634709573Sraf hres += (NANOSEC / MICROSEC) - 1; 5977c478bd9Sstevel@tonic-gate wait->tv_sec = hres / NANOSEC; 5987c478bd9Sstevel@tonic-gate wait->tv_usec = 5997c478bd9Sstevel@tonic-gate (hres % NANOSEC) / (NANOSEC / MICROSEC); 6007c478bd9Sstevel@tonic-gate } 6017c478bd9Sstevel@tonic-gate } else { 60234709573Sraf ASSERT(kresultp == NULL && uresultp == NULL); 6037c478bd9Sstevel@tonic-gate resultp = NULL; 6047c478bd9Sstevel@tonic-gate continue; 6057c478bd9Sstevel@tonic-gate } 6067c478bd9Sstevel@tonic-gate } 6077c478bd9Sstevel@tonic-gate return (resultp); 6087c478bd9Sstevel@tonic-gate } 6097c478bd9Sstevel@tonic-gate 6107c478bd9Sstevel@tonic-gate /* 6117c478bd9Sstevel@tonic-gate * _aio_get_timedelta calculates the remaining time and stores the result 61234709573Sraf * into timespec_t *wait. 6137c478bd9Sstevel@tonic-gate */ 6147c478bd9Sstevel@tonic-gate 6157c478bd9Sstevel@tonic-gate int 61634709573Sraf _aio_get_timedelta(timespec_t *end, timespec_t *wait) 6177c478bd9Sstevel@tonic-gate { 6187c478bd9Sstevel@tonic-gate int ret = 0; 6197c478bd9Sstevel@tonic-gate struct timeval cur; 62034709573Sraf timespec_t curtime; 6217c478bd9Sstevel@tonic-gate 6227c478bd9Sstevel@tonic-gate (void) gettimeofday(&cur, NULL); 6237c478bd9Sstevel@tonic-gate curtime.tv_sec = cur.tv_sec; 6247c478bd9Sstevel@tonic-gate curtime.tv_nsec = cur.tv_usec * 1000; /* convert us to ns */ 6257c478bd9Sstevel@tonic-gate 6267c478bd9Sstevel@tonic-gate if (end->tv_sec >= curtime.tv_sec) { 6277c478bd9Sstevel@tonic-gate wait->tv_sec = end->tv_sec - curtime.tv_sec; 6287c478bd9Sstevel@tonic-gate if (end->tv_nsec >= curtime.tv_nsec) { 6297c478bd9Sstevel@tonic-gate wait->tv_nsec = end->tv_nsec - curtime.tv_nsec; 6307c478bd9Sstevel@tonic-gate if (wait->tv_sec == 0 && wait->tv_nsec == 0) 6317c478bd9Sstevel@tonic-gate ret = -1; /* timer expired */ 6327c478bd9Sstevel@tonic-gate } else { 6337c478bd9Sstevel@tonic-gate if (end->tv_sec > curtime.tv_sec) { 6347c478bd9Sstevel@tonic-gate wait->tv_sec -= 1; 6357c478bd9Sstevel@tonic-gate wait->tv_nsec = NANOSEC - 6367c478bd9Sstevel@tonic-gate (curtime.tv_nsec - end->tv_nsec); 6377c478bd9Sstevel@tonic-gate } else { 6387c478bd9Sstevel@tonic-gate ret = -1; /* timer expired */ 6397c478bd9Sstevel@tonic-gate } 6407c478bd9Sstevel@tonic-gate } 6417c478bd9Sstevel@tonic-gate } else { 6427c478bd9Sstevel@tonic-gate ret = -1; 6437c478bd9Sstevel@tonic-gate } 6447c478bd9Sstevel@tonic-gate return (ret); 6457c478bd9Sstevel@tonic-gate } 6467c478bd9Sstevel@tonic-gate 6477c478bd9Sstevel@tonic-gate /* 6487c478bd9Sstevel@tonic-gate * If closing by file descriptor: we will simply cancel all the outstanding 64934709573Sraf * aio`s and return. Those aio's in question will have either noticed the 6507c478bd9Sstevel@tonic-gate * cancellation notice before, during, or after initiating io. 6517c478bd9Sstevel@tonic-gate */ 6527c478bd9Sstevel@tonic-gate int 6537c478bd9Sstevel@tonic-gate aiocancel_all(int fd) 6547c478bd9Sstevel@tonic-gate { 65534709573Sraf aio_req_t *reqp; 65634709573Sraf aio_req_t **reqpp; 65734709573Sraf aio_worker_t *first; 65834709573Sraf aio_worker_t *next; 6597c478bd9Sstevel@tonic-gate int canceled = 0; 6607c478bd9Sstevel@tonic-gate int done = 0; 6617c478bd9Sstevel@tonic-gate int cancelall = 0; 6627c478bd9Sstevel@tonic-gate 66334709573Sraf sig_mutex_lock(&__aio_mutex); 6647c478bd9Sstevel@tonic-gate 66534709573Sraf if (_aio_outstand_cnt == 0) { 66634709573Sraf sig_mutex_unlock(&__aio_mutex); 66734709573Sraf return (AIO_ALLDONE); 66834709573Sraf } 6697c478bd9Sstevel@tonic-gate 6707c478bd9Sstevel@tonic-gate /* 67134709573Sraf * Cancel requests from the read/write workers' queues. 6727c478bd9Sstevel@tonic-gate */ 67334709573Sraf first = __nextworker_rw; 6747c478bd9Sstevel@tonic-gate next = first; 6757c478bd9Sstevel@tonic-gate do { 6767c478bd9Sstevel@tonic-gate _aio_cancel_work(next, fd, &canceled, &done); 6777c478bd9Sstevel@tonic-gate } while ((next = next->work_forw) != first); 6787c478bd9Sstevel@tonic-gate 6797c478bd9Sstevel@tonic-gate /* 6807c478bd9Sstevel@tonic-gate * finally, check if there are requests on the done queue that 6817c478bd9Sstevel@tonic-gate * should be canceled. 6827c478bd9Sstevel@tonic-gate */ 6837c478bd9Sstevel@tonic-gate if (fd < 0) 6847c478bd9Sstevel@tonic-gate cancelall = 1; 68534709573Sraf reqpp = &_aio_done_tail; 68634709573Sraf while ((reqp = *reqpp) != NULL) { 68734709573Sraf if (cancelall || reqp->req_args.fd == fd) { 68834709573Sraf *reqpp = reqp->req_next; 6897c478bd9Sstevel@tonic-gate _aio_donecnt--; 69034709573Sraf (void) _aio_hash_del(reqp->req_resultp); 69134709573Sraf _aio_req_free(reqp); 6927c478bd9Sstevel@tonic-gate } else 69334709573Sraf reqpp = &reqp->req_next; 6947c478bd9Sstevel@tonic-gate } 6957c478bd9Sstevel@tonic-gate if (cancelall) { 6967c478bd9Sstevel@tonic-gate ASSERT(_aio_donecnt == 0); 6977c478bd9Sstevel@tonic-gate _aio_done_head = NULL; 6987c478bd9Sstevel@tonic-gate } 69934709573Sraf sig_mutex_unlock(&__aio_mutex); 7007c478bd9Sstevel@tonic-gate 7017c478bd9Sstevel@tonic-gate if (canceled && done == 0) 7027c478bd9Sstevel@tonic-gate return (AIO_CANCELED); 7037c478bd9Sstevel@tonic-gate else if (done && canceled == 0) 7047c478bd9Sstevel@tonic-gate return (AIO_ALLDONE); 7057c478bd9Sstevel@tonic-gate else if ((canceled + done == 0) && KAIO_SUPPORTED(fd)) 7067c478bd9Sstevel@tonic-gate return ((int)_kaio(AIOCANCEL, fd, NULL)); 7077c478bd9Sstevel@tonic-gate return (AIO_NOTCANCELED); 7087c478bd9Sstevel@tonic-gate } 7097c478bd9Sstevel@tonic-gate 7107c478bd9Sstevel@tonic-gate /* 71134709573Sraf * Cancel requests from a given work queue. If the file descriptor 71234709573Sraf * parameter, fd, is non-negative, then only cancel those requests 71334709573Sraf * in this queue that are to this file descriptor. If the fd 7147c478bd9Sstevel@tonic-gate * parameter is -1, then cancel all requests. 7157c478bd9Sstevel@tonic-gate */ 7167c478bd9Sstevel@tonic-gate static void 7177c478bd9Sstevel@tonic-gate _aio_cancel_work(aio_worker_t *aiowp, int fd, int *canceled, int *done) 7187c478bd9Sstevel@tonic-gate { 71934709573Sraf aio_req_t *reqp; 7207c478bd9Sstevel@tonic-gate 72134709573Sraf sig_mutex_lock(&aiowp->work_qlock1); 7227c478bd9Sstevel@tonic-gate /* 7237c478bd9Sstevel@tonic-gate * cancel queued requests first. 7247c478bd9Sstevel@tonic-gate */ 72534709573Sraf reqp = aiowp->work_tail1; 72634709573Sraf while (reqp != NULL) { 72734709573Sraf if (fd < 0 || reqp->req_args.fd == fd) { 72834709573Sraf if (_aio_cancel_req(aiowp, reqp, canceled, done)) { 7297c478bd9Sstevel@tonic-gate /* 73034709573Sraf * Callers locks were dropped. 73134709573Sraf * reqp is invalid; start traversing 73234709573Sraf * the list from the beginning again. 7337c478bd9Sstevel@tonic-gate */ 73434709573Sraf reqp = aiowp->work_tail1; 7357c478bd9Sstevel@tonic-gate continue; 7367c478bd9Sstevel@tonic-gate } 7377c478bd9Sstevel@tonic-gate } 73834709573Sraf reqp = reqp->req_next; 7397c478bd9Sstevel@tonic-gate } 7407c478bd9Sstevel@tonic-gate /* 74134709573Sraf * Since the queued requests have been canceled, there can 74234709573Sraf * only be one inprogress request that should be canceled. 7437c478bd9Sstevel@tonic-gate */ 74434709573Sraf if ((reqp = aiowp->work_req) != NULL && 74534709573Sraf (fd < 0 || reqp->req_args.fd == fd)) 74634709573Sraf (void) _aio_cancel_req(aiowp, reqp, canceled, done); 74734709573Sraf sig_mutex_unlock(&aiowp->work_qlock1); 7487c478bd9Sstevel@tonic-gate } 7497c478bd9Sstevel@tonic-gate 7507c478bd9Sstevel@tonic-gate /* 75134709573Sraf * Cancel a request. Return 1 if the callers locks were temporarily 7527c478bd9Sstevel@tonic-gate * dropped, otherwise return 0. 7537c478bd9Sstevel@tonic-gate */ 7547c478bd9Sstevel@tonic-gate int 75534709573Sraf _aio_cancel_req(aio_worker_t *aiowp, aio_req_t *reqp, int *canceled, int *done) 7567c478bd9Sstevel@tonic-gate { 75734709573Sraf int ostate = reqp->req_state; 7587c478bd9Sstevel@tonic-gate 7597c478bd9Sstevel@tonic-gate ASSERT(MUTEX_HELD(&__aio_mutex)); 7607c478bd9Sstevel@tonic-gate ASSERT(MUTEX_HELD(&aiowp->work_qlock1)); 76134709573Sraf if (ostate == AIO_REQ_CANCELED) 7627c478bd9Sstevel@tonic-gate return (0); 7637c478bd9Sstevel@tonic-gate if (ostate == AIO_REQ_DONE || ostate == AIO_REQ_DONEQ) { 7647c478bd9Sstevel@tonic-gate (*done)++; 7657c478bd9Sstevel@tonic-gate return (0); 7667c478bd9Sstevel@tonic-gate } 767c2575b5eSraf if (reqp->req_op == AIOFSYNC && reqp != aiowp->work_req) { 76834709573Sraf ASSERT(POSIX_AIO(reqp)); 769c2575b5eSraf /* Cancel the queued aio_fsync() request */ 77034709573Sraf if (!reqp->req_head->lio_canned) { 77134709573Sraf reqp->req_head->lio_canned = 1; 77234709573Sraf _aio_outstand_cnt--; 77334709573Sraf (*canceled)++; 77434709573Sraf } 7757c478bd9Sstevel@tonic-gate return (0); 7767c478bd9Sstevel@tonic-gate } 77734709573Sraf reqp->req_state = AIO_REQ_CANCELED; 77834709573Sraf _aio_req_del(aiowp, reqp, ostate); 77934709573Sraf (void) _aio_hash_del(reqp->req_resultp); 7807c478bd9Sstevel@tonic-gate (*canceled)++; 78134709573Sraf if (reqp == aiowp->work_req) { 78234709573Sraf ASSERT(ostate == AIO_REQ_INPROGRESS); 78334709573Sraf /* 78434709573Sraf * Set the result values now, before _aiodone() is called. 78534709573Sraf * We do this because the application can expect aio_return 78634709573Sraf * and aio_errno to be set to -1 and ECANCELED, respectively, 78734709573Sraf * immediately after a successful return from aiocancel() 78834709573Sraf * or aio_cancel(). 78934709573Sraf */ 79034709573Sraf _aio_set_result(reqp, -1, ECANCELED); 79134709573Sraf (void) thr_kill(aiowp->work_tid, SIGAIOCANCEL); 79234709573Sraf return (0); 79334709573Sraf } 79434709573Sraf if (!POSIX_AIO(reqp)) { 79534709573Sraf _aio_outstand_cnt--; 79634709573Sraf _aio_set_result(reqp, -1, ECANCELED); 79734709573Sraf return (0); 79834709573Sraf } 79934709573Sraf sig_mutex_unlock(&aiowp->work_qlock1); 80034709573Sraf sig_mutex_unlock(&__aio_mutex); 80134709573Sraf _aiodone(reqp, -1, ECANCELED); 80234709573Sraf sig_mutex_lock(&__aio_mutex); 80334709573Sraf sig_mutex_lock(&aiowp->work_qlock1); 8047c478bd9Sstevel@tonic-gate return (1); 8057c478bd9Sstevel@tonic-gate } 8067c478bd9Sstevel@tonic-gate 807*f841f6adSraf int 808*f841f6adSraf _aio_create_worker(aio_req_t *reqp, int mode) 809*f841f6adSraf { 810*f841f6adSraf aio_worker_t *aiowp, **workers, **nextworker; 811*f841f6adSraf int *aio_workerscnt; 812*f841f6adSraf void *(*func)(void *); 813*f841f6adSraf sigset_t oset; 814*f841f6adSraf int error; 815*f841f6adSraf 816*f841f6adSraf /* 817*f841f6adSraf * Put the new worker thread in the right queue. 818*f841f6adSraf */ 819*f841f6adSraf switch (mode) { 820*f841f6adSraf case AIOREAD: 821*f841f6adSraf case AIOWRITE: 822*f841f6adSraf case AIOAREAD: 823*f841f6adSraf case AIOAWRITE: 824*f841f6adSraf #if !defined(_LP64) 825*f841f6adSraf case AIOAREAD64: 826*f841f6adSraf case AIOAWRITE64: 827*f841f6adSraf #endif 828*f841f6adSraf workers = &__workers_rw; 829*f841f6adSraf nextworker = &__nextworker_rw; 830*f841f6adSraf aio_workerscnt = &__rw_workerscnt; 831*f841f6adSraf func = _aio_do_request; 832*f841f6adSraf break; 833*f841f6adSraf case AIONOTIFY: 834*f841f6adSraf workers = &__workers_no; 835*f841f6adSraf nextworker = &__nextworker_no; 836*f841f6adSraf func = _aio_do_notify; 837*f841f6adSraf aio_workerscnt = &__no_workerscnt; 838*f841f6adSraf break; 839*f841f6adSraf default: 840*f841f6adSraf aio_panic("_aio_create_worker: invalid mode"); 841*f841f6adSraf break; 842*f841f6adSraf } 843*f841f6adSraf 844*f841f6adSraf if ((aiowp = _aio_worker_alloc()) == NULL) 845*f841f6adSraf return (-1); 846*f841f6adSraf 847*f841f6adSraf if (reqp) { 848*f841f6adSraf reqp->req_state = AIO_REQ_QUEUED; 849*f841f6adSraf reqp->req_worker = aiowp; 850*f841f6adSraf aiowp->work_head1 = reqp; 851*f841f6adSraf aiowp->work_tail1 = reqp; 852*f841f6adSraf aiowp->work_next1 = reqp; 853*f841f6adSraf aiowp->work_count1 = 1; 854*f841f6adSraf aiowp->work_minload1 = 1; 855*f841f6adSraf } 856*f841f6adSraf 857*f841f6adSraf (void) pthread_sigmask(SIG_SETMASK, &maskset, &oset); 858*f841f6adSraf error = thr_create(NULL, AIOSTKSIZE, func, aiowp, 859*f841f6adSraf THR_DAEMON | THR_SUSPENDED, &aiowp->work_tid); 860*f841f6adSraf (void) pthread_sigmask(SIG_SETMASK, &oset, NULL); 861*f841f6adSraf if (error) { 862*f841f6adSraf if (reqp) { 863*f841f6adSraf reqp->req_state = 0; 864*f841f6adSraf reqp->req_worker = NULL; 865*f841f6adSraf } 866*f841f6adSraf _aio_worker_free(aiowp); 867*f841f6adSraf return (-1); 868*f841f6adSraf } 869*f841f6adSraf 870*f841f6adSraf lmutex_lock(&__aio_mutex); 871*f841f6adSraf (*aio_workerscnt)++; 872*f841f6adSraf if (*workers == NULL) { 873*f841f6adSraf aiowp->work_forw = aiowp; 874*f841f6adSraf aiowp->work_backw = aiowp; 875*f841f6adSraf *nextworker = aiowp; 876*f841f6adSraf *workers = aiowp; 877*f841f6adSraf } else { 878*f841f6adSraf aiowp->work_backw = (*workers)->work_backw; 879*f841f6adSraf aiowp->work_forw = (*workers); 880*f841f6adSraf (*workers)->work_backw->work_forw = aiowp; 881*f841f6adSraf (*workers)->work_backw = aiowp; 882*f841f6adSraf } 883*f841f6adSraf _aio_worker_cnt++; 884*f841f6adSraf lmutex_unlock(&__aio_mutex); 885*f841f6adSraf 886*f841f6adSraf (void) thr_continue(aiowp->work_tid); 887*f841f6adSraf 888*f841f6adSraf return (0); 889*f841f6adSraf } 890*f841f6adSraf 8917c478bd9Sstevel@tonic-gate /* 8927c478bd9Sstevel@tonic-gate * This is the worker's main routine. 8937c478bd9Sstevel@tonic-gate * The task of this function is to execute all queued requests; 8947c478bd9Sstevel@tonic-gate * once the last pending request is executed this function will block 89534709573Sraf * in _aio_idle(). A new incoming request must wakeup this thread to 8967c478bd9Sstevel@tonic-gate * restart the work. 89734709573Sraf * Every worker has an own work queue. The queue lock is required 8987c478bd9Sstevel@tonic-gate * to synchronize the addition of new requests for this worker or 8997c478bd9Sstevel@tonic-gate * cancellation of pending/running requests. 9007c478bd9Sstevel@tonic-gate * 9017c478bd9Sstevel@tonic-gate * Cancellation scenarios: 9027c478bd9Sstevel@tonic-gate * The cancellation of a request is being done asynchronously using 9037c478bd9Sstevel@tonic-gate * _aio_cancel_req() from another thread context. 9047c478bd9Sstevel@tonic-gate * A queued request can be cancelled in different manners : 9057c478bd9Sstevel@tonic-gate * a) request is queued but not "in progress" or "done" (AIO_REQ_QUEUED): 9067c478bd9Sstevel@tonic-gate * - lock the queue -> remove the request -> unlock the queue 9077c478bd9Sstevel@tonic-gate * - this function/thread does not detect this cancellation process 9087c478bd9Sstevel@tonic-gate * b) request is in progress (AIO_REQ_INPROGRESS) : 9097c478bd9Sstevel@tonic-gate * - this function first allow the cancellation of the running 9107c478bd9Sstevel@tonic-gate * request with the flag "work_cancel_flg=1" 9117c478bd9Sstevel@tonic-gate * see _aio_req_get() -> _aio_cancel_on() 9127c478bd9Sstevel@tonic-gate * During this phase, it is allowed to interrupt the worker 9137c478bd9Sstevel@tonic-gate * thread running the request (this thread) using the SIGAIOCANCEL 9147c478bd9Sstevel@tonic-gate * signal. 9157c478bd9Sstevel@tonic-gate * Once this thread returns from the kernel (because the request 9167c478bd9Sstevel@tonic-gate * is just done), then it must disable a possible cancellation 91734709573Sraf * and proceed to finish the request. To disable the cancellation 9187c478bd9Sstevel@tonic-gate * this thread must use _aio_cancel_off() to set "work_cancel_flg=0". 9197c478bd9Sstevel@tonic-gate * c) request is already done (AIO_REQ_DONE || AIO_REQ_DONEQ): 9207c478bd9Sstevel@tonic-gate * same procedure as in a) 9217c478bd9Sstevel@tonic-gate * 9227c478bd9Sstevel@tonic-gate * To b) 9237c478bd9Sstevel@tonic-gate * This thread uses sigsetjmp() to define the position in the code, where 9247c478bd9Sstevel@tonic-gate * it wish to continue working in the case that a SIGAIOCANCEL signal 9257c478bd9Sstevel@tonic-gate * is detected. 9267c478bd9Sstevel@tonic-gate * Normally this thread should get the cancellation signal during the 92734709573Sraf * kernel phase (reading or writing). In that case the signal handler 9287c478bd9Sstevel@tonic-gate * aiosigcancelhndlr() is activated using the worker thread context, 9297c478bd9Sstevel@tonic-gate * which again will use the siglongjmp() function to break the standard 9307c478bd9Sstevel@tonic-gate * code flow and jump to the "sigsetjmp" position, provided that 9317c478bd9Sstevel@tonic-gate * "work_cancel_flg" is set to "1". 9327c478bd9Sstevel@tonic-gate * Because the "work_cancel_flg" is only manipulated by this worker 9337c478bd9Sstevel@tonic-gate * thread and it can only run on one CPU at a given time, it is not 9347c478bd9Sstevel@tonic-gate * necessary to protect that flag with the queue lock. 9357c478bd9Sstevel@tonic-gate * Returning from the kernel (read or write system call) we must 9367c478bd9Sstevel@tonic-gate * first disable the use of the SIGAIOCANCEL signal and accordingly 9377c478bd9Sstevel@tonic-gate * the use of the siglongjmp() function to prevent a possible deadlock: 9387c478bd9Sstevel@tonic-gate * - It can happens that this worker thread returns from the kernel and 9397c478bd9Sstevel@tonic-gate * blocks in "work_qlock1", 9407c478bd9Sstevel@tonic-gate * - then a second thread cancels the apparently "in progress" request 9417c478bd9Sstevel@tonic-gate * and sends the SIGAIOCANCEL signal to the worker thread, 9427c478bd9Sstevel@tonic-gate * - the worker thread gets assigned the "work_qlock1" and will returns 9437c478bd9Sstevel@tonic-gate * from the kernel, 9447c478bd9Sstevel@tonic-gate * - the kernel detects the pending signal and activates the signal 9457c478bd9Sstevel@tonic-gate * handler instead, 9467c478bd9Sstevel@tonic-gate * - if the "work_cancel_flg" is still set then the signal handler 9477c478bd9Sstevel@tonic-gate * should use siglongjmp() to cancel the "in progress" request and 9487c478bd9Sstevel@tonic-gate * it would try to acquire the same work_qlock1 in _aio_req_get() 9497c478bd9Sstevel@tonic-gate * for a second time => deadlock. 9507c478bd9Sstevel@tonic-gate * To avoid that situation we disable the cancellation of the request 9517c478bd9Sstevel@tonic-gate * in progress BEFORE we try to acquire the work_qlock1. 9527c478bd9Sstevel@tonic-gate * In that case the signal handler will not call siglongjmp() and the 9537c478bd9Sstevel@tonic-gate * worker thread will continue running the standard code flow. 9547c478bd9Sstevel@tonic-gate * Then this thread must check the AIO_REQ_CANCELED flag to emulate 9557c478bd9Sstevel@tonic-gate * an eventually required siglongjmp() freeing the work_qlock1 and 9567c478bd9Sstevel@tonic-gate * avoiding a deadlock. 9577c478bd9Sstevel@tonic-gate */ 9587c478bd9Sstevel@tonic-gate void * 9597c478bd9Sstevel@tonic-gate _aio_do_request(void *arglist) 9607c478bd9Sstevel@tonic-gate { 9617c478bd9Sstevel@tonic-gate aio_worker_t *aiowp = (aio_worker_t *)arglist; 962*f841f6adSraf ulwp_t *self = curthread; 9637c478bd9Sstevel@tonic-gate struct aio_args *arg; 96434709573Sraf aio_req_t *reqp; /* current AIO request */ 9657c478bd9Sstevel@tonic-gate ssize_t retval; 96634709573Sraf int error; 9677c478bd9Sstevel@tonic-gate 96834709573Sraf if (pthread_setspecific(_aio_key, aiowp) != 0) 969*f841f6adSraf aio_panic("_aio_do_request, pthread_setspecific()"); 97034709573Sraf (void) pthread_sigmask(SIG_SETMASK, &_worker_set, NULL); 97134709573Sraf ASSERT(aiowp->work_req == NULL); 9727c478bd9Sstevel@tonic-gate 97334709573Sraf /* 97434709573Sraf * We resume here when an operation is cancelled. 97534709573Sraf * On first entry, aiowp->work_req == NULL, so all 97634709573Sraf * we do is block SIGAIOCANCEL. 97734709573Sraf */ 97834709573Sraf (void) sigsetjmp(aiowp->work_jmp_buf, 0); 979*f841f6adSraf ASSERT(self->ul_sigdefer == 0); 9807c478bd9Sstevel@tonic-gate 981*f841f6adSraf sigoff(self); /* block SIGAIOCANCEL */ 98234709573Sraf if (aiowp->work_req != NULL) 98334709573Sraf _aio_finish_request(aiowp, -1, ECANCELED); 9847c478bd9Sstevel@tonic-gate 98534709573Sraf for (;;) { 9867c478bd9Sstevel@tonic-gate /* 98734709573Sraf * Put completed requests on aio_done_list. This has 9887c478bd9Sstevel@tonic-gate * to be done as part of the main loop to ensure that 9897c478bd9Sstevel@tonic-gate * we don't artificially starve any aiowait'ers. 9907c478bd9Sstevel@tonic-gate */ 9917c478bd9Sstevel@tonic-gate if (aiowp->work_done1) 9927c478bd9Sstevel@tonic-gate _aio_work_done(aiowp); 9937c478bd9Sstevel@tonic-gate 99434709573Sraf top: 99534709573Sraf /* consume any deferred SIGAIOCANCEL signal here */ 996*f841f6adSraf sigon(self); 997*f841f6adSraf sigoff(self); 99834709573Sraf 999*f841f6adSraf while ((reqp = _aio_req_get(aiowp)) == NULL) { 1000*f841f6adSraf if (_aio_idle(aiowp) != 0) 1001*f841f6adSraf goto top; 1002*f841f6adSraf } 100334709573Sraf arg = &reqp->req_args; 100434709573Sraf ASSERT(reqp->req_state == AIO_REQ_INPROGRESS || 100534709573Sraf reqp->req_state == AIO_REQ_CANCELED); 100634709573Sraf error = 0; 100734709573Sraf 100834709573Sraf switch (reqp->req_op) { 100934709573Sraf case AIOREAD: 101034709573Sraf case AIOAREAD: 1011*f841f6adSraf sigon(self); /* unblock SIGAIOCANCEL */ 101234709573Sraf retval = pread(arg->fd, arg->buf, 101334709573Sraf arg->bufsz, arg->offset); 101434709573Sraf if (retval == -1) { 101534709573Sraf if (errno == ESPIPE) { 101634709573Sraf retval = read(arg->fd, 101734709573Sraf arg->buf, arg->bufsz); 101834709573Sraf if (retval == -1) 101934709573Sraf error = errno; 102034709573Sraf } else { 102134709573Sraf error = errno; 10227c478bd9Sstevel@tonic-gate } 102334709573Sraf } 1024*f841f6adSraf sigoff(self); /* block SIGAIOCANCEL */ 102534709573Sraf break; 102634709573Sraf case AIOWRITE: 102734709573Sraf case AIOAWRITE: 1028*f841f6adSraf sigon(self); /* unblock SIGAIOCANCEL */ 102934709573Sraf retval = pwrite(arg->fd, arg->buf, 103034709573Sraf arg->bufsz, arg->offset); 103134709573Sraf if (retval == -1) { 103234709573Sraf if (errno == ESPIPE) { 103334709573Sraf retval = write(arg->fd, 103434709573Sraf arg->buf, arg->bufsz); 103534709573Sraf if (retval == -1) 103634709573Sraf error = errno; 103734709573Sraf } else { 103834709573Sraf error = errno; 10397c478bd9Sstevel@tonic-gate } 104034709573Sraf } 1041*f841f6adSraf sigoff(self); /* block SIGAIOCANCEL */ 104234709573Sraf break; 104334709573Sraf #if !defined(_LP64) 104434709573Sraf case AIOAREAD64: 1045*f841f6adSraf sigon(self); /* unblock SIGAIOCANCEL */ 104634709573Sraf retval = pread64(arg->fd, arg->buf, 104734709573Sraf arg->bufsz, arg->offset); 104834709573Sraf if (retval == -1) { 104934709573Sraf if (errno == ESPIPE) { 105034709573Sraf retval = read(arg->fd, 105134709573Sraf arg->buf, arg->bufsz); 105234709573Sraf if (retval == -1) 105334709573Sraf error = errno; 105434709573Sraf } else { 105534709573Sraf error = errno; 10567c478bd9Sstevel@tonic-gate } 105734709573Sraf } 1058*f841f6adSraf sigoff(self); /* block SIGAIOCANCEL */ 105934709573Sraf break; 106034709573Sraf case AIOAWRITE64: 1061*f841f6adSraf sigon(self); /* unblock SIGAIOCANCEL */ 106234709573Sraf retval = pwrite64(arg->fd, arg->buf, 106334709573Sraf arg->bufsz, arg->offset); 106434709573Sraf if (retval == -1) { 106534709573Sraf if (errno == ESPIPE) { 106634709573Sraf retval = write(arg->fd, 106734709573Sraf arg->buf, arg->bufsz); 106834709573Sraf if (retval == -1) 106934709573Sraf error = errno; 107034709573Sraf } else { 107134709573Sraf error = errno; 10727c478bd9Sstevel@tonic-gate } 107334709573Sraf } 1074*f841f6adSraf sigoff(self); /* block SIGAIOCANCEL */ 107534709573Sraf break; 107634709573Sraf #endif /* !defined(_LP64) */ 107734709573Sraf case AIOFSYNC: 1078c2575b5eSraf if (_aio_fsync_del(aiowp, reqp)) 107934709573Sraf goto top; 108034709573Sraf ASSERT(reqp->req_head == NULL); 108134709573Sraf /* 108234709573Sraf * All writes for this fsync request are now 108334709573Sraf * acknowledged. Now make these writes visible 108434709573Sraf * and put the final request into the hash table. 108534709573Sraf */ 108634709573Sraf if (reqp->req_state == AIO_REQ_CANCELED) { 108734709573Sraf /* EMPTY */; 108834709573Sraf } else if (arg->offset == O_SYNC) { 108934709573Sraf if ((retval = __fdsync(arg->fd, FSYNC)) == -1) 109034709573Sraf error = errno; 109134709573Sraf } else { 109234709573Sraf if ((retval = __fdsync(arg->fd, FDSYNC)) == -1) 109334709573Sraf error = errno; 109434709573Sraf } 109534709573Sraf if (_aio_hash_insert(reqp->req_resultp, reqp) != 0) 1096*f841f6adSraf aio_panic("_aio_do_request(): AIOFSYNC: " 109734709573Sraf "request already in hash table"); 109834709573Sraf break; 109934709573Sraf default: 1100*f841f6adSraf aio_panic("_aio_do_request, bad op"); 11017c478bd9Sstevel@tonic-gate } 11027c478bd9Sstevel@tonic-gate 110334709573Sraf _aio_finish_request(aiowp, retval, error); 110434709573Sraf } 110534709573Sraf /* NOTREACHED */ 110634709573Sraf return (NULL); 110734709573Sraf } 110834709573Sraf 110934709573Sraf /* 111034709573Sraf * Perform the tail processing for _aio_do_request(). 111134709573Sraf * The in-progress request may or may not have been cancelled. 111234709573Sraf */ 111334709573Sraf static void 111434709573Sraf _aio_finish_request(aio_worker_t *aiowp, ssize_t retval, int error) 111534709573Sraf { 111634709573Sraf aio_req_t *reqp; 111734709573Sraf 111834709573Sraf sig_mutex_lock(&aiowp->work_qlock1); 111934709573Sraf if ((reqp = aiowp->work_req) == NULL) 112034709573Sraf sig_mutex_unlock(&aiowp->work_qlock1); 112134709573Sraf else { 112234709573Sraf aiowp->work_req = NULL; 112334709573Sraf if (reqp->req_state == AIO_REQ_CANCELED) { 112434709573Sraf retval = -1; 112534709573Sraf error = ECANCELED; 112634709573Sraf } 112734709573Sraf if (!POSIX_AIO(reqp)) { 112834709573Sraf sig_mutex_unlock(&aiowp->work_qlock1); 112934709573Sraf sig_mutex_lock(&__aio_mutex); 113034709573Sraf if (reqp->req_state == AIO_REQ_INPROGRESS) 113134709573Sraf reqp->req_state = AIO_REQ_DONE; 113234709573Sraf _aio_req_done_cnt++; 113334709573Sraf _aio_set_result(reqp, retval, error); 113434709573Sraf if (error == ECANCELED) 113534709573Sraf _aio_outstand_cnt--; 113634709573Sraf sig_mutex_unlock(&__aio_mutex); 113734709573Sraf } else { 113834709573Sraf if (reqp->req_state == AIO_REQ_INPROGRESS) 113934709573Sraf reqp->req_state = AIO_REQ_DONE; 114034709573Sraf sig_mutex_unlock(&aiowp->work_qlock1); 114134709573Sraf _aiodone(reqp, retval, error); 114234709573Sraf } 114334709573Sraf } 114434709573Sraf } 11457c478bd9Sstevel@tonic-gate 114634709573Sraf void 114734709573Sraf _aio_req_mark_done(aio_req_t *reqp) 114834709573Sraf { 114934709573Sraf #if !defined(_LP64) 115034709573Sraf if (reqp->req_largefile) 115134709573Sraf ((aiocb64_t *)reqp->req_aiocbp)->aio_state = USERAIO_DONE; 115234709573Sraf else 115334709573Sraf #endif 115434709573Sraf ((aiocb_t *)reqp->req_aiocbp)->aio_state = USERAIO_DONE; 115534709573Sraf } 11567c478bd9Sstevel@tonic-gate 115734709573Sraf /* 115834709573Sraf * Sleep for 'ticks' clock ticks to give somebody else a chance to run, 115934709573Sraf * hopefully to consume one of our queued signals. 116034709573Sraf */ 116134709573Sraf static void 116234709573Sraf _aio_delay(int ticks) 116334709573Sraf { 116434709573Sraf (void) usleep(ticks * (MICROSEC / hz)); 116534709573Sraf } 11667c478bd9Sstevel@tonic-gate 116734709573Sraf /* 116834709573Sraf * Actually send the notifications. 116934709573Sraf * We could block indefinitely here if the application 117034709573Sraf * is not listening for the signal or port notifications. 117134709573Sraf */ 117234709573Sraf static void 117334709573Sraf send_notification(notif_param_t *npp) 117434709573Sraf { 1175*f841f6adSraf extern int __sigqueue(pid_t pid, int signo, 1176*f841f6adSraf /* const union sigval */ void *value, int si_code, int block); 1177*f841f6adSraf 1178*f841f6adSraf if (npp->np_signo) 1179*f841f6adSraf (void) __sigqueue(__pid, npp->np_signo, npp->np_user, 1180*f841f6adSraf SI_ASYNCIO, 1); 1181*f841f6adSraf else if (npp->np_port >= 0) 118234709573Sraf (void) _port_dispatch(npp->np_port, 0, PORT_SOURCE_AIO, 118334709573Sraf npp->np_event, npp->np_object, npp->np_user); 1184*f841f6adSraf 1185*f841f6adSraf if (npp->np_lio_signo) 1186*f841f6adSraf (void) __sigqueue(__pid, npp->np_lio_signo, npp->np_lio_user, 1187*f841f6adSraf SI_ASYNCIO, 1); 1188*f841f6adSraf else if (npp->np_lio_port >= 0) 118934709573Sraf (void) _port_dispatch(npp->np_lio_port, 0, PORT_SOURCE_AIO, 119034709573Sraf npp->np_lio_event, npp->np_lio_object, npp->np_lio_user); 11917c478bd9Sstevel@tonic-gate } 11927c478bd9Sstevel@tonic-gate 11937c478bd9Sstevel@tonic-gate /* 119434709573Sraf * Asynchronous notification worker. 11957c478bd9Sstevel@tonic-gate */ 11967c478bd9Sstevel@tonic-gate void * 119734709573Sraf _aio_do_notify(void *arg) 11987c478bd9Sstevel@tonic-gate { 11997c478bd9Sstevel@tonic-gate aio_worker_t *aiowp = (aio_worker_t *)arg; 120034709573Sraf aio_req_t *reqp; 12017c478bd9Sstevel@tonic-gate 120234709573Sraf /* 120334709573Sraf * This isn't really necessary. All signals are blocked. 120434709573Sraf */ 120534709573Sraf if (pthread_setspecific(_aio_key, aiowp) != 0) 1206*f841f6adSraf aio_panic("_aio_do_notify, pthread_setspecific()"); 120734709573Sraf 120834709573Sraf /* 120934709573Sraf * Notifications are never cancelled. 121034709573Sraf * All signals remain blocked, forever. 121134709573Sraf */ 12127c478bd9Sstevel@tonic-gate for (;;) { 1213*f841f6adSraf while ((reqp = _aio_req_get(aiowp)) == NULL) { 1214*f841f6adSraf if (_aio_idle(aiowp) != 0) 1215*f841f6adSraf aio_panic("_aio_do_notify: _aio_idle() failed"); 1216*f841f6adSraf } 121734709573Sraf send_notification(&reqp->req_notify); 121834709573Sraf _aio_req_free(reqp); 12197c478bd9Sstevel@tonic-gate } 122034709573Sraf 12217c478bd9Sstevel@tonic-gate /* NOTREACHED */ 12227c478bd9Sstevel@tonic-gate return (NULL); 12237c478bd9Sstevel@tonic-gate } 12247c478bd9Sstevel@tonic-gate 12257c478bd9Sstevel@tonic-gate /* 122634709573Sraf * Do the completion semantics for a request that was either canceled 122734709573Sraf * by _aio_cancel_req() or was completed by _aio_do_request(). 12287c478bd9Sstevel@tonic-gate */ 122934709573Sraf static void 123034709573Sraf _aiodone(aio_req_t *reqp, ssize_t retval, int error) 12317c478bd9Sstevel@tonic-gate { 123234709573Sraf aio_result_t *resultp = reqp->req_resultp; 123334709573Sraf int notify = 0; 123434709573Sraf aio_lio_t *head; 123534709573Sraf int sigev_none; 123634709573Sraf int sigev_signal; 123734709573Sraf int sigev_thread; 123834709573Sraf int sigev_port; 123934709573Sraf notif_param_t np; 12407c478bd9Sstevel@tonic-gate 124134709573Sraf /* 124234709573Sraf * We call _aiodone() only for Posix I/O. 124334709573Sraf */ 124434709573Sraf ASSERT(POSIX_AIO(reqp)); 124534709573Sraf 124634709573Sraf sigev_none = 0; 124734709573Sraf sigev_signal = 0; 124834709573Sraf sigev_thread = 0; 124934709573Sraf sigev_port = 0; 125034709573Sraf np.np_signo = 0; 125134709573Sraf np.np_port = -1; 125234709573Sraf np.np_lio_signo = 0; 125334709573Sraf np.np_lio_port = -1; 125434709573Sraf 125534709573Sraf switch (reqp->req_sigevent.sigev_notify) { 125634709573Sraf case SIGEV_NONE: 125734709573Sraf sigev_none = 1; 125834709573Sraf break; 125934709573Sraf case SIGEV_SIGNAL: 126034709573Sraf sigev_signal = 1; 126134709573Sraf break; 126234709573Sraf case SIGEV_THREAD: 126334709573Sraf sigev_thread = 1; 126434709573Sraf break; 126534709573Sraf case SIGEV_PORT: 126634709573Sraf sigev_port = 1; 126734709573Sraf break; 126834709573Sraf default: 1269*f841f6adSraf aio_panic("_aiodone: improper sigev_notify"); 127034709573Sraf break; 127134709573Sraf } 12727c478bd9Sstevel@tonic-gate 127334709573Sraf /* 127434709573Sraf * Figure out the notification parameters while holding __aio_mutex. 127534709573Sraf * Actually perform the notifications after dropping __aio_mutex. 127634709573Sraf * This allows us to sleep for a long time (if the notifications 127734709573Sraf * incur delays) without impeding other async I/O operations. 127834709573Sraf */ 12797c478bd9Sstevel@tonic-gate 128034709573Sraf sig_mutex_lock(&__aio_mutex); 128134709573Sraf 128234709573Sraf if (sigev_signal) { 128334709573Sraf if ((np.np_signo = reqp->req_sigevent.sigev_signo) != 0) 128434709573Sraf notify = 1; 128534709573Sraf np.np_user = reqp->req_sigevent.sigev_value.sival_ptr; 128634709573Sraf } else if (sigev_thread | sigev_port) { 128734709573Sraf if ((np.np_port = reqp->req_sigevent.sigev_signo) >= 0) 128834709573Sraf notify = 1; 128934709573Sraf np.np_event = reqp->req_op; 129034709573Sraf if (np.np_event == AIOFSYNC && reqp->req_largefile) 129134709573Sraf np.np_event = AIOFSYNC64; 129234709573Sraf np.np_object = (uintptr_t)reqp->req_aiocbp; 129334709573Sraf np.np_user = reqp->req_sigevent.sigev_value.sival_ptr; 129434709573Sraf } 12957c478bd9Sstevel@tonic-gate 129634709573Sraf if (resultp->aio_errno == EINPROGRESS) 129734709573Sraf _aio_set_result(reqp, retval, error); 12987c478bd9Sstevel@tonic-gate 129934709573Sraf _aio_outstand_cnt--; 13007c478bd9Sstevel@tonic-gate 130134709573Sraf head = reqp->req_head; 130234709573Sraf reqp->req_head = NULL; 13037c478bd9Sstevel@tonic-gate 130434709573Sraf if (sigev_none) { 130534709573Sraf _aio_enq_doneq(reqp); 130634709573Sraf reqp = NULL; 130734709573Sraf } else { 130834709573Sraf (void) _aio_hash_del(resultp); 130934709573Sraf _aio_req_mark_done(reqp); 131034709573Sraf } 13117c478bd9Sstevel@tonic-gate 131234709573Sraf _aio_waitn_wakeup(); 13137c478bd9Sstevel@tonic-gate 131434709573Sraf /* 131534709573Sraf * __aio_waitn() sets AIO_WAIT_INPROGRESS and 131634709573Sraf * __aio_suspend() increments "_aio_kernel_suspend" 131734709573Sraf * when they are waiting in the kernel for completed I/Os. 131834709573Sraf * 131934709573Sraf * _kaio(AIONOTIFY) awakes the corresponding function 132034709573Sraf * in the kernel; then the corresponding __aio_waitn() or 132134709573Sraf * __aio_suspend() function could reap the recently 132234709573Sraf * completed I/Os (_aiodone()). 132334709573Sraf */ 132434709573Sraf if ((_aio_flags & AIO_WAIT_INPROGRESS) || _aio_kernel_suspend > 0) 132534709573Sraf (void) _kaio(AIONOTIFY); 13267c478bd9Sstevel@tonic-gate 132734709573Sraf sig_mutex_unlock(&__aio_mutex); 13287c478bd9Sstevel@tonic-gate 132934709573Sraf if (head != NULL) { 13307c478bd9Sstevel@tonic-gate /* 133134709573Sraf * If all the lio requests have completed, 133234709573Sraf * prepare to notify the waiting thread. 13337c478bd9Sstevel@tonic-gate */ 133434709573Sraf sig_mutex_lock(&head->lio_mutex); 133534709573Sraf ASSERT(head->lio_refcnt == head->lio_nent); 133634709573Sraf if (head->lio_refcnt == 1) { 133734709573Sraf int waiting = 0; 133834709573Sraf if (head->lio_mode == LIO_WAIT) { 133934709573Sraf if ((waiting = head->lio_waiting) != 0) 134034709573Sraf (void) cond_signal(&head->lio_cond_cv); 134134709573Sraf } else if (head->lio_port < 0) { /* none or signal */ 134234709573Sraf if ((np.np_lio_signo = head->lio_signo) != 0) 134334709573Sraf notify = 1; 134434709573Sraf np.np_lio_user = head->lio_sigval.sival_ptr; 134534709573Sraf } else { /* thread or port */ 134634709573Sraf notify = 1; 134734709573Sraf np.np_lio_port = head->lio_port; 134834709573Sraf np.np_lio_event = head->lio_event; 134934709573Sraf np.np_lio_object = 135034709573Sraf (uintptr_t)head->lio_sigevent; 135134709573Sraf np.np_lio_user = head->lio_sigval.sival_ptr; 13527c478bd9Sstevel@tonic-gate } 135334709573Sraf head->lio_nent = head->lio_refcnt = 0; 135434709573Sraf sig_mutex_unlock(&head->lio_mutex); 135534709573Sraf if (waiting == 0) 135634709573Sraf _aio_lio_free(head); 135734709573Sraf } else { 135834709573Sraf head->lio_nent--; 135934709573Sraf head->lio_refcnt--; 136034709573Sraf sig_mutex_unlock(&head->lio_mutex); 13617c478bd9Sstevel@tonic-gate } 136234709573Sraf } 13637c478bd9Sstevel@tonic-gate 136434709573Sraf /* 136534709573Sraf * The request is completed; now perform the notifications. 136634709573Sraf */ 136734709573Sraf if (notify) { 136834709573Sraf if (reqp != NULL) { 13697c478bd9Sstevel@tonic-gate /* 137034709573Sraf * We usually put the request on the notification 137134709573Sraf * queue because we don't want to block and delay 137234709573Sraf * other operations behind us in the work queue. 137334709573Sraf * Also we must never block on a cancel notification 137434709573Sraf * because we are being called from an application 137534709573Sraf * thread in this case and that could lead to deadlock 137634709573Sraf * if no other thread is receiving notificatins. 13777c478bd9Sstevel@tonic-gate */ 137834709573Sraf reqp->req_notify = np; 137934709573Sraf reqp->req_op = AIONOTIFY; 138034709573Sraf _aio_req_add(reqp, &__workers_no, AIONOTIFY); 138134709573Sraf reqp = NULL; 138234709573Sraf } else { 138334709573Sraf /* 138434709573Sraf * We already put the request on the done queue, 138534709573Sraf * so we can't queue it to the notification queue. 138634709573Sraf * Just do the notification directly. 138734709573Sraf */ 138834709573Sraf send_notification(&np); 13897c478bd9Sstevel@tonic-gate } 13907c478bd9Sstevel@tonic-gate } 139134709573Sraf 139234709573Sraf if (reqp != NULL) 139334709573Sraf _aio_req_free(reqp); 13947c478bd9Sstevel@tonic-gate } 13957c478bd9Sstevel@tonic-gate 13967c478bd9Sstevel@tonic-gate /* 139734709573Sraf * Delete fsync requests from list head until there is 139834709573Sraf * only one left. Return 0 when there is only one, 139934709573Sraf * otherwise return a non-zero value. 14007c478bd9Sstevel@tonic-gate */ 14017c478bd9Sstevel@tonic-gate static int 1402c2575b5eSraf _aio_fsync_del(aio_worker_t *aiowp, aio_req_t *reqp) 14037c478bd9Sstevel@tonic-gate { 140434709573Sraf aio_lio_t *head = reqp->req_head; 140534709573Sraf int rval = 0; 140634709573Sraf 1407c2575b5eSraf ASSERT(reqp == aiowp->work_req); 1408c2575b5eSraf sig_mutex_lock(&aiowp->work_qlock1); 140934709573Sraf sig_mutex_lock(&head->lio_mutex); 141034709573Sraf if (head->lio_refcnt > 1) { 141134709573Sraf head->lio_refcnt--; 141234709573Sraf head->lio_nent--; 1413c2575b5eSraf aiowp->work_req = NULL; 141434709573Sraf sig_mutex_unlock(&head->lio_mutex); 1415c2575b5eSraf sig_mutex_unlock(&aiowp->work_qlock1); 141634709573Sraf sig_mutex_lock(&__aio_mutex); 141734709573Sraf _aio_outstand_cnt--; 141834709573Sraf _aio_waitn_wakeup(); 141934709573Sraf sig_mutex_unlock(&__aio_mutex); 142034709573Sraf _aio_req_free(reqp); 142134709573Sraf return (1); 14227c478bd9Sstevel@tonic-gate } 142334709573Sraf ASSERT(head->lio_nent == 1 && head->lio_refcnt == 1); 142434709573Sraf reqp->req_head = NULL; 142534709573Sraf if (head->lio_canned) 142634709573Sraf reqp->req_state = AIO_REQ_CANCELED; 142734709573Sraf if (head->lio_mode == LIO_DESTROY) { 1428c2575b5eSraf aiowp->work_req = NULL; 142934709573Sraf rval = 1; 143034709573Sraf } 1431c2575b5eSraf sig_mutex_unlock(&head->lio_mutex); 1432c2575b5eSraf sig_mutex_unlock(&aiowp->work_qlock1); 143334709573Sraf head->lio_refcnt--; 143434709573Sraf head->lio_nent--; 143534709573Sraf _aio_lio_free(head); 1436c2575b5eSraf if (rval != 0) 1437c2575b5eSraf _aio_req_free(reqp); 143834709573Sraf return (rval); 14397c478bd9Sstevel@tonic-gate } 14407c478bd9Sstevel@tonic-gate 14417c478bd9Sstevel@tonic-gate /* 1442*f841f6adSraf * A worker is set idle when its work queue is empty. 1443*f841f6adSraf * The worker checks again that it has no more work 1444*f841f6adSraf * and then goes to sleep waiting for more work. 14457c478bd9Sstevel@tonic-gate */ 1446*f841f6adSraf int 14477c478bd9Sstevel@tonic-gate _aio_idle(aio_worker_t *aiowp) 14487c478bd9Sstevel@tonic-gate { 144934709573Sraf int error = 0; 145034709573Sraf 145134709573Sraf sig_mutex_lock(&aiowp->work_qlock1); 145234709573Sraf if (aiowp->work_count1 == 0) { 145334709573Sraf ASSERT(aiowp->work_minload1 == 0); 14547c478bd9Sstevel@tonic-gate aiowp->work_idleflg = 1; 14557c478bd9Sstevel@tonic-gate /* 145634709573Sraf * A cancellation handler is not needed here. 145734709573Sraf * aio worker threads are never cancelled via pthread_cancel(). 14587c478bd9Sstevel@tonic-gate */ 145934709573Sraf error = sig_cond_wait(&aiowp->work_idle_cv, 146034709573Sraf &aiowp->work_qlock1); 146134709573Sraf /* 146234709573Sraf * The idle flag is normally cleared before worker is awakened 146334709573Sraf * by aio_req_add(). On error (EINTR), we clear it ourself. 146434709573Sraf */ 146534709573Sraf if (error) 146634709573Sraf aiowp->work_idleflg = 0; 14677c478bd9Sstevel@tonic-gate } 146834709573Sraf sig_mutex_unlock(&aiowp->work_qlock1); 1469*f841f6adSraf return (error); 14707c478bd9Sstevel@tonic-gate } 14717c478bd9Sstevel@tonic-gate 14727c478bd9Sstevel@tonic-gate /* 14737c478bd9Sstevel@tonic-gate * A worker's completed AIO requests are placed onto a global 147434709573Sraf * done queue. The application is only sent a SIGIO signal if 14757c478bd9Sstevel@tonic-gate * the process has a handler enabled and it is not waiting via 14767c478bd9Sstevel@tonic-gate * aiowait(). 14777c478bd9Sstevel@tonic-gate */ 14787c478bd9Sstevel@tonic-gate static void 147934709573Sraf _aio_work_done(aio_worker_t *aiowp) 14807c478bd9Sstevel@tonic-gate { 148134709573Sraf aio_req_t *reqp; 14827c478bd9Sstevel@tonic-gate 148334709573Sraf sig_mutex_lock(&aiowp->work_qlock1); 148434709573Sraf reqp = aiowp->work_prev1; 148534709573Sraf reqp->req_next = NULL; 14867c478bd9Sstevel@tonic-gate aiowp->work_done1 = 0; 14877c478bd9Sstevel@tonic-gate aiowp->work_tail1 = aiowp->work_next1; 14887c478bd9Sstevel@tonic-gate if (aiowp->work_tail1 == NULL) 14897c478bd9Sstevel@tonic-gate aiowp->work_head1 = NULL; 14907c478bd9Sstevel@tonic-gate aiowp->work_prev1 = NULL; 149134709573Sraf sig_mutex_unlock(&aiowp->work_qlock1); 149234709573Sraf sig_mutex_lock(&__aio_mutex); 14937c478bd9Sstevel@tonic-gate _aio_donecnt++; 14947c478bd9Sstevel@tonic-gate _aio_outstand_cnt--; 14957c478bd9Sstevel@tonic-gate _aio_req_done_cnt--; 149634709573Sraf ASSERT(_aio_donecnt > 0 && 149734709573Sraf _aio_outstand_cnt >= 0 && 149834709573Sraf _aio_req_done_cnt >= 0); 149934709573Sraf ASSERT(reqp != NULL); 15007c478bd9Sstevel@tonic-gate 15017c478bd9Sstevel@tonic-gate if (_aio_done_tail == NULL) { 150234709573Sraf _aio_done_head = _aio_done_tail = reqp; 15037c478bd9Sstevel@tonic-gate } else { 150434709573Sraf _aio_done_head->req_next = reqp; 150534709573Sraf _aio_done_head = reqp; 15067c478bd9Sstevel@tonic-gate } 15077c478bd9Sstevel@tonic-gate 15087c478bd9Sstevel@tonic-gate if (_aiowait_flag) { 150934709573Sraf sig_mutex_unlock(&__aio_mutex); 15107c478bd9Sstevel@tonic-gate (void) _kaio(AIONOTIFY); 15117c478bd9Sstevel@tonic-gate } else { 151234709573Sraf sig_mutex_unlock(&__aio_mutex); 151334709573Sraf if (_sigio_enabled) 15147c478bd9Sstevel@tonic-gate (void) kill(__pid, SIGIO); 15157c478bd9Sstevel@tonic-gate } 15167c478bd9Sstevel@tonic-gate } 15177c478bd9Sstevel@tonic-gate 15187c478bd9Sstevel@tonic-gate /* 151934709573Sraf * The done queue consists of AIO requests that are in either the 152034709573Sraf * AIO_REQ_DONE or AIO_REQ_CANCELED state. Requests that were cancelled 152134709573Sraf * are discarded. If the done queue is empty then NULL is returned. 152234709573Sraf * Otherwise the address of a done aio_result_t is returned. 15237c478bd9Sstevel@tonic-gate */ 152434709573Sraf aio_result_t * 15257c478bd9Sstevel@tonic-gate _aio_req_done(void) 15267c478bd9Sstevel@tonic-gate { 152734709573Sraf aio_req_t *reqp; 15287c478bd9Sstevel@tonic-gate aio_result_t *resultp; 15297c478bd9Sstevel@tonic-gate 15307c478bd9Sstevel@tonic-gate ASSERT(MUTEX_HELD(&__aio_mutex)); 15317c478bd9Sstevel@tonic-gate 153234709573Sraf if ((reqp = _aio_done_tail) != NULL) { 153334709573Sraf if ((_aio_done_tail = reqp->req_next) == NULL) 153434709573Sraf _aio_done_head = NULL; 15357c478bd9Sstevel@tonic-gate ASSERT(_aio_donecnt > 0); 15367c478bd9Sstevel@tonic-gate _aio_donecnt--; 153734709573Sraf (void) _aio_hash_del(reqp->req_resultp); 153834709573Sraf resultp = reqp->req_resultp; 153934709573Sraf ASSERT(reqp->req_state == AIO_REQ_DONE); 154034709573Sraf _aio_req_free(reqp); 15417c478bd9Sstevel@tonic-gate return (resultp); 15427c478bd9Sstevel@tonic-gate } 15437c478bd9Sstevel@tonic-gate /* is queue empty? */ 154434709573Sraf if (reqp == NULL && _aio_outstand_cnt == 0) { 15457c478bd9Sstevel@tonic-gate return ((aio_result_t *)-1); 15467c478bd9Sstevel@tonic-gate } 15477c478bd9Sstevel@tonic-gate return (NULL); 15487c478bd9Sstevel@tonic-gate } 15497c478bd9Sstevel@tonic-gate 15507c478bd9Sstevel@tonic-gate /* 155134709573Sraf * Set the return and errno values for the application's use. 155234709573Sraf * 155334709573Sraf * For the Posix interfaces, we must set the return value first followed 155434709573Sraf * by the errno value because the Posix interfaces allow for a change 155534709573Sraf * in the errno value from EINPROGRESS to something else to signal 155634709573Sraf * the completion of the asynchronous request. 155734709573Sraf * 155834709573Sraf * The opposite is true for the Solaris interfaces. These allow for 155934709573Sraf * a change in the return value from AIO_INPROGRESS to something else 156034709573Sraf * to signal the completion of the asynchronous request. 15617c478bd9Sstevel@tonic-gate */ 15627c478bd9Sstevel@tonic-gate void 156334709573Sraf _aio_set_result(aio_req_t *reqp, ssize_t retval, int error) 15647c478bd9Sstevel@tonic-gate { 156534709573Sraf aio_result_t *resultp = reqp->req_resultp; 156634709573Sraf 156734709573Sraf if (POSIX_AIO(reqp)) { 156834709573Sraf resultp->aio_return = retval; 156934709573Sraf membar_producer(); 157034709573Sraf resultp->aio_errno = error; 157134709573Sraf } else { 157234709573Sraf resultp->aio_errno = error; 157334709573Sraf membar_producer(); 157434709573Sraf resultp->aio_return = retval; 157534709573Sraf } 157634709573Sraf } 157734709573Sraf 157834709573Sraf /* 157934709573Sraf * Add an AIO request onto the next work queue. 158034709573Sraf * A circular list of workers is used to choose the next worker. 158134709573Sraf */ 158234709573Sraf void 158334709573Sraf _aio_req_add(aio_req_t *reqp, aio_worker_t **nextworker, int mode) 158434709573Sraf { 1585*f841f6adSraf ulwp_t *self = curthread; 158634709573Sraf aio_worker_t *aiowp; 158734709573Sraf aio_worker_t *first; 158834709573Sraf int load_bal_flg = 1; 158934709573Sraf int found; 159034709573Sraf 159134709573Sraf ASSERT(reqp->req_state != AIO_REQ_DONEQ); 159234709573Sraf reqp->req_next = NULL; 15937c478bd9Sstevel@tonic-gate /* 159434709573Sraf * Try to acquire the next worker's work queue. If it is locked, 15957c478bd9Sstevel@tonic-gate * then search the list of workers until a queue is found unlocked, 15967c478bd9Sstevel@tonic-gate * or until the list is completely traversed at which point another 15977c478bd9Sstevel@tonic-gate * worker will be created. 15987c478bd9Sstevel@tonic-gate */ 1599*f841f6adSraf sigoff(self); /* defer SIGIO */ 160034709573Sraf sig_mutex_lock(&__aio_mutex); 160134709573Sraf first = aiowp = *nextworker; 160234709573Sraf if (mode != AIONOTIFY) 16037c478bd9Sstevel@tonic-gate _aio_outstand_cnt++; 160434709573Sraf sig_mutex_unlock(&__aio_mutex); 160534709573Sraf 16067c478bd9Sstevel@tonic-gate switch (mode) { 160734709573Sraf case AIOREAD: 160834709573Sraf case AIOWRITE: 160934709573Sraf case AIOAREAD: 161034709573Sraf case AIOAWRITE: 161134709573Sraf #if !defined(_LP64) 161234709573Sraf case AIOAREAD64: 161334709573Sraf case AIOAWRITE64: 16147c478bd9Sstevel@tonic-gate #endif 161534709573Sraf /* try to find an idle worker */ 161634709573Sraf found = 0; 161734709573Sraf do { 161834709573Sraf if (sig_mutex_trylock(&aiowp->work_qlock1) == 0) { 161934709573Sraf if (aiowp->work_idleflg) { 162034709573Sraf found = 1; 16217c478bd9Sstevel@tonic-gate break; 16227c478bd9Sstevel@tonic-gate } 162334709573Sraf sig_mutex_unlock(&aiowp->work_qlock1); 16247c478bd9Sstevel@tonic-gate } 162534709573Sraf } while ((aiowp = aiowp->work_forw) != first); 16267c478bd9Sstevel@tonic-gate 162734709573Sraf if (found) { 162834709573Sraf aiowp->work_minload1++; 162934709573Sraf break; 163034709573Sraf } 16317c478bd9Sstevel@tonic-gate 163234709573Sraf /* try to acquire some worker's queue lock */ 163334709573Sraf do { 163434709573Sraf if (sig_mutex_trylock(&aiowp->work_qlock1) == 0) { 163534709573Sraf found = 1; 163634709573Sraf break; 16377c478bd9Sstevel@tonic-gate } 163834709573Sraf } while ((aiowp = aiowp->work_forw) != first); 163934709573Sraf 164034709573Sraf /* 164134709573Sraf * Create more workers when the workers appear overloaded. 164234709573Sraf * Either all the workers are busy draining their queues 164334709573Sraf * or no worker's queue lock could be acquired. 164434709573Sraf */ 164534709573Sraf if (!found) { 164634709573Sraf if (_aio_worker_cnt < _max_workers) { 164734709573Sraf if (_aio_create_worker(reqp, mode)) 1648*f841f6adSraf aio_panic("_aio_req_add: add worker"); 1649*f841f6adSraf sigon(self); /* reenable SIGIO */ 16507c478bd9Sstevel@tonic-gate return; 16517c478bd9Sstevel@tonic-gate } 165234709573Sraf 165334709573Sraf /* 165434709573Sraf * No worker available and we have created 165534709573Sraf * _max_workers, keep going through the 165634709573Sraf * list slowly until we get a lock 165734709573Sraf */ 165834709573Sraf while (sig_mutex_trylock(&aiowp->work_qlock1) != 0) { 165934709573Sraf /* 166034709573Sraf * give someone else a chance 166134709573Sraf */ 166234709573Sraf _aio_delay(1); 166334709573Sraf aiowp = aiowp->work_forw; 166434709573Sraf } 166534709573Sraf } 166634709573Sraf 166734709573Sraf ASSERT(MUTEX_HELD(&aiowp->work_qlock1)); 166834709573Sraf if (_aio_worker_cnt < _max_workers && 166934709573Sraf aiowp->work_minload1 >= _minworkload) { 167034709573Sraf sig_mutex_unlock(&aiowp->work_qlock1); 167134709573Sraf sig_mutex_lock(&__aio_mutex); 167234709573Sraf *nextworker = aiowp->work_forw; 167334709573Sraf sig_mutex_unlock(&__aio_mutex); 167434709573Sraf if (_aio_create_worker(reqp, mode)) 1675*f841f6adSraf aio_panic("aio_req_add: add worker"); 1676*f841f6adSraf sigon(self); /* reenable SIGIO */ 167734709573Sraf return; 167834709573Sraf } 167934709573Sraf aiowp->work_minload1++; 168034709573Sraf break; 168134709573Sraf case AIOFSYNC: 168234709573Sraf case AIONOTIFY: 168334709573Sraf load_bal_flg = 0; 168434709573Sraf sig_mutex_lock(&aiowp->work_qlock1); 168534709573Sraf break; 168634709573Sraf default: 1687*f841f6adSraf aio_panic("_aio_req_add: invalid mode"); 168834709573Sraf break; 16897c478bd9Sstevel@tonic-gate } 16907c478bd9Sstevel@tonic-gate /* 16917c478bd9Sstevel@tonic-gate * Put request onto worker's work queue. 16927c478bd9Sstevel@tonic-gate */ 16937c478bd9Sstevel@tonic-gate if (aiowp->work_tail1 == NULL) { 169434709573Sraf ASSERT(aiowp->work_count1 == 0); 169534709573Sraf aiowp->work_tail1 = reqp; 169634709573Sraf aiowp->work_next1 = reqp; 16977c478bd9Sstevel@tonic-gate } else { 169834709573Sraf aiowp->work_head1->req_next = reqp; 16997c478bd9Sstevel@tonic-gate if (aiowp->work_next1 == NULL) 170034709573Sraf aiowp->work_next1 = reqp; 17017c478bd9Sstevel@tonic-gate } 170234709573Sraf reqp->req_state = AIO_REQ_QUEUED; 170334709573Sraf reqp->req_worker = aiowp; 170434709573Sraf aiowp->work_head1 = reqp; 17057c478bd9Sstevel@tonic-gate /* 17067c478bd9Sstevel@tonic-gate * Awaken worker if it is not currently active. 17077c478bd9Sstevel@tonic-gate */ 170834709573Sraf if (aiowp->work_count1++ == 0 && aiowp->work_idleflg) { 17097c478bd9Sstevel@tonic-gate aiowp->work_idleflg = 0; 171034709573Sraf (void) cond_signal(&aiowp->work_idle_cv); 17117c478bd9Sstevel@tonic-gate } 171234709573Sraf sig_mutex_unlock(&aiowp->work_qlock1); 171334709573Sraf 171434709573Sraf if (load_bal_flg) { 171534709573Sraf sig_mutex_lock(&__aio_mutex); 171634709573Sraf *nextworker = aiowp->work_forw; 171734709573Sraf sig_mutex_unlock(&__aio_mutex); 171834709573Sraf } 1719*f841f6adSraf sigon(self); /* reenable SIGIO */ 17207c478bd9Sstevel@tonic-gate } 17217c478bd9Sstevel@tonic-gate 17227c478bd9Sstevel@tonic-gate /* 172334709573Sraf * Get an AIO request for a specified worker. 172434709573Sraf * If the work queue is empty, return NULL. 17257c478bd9Sstevel@tonic-gate */ 17267c478bd9Sstevel@tonic-gate aio_req_t * 17277c478bd9Sstevel@tonic-gate _aio_req_get(aio_worker_t *aiowp) 17287c478bd9Sstevel@tonic-gate { 172934709573Sraf aio_req_t *reqp; 17307c478bd9Sstevel@tonic-gate 173134709573Sraf sig_mutex_lock(&aiowp->work_qlock1); 173234709573Sraf if ((reqp = aiowp->work_next1) != NULL) { 17337c478bd9Sstevel@tonic-gate /* 173434709573Sraf * Remove a POSIX request from the queue; the 17357c478bd9Sstevel@tonic-gate * request queue is a singularly linked list 173634709573Sraf * with a previous pointer. The request is 173734709573Sraf * removed by updating the previous pointer. 17387c478bd9Sstevel@tonic-gate * 173934709573Sraf * Non-posix requests are left on the queue 174034709573Sraf * to eventually be placed on the done queue. 17417c478bd9Sstevel@tonic-gate */ 17427c478bd9Sstevel@tonic-gate 174334709573Sraf if (POSIX_AIO(reqp)) { 17447c478bd9Sstevel@tonic-gate if (aiowp->work_prev1 == NULL) { 174534709573Sraf aiowp->work_tail1 = reqp->req_next; 17467c478bd9Sstevel@tonic-gate if (aiowp->work_tail1 == NULL) 17477c478bd9Sstevel@tonic-gate aiowp->work_head1 = NULL; 17487c478bd9Sstevel@tonic-gate } else { 174934709573Sraf aiowp->work_prev1->req_next = reqp->req_next; 175034709573Sraf if (aiowp->work_head1 == reqp) 175134709573Sraf aiowp->work_head1 = reqp->req_next; 17527c478bd9Sstevel@tonic-gate } 17537c478bd9Sstevel@tonic-gate 17547c478bd9Sstevel@tonic-gate } else { 175534709573Sraf aiowp->work_prev1 = reqp; 17567c478bd9Sstevel@tonic-gate ASSERT(aiowp->work_done1 >= 0); 17577c478bd9Sstevel@tonic-gate aiowp->work_done1++; 17587c478bd9Sstevel@tonic-gate } 175934709573Sraf ASSERT(reqp != reqp->req_next); 176034709573Sraf aiowp->work_next1 = reqp->req_next; 176134709573Sraf ASSERT(aiowp->work_count1 >= 1); 176234709573Sraf aiowp->work_count1--; 176334709573Sraf switch (reqp->req_op) { 176434709573Sraf case AIOREAD: 176534709573Sraf case AIOWRITE: 176634709573Sraf case AIOAREAD: 176734709573Sraf case AIOAWRITE: 176834709573Sraf #if !defined(_LP64) 176934709573Sraf case AIOAREAD64: 177034709573Sraf case AIOAWRITE64: 17717c478bd9Sstevel@tonic-gate #endif 177234709573Sraf ASSERT(aiowp->work_minload1 > 0); 177334709573Sraf aiowp->work_minload1--; 177434709573Sraf break; 177534709573Sraf } 177634709573Sraf reqp->req_state = AIO_REQ_INPROGRESS; 17777c478bd9Sstevel@tonic-gate } 177834709573Sraf aiowp->work_req = reqp; 177934709573Sraf ASSERT(reqp != NULL || aiowp->work_count1 == 0); 178034709573Sraf sig_mutex_unlock(&aiowp->work_qlock1); 178134709573Sraf return (reqp); 17827c478bd9Sstevel@tonic-gate } 17837c478bd9Sstevel@tonic-gate 17847c478bd9Sstevel@tonic-gate static void 178534709573Sraf _aio_req_del(aio_worker_t *aiowp, aio_req_t *reqp, int ostate) 17867c478bd9Sstevel@tonic-gate { 178734709573Sraf aio_req_t **last; 178834709573Sraf aio_req_t *lastrp; 178934709573Sraf aio_req_t *next; 17907c478bd9Sstevel@tonic-gate 17917c478bd9Sstevel@tonic-gate ASSERT(aiowp != NULL); 17927c478bd9Sstevel@tonic-gate ASSERT(MUTEX_HELD(&aiowp->work_qlock1)); 179334709573Sraf if (POSIX_AIO(reqp)) { 17947c478bd9Sstevel@tonic-gate if (ostate != AIO_REQ_QUEUED) 17957c478bd9Sstevel@tonic-gate return; 17967c478bd9Sstevel@tonic-gate } 17977c478bd9Sstevel@tonic-gate last = &aiowp->work_tail1; 17987c478bd9Sstevel@tonic-gate lastrp = aiowp->work_tail1; 17997c478bd9Sstevel@tonic-gate ASSERT(ostate == AIO_REQ_QUEUED || ostate == AIO_REQ_INPROGRESS); 18007c478bd9Sstevel@tonic-gate while ((next = *last) != NULL) { 180134709573Sraf if (next == reqp) { 18027c478bd9Sstevel@tonic-gate *last = next->req_next; 18037c478bd9Sstevel@tonic-gate if (aiowp->work_next1 == next) 18047c478bd9Sstevel@tonic-gate aiowp->work_next1 = next->req_next; 18057c478bd9Sstevel@tonic-gate 18067c478bd9Sstevel@tonic-gate if ((next->req_next != NULL) || 18077c478bd9Sstevel@tonic-gate (aiowp->work_done1 == 0)) { 18087c478bd9Sstevel@tonic-gate if (aiowp->work_head1 == next) 18097c478bd9Sstevel@tonic-gate aiowp->work_head1 = next->req_next; 18107c478bd9Sstevel@tonic-gate if (aiowp->work_prev1 == next) 18117c478bd9Sstevel@tonic-gate aiowp->work_prev1 = next->req_next; 18127c478bd9Sstevel@tonic-gate } else { 18137c478bd9Sstevel@tonic-gate if (aiowp->work_head1 == next) 18147c478bd9Sstevel@tonic-gate aiowp->work_head1 = lastrp; 18157c478bd9Sstevel@tonic-gate if (aiowp->work_prev1 == next) 18167c478bd9Sstevel@tonic-gate aiowp->work_prev1 = lastrp; 18177c478bd9Sstevel@tonic-gate } 18187c478bd9Sstevel@tonic-gate 18197c478bd9Sstevel@tonic-gate if (ostate == AIO_REQ_QUEUED) { 182034709573Sraf ASSERT(aiowp->work_count1 >= 1); 182134709573Sraf aiowp->work_count1--; 182234709573Sraf ASSERT(aiowp->work_minload1 >= 1); 182334709573Sraf aiowp->work_minload1--; 18247c478bd9Sstevel@tonic-gate } else { 18257c478bd9Sstevel@tonic-gate ASSERT(ostate == AIO_REQ_INPROGRESS && 182634709573Sraf !POSIX_AIO(reqp)); 18277c478bd9Sstevel@tonic-gate aiowp->work_done1--; 18287c478bd9Sstevel@tonic-gate } 18297c478bd9Sstevel@tonic-gate return; 18307c478bd9Sstevel@tonic-gate } 18317c478bd9Sstevel@tonic-gate last = &next->req_next; 18327c478bd9Sstevel@tonic-gate lastrp = next; 18337c478bd9Sstevel@tonic-gate } 18347c478bd9Sstevel@tonic-gate /* NOTREACHED */ 18357c478bd9Sstevel@tonic-gate } 18367c478bd9Sstevel@tonic-gate 18377c478bd9Sstevel@tonic-gate static void 18387c478bd9Sstevel@tonic-gate _aio_enq_doneq(aio_req_t *reqp) 18397c478bd9Sstevel@tonic-gate { 18407c478bd9Sstevel@tonic-gate if (_aio_doneq == NULL) { 18417c478bd9Sstevel@tonic-gate _aio_doneq = reqp; 184234709573Sraf reqp->req_next = reqp->req_prev = reqp; 18437c478bd9Sstevel@tonic-gate } else { 18447c478bd9Sstevel@tonic-gate reqp->req_next = _aio_doneq; 18457c478bd9Sstevel@tonic-gate reqp->req_prev = _aio_doneq->req_prev; 184634709573Sraf _aio_doneq->req_prev->req_next = reqp; 18477c478bd9Sstevel@tonic-gate _aio_doneq->req_prev = reqp; 18487c478bd9Sstevel@tonic-gate } 18497c478bd9Sstevel@tonic-gate reqp->req_state = AIO_REQ_DONEQ; 18507c478bd9Sstevel@tonic-gate _aio_doneq_cnt++; 18517c478bd9Sstevel@tonic-gate } 18527c478bd9Sstevel@tonic-gate 18537c478bd9Sstevel@tonic-gate /* 18547c478bd9Sstevel@tonic-gate * caller owns the _aio_mutex 18557c478bd9Sstevel@tonic-gate */ 18567c478bd9Sstevel@tonic-gate aio_req_t * 18577c478bd9Sstevel@tonic-gate _aio_req_remove(aio_req_t *reqp) 18587c478bd9Sstevel@tonic-gate { 18597c478bd9Sstevel@tonic-gate if (reqp && reqp->req_state != AIO_REQ_DONEQ) 18607c478bd9Sstevel@tonic-gate return (NULL); 18617c478bd9Sstevel@tonic-gate 18627c478bd9Sstevel@tonic-gate if (reqp) { 18637c478bd9Sstevel@tonic-gate /* request in done queue */ 186434709573Sraf if (_aio_doneq == reqp) 186534709573Sraf _aio_doneq = reqp->req_next; 186634709573Sraf if (_aio_doneq == reqp) { 18677c478bd9Sstevel@tonic-gate /* only one request on queue */ 18687c478bd9Sstevel@tonic-gate _aio_doneq = NULL; 18697c478bd9Sstevel@tonic-gate } else { 187034709573Sraf aio_req_t *tmp = reqp->req_next; 187134709573Sraf reqp->req_prev->req_next = tmp; 187234709573Sraf tmp->req_prev = reqp->req_prev; 18737c478bd9Sstevel@tonic-gate } 187434709573Sraf } else if ((reqp = _aio_doneq) != NULL) { 187534709573Sraf if (reqp == reqp->req_next) { 18767c478bd9Sstevel@tonic-gate /* only one request on queue */ 18777c478bd9Sstevel@tonic-gate _aio_doneq = NULL; 18787c478bd9Sstevel@tonic-gate } else { 187934709573Sraf reqp->req_prev->req_next = _aio_doneq = reqp->req_next; 188034709573Sraf _aio_doneq->req_prev = reqp->req_prev; 18817c478bd9Sstevel@tonic-gate } 188234709573Sraf } 188334709573Sraf if (reqp) { 18847c478bd9Sstevel@tonic-gate _aio_doneq_cnt--; 188534709573Sraf reqp->req_next = reqp->req_prev = reqp; 188634709573Sraf reqp->req_state = AIO_REQ_DONE; 18877c478bd9Sstevel@tonic-gate } 188834709573Sraf return (reqp); 18897c478bd9Sstevel@tonic-gate } 18907c478bd9Sstevel@tonic-gate 18917c478bd9Sstevel@tonic-gate /* 189234709573Sraf * An AIO request is identified by an aio_result_t pointer. The library 189334709573Sraf * maps this aio_result_t pointer to its internal representation using a 189434709573Sraf * hash table. This function adds an aio_result_t pointer to the hash table. 18957c478bd9Sstevel@tonic-gate */ 18967c478bd9Sstevel@tonic-gate static int 189734709573Sraf _aio_hash_insert(aio_result_t *resultp, aio_req_t *reqp) 18987c478bd9Sstevel@tonic-gate { 189934709573Sraf aio_hash_t *hashp; 190034709573Sraf aio_req_t **prev; 190134709573Sraf aio_req_t *next; 19027c478bd9Sstevel@tonic-gate 190334709573Sraf hashp = _aio_hash + AIOHASH(resultp); 1904*f841f6adSraf lmutex_lock(&hashp->hash_lock); 190534709573Sraf prev = &hashp->hash_ptr; 19067c478bd9Sstevel@tonic-gate while ((next = *prev) != NULL) { 19077c478bd9Sstevel@tonic-gate if (resultp == next->req_resultp) { 1908*f841f6adSraf lmutex_unlock(&hashp->hash_lock); 190934709573Sraf return (-1); 19107c478bd9Sstevel@tonic-gate } 19117c478bd9Sstevel@tonic-gate prev = &next->req_link; 19127c478bd9Sstevel@tonic-gate } 191334709573Sraf *prev = reqp; 191434709573Sraf ASSERT(reqp->req_link == NULL); 1915*f841f6adSraf lmutex_unlock(&hashp->hash_lock); 191634709573Sraf return (0); 19177c478bd9Sstevel@tonic-gate } 19187c478bd9Sstevel@tonic-gate 19197c478bd9Sstevel@tonic-gate /* 192034709573Sraf * Remove an entry from the hash table. 19217c478bd9Sstevel@tonic-gate */ 192234709573Sraf aio_req_t * 192334709573Sraf _aio_hash_del(aio_result_t *resultp) 19247c478bd9Sstevel@tonic-gate { 192534709573Sraf aio_hash_t *hashp; 192634709573Sraf aio_req_t **prev; 192734709573Sraf aio_req_t *next = NULL; 192834709573Sraf 192934709573Sraf if (_aio_hash != NULL) { 193034709573Sraf hashp = _aio_hash + AIOHASH(resultp); 1931*f841f6adSraf lmutex_lock(&hashp->hash_lock); 193234709573Sraf prev = &hashp->hash_ptr; 193334709573Sraf while ((next = *prev) != NULL) { 193434709573Sraf if (resultp == next->req_resultp) { 193534709573Sraf *prev = next->req_link; 193634709573Sraf next->req_link = NULL; 193734709573Sraf break; 193834709573Sraf } 193934709573Sraf prev = &next->req_link; 19407c478bd9Sstevel@tonic-gate } 1941*f841f6adSraf lmutex_unlock(&hashp->hash_lock); 19427c478bd9Sstevel@tonic-gate } 194334709573Sraf return (next); 19447c478bd9Sstevel@tonic-gate } 19457c478bd9Sstevel@tonic-gate 19467c478bd9Sstevel@tonic-gate /* 194734709573Sraf * find an entry in the hash table 19487c478bd9Sstevel@tonic-gate */ 19497c478bd9Sstevel@tonic-gate aio_req_t * 195034709573Sraf _aio_hash_find(aio_result_t *resultp) 19517c478bd9Sstevel@tonic-gate { 195234709573Sraf aio_hash_t *hashp; 195334709573Sraf aio_req_t **prev; 195434709573Sraf aio_req_t *next = NULL; 195534709573Sraf 195634709573Sraf if (_aio_hash != NULL) { 195734709573Sraf hashp = _aio_hash + AIOHASH(resultp); 1958*f841f6adSraf lmutex_lock(&hashp->hash_lock); 195934709573Sraf prev = &hashp->hash_ptr; 196034709573Sraf while ((next = *prev) != NULL) { 196134709573Sraf if (resultp == next->req_resultp) 196234709573Sraf break; 196334709573Sraf prev = &next->req_link; 196434709573Sraf } 1965*f841f6adSraf lmutex_unlock(&hashp->hash_lock); 19667c478bd9Sstevel@tonic-gate } 196734709573Sraf return (next); 19687c478bd9Sstevel@tonic-gate } 19697c478bd9Sstevel@tonic-gate 19707c478bd9Sstevel@tonic-gate /* 19717c478bd9Sstevel@tonic-gate * AIO interface for POSIX 19727c478bd9Sstevel@tonic-gate */ 19737c478bd9Sstevel@tonic-gate int 197434709573Sraf _aio_rw(aiocb_t *aiocbp, aio_lio_t *lio_head, aio_worker_t **nextworker, 197534709573Sraf int mode, int flg) 19767c478bd9Sstevel@tonic-gate { 197734709573Sraf aio_req_t *reqp; 197834709573Sraf aio_args_t *ap; 19797c478bd9Sstevel@tonic-gate int kerr; 19807c478bd9Sstevel@tonic-gate 198134709573Sraf if (aiocbp == NULL) { 19827c478bd9Sstevel@tonic-gate errno = EINVAL; 19837c478bd9Sstevel@tonic-gate return (-1); 19847c478bd9Sstevel@tonic-gate } 19857c478bd9Sstevel@tonic-gate 19867c478bd9Sstevel@tonic-gate /* initialize kaio */ 19877c478bd9Sstevel@tonic-gate if (!_kaio_ok) 19887c478bd9Sstevel@tonic-gate _kaio_init(); 19897c478bd9Sstevel@tonic-gate 199034709573Sraf aiocbp->aio_state = NOCHECK; 19917c478bd9Sstevel@tonic-gate 19927c478bd9Sstevel@tonic-gate /* 199334709573Sraf * If we have been called because a list I/O 19947c478bd9Sstevel@tonic-gate * kaio() failed, we dont want to repeat the 19957c478bd9Sstevel@tonic-gate * system call 19967c478bd9Sstevel@tonic-gate */ 19977c478bd9Sstevel@tonic-gate 19987c478bd9Sstevel@tonic-gate if (flg & AIO_KAIO) { 19997c478bd9Sstevel@tonic-gate /* 20007c478bd9Sstevel@tonic-gate * Try kernel aio first. 20017c478bd9Sstevel@tonic-gate * If errno is ENOTSUP/EBADFD, 20027c478bd9Sstevel@tonic-gate * fall back to the thread implementation. 20037c478bd9Sstevel@tonic-gate */ 200434709573Sraf if (_kaio_ok > 0 && KAIO_SUPPORTED(aiocbp->aio_fildes)) { 200534709573Sraf aiocbp->aio_resultp.aio_errno = EINPROGRESS; 200634709573Sraf aiocbp->aio_state = CHECK; 200734709573Sraf kerr = (int)_kaio(mode, aiocbp); 20087c478bd9Sstevel@tonic-gate if (kerr == 0) 20097c478bd9Sstevel@tonic-gate return (0); 201034709573Sraf if (errno != ENOTSUP && errno != EBADFD) { 201134709573Sraf aiocbp->aio_resultp.aio_errno = errno; 201234709573Sraf aiocbp->aio_resultp.aio_return = -1; 201334709573Sraf aiocbp->aio_state = NOCHECK; 20147c478bd9Sstevel@tonic-gate return (-1); 20157c478bd9Sstevel@tonic-gate } 20167c478bd9Sstevel@tonic-gate if (errno == EBADFD) 201734709573Sraf SET_KAIO_NOT_SUPPORTED(aiocbp->aio_fildes); 20187c478bd9Sstevel@tonic-gate } 20197c478bd9Sstevel@tonic-gate } 20207c478bd9Sstevel@tonic-gate 202134709573Sraf aiocbp->aio_resultp.aio_errno = EINPROGRESS; 202234709573Sraf aiocbp->aio_state = USERAIO; 20237c478bd9Sstevel@tonic-gate 202434709573Sraf if (!__uaio_ok && __uaio_init() == -1) 202534709573Sraf return (-1); 20267c478bd9Sstevel@tonic-gate 202734709573Sraf if ((reqp = _aio_req_alloc()) == NULL) { 20287c478bd9Sstevel@tonic-gate errno = EAGAIN; 20297c478bd9Sstevel@tonic-gate return (-1); 20307c478bd9Sstevel@tonic-gate } 20317c478bd9Sstevel@tonic-gate 20327c478bd9Sstevel@tonic-gate /* 203334709573Sraf * If an LIO request, add the list head to the aio request 20347c478bd9Sstevel@tonic-gate */ 203534709573Sraf reqp->req_head = lio_head; 203634709573Sraf reqp->req_type = AIO_POSIX_REQ; 203734709573Sraf reqp->req_op = mode; 203834709573Sraf reqp->req_largefile = 0; 203934709573Sraf 204034709573Sraf if (aiocbp->aio_sigevent.sigev_notify == SIGEV_NONE) { 204134709573Sraf reqp->req_sigevent.sigev_notify = SIGEV_NONE; 204234709573Sraf } else if (aiocbp->aio_sigevent.sigev_notify == SIGEV_SIGNAL) { 204334709573Sraf reqp->req_sigevent.sigev_notify = SIGEV_SIGNAL; 204434709573Sraf reqp->req_sigevent.sigev_signo = 204534709573Sraf aiocbp->aio_sigevent.sigev_signo; 204634709573Sraf reqp->req_sigevent.sigev_value.sival_ptr = 204734709573Sraf aiocbp->aio_sigevent.sigev_value.sival_ptr; 204834709573Sraf } else if (aiocbp->aio_sigevent.sigev_notify == SIGEV_PORT) { 204934709573Sraf port_notify_t *pn = aiocbp->aio_sigevent.sigev_value.sival_ptr; 205034709573Sraf reqp->req_sigevent.sigev_notify = SIGEV_PORT; 205134709573Sraf /* 205234709573Sraf * Reuse the sigevent structure to contain the port number 205334709573Sraf * and the user value. Same for SIGEV_THREAD, below. 205434709573Sraf */ 205534709573Sraf reqp->req_sigevent.sigev_signo = 205634709573Sraf pn->portnfy_port; 205734709573Sraf reqp->req_sigevent.sigev_value.sival_ptr = 205834709573Sraf pn->portnfy_user; 205934709573Sraf } else if (aiocbp->aio_sigevent.sigev_notify == SIGEV_THREAD) { 206034709573Sraf reqp->req_sigevent.sigev_notify = SIGEV_THREAD; 206134709573Sraf /* 206234709573Sraf * The sigevent structure contains the port number 206334709573Sraf * and the user value. Same for SIGEV_PORT, above. 206434709573Sraf */ 206534709573Sraf reqp->req_sigevent.sigev_signo = 206634709573Sraf aiocbp->aio_sigevent.sigev_signo; 206734709573Sraf reqp->req_sigevent.sigev_value.sival_ptr = 206834709573Sraf aiocbp->aio_sigevent.sigev_value.sival_ptr; 20697c478bd9Sstevel@tonic-gate } 20707c478bd9Sstevel@tonic-gate 207134709573Sraf reqp->req_resultp = &aiocbp->aio_resultp; 207234709573Sraf reqp->req_aiocbp = aiocbp; 207334709573Sraf ap = &reqp->req_args; 207434709573Sraf ap->fd = aiocbp->aio_fildes; 207534709573Sraf ap->buf = (caddr_t)aiocbp->aio_buf; 207634709573Sraf ap->bufsz = aiocbp->aio_nbytes; 207734709573Sraf ap->offset = aiocbp->aio_offset; 207834709573Sraf 207934709573Sraf if ((flg & AIO_NO_DUPS) && 208034709573Sraf _aio_hash_insert(&aiocbp->aio_resultp, reqp) != 0) { 2081*f841f6adSraf aio_panic("_aio_rw(): request already in hash table"); 208234709573Sraf _aio_req_free(reqp); 20837c478bd9Sstevel@tonic-gate errno = EINVAL; 20847c478bd9Sstevel@tonic-gate return (-1); 20857c478bd9Sstevel@tonic-gate } 208634709573Sraf _aio_req_add(reqp, nextworker, mode); 208734709573Sraf return (0); 20887c478bd9Sstevel@tonic-gate } 20897c478bd9Sstevel@tonic-gate 209034709573Sraf #if !defined(_LP64) 20917c478bd9Sstevel@tonic-gate /* 20927c478bd9Sstevel@tonic-gate * 64-bit AIO interface for POSIX 20937c478bd9Sstevel@tonic-gate */ 20947c478bd9Sstevel@tonic-gate int 209534709573Sraf _aio_rw64(aiocb64_t *aiocbp, aio_lio_t *lio_head, aio_worker_t **nextworker, 209634709573Sraf int mode, int flg) 20977c478bd9Sstevel@tonic-gate { 209834709573Sraf aio_req_t *reqp; 209934709573Sraf aio_args_t *ap; 21007c478bd9Sstevel@tonic-gate int kerr; 21017c478bd9Sstevel@tonic-gate 210234709573Sraf if (aiocbp == NULL) { 21037c478bd9Sstevel@tonic-gate errno = EINVAL; 21047c478bd9Sstevel@tonic-gate return (-1); 21057c478bd9Sstevel@tonic-gate } 21067c478bd9Sstevel@tonic-gate 21077c478bd9Sstevel@tonic-gate /* initialize kaio */ 21087c478bd9Sstevel@tonic-gate if (!_kaio_ok) 21097c478bd9Sstevel@tonic-gate _kaio_init(); 21107c478bd9Sstevel@tonic-gate 211134709573Sraf aiocbp->aio_state = NOCHECK; 21127c478bd9Sstevel@tonic-gate 21137c478bd9Sstevel@tonic-gate /* 211434709573Sraf * If we have been called because a list I/O 21157c478bd9Sstevel@tonic-gate * kaio() failed, we dont want to repeat the 21167c478bd9Sstevel@tonic-gate * system call 21177c478bd9Sstevel@tonic-gate */ 21187c478bd9Sstevel@tonic-gate 21197c478bd9Sstevel@tonic-gate if (flg & AIO_KAIO) { 21207c478bd9Sstevel@tonic-gate /* 21217c478bd9Sstevel@tonic-gate * Try kernel aio first. 21227c478bd9Sstevel@tonic-gate * If errno is ENOTSUP/EBADFD, 21237c478bd9Sstevel@tonic-gate * fall back to the thread implementation. 21247c478bd9Sstevel@tonic-gate */ 212534709573Sraf if (_kaio_ok > 0 && KAIO_SUPPORTED(aiocbp->aio_fildes)) { 212634709573Sraf aiocbp->aio_resultp.aio_errno = EINPROGRESS; 212734709573Sraf aiocbp->aio_state = CHECK; 212834709573Sraf kerr = (int)_kaio(mode, aiocbp); 21297c478bd9Sstevel@tonic-gate if (kerr == 0) 21307c478bd9Sstevel@tonic-gate return (0); 213134709573Sraf if (errno != ENOTSUP && errno != EBADFD) { 213234709573Sraf aiocbp->aio_resultp.aio_errno = errno; 213334709573Sraf aiocbp->aio_resultp.aio_return = -1; 213434709573Sraf aiocbp->aio_state = NOCHECK; 21357c478bd9Sstevel@tonic-gate return (-1); 21367c478bd9Sstevel@tonic-gate } 21377c478bd9Sstevel@tonic-gate if (errno == EBADFD) 213834709573Sraf SET_KAIO_NOT_SUPPORTED(aiocbp->aio_fildes); 21397c478bd9Sstevel@tonic-gate } 21407c478bd9Sstevel@tonic-gate } 21417c478bd9Sstevel@tonic-gate 214234709573Sraf aiocbp->aio_resultp.aio_errno = EINPROGRESS; 214334709573Sraf aiocbp->aio_state = USERAIO; 21447c478bd9Sstevel@tonic-gate 214534709573Sraf if (!__uaio_ok && __uaio_init() == -1) 214634709573Sraf return (-1); 21477c478bd9Sstevel@tonic-gate 214834709573Sraf if ((reqp = _aio_req_alloc()) == NULL) { 21497c478bd9Sstevel@tonic-gate errno = EAGAIN; 21507c478bd9Sstevel@tonic-gate return (-1); 21517c478bd9Sstevel@tonic-gate } 21527c478bd9Sstevel@tonic-gate 21537c478bd9Sstevel@tonic-gate /* 215434709573Sraf * If an LIO request, add the list head to the aio request 21557c478bd9Sstevel@tonic-gate */ 215634709573Sraf reqp->req_head = lio_head; 215734709573Sraf reqp->req_type = AIO_POSIX_REQ; 215834709573Sraf reqp->req_op = mode; 215934709573Sraf reqp->req_largefile = 1; 216034709573Sraf 216134709573Sraf if (aiocbp->aio_sigevent.sigev_notify == SIGEV_NONE) { 216234709573Sraf reqp->req_sigevent.sigev_notify = SIGEV_NONE; 216334709573Sraf } else if (aiocbp->aio_sigevent.sigev_notify == SIGEV_SIGNAL) { 216434709573Sraf reqp->req_sigevent.sigev_notify = SIGEV_SIGNAL; 216534709573Sraf reqp->req_sigevent.sigev_signo = 216634709573Sraf aiocbp->aio_sigevent.sigev_signo; 216734709573Sraf reqp->req_sigevent.sigev_value.sival_ptr = 216834709573Sraf aiocbp->aio_sigevent.sigev_value.sival_ptr; 216934709573Sraf } else if (aiocbp->aio_sigevent.sigev_notify == SIGEV_PORT) { 217034709573Sraf port_notify_t *pn = aiocbp->aio_sigevent.sigev_value.sival_ptr; 217134709573Sraf reqp->req_sigevent.sigev_notify = SIGEV_PORT; 217234709573Sraf reqp->req_sigevent.sigev_signo = 217334709573Sraf pn->portnfy_port; 217434709573Sraf reqp->req_sigevent.sigev_value.sival_ptr = 217534709573Sraf pn->portnfy_user; 217634709573Sraf } else if (aiocbp->aio_sigevent.sigev_notify == SIGEV_THREAD) { 217734709573Sraf reqp->req_sigevent.sigev_notify = SIGEV_THREAD; 217834709573Sraf reqp->req_sigevent.sigev_signo = 217934709573Sraf aiocbp->aio_sigevent.sigev_signo; 218034709573Sraf reqp->req_sigevent.sigev_value.sival_ptr = 218134709573Sraf aiocbp->aio_sigevent.sigev_value.sival_ptr; 21827c478bd9Sstevel@tonic-gate } 21837c478bd9Sstevel@tonic-gate 218434709573Sraf reqp->req_resultp = &aiocbp->aio_resultp; 218534709573Sraf reqp->req_aiocbp = aiocbp; 218634709573Sraf ap = &reqp->req_args; 218734709573Sraf ap->fd = aiocbp->aio_fildes; 218834709573Sraf ap->buf = (caddr_t)aiocbp->aio_buf; 218934709573Sraf ap->bufsz = aiocbp->aio_nbytes; 219034709573Sraf ap->offset = aiocbp->aio_offset; 219134709573Sraf 219234709573Sraf if ((flg & AIO_NO_DUPS) && 219334709573Sraf _aio_hash_insert(&aiocbp->aio_resultp, reqp) != 0) { 2194*f841f6adSraf aio_panic("_aio_rw64(): request already in hash table"); 219534709573Sraf _aio_req_free(reqp); 21967c478bd9Sstevel@tonic-gate errno = EINVAL; 21977c478bd9Sstevel@tonic-gate return (-1); 21987c478bd9Sstevel@tonic-gate } 219934709573Sraf _aio_req_add(reqp, nextworker, mode); 220034709573Sraf return (0); 22017c478bd9Sstevel@tonic-gate } 220234709573Sraf #endif /* !defined(_LP64) */ 2203