1*7c478bd9Sstevel@tonic-gate /* 2*7c478bd9Sstevel@tonic-gate * CDDL HEADER START 3*7c478bd9Sstevel@tonic-gate * 4*7c478bd9Sstevel@tonic-gate * The contents of this file are subject to the terms of the 5*7c478bd9Sstevel@tonic-gate * Common Development and Distribution License, Version 1.0 only 6*7c478bd9Sstevel@tonic-gate * (the "License"). You may not use this file except in compliance 7*7c478bd9Sstevel@tonic-gate * with the License. 8*7c478bd9Sstevel@tonic-gate * 9*7c478bd9Sstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10*7c478bd9Sstevel@tonic-gate * or http://www.opensolaris.org/os/licensing. 11*7c478bd9Sstevel@tonic-gate * See the License for the specific language governing permissions 12*7c478bd9Sstevel@tonic-gate * and limitations under the License. 13*7c478bd9Sstevel@tonic-gate * 14*7c478bd9Sstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each 15*7c478bd9Sstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16*7c478bd9Sstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the 17*7c478bd9Sstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying 18*7c478bd9Sstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner] 19*7c478bd9Sstevel@tonic-gate * 20*7c478bd9Sstevel@tonic-gate * CDDL HEADER END 21*7c478bd9Sstevel@tonic-gate */ 22*7c478bd9Sstevel@tonic-gate /* 23*7c478bd9Sstevel@tonic-gate * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24*7c478bd9Sstevel@tonic-gate * Use is subject to license terms. 25*7c478bd9Sstevel@tonic-gate */ 26*7c478bd9Sstevel@tonic-gate 27*7c478bd9Sstevel@tonic-gate /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 28*7c478bd9Sstevel@tonic-gate /* All Rights Reserved */ 29*7c478bd9Sstevel@tonic-gate 30*7c478bd9Sstevel@tonic-gate /* 31*7c478bd9Sstevel@tonic-gate * University Copyright- Copyright (c) 1982, 1986, 1988 32*7c478bd9Sstevel@tonic-gate * The Regents of the University of California 33*7c478bd9Sstevel@tonic-gate * All Rights Reserved 34*7c478bd9Sstevel@tonic-gate * 35*7c478bd9Sstevel@tonic-gate * University Acknowledgment- Portions of this document are derived from 36*7c478bd9Sstevel@tonic-gate * software developed by the University of California, Berkeley, and its 37*7c478bd9Sstevel@tonic-gate * contributors. 38*7c478bd9Sstevel@tonic-gate */ 39*7c478bd9Sstevel@tonic-gate 40*7c478bd9Sstevel@tonic-gate #pragma ident "%Z%%M% %I% %E% SMI" 41*7c478bd9Sstevel@tonic-gate 42*7c478bd9Sstevel@tonic-gate #include <sys/types.h> 43*7c478bd9Sstevel@tonic-gate #include <sys/t_lock.h> 44*7c478bd9Sstevel@tonic-gate #include <sys/sysmacros.h> 45*7c478bd9Sstevel@tonic-gate #include <sys/conf.h> 46*7c478bd9Sstevel@tonic-gate #include <sys/cpuvar.h> 47*7c478bd9Sstevel@tonic-gate #include <sys/errno.h> 48*7c478bd9Sstevel@tonic-gate #include <sys/debug.h> 49*7c478bd9Sstevel@tonic-gate #include <sys/buf.h> 50*7c478bd9Sstevel@tonic-gate #include <sys/var.h> 51*7c478bd9Sstevel@tonic-gate #include <sys/vnode.h> 52*7c478bd9Sstevel@tonic-gate #include <sys/bitmap.h> 53*7c478bd9Sstevel@tonic-gate #include <sys/cmn_err.h> 54*7c478bd9Sstevel@tonic-gate #include <sys/kmem.h> 55*7c478bd9Sstevel@tonic-gate #include <sys/vmem.h> 56*7c478bd9Sstevel@tonic-gate #include <sys/atomic.h> 57*7c478bd9Sstevel@tonic-gate #include <vm/seg_kmem.h> 58*7c478bd9Sstevel@tonic-gate #include <vm/page.h> 59*7c478bd9Sstevel@tonic-gate #include <vm/pvn.h> 60*7c478bd9Sstevel@tonic-gate #include <sys/vtrace.h> 61*7c478bd9Sstevel@tonic-gate #include <sys/tnf_probe.h> 62*7c478bd9Sstevel@tonic-gate #include <sys/fs/ufs_inode.h> 63*7c478bd9Sstevel@tonic-gate #include <sys/fs/ufs_bio.h> 64*7c478bd9Sstevel@tonic-gate #include <sys/fs/ufs_log.h> 65*7c478bd9Sstevel@tonic-gate #include <sys/systm.h> 66*7c478bd9Sstevel@tonic-gate #include <sys/vfs.h> 67*7c478bd9Sstevel@tonic-gate #include <sys/sdt.h> 68*7c478bd9Sstevel@tonic-gate 69*7c478bd9Sstevel@tonic-gate /* Locks */ 70*7c478bd9Sstevel@tonic-gate static kmutex_t blist_lock; /* protects b_list */ 71*7c478bd9Sstevel@tonic-gate static kmutex_t bhdr_lock; /* protects the bhdrlist */ 72*7c478bd9Sstevel@tonic-gate static kmutex_t bfree_lock; /* protects the bfreelist structure */ 73*7c478bd9Sstevel@tonic-gate 74*7c478bd9Sstevel@tonic-gate struct hbuf *hbuf; /* Hash buckets */ 75*7c478bd9Sstevel@tonic-gate struct dwbuf *dwbuf; /* Delayed write buckets */ 76*7c478bd9Sstevel@tonic-gate static struct buf *bhdrlist; /* buf header free list */ 77*7c478bd9Sstevel@tonic-gate static int nbuf; /* number of buffer headers allocated */ 78*7c478bd9Sstevel@tonic-gate 79*7c478bd9Sstevel@tonic-gate static int lastindex; /* Reference point on where to start */ 80*7c478bd9Sstevel@tonic-gate /* when looking for free buffers */ 81*7c478bd9Sstevel@tonic-gate 82*7c478bd9Sstevel@tonic-gate #define bio_bhash(dev, bn) (hash2ints((dev), (int)(bn)) & v.v_hmask) 83*7c478bd9Sstevel@tonic-gate #define EMPTY_LIST ((struct buf *)-1) 84*7c478bd9Sstevel@tonic-gate 85*7c478bd9Sstevel@tonic-gate static kcondvar_t bio_mem_cv; /* Condition variables */ 86*7c478bd9Sstevel@tonic-gate static kcondvar_t bio_flushinval_cv; 87*7c478bd9Sstevel@tonic-gate static int bio_doingflush; /* flush in progress */ 88*7c478bd9Sstevel@tonic-gate static int bio_doinginval; /* inval in progress */ 89*7c478bd9Sstevel@tonic-gate static int bio_flinv_cv_wanted; /* someone waiting for cv */ 90*7c478bd9Sstevel@tonic-gate 91*7c478bd9Sstevel@tonic-gate /* 92*7c478bd9Sstevel@tonic-gate * Statistics on the buffer cache 93*7c478bd9Sstevel@tonic-gate */ 94*7c478bd9Sstevel@tonic-gate struct biostats biostats = { 95*7c478bd9Sstevel@tonic-gate { "buffer_cache_lookups", KSTAT_DATA_UINT32 }, 96*7c478bd9Sstevel@tonic-gate { "buffer_cache_hits", KSTAT_DATA_UINT32 }, 97*7c478bd9Sstevel@tonic-gate { "new_buffer_requests", KSTAT_DATA_UINT32 }, 98*7c478bd9Sstevel@tonic-gate { "waits_for_buffer_allocs", KSTAT_DATA_UINT32 }, 99*7c478bd9Sstevel@tonic-gate { "buffers_locked_by_someone", KSTAT_DATA_UINT32 }, 100*7c478bd9Sstevel@tonic-gate { "duplicate_buffers_found", KSTAT_DATA_UINT32 } 101*7c478bd9Sstevel@tonic-gate }; 102*7c478bd9Sstevel@tonic-gate 103*7c478bd9Sstevel@tonic-gate /* 104*7c478bd9Sstevel@tonic-gate * kstat data 105*7c478bd9Sstevel@tonic-gate */ 106*7c478bd9Sstevel@tonic-gate kstat_named_t *biostats_ptr = (kstat_named_t *)&biostats; 107*7c478bd9Sstevel@tonic-gate uint_t biostats_ndata = (uint_t)(sizeof (biostats) / 108*7c478bd9Sstevel@tonic-gate sizeof (kstat_named_t)); 109*7c478bd9Sstevel@tonic-gate 110*7c478bd9Sstevel@tonic-gate /* 111*7c478bd9Sstevel@tonic-gate * Statistics on ufs buffer cache 112*7c478bd9Sstevel@tonic-gate * Not protected by locks 113*7c478bd9Sstevel@tonic-gate */ 114*7c478bd9Sstevel@tonic-gate struct ufsbiostats ub = { 115*7c478bd9Sstevel@tonic-gate { "breads", KSTAT_DATA_UINT32 }, 116*7c478bd9Sstevel@tonic-gate { "bwrites", KSTAT_DATA_UINT32 }, 117*7c478bd9Sstevel@tonic-gate { "fbiwrites", KSTAT_DATA_UINT32 }, 118*7c478bd9Sstevel@tonic-gate { "getpages", KSTAT_DATA_UINT32 }, 119*7c478bd9Sstevel@tonic-gate { "getras", KSTAT_DATA_UINT32 }, 120*7c478bd9Sstevel@tonic-gate { "putsyncs", KSTAT_DATA_UINT32 }, 121*7c478bd9Sstevel@tonic-gate { "putasyncs", KSTAT_DATA_UINT32 }, 122*7c478bd9Sstevel@tonic-gate { "putpageios", KSTAT_DATA_UINT32 }, 123*7c478bd9Sstevel@tonic-gate }; 124*7c478bd9Sstevel@tonic-gate 125*7c478bd9Sstevel@tonic-gate /* 126*7c478bd9Sstevel@tonic-gate * more UFS Logging eccentricities... 127*7c478bd9Sstevel@tonic-gate * 128*7c478bd9Sstevel@tonic-gate * required since "#pragma weak ..." doesn't work in reverse order. 129*7c478bd9Sstevel@tonic-gate * i.e.: genunix (bio.c) is loaded before the ufs modules and pointers 130*7c478bd9Sstevel@tonic-gate * to ufs routines don't get plugged into bio.c calls so 131*7c478bd9Sstevel@tonic-gate * we initialize it when setting up the "lufsops" table 132*7c478bd9Sstevel@tonic-gate * in "lufs.c:_init()" 133*7c478bd9Sstevel@tonic-gate */ 134*7c478bd9Sstevel@tonic-gate void (*bio_lufs_strategy)(void *, buf_t *); 135*7c478bd9Sstevel@tonic-gate void (*bio_snapshot_strategy)(void *, buf_t *); 136*7c478bd9Sstevel@tonic-gate 137*7c478bd9Sstevel@tonic-gate 138*7c478bd9Sstevel@tonic-gate /* Private routines */ 139*7c478bd9Sstevel@tonic-gate static struct buf *bio_getfreeblk(long); 140*7c478bd9Sstevel@tonic-gate static void bio_mem_get(long); 141*7c478bd9Sstevel@tonic-gate static void bio_bhdr_free(struct buf *); 142*7c478bd9Sstevel@tonic-gate static struct buf *bio_bhdr_alloc(void); 143*7c478bd9Sstevel@tonic-gate static void bio_recycle(int, long); 144*7c478bd9Sstevel@tonic-gate static void bio_pageio_done(struct buf *); 145*7c478bd9Sstevel@tonic-gate static int bio_incore(dev_t, daddr_t); 146*7c478bd9Sstevel@tonic-gate 147*7c478bd9Sstevel@tonic-gate /* 148*7c478bd9Sstevel@tonic-gate * Buffer cache constants 149*7c478bd9Sstevel@tonic-gate */ 150*7c478bd9Sstevel@tonic-gate #define BIO_BUF_PERCENT (100/2) /* default: 2% of memory */ 151*7c478bd9Sstevel@tonic-gate #define BIO_MAX_PERCENT (100/20) /* max is 20% of real memory */ 152*7c478bd9Sstevel@tonic-gate #define BIO_BHDR_POOL 100 /* Default bhdr pool size */ 153*7c478bd9Sstevel@tonic-gate #define BIO_MIN_HDR 10 /* Minimum number of buffer headers */ 154*7c478bd9Sstevel@tonic-gate #define BIO_MIN_HWM (BIO_MIN_HDR * MAXBSIZE / 1024) 155*7c478bd9Sstevel@tonic-gate #define BIO_HASHLEN 4 /* Target length of hash chains */ 156*7c478bd9Sstevel@tonic-gate 157*7c478bd9Sstevel@tonic-gate 158*7c478bd9Sstevel@tonic-gate /* Flags for bio_recycle() */ 159*7c478bd9Sstevel@tonic-gate #define BIO_HEADER 0x01 160*7c478bd9Sstevel@tonic-gate #define BIO_MEM 0x02 161*7c478bd9Sstevel@tonic-gate 162*7c478bd9Sstevel@tonic-gate extern int bufhwm; /* User tunable - high water mark for mem */ 163*7c478bd9Sstevel@tonic-gate extern int bufhwm_pct; /* ditto - given in % of physmem */ 164*7c478bd9Sstevel@tonic-gate 165*7c478bd9Sstevel@tonic-gate /* 166*7c478bd9Sstevel@tonic-gate * The following routines allocate and free 167*7c478bd9Sstevel@tonic-gate * buffers with various side effects. In general the 168*7c478bd9Sstevel@tonic-gate * arguments to an allocate routine are a device and 169*7c478bd9Sstevel@tonic-gate * a block number, and the value is a pointer to 170*7c478bd9Sstevel@tonic-gate * to the buffer header; the buffer returned is locked with a 171*7c478bd9Sstevel@tonic-gate * binary semaphore so that no one else can touch it. If the block was 172*7c478bd9Sstevel@tonic-gate * already in core, no I/O need be done; if it is 173*7c478bd9Sstevel@tonic-gate * already locked, the process waits until it becomes free. 174*7c478bd9Sstevel@tonic-gate * The following routines allocate a buffer: 175*7c478bd9Sstevel@tonic-gate * getblk 176*7c478bd9Sstevel@tonic-gate * bread/BREAD 177*7c478bd9Sstevel@tonic-gate * breada 178*7c478bd9Sstevel@tonic-gate * Eventually the buffer must be released, possibly with the 179*7c478bd9Sstevel@tonic-gate * side effect of writing it out, by using one of 180*7c478bd9Sstevel@tonic-gate * bwrite/BWRITE/brwrite 181*7c478bd9Sstevel@tonic-gate * bdwrite/bdrwrite 182*7c478bd9Sstevel@tonic-gate * bawrite 183*7c478bd9Sstevel@tonic-gate * brelse 184*7c478bd9Sstevel@tonic-gate * 185*7c478bd9Sstevel@tonic-gate * The B_WANTED/B_BUSY bits are NOT used by these routines for synchronization. 186*7c478bd9Sstevel@tonic-gate * Instead, a binary semaphore, b_sem is used to gain exclusive access to 187*7c478bd9Sstevel@tonic-gate * a buffer and a binary semaphore, b_io is used for I/O synchronization. 188*7c478bd9Sstevel@tonic-gate * B_DONE is still used to denote a buffer with I/O complete on it. 189*7c478bd9Sstevel@tonic-gate * 190*7c478bd9Sstevel@tonic-gate * The bfreelist.b_bcount field is computed everytime fsflush runs. It is 191*7c478bd9Sstevel@tonic-gate * should not be used where a very accurate count of the free buffers is 192*7c478bd9Sstevel@tonic-gate * needed. 193*7c478bd9Sstevel@tonic-gate */ 194*7c478bd9Sstevel@tonic-gate 195*7c478bd9Sstevel@tonic-gate /* 196*7c478bd9Sstevel@tonic-gate * Read in (if necessary) the block and return a buffer pointer. 197*7c478bd9Sstevel@tonic-gate * 198*7c478bd9Sstevel@tonic-gate * This interface is provided for binary compatibility. Using 199*7c478bd9Sstevel@tonic-gate * BREAD() directly avoids the extra function call overhead invoked 200*7c478bd9Sstevel@tonic-gate * by calling this routine. 201*7c478bd9Sstevel@tonic-gate */ 202*7c478bd9Sstevel@tonic-gate struct buf * 203*7c478bd9Sstevel@tonic-gate bread(dev_t dev, daddr_t blkno, long bsize) 204*7c478bd9Sstevel@tonic-gate { 205*7c478bd9Sstevel@tonic-gate return (BREAD(dev, blkno, bsize)); 206*7c478bd9Sstevel@tonic-gate } 207*7c478bd9Sstevel@tonic-gate 208*7c478bd9Sstevel@tonic-gate /* 209*7c478bd9Sstevel@tonic-gate * Common code for reading a buffer with various options 210*7c478bd9Sstevel@tonic-gate * 211*7c478bd9Sstevel@tonic-gate * Read in (if necessary) the block and return a buffer pointer. 212*7c478bd9Sstevel@tonic-gate */ 213*7c478bd9Sstevel@tonic-gate struct buf * 214*7c478bd9Sstevel@tonic-gate bread_common(void *arg, dev_t dev, daddr_t blkno, long bsize) 215*7c478bd9Sstevel@tonic-gate { 216*7c478bd9Sstevel@tonic-gate struct ufsvfs *ufsvfsp = (struct ufsvfs *)arg; 217*7c478bd9Sstevel@tonic-gate struct buf *bp; 218*7c478bd9Sstevel@tonic-gate klwp_t *lwp = ttolwp(curthread); 219*7c478bd9Sstevel@tonic-gate 220*7c478bd9Sstevel@tonic-gate CPU_STATS_ADD_K(sys, lread, 1); 221*7c478bd9Sstevel@tonic-gate bp = getblk_common(ufsvfsp, dev, blkno, bsize, /* errflg */ 1); 222*7c478bd9Sstevel@tonic-gate if (bp->b_flags & B_DONE) 223*7c478bd9Sstevel@tonic-gate return (bp); 224*7c478bd9Sstevel@tonic-gate bp->b_flags |= B_READ; 225*7c478bd9Sstevel@tonic-gate ASSERT(bp->b_bcount == bsize); 226*7c478bd9Sstevel@tonic-gate if (ufsvfsp == NULL) { /* !ufs */ 227*7c478bd9Sstevel@tonic-gate (void) bdev_strategy(bp); 228*7c478bd9Sstevel@tonic-gate } else if (ufsvfsp->vfs_log && bio_lufs_strategy != NULL) { 229*7c478bd9Sstevel@tonic-gate /* ufs && logging */ 230*7c478bd9Sstevel@tonic-gate (*bio_lufs_strategy)(ufsvfsp->vfs_log, bp); 231*7c478bd9Sstevel@tonic-gate } else if (ufsvfsp->vfs_snapshot && bio_snapshot_strategy != NULL) { 232*7c478bd9Sstevel@tonic-gate /* ufs && snapshots */ 233*7c478bd9Sstevel@tonic-gate (*bio_snapshot_strategy)(&ufsvfsp->vfs_snapshot, bp); 234*7c478bd9Sstevel@tonic-gate } else { 235*7c478bd9Sstevel@tonic-gate ufsvfsp->vfs_iotstamp = lbolt; 236*7c478bd9Sstevel@tonic-gate ub.ub_breads.value.ul++; /* ufs && !logging */ 237*7c478bd9Sstevel@tonic-gate (void) bdev_strategy(bp); 238*7c478bd9Sstevel@tonic-gate } 239*7c478bd9Sstevel@tonic-gate if (lwp != NULL) 240*7c478bd9Sstevel@tonic-gate lwp->lwp_ru.inblock++; 241*7c478bd9Sstevel@tonic-gate CPU_STATS_ADD_K(sys, bread, 1); 242*7c478bd9Sstevel@tonic-gate (void) biowait(bp); 243*7c478bd9Sstevel@tonic-gate return (bp); 244*7c478bd9Sstevel@tonic-gate } 245*7c478bd9Sstevel@tonic-gate 246*7c478bd9Sstevel@tonic-gate /* 247*7c478bd9Sstevel@tonic-gate * Read in the block, like bread, but also start I/O on the 248*7c478bd9Sstevel@tonic-gate * read-ahead block (which is not allocated to the caller). 249*7c478bd9Sstevel@tonic-gate */ 250*7c478bd9Sstevel@tonic-gate struct buf * 251*7c478bd9Sstevel@tonic-gate breada(dev_t dev, daddr_t blkno, daddr_t rablkno, long bsize) 252*7c478bd9Sstevel@tonic-gate { 253*7c478bd9Sstevel@tonic-gate struct buf *bp, *rabp; 254*7c478bd9Sstevel@tonic-gate klwp_t *lwp = ttolwp(curthread); 255*7c478bd9Sstevel@tonic-gate 256*7c478bd9Sstevel@tonic-gate bp = NULL; 257*7c478bd9Sstevel@tonic-gate if (!bio_incore(dev, blkno)) { 258*7c478bd9Sstevel@tonic-gate CPU_STATS_ADD_K(sys, lread, 1); 259*7c478bd9Sstevel@tonic-gate bp = GETBLK(dev, blkno, bsize); 260*7c478bd9Sstevel@tonic-gate if ((bp->b_flags & B_DONE) == 0) { 261*7c478bd9Sstevel@tonic-gate bp->b_flags |= B_READ; 262*7c478bd9Sstevel@tonic-gate bp->b_bcount = bsize; 263*7c478bd9Sstevel@tonic-gate (void) bdev_strategy(bp); 264*7c478bd9Sstevel@tonic-gate if (lwp != NULL) 265*7c478bd9Sstevel@tonic-gate lwp->lwp_ru.inblock++; 266*7c478bd9Sstevel@tonic-gate CPU_STATS_ADD_K(sys, bread, 1); 267*7c478bd9Sstevel@tonic-gate } 268*7c478bd9Sstevel@tonic-gate } 269*7c478bd9Sstevel@tonic-gate if (rablkno && bfreelist.b_bcount > 1 && 270*7c478bd9Sstevel@tonic-gate !bio_incore(dev, rablkno)) { 271*7c478bd9Sstevel@tonic-gate rabp = GETBLK(dev, rablkno, bsize); 272*7c478bd9Sstevel@tonic-gate if (rabp->b_flags & B_DONE) 273*7c478bd9Sstevel@tonic-gate brelse(rabp); 274*7c478bd9Sstevel@tonic-gate else { 275*7c478bd9Sstevel@tonic-gate rabp->b_flags |= B_READ|B_ASYNC; 276*7c478bd9Sstevel@tonic-gate rabp->b_bcount = bsize; 277*7c478bd9Sstevel@tonic-gate (void) bdev_strategy(rabp); 278*7c478bd9Sstevel@tonic-gate if (lwp != NULL) 279*7c478bd9Sstevel@tonic-gate lwp->lwp_ru.inblock++; 280*7c478bd9Sstevel@tonic-gate CPU_STATS_ADD_K(sys, bread, 1); 281*7c478bd9Sstevel@tonic-gate } 282*7c478bd9Sstevel@tonic-gate } 283*7c478bd9Sstevel@tonic-gate if (bp == NULL) 284*7c478bd9Sstevel@tonic-gate return (BREAD(dev, blkno, bsize)); 285*7c478bd9Sstevel@tonic-gate (void) biowait(bp); 286*7c478bd9Sstevel@tonic-gate return (bp); 287*7c478bd9Sstevel@tonic-gate } 288*7c478bd9Sstevel@tonic-gate 289*7c478bd9Sstevel@tonic-gate /* 290*7c478bd9Sstevel@tonic-gate * Common code for writing a buffer with various options. 291*7c478bd9Sstevel@tonic-gate * 292*7c478bd9Sstevel@tonic-gate * force_wait - wait for write completion regardless of B_ASYNC flag 293*7c478bd9Sstevel@tonic-gate * do_relse - release the buffer when we are done 294*7c478bd9Sstevel@tonic-gate * clear_flags - flags to clear from the buffer 295*7c478bd9Sstevel@tonic-gate */ 296*7c478bd9Sstevel@tonic-gate void 297*7c478bd9Sstevel@tonic-gate bwrite_common(void *arg, struct buf *bp, int force_wait, 298*7c478bd9Sstevel@tonic-gate int do_relse, int clear_flags) 299*7c478bd9Sstevel@tonic-gate { 300*7c478bd9Sstevel@tonic-gate register int do_wait; 301*7c478bd9Sstevel@tonic-gate struct ufsvfs *ufsvfsp = (struct ufsvfs *)arg; 302*7c478bd9Sstevel@tonic-gate int flag; 303*7c478bd9Sstevel@tonic-gate klwp_t *lwp = ttolwp(curthread); 304*7c478bd9Sstevel@tonic-gate struct cpu *cpup; 305*7c478bd9Sstevel@tonic-gate 306*7c478bd9Sstevel@tonic-gate ASSERT(SEMA_HELD(&bp->b_sem)); 307*7c478bd9Sstevel@tonic-gate flag = bp->b_flags; 308*7c478bd9Sstevel@tonic-gate bp->b_flags &= ~clear_flags; 309*7c478bd9Sstevel@tonic-gate if (lwp != NULL) 310*7c478bd9Sstevel@tonic-gate lwp->lwp_ru.oublock++; 311*7c478bd9Sstevel@tonic-gate CPU_STATS_ENTER_K(); 312*7c478bd9Sstevel@tonic-gate cpup = CPU; /* get pointer AFTER preemption is disabled */ 313*7c478bd9Sstevel@tonic-gate CPU_STATS_ADDQ(cpup, sys, lwrite, 1); 314*7c478bd9Sstevel@tonic-gate CPU_STATS_ADDQ(cpup, sys, bwrite, 1); 315*7c478bd9Sstevel@tonic-gate do_wait = ((flag & B_ASYNC) == 0 || force_wait); 316*7c478bd9Sstevel@tonic-gate if (do_wait == 0) 317*7c478bd9Sstevel@tonic-gate CPU_STATS_ADDQ(cpup, sys, bawrite, 1); 318*7c478bd9Sstevel@tonic-gate CPU_STATS_EXIT_K(); 319*7c478bd9Sstevel@tonic-gate if (ufsvfsp == NULL) { 320*7c478bd9Sstevel@tonic-gate (void) bdev_strategy(bp); 321*7c478bd9Sstevel@tonic-gate } else if (ufsvfsp->vfs_log && bio_lufs_strategy != NULL) { 322*7c478bd9Sstevel@tonic-gate /* ufs && logging */ 323*7c478bd9Sstevel@tonic-gate (*bio_lufs_strategy)(ufsvfsp->vfs_log, bp); 324*7c478bd9Sstevel@tonic-gate } else if (ufsvfsp->vfs_snapshot && bio_snapshot_strategy != NULL) { 325*7c478bd9Sstevel@tonic-gate /* ufs && snapshots */ 326*7c478bd9Sstevel@tonic-gate (*bio_snapshot_strategy)(&ufsvfsp->vfs_snapshot, bp); 327*7c478bd9Sstevel@tonic-gate } else { 328*7c478bd9Sstevel@tonic-gate ub.ub_bwrites.value.ul++; /* ufs && !logging */ 329*7c478bd9Sstevel@tonic-gate (void) bdev_strategy(bp); 330*7c478bd9Sstevel@tonic-gate } 331*7c478bd9Sstevel@tonic-gate if (do_wait) { 332*7c478bd9Sstevel@tonic-gate (void) biowait(bp); 333*7c478bd9Sstevel@tonic-gate if (do_relse) { 334*7c478bd9Sstevel@tonic-gate brelse(bp); 335*7c478bd9Sstevel@tonic-gate } 336*7c478bd9Sstevel@tonic-gate } 337*7c478bd9Sstevel@tonic-gate } 338*7c478bd9Sstevel@tonic-gate 339*7c478bd9Sstevel@tonic-gate /* 340*7c478bd9Sstevel@tonic-gate * Write the buffer, waiting for completion (unless B_ASYNC is set). 341*7c478bd9Sstevel@tonic-gate * Then release the buffer. 342*7c478bd9Sstevel@tonic-gate * This interface is provided for binary compatibility. Using 343*7c478bd9Sstevel@tonic-gate * BWRITE() directly avoids the extra function call overhead invoked 344*7c478bd9Sstevel@tonic-gate * by calling this routine. 345*7c478bd9Sstevel@tonic-gate */ 346*7c478bd9Sstevel@tonic-gate void 347*7c478bd9Sstevel@tonic-gate bwrite(struct buf *bp) 348*7c478bd9Sstevel@tonic-gate { 349*7c478bd9Sstevel@tonic-gate BWRITE(bp); 350*7c478bd9Sstevel@tonic-gate } 351*7c478bd9Sstevel@tonic-gate 352*7c478bd9Sstevel@tonic-gate /* 353*7c478bd9Sstevel@tonic-gate * Write the buffer, waiting for completion. 354*7c478bd9Sstevel@tonic-gate * But don't release the buffer afterwards. 355*7c478bd9Sstevel@tonic-gate * This interface is provided for binary compatibility. Using 356*7c478bd9Sstevel@tonic-gate * BWRITE2() directly avoids the extra function call overhead. 357*7c478bd9Sstevel@tonic-gate */ 358*7c478bd9Sstevel@tonic-gate void 359*7c478bd9Sstevel@tonic-gate bwrite2(struct buf *bp) 360*7c478bd9Sstevel@tonic-gate { 361*7c478bd9Sstevel@tonic-gate BWRITE2(bp); 362*7c478bd9Sstevel@tonic-gate } 363*7c478bd9Sstevel@tonic-gate 364*7c478bd9Sstevel@tonic-gate /* 365*7c478bd9Sstevel@tonic-gate * Release the buffer, marking it so that if it is grabbed 366*7c478bd9Sstevel@tonic-gate * for another purpose it will be written out before being 367*7c478bd9Sstevel@tonic-gate * given up (e.g. when writing a partial block where it is 368*7c478bd9Sstevel@tonic-gate * assumed that another write for the same block will soon follow). 369*7c478bd9Sstevel@tonic-gate * Also save the time that the block is first marked as delayed 370*7c478bd9Sstevel@tonic-gate * so that it will be written in a reasonable time. 371*7c478bd9Sstevel@tonic-gate */ 372*7c478bd9Sstevel@tonic-gate void 373*7c478bd9Sstevel@tonic-gate bdwrite(struct buf *bp) 374*7c478bd9Sstevel@tonic-gate { 375*7c478bd9Sstevel@tonic-gate ASSERT(SEMA_HELD(&bp->b_sem)); 376*7c478bd9Sstevel@tonic-gate CPU_STATS_ADD_K(sys, lwrite, 1); 377*7c478bd9Sstevel@tonic-gate if ((bp->b_flags & B_DELWRI) == 0) 378*7c478bd9Sstevel@tonic-gate bp->b_start = lbolt; 379*7c478bd9Sstevel@tonic-gate /* 380*7c478bd9Sstevel@tonic-gate * B_DONE allows others to use the buffer, B_DELWRI causes the 381*7c478bd9Sstevel@tonic-gate * buffer to be written before being reused, and setting b_resid 382*7c478bd9Sstevel@tonic-gate * to zero says the buffer is complete. 383*7c478bd9Sstevel@tonic-gate */ 384*7c478bd9Sstevel@tonic-gate bp->b_flags |= B_DELWRI | B_DONE; 385*7c478bd9Sstevel@tonic-gate bp->b_resid = 0; 386*7c478bd9Sstevel@tonic-gate brelse(bp); 387*7c478bd9Sstevel@tonic-gate } 388*7c478bd9Sstevel@tonic-gate 389*7c478bd9Sstevel@tonic-gate /* 390*7c478bd9Sstevel@tonic-gate * Release the buffer, start I/O on it, but don't wait for completion. 391*7c478bd9Sstevel@tonic-gate */ 392*7c478bd9Sstevel@tonic-gate void 393*7c478bd9Sstevel@tonic-gate bawrite(struct buf *bp) 394*7c478bd9Sstevel@tonic-gate { 395*7c478bd9Sstevel@tonic-gate ASSERT(SEMA_HELD(&bp->b_sem)); 396*7c478bd9Sstevel@tonic-gate 397*7c478bd9Sstevel@tonic-gate /* Use bfreelist.b_bcount as a weird-ass heuristic */ 398*7c478bd9Sstevel@tonic-gate if (bfreelist.b_bcount > 4) 399*7c478bd9Sstevel@tonic-gate bp->b_flags |= B_ASYNC; 400*7c478bd9Sstevel@tonic-gate BWRITE(bp); 401*7c478bd9Sstevel@tonic-gate } 402*7c478bd9Sstevel@tonic-gate 403*7c478bd9Sstevel@tonic-gate /* 404*7c478bd9Sstevel@tonic-gate * Release the buffer, with no I/O implied. 405*7c478bd9Sstevel@tonic-gate */ 406*7c478bd9Sstevel@tonic-gate void 407*7c478bd9Sstevel@tonic-gate brelse(struct buf *bp) 408*7c478bd9Sstevel@tonic-gate { 409*7c478bd9Sstevel@tonic-gate struct buf **backp; 410*7c478bd9Sstevel@tonic-gate uint_t index; 411*7c478bd9Sstevel@tonic-gate kmutex_t *hmp; 412*7c478bd9Sstevel@tonic-gate struct buf *dp; 413*7c478bd9Sstevel@tonic-gate struct hbuf *hp; 414*7c478bd9Sstevel@tonic-gate 415*7c478bd9Sstevel@tonic-gate 416*7c478bd9Sstevel@tonic-gate ASSERT(SEMA_HELD(&bp->b_sem)); 417*7c478bd9Sstevel@tonic-gate 418*7c478bd9Sstevel@tonic-gate /* 419*7c478bd9Sstevel@tonic-gate * Clear the retry write flag if the buffer was written without 420*7c478bd9Sstevel@tonic-gate * error. The presence of B_DELWRI means the buffer has not yet 421*7c478bd9Sstevel@tonic-gate * been written and the presence of B_ERROR means that an error 422*7c478bd9Sstevel@tonic-gate * is still occurring. 423*7c478bd9Sstevel@tonic-gate */ 424*7c478bd9Sstevel@tonic-gate if ((bp->b_flags & (B_ERROR | B_DELWRI | B_RETRYWRI)) == B_RETRYWRI) { 425*7c478bd9Sstevel@tonic-gate bp->b_flags &= ~B_RETRYWRI; 426*7c478bd9Sstevel@tonic-gate } 427*7c478bd9Sstevel@tonic-gate 428*7c478bd9Sstevel@tonic-gate /* Check for anomalous conditions */ 429*7c478bd9Sstevel@tonic-gate if (bp->b_flags & (B_ERROR|B_NOCACHE)) { 430*7c478bd9Sstevel@tonic-gate if (bp->b_flags & B_NOCACHE) { 431*7c478bd9Sstevel@tonic-gate /* Don't add to the freelist. Destroy it now */ 432*7c478bd9Sstevel@tonic-gate kmem_free(bp->b_un.b_addr, bp->b_bufsize); 433*7c478bd9Sstevel@tonic-gate sema_destroy(&bp->b_sem); 434*7c478bd9Sstevel@tonic-gate sema_destroy(&bp->b_io); 435*7c478bd9Sstevel@tonic-gate kmem_free(bp, sizeof (struct buf)); 436*7c478bd9Sstevel@tonic-gate return; 437*7c478bd9Sstevel@tonic-gate } 438*7c478bd9Sstevel@tonic-gate /* 439*7c478bd9Sstevel@tonic-gate * If a write failed and we are supposed to retry write, 440*7c478bd9Sstevel@tonic-gate * don't toss the buffer. Keep it around and mark it 441*7c478bd9Sstevel@tonic-gate * delayed write in the hopes that it will eventually 442*7c478bd9Sstevel@tonic-gate * get flushed (and still keep the system running.) 443*7c478bd9Sstevel@tonic-gate */ 444*7c478bd9Sstevel@tonic-gate if ((bp->b_flags & (B_READ | B_RETRYWRI)) == B_RETRYWRI) { 445*7c478bd9Sstevel@tonic-gate bp->b_flags |= B_DELWRI; 446*7c478bd9Sstevel@tonic-gate /* keep fsflush from trying continuously to flush */ 447*7c478bd9Sstevel@tonic-gate bp->b_start = lbolt; 448*7c478bd9Sstevel@tonic-gate } else 449*7c478bd9Sstevel@tonic-gate bp->b_flags |= B_AGE|B_STALE; 450*7c478bd9Sstevel@tonic-gate bp->b_flags &= ~B_ERROR; 451*7c478bd9Sstevel@tonic-gate bp->b_error = 0; 452*7c478bd9Sstevel@tonic-gate } 453*7c478bd9Sstevel@tonic-gate 454*7c478bd9Sstevel@tonic-gate /* 455*7c478bd9Sstevel@tonic-gate * If delayed write is set then put in on the delayed 456*7c478bd9Sstevel@tonic-gate * write list instead of the free buffer list. 457*7c478bd9Sstevel@tonic-gate */ 458*7c478bd9Sstevel@tonic-gate index = bio_bhash(bp->b_edev, bp->b_blkno); 459*7c478bd9Sstevel@tonic-gate hmp = &hbuf[index].b_lock; 460*7c478bd9Sstevel@tonic-gate 461*7c478bd9Sstevel@tonic-gate mutex_enter(hmp); 462*7c478bd9Sstevel@tonic-gate hp = &hbuf[index]; 463*7c478bd9Sstevel@tonic-gate dp = (struct buf *)hp; 464*7c478bd9Sstevel@tonic-gate 465*7c478bd9Sstevel@tonic-gate /* 466*7c478bd9Sstevel@tonic-gate * Make sure that the number of entries on this list are 467*7c478bd9Sstevel@tonic-gate * Zero <= count <= total # buffers 468*7c478bd9Sstevel@tonic-gate */ 469*7c478bd9Sstevel@tonic-gate ASSERT(hp->b_length >= 0); 470*7c478bd9Sstevel@tonic-gate ASSERT(hp->b_length < nbuf); 471*7c478bd9Sstevel@tonic-gate 472*7c478bd9Sstevel@tonic-gate hp->b_length++; /* We are adding this buffer */ 473*7c478bd9Sstevel@tonic-gate 474*7c478bd9Sstevel@tonic-gate if (bp->b_flags & B_DELWRI) { 475*7c478bd9Sstevel@tonic-gate /* 476*7c478bd9Sstevel@tonic-gate * This buffer goes on the delayed write buffer list 477*7c478bd9Sstevel@tonic-gate */ 478*7c478bd9Sstevel@tonic-gate dp = (struct buf *)&dwbuf[index]; 479*7c478bd9Sstevel@tonic-gate } 480*7c478bd9Sstevel@tonic-gate ASSERT(bp->b_bufsize > 0); 481*7c478bd9Sstevel@tonic-gate ASSERT(bp->b_bcount > 0); 482*7c478bd9Sstevel@tonic-gate ASSERT(bp->b_un.b_addr != NULL); 483*7c478bd9Sstevel@tonic-gate 484*7c478bd9Sstevel@tonic-gate if (bp->b_flags & B_AGE) { 485*7c478bd9Sstevel@tonic-gate backp = &dp->av_forw; 486*7c478bd9Sstevel@tonic-gate (*backp)->av_back = bp; 487*7c478bd9Sstevel@tonic-gate bp->av_forw = *backp; 488*7c478bd9Sstevel@tonic-gate *backp = bp; 489*7c478bd9Sstevel@tonic-gate bp->av_back = dp; 490*7c478bd9Sstevel@tonic-gate } else { 491*7c478bd9Sstevel@tonic-gate backp = &dp->av_back; 492*7c478bd9Sstevel@tonic-gate (*backp)->av_forw = bp; 493*7c478bd9Sstevel@tonic-gate bp->av_back = *backp; 494*7c478bd9Sstevel@tonic-gate *backp = bp; 495*7c478bd9Sstevel@tonic-gate bp->av_forw = dp; 496*7c478bd9Sstevel@tonic-gate } 497*7c478bd9Sstevel@tonic-gate mutex_exit(hmp); 498*7c478bd9Sstevel@tonic-gate 499*7c478bd9Sstevel@tonic-gate if (bfreelist.b_flags & B_WANTED) { 500*7c478bd9Sstevel@tonic-gate /* 501*7c478bd9Sstevel@tonic-gate * Should come here very very rarely. 502*7c478bd9Sstevel@tonic-gate */ 503*7c478bd9Sstevel@tonic-gate mutex_enter(&bfree_lock); 504*7c478bd9Sstevel@tonic-gate if (bfreelist.b_flags & B_WANTED) { 505*7c478bd9Sstevel@tonic-gate bfreelist.b_flags &= ~B_WANTED; 506*7c478bd9Sstevel@tonic-gate cv_broadcast(&bio_mem_cv); 507*7c478bd9Sstevel@tonic-gate } 508*7c478bd9Sstevel@tonic-gate mutex_exit(&bfree_lock); 509*7c478bd9Sstevel@tonic-gate } 510*7c478bd9Sstevel@tonic-gate 511*7c478bd9Sstevel@tonic-gate bp->b_flags &= ~(B_WANTED|B_BUSY|B_ASYNC); 512*7c478bd9Sstevel@tonic-gate /* 513*7c478bd9Sstevel@tonic-gate * Don't let anyone get the buffer off the freelist before we 514*7c478bd9Sstevel@tonic-gate * release our hold on it. 515*7c478bd9Sstevel@tonic-gate */ 516*7c478bd9Sstevel@tonic-gate sema_v(&bp->b_sem); 517*7c478bd9Sstevel@tonic-gate } 518*7c478bd9Sstevel@tonic-gate 519*7c478bd9Sstevel@tonic-gate /* 520*7c478bd9Sstevel@tonic-gate * Return a count of the number of B_BUSY buffers in the system 521*7c478bd9Sstevel@tonic-gate * Can only be used as a good estimate. If 'cleanit' is set, 522*7c478bd9Sstevel@tonic-gate * try to flush all bufs. 523*7c478bd9Sstevel@tonic-gate */ 524*7c478bd9Sstevel@tonic-gate int 525*7c478bd9Sstevel@tonic-gate bio_busy(int cleanit) 526*7c478bd9Sstevel@tonic-gate { 527*7c478bd9Sstevel@tonic-gate struct buf *bp, *dp; 528*7c478bd9Sstevel@tonic-gate int busy = 0; 529*7c478bd9Sstevel@tonic-gate int i; 530*7c478bd9Sstevel@tonic-gate kmutex_t *hmp; 531*7c478bd9Sstevel@tonic-gate 532*7c478bd9Sstevel@tonic-gate for (i = 0; i < v.v_hbuf; i++) { 533*7c478bd9Sstevel@tonic-gate vfs_syncprogress(); 534*7c478bd9Sstevel@tonic-gate dp = (struct buf *)&hbuf[i]; 535*7c478bd9Sstevel@tonic-gate hmp = &hbuf[i].b_lock; 536*7c478bd9Sstevel@tonic-gate 537*7c478bd9Sstevel@tonic-gate mutex_enter(hmp); 538*7c478bd9Sstevel@tonic-gate for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) { 539*7c478bd9Sstevel@tonic-gate if (bp->b_flags & B_BUSY) 540*7c478bd9Sstevel@tonic-gate busy++; 541*7c478bd9Sstevel@tonic-gate } 542*7c478bd9Sstevel@tonic-gate mutex_exit(hmp); 543*7c478bd9Sstevel@tonic-gate } 544*7c478bd9Sstevel@tonic-gate 545*7c478bd9Sstevel@tonic-gate if (cleanit && busy != 0) { 546*7c478bd9Sstevel@tonic-gate bflush(NODEV); 547*7c478bd9Sstevel@tonic-gate } 548*7c478bd9Sstevel@tonic-gate 549*7c478bd9Sstevel@tonic-gate return (busy); 550*7c478bd9Sstevel@tonic-gate } 551*7c478bd9Sstevel@tonic-gate 552*7c478bd9Sstevel@tonic-gate /* 553*7c478bd9Sstevel@tonic-gate * this interface is provided for binary compatibility. 554*7c478bd9Sstevel@tonic-gate * 555*7c478bd9Sstevel@tonic-gate * Assign a buffer for the given block. If the appropriate 556*7c478bd9Sstevel@tonic-gate * block is already associated, return it; otherwise search 557*7c478bd9Sstevel@tonic-gate * for the oldest non-busy buffer and reassign it. 558*7c478bd9Sstevel@tonic-gate */ 559*7c478bd9Sstevel@tonic-gate struct buf * 560*7c478bd9Sstevel@tonic-gate getblk(dev_t dev, daddr_t blkno, long bsize) 561*7c478bd9Sstevel@tonic-gate { 562*7c478bd9Sstevel@tonic-gate return (getblk_common(/* ufsvfsp */ NULL, dev, 563*7c478bd9Sstevel@tonic-gate blkno, bsize, /* errflg */ 0)); 564*7c478bd9Sstevel@tonic-gate } 565*7c478bd9Sstevel@tonic-gate 566*7c478bd9Sstevel@tonic-gate /* 567*7c478bd9Sstevel@tonic-gate * Assign a buffer for the given block. If the appropriate 568*7c478bd9Sstevel@tonic-gate * block is already associated, return it; otherwise search 569*7c478bd9Sstevel@tonic-gate * for the oldest non-busy buffer and reassign it. 570*7c478bd9Sstevel@tonic-gate */ 571*7c478bd9Sstevel@tonic-gate struct buf * 572*7c478bd9Sstevel@tonic-gate getblk_common(void * arg, dev_t dev, daddr_t blkno, long bsize, int errflg) 573*7c478bd9Sstevel@tonic-gate { 574*7c478bd9Sstevel@tonic-gate ufsvfs_t *ufsvfsp = (struct ufsvfs *)arg; 575*7c478bd9Sstevel@tonic-gate struct buf *bp; 576*7c478bd9Sstevel@tonic-gate struct buf *dp; 577*7c478bd9Sstevel@tonic-gate struct buf *nbp = NULL; 578*7c478bd9Sstevel@tonic-gate struct buf *errbp; 579*7c478bd9Sstevel@tonic-gate uint_t index; 580*7c478bd9Sstevel@tonic-gate kmutex_t *hmp; 581*7c478bd9Sstevel@tonic-gate struct hbuf *hp; 582*7c478bd9Sstevel@tonic-gate 583*7c478bd9Sstevel@tonic-gate if (getmajor(dev) >= devcnt) 584*7c478bd9Sstevel@tonic-gate cmn_err(CE_PANIC, "blkdev"); 585*7c478bd9Sstevel@tonic-gate 586*7c478bd9Sstevel@tonic-gate biostats.bio_lookup.value.ui32++; 587*7c478bd9Sstevel@tonic-gate 588*7c478bd9Sstevel@tonic-gate index = bio_bhash(dev, blkno); 589*7c478bd9Sstevel@tonic-gate hp = &hbuf[index]; 590*7c478bd9Sstevel@tonic-gate dp = (struct buf *)hp; 591*7c478bd9Sstevel@tonic-gate hmp = &hp->b_lock; 592*7c478bd9Sstevel@tonic-gate 593*7c478bd9Sstevel@tonic-gate mutex_enter(hmp); 594*7c478bd9Sstevel@tonic-gate loop: 595*7c478bd9Sstevel@tonic-gate for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) { 596*7c478bd9Sstevel@tonic-gate if (bp->b_blkno != blkno || bp->b_edev != dev || 597*7c478bd9Sstevel@tonic-gate (bp->b_flags & B_STALE)) 598*7c478bd9Sstevel@tonic-gate continue; 599*7c478bd9Sstevel@tonic-gate /* 600*7c478bd9Sstevel@tonic-gate * Avoid holding the hash lock in the event that 601*7c478bd9Sstevel@tonic-gate * the buffer is locked by someone. Since the hash chain 602*7c478bd9Sstevel@tonic-gate * may change when we drop the hash lock 603*7c478bd9Sstevel@tonic-gate * we have to start at the beginning of the chain if the 604*7c478bd9Sstevel@tonic-gate * buffer identity/contents aren't valid. 605*7c478bd9Sstevel@tonic-gate */ 606*7c478bd9Sstevel@tonic-gate if (!sema_tryp(&bp->b_sem)) { 607*7c478bd9Sstevel@tonic-gate biostats.bio_bufbusy.value.ui32++; 608*7c478bd9Sstevel@tonic-gate mutex_exit(hmp); 609*7c478bd9Sstevel@tonic-gate /* 610*7c478bd9Sstevel@tonic-gate * OK, we are dealing with a busy buffer. 611*7c478bd9Sstevel@tonic-gate * In the case that we are panicking and we 612*7c478bd9Sstevel@tonic-gate * got called from bread(), we have some chance 613*7c478bd9Sstevel@tonic-gate * for error recovery. So better bail out from 614*7c478bd9Sstevel@tonic-gate * here since sema_p() won't block. If we got 615*7c478bd9Sstevel@tonic-gate * called directly from ufs routines, there is 616*7c478bd9Sstevel@tonic-gate * no way to report an error yet. 617*7c478bd9Sstevel@tonic-gate */ 618*7c478bd9Sstevel@tonic-gate if (panicstr && errflg) 619*7c478bd9Sstevel@tonic-gate goto errout; 620*7c478bd9Sstevel@tonic-gate /* 621*7c478bd9Sstevel@tonic-gate * For the following line of code to work 622*7c478bd9Sstevel@tonic-gate * correctly never kmem_free the buffer "header". 623*7c478bd9Sstevel@tonic-gate */ 624*7c478bd9Sstevel@tonic-gate sema_p(&bp->b_sem); 625*7c478bd9Sstevel@tonic-gate if (bp->b_blkno != blkno || bp->b_edev != dev || 626*7c478bd9Sstevel@tonic-gate (bp->b_flags & B_STALE)) { 627*7c478bd9Sstevel@tonic-gate sema_v(&bp->b_sem); 628*7c478bd9Sstevel@tonic-gate mutex_enter(hmp); 629*7c478bd9Sstevel@tonic-gate goto loop; /* start over */ 630*7c478bd9Sstevel@tonic-gate } 631*7c478bd9Sstevel@tonic-gate mutex_enter(hmp); 632*7c478bd9Sstevel@tonic-gate } 633*7c478bd9Sstevel@tonic-gate /* Found */ 634*7c478bd9Sstevel@tonic-gate biostats.bio_hit.value.ui32++; 635*7c478bd9Sstevel@tonic-gate bp->b_flags &= ~B_AGE; 636*7c478bd9Sstevel@tonic-gate 637*7c478bd9Sstevel@tonic-gate /* 638*7c478bd9Sstevel@tonic-gate * Yank it off the free/delayed write lists 639*7c478bd9Sstevel@tonic-gate */ 640*7c478bd9Sstevel@tonic-gate hp->b_length--; 641*7c478bd9Sstevel@tonic-gate notavail(bp); 642*7c478bd9Sstevel@tonic-gate mutex_exit(hmp); 643*7c478bd9Sstevel@tonic-gate 644*7c478bd9Sstevel@tonic-gate ASSERT((bp->b_flags & B_NOCACHE) == NULL); 645*7c478bd9Sstevel@tonic-gate 646*7c478bd9Sstevel@tonic-gate if (nbp == NULL) { 647*7c478bd9Sstevel@tonic-gate /* 648*7c478bd9Sstevel@tonic-gate * Make the common path short. 649*7c478bd9Sstevel@tonic-gate */ 650*7c478bd9Sstevel@tonic-gate ASSERT(SEMA_HELD(&bp->b_sem)); 651*7c478bd9Sstevel@tonic-gate return (bp); 652*7c478bd9Sstevel@tonic-gate } 653*7c478bd9Sstevel@tonic-gate 654*7c478bd9Sstevel@tonic-gate biostats.bio_bufdup.value.ui32++; 655*7c478bd9Sstevel@tonic-gate 656*7c478bd9Sstevel@tonic-gate /* 657*7c478bd9Sstevel@tonic-gate * The buffer must have entered during the lock upgrade 658*7c478bd9Sstevel@tonic-gate * so free the new buffer we allocated and return the 659*7c478bd9Sstevel@tonic-gate * found buffer. 660*7c478bd9Sstevel@tonic-gate */ 661*7c478bd9Sstevel@tonic-gate kmem_free(nbp->b_un.b_addr, nbp->b_bufsize); 662*7c478bd9Sstevel@tonic-gate nbp->b_un.b_addr = NULL; 663*7c478bd9Sstevel@tonic-gate 664*7c478bd9Sstevel@tonic-gate /* 665*7c478bd9Sstevel@tonic-gate * Account for the memory 666*7c478bd9Sstevel@tonic-gate */ 667*7c478bd9Sstevel@tonic-gate mutex_enter(&bfree_lock); 668*7c478bd9Sstevel@tonic-gate bfreelist.b_bufsize += nbp->b_bufsize; 669*7c478bd9Sstevel@tonic-gate mutex_exit(&bfree_lock); 670*7c478bd9Sstevel@tonic-gate 671*7c478bd9Sstevel@tonic-gate /* 672*7c478bd9Sstevel@tonic-gate * Destroy buf identity, and place on avail list 673*7c478bd9Sstevel@tonic-gate */ 674*7c478bd9Sstevel@tonic-gate nbp->b_dev = (o_dev_t)NODEV; 675*7c478bd9Sstevel@tonic-gate nbp->b_edev = NODEV; 676*7c478bd9Sstevel@tonic-gate nbp->b_flags = 0; 677*7c478bd9Sstevel@tonic-gate nbp->b_file = NULL; 678*7c478bd9Sstevel@tonic-gate nbp->b_offset = -1; 679*7c478bd9Sstevel@tonic-gate 680*7c478bd9Sstevel@tonic-gate sema_v(&nbp->b_sem); 681*7c478bd9Sstevel@tonic-gate bio_bhdr_free(nbp); 682*7c478bd9Sstevel@tonic-gate 683*7c478bd9Sstevel@tonic-gate ASSERT(SEMA_HELD(&bp->b_sem)); 684*7c478bd9Sstevel@tonic-gate return (bp); 685*7c478bd9Sstevel@tonic-gate } 686*7c478bd9Sstevel@tonic-gate 687*7c478bd9Sstevel@tonic-gate /* 688*7c478bd9Sstevel@tonic-gate * bio_getfreeblk may block so check the hash chain again. 689*7c478bd9Sstevel@tonic-gate */ 690*7c478bd9Sstevel@tonic-gate if (nbp == NULL) { 691*7c478bd9Sstevel@tonic-gate mutex_exit(hmp); 692*7c478bd9Sstevel@tonic-gate nbp = bio_getfreeblk(bsize); 693*7c478bd9Sstevel@tonic-gate mutex_enter(hmp); 694*7c478bd9Sstevel@tonic-gate goto loop; 695*7c478bd9Sstevel@tonic-gate } 696*7c478bd9Sstevel@tonic-gate 697*7c478bd9Sstevel@tonic-gate /* 698*7c478bd9Sstevel@tonic-gate * New buffer. Assign nbp and stick it on the hash. 699*7c478bd9Sstevel@tonic-gate */ 700*7c478bd9Sstevel@tonic-gate nbp->b_flags = B_BUSY; 701*7c478bd9Sstevel@tonic-gate nbp->b_edev = dev; 702*7c478bd9Sstevel@tonic-gate nbp->b_dev = (o_dev_t)cmpdev(dev); 703*7c478bd9Sstevel@tonic-gate nbp->b_blkno = blkno; 704*7c478bd9Sstevel@tonic-gate nbp->b_iodone = NULL; 705*7c478bd9Sstevel@tonic-gate nbp->b_bcount = bsize; 706*7c478bd9Sstevel@tonic-gate /* 707*7c478bd9Sstevel@tonic-gate * If we are given a ufsvfsp and the vfs_root field is NULL 708*7c478bd9Sstevel@tonic-gate * then this must be I/O for a superblock. A superblock's 709*7c478bd9Sstevel@tonic-gate * buffer is set up in mountfs() and there is no root vnode 710*7c478bd9Sstevel@tonic-gate * at that point. 711*7c478bd9Sstevel@tonic-gate */ 712*7c478bd9Sstevel@tonic-gate if (ufsvfsp && ufsvfsp->vfs_root) { 713*7c478bd9Sstevel@tonic-gate nbp->b_vp = ufsvfsp->vfs_root; 714*7c478bd9Sstevel@tonic-gate } else { 715*7c478bd9Sstevel@tonic-gate nbp->b_vp = NULL; 716*7c478bd9Sstevel@tonic-gate } 717*7c478bd9Sstevel@tonic-gate 718*7c478bd9Sstevel@tonic-gate ASSERT((nbp->b_flags & B_NOCACHE) == NULL); 719*7c478bd9Sstevel@tonic-gate 720*7c478bd9Sstevel@tonic-gate binshash(nbp, dp); 721*7c478bd9Sstevel@tonic-gate mutex_exit(hmp); 722*7c478bd9Sstevel@tonic-gate 723*7c478bd9Sstevel@tonic-gate ASSERT(SEMA_HELD(&nbp->b_sem)); 724*7c478bd9Sstevel@tonic-gate 725*7c478bd9Sstevel@tonic-gate return (nbp); 726*7c478bd9Sstevel@tonic-gate 727*7c478bd9Sstevel@tonic-gate 728*7c478bd9Sstevel@tonic-gate /* 729*7c478bd9Sstevel@tonic-gate * Come here in case of an internal error. At this point we couldn't 730*7c478bd9Sstevel@tonic-gate * get a buffer, but he have to return one. Hence we allocate some 731*7c478bd9Sstevel@tonic-gate * kind of error reply buffer on the fly. This buffer is marked as 732*7c478bd9Sstevel@tonic-gate * B_NOCACHE | B_AGE | B_ERROR | B_DONE to assure the following: 733*7c478bd9Sstevel@tonic-gate * - B_ERROR will indicate error to the caller. 734*7c478bd9Sstevel@tonic-gate * - B_DONE will prevent us from reading the buffer from 735*7c478bd9Sstevel@tonic-gate * the device. 736*7c478bd9Sstevel@tonic-gate * - B_NOCACHE will cause that this buffer gets free'd in 737*7c478bd9Sstevel@tonic-gate * brelse(). 738*7c478bd9Sstevel@tonic-gate */ 739*7c478bd9Sstevel@tonic-gate 740*7c478bd9Sstevel@tonic-gate errout: 741*7c478bd9Sstevel@tonic-gate errbp = geteblk(); 742*7c478bd9Sstevel@tonic-gate sema_p(&errbp->b_sem); 743*7c478bd9Sstevel@tonic-gate errbp->b_flags &= ~B_BUSY; 744*7c478bd9Sstevel@tonic-gate errbp->b_flags |= (B_ERROR | B_DONE); 745*7c478bd9Sstevel@tonic-gate return (errbp); 746*7c478bd9Sstevel@tonic-gate } 747*7c478bd9Sstevel@tonic-gate 748*7c478bd9Sstevel@tonic-gate /* 749*7c478bd9Sstevel@tonic-gate * Get an empty block, not assigned to any particular device. 750*7c478bd9Sstevel@tonic-gate * Returns a locked buffer that is not on any hash or free list. 751*7c478bd9Sstevel@tonic-gate */ 752*7c478bd9Sstevel@tonic-gate struct buf * 753*7c478bd9Sstevel@tonic-gate ngeteblk(long bsize) 754*7c478bd9Sstevel@tonic-gate { 755*7c478bd9Sstevel@tonic-gate struct buf *bp; 756*7c478bd9Sstevel@tonic-gate 757*7c478bd9Sstevel@tonic-gate bp = kmem_alloc(sizeof (struct buf), KM_SLEEP); 758*7c478bd9Sstevel@tonic-gate bioinit(bp); 759*7c478bd9Sstevel@tonic-gate bp->av_forw = bp->av_back = NULL; 760*7c478bd9Sstevel@tonic-gate bp->b_un.b_addr = kmem_alloc(bsize, KM_SLEEP); 761*7c478bd9Sstevel@tonic-gate bp->b_bufsize = bsize; 762*7c478bd9Sstevel@tonic-gate bp->b_flags = B_BUSY | B_NOCACHE | B_AGE; 763*7c478bd9Sstevel@tonic-gate bp->b_dev = (o_dev_t)NODEV; 764*7c478bd9Sstevel@tonic-gate bp->b_edev = NODEV; 765*7c478bd9Sstevel@tonic-gate bp->b_lblkno = 0; 766*7c478bd9Sstevel@tonic-gate bp->b_bcount = bsize; 767*7c478bd9Sstevel@tonic-gate bp->b_iodone = NULL; 768*7c478bd9Sstevel@tonic-gate return (bp); 769*7c478bd9Sstevel@tonic-gate } 770*7c478bd9Sstevel@tonic-gate 771*7c478bd9Sstevel@tonic-gate /* 772*7c478bd9Sstevel@tonic-gate * Interface of geteblk() is kept intact to maintain driver compatibility. 773*7c478bd9Sstevel@tonic-gate * Use ngeteblk() to allocate block size other than 1 KB. 774*7c478bd9Sstevel@tonic-gate */ 775*7c478bd9Sstevel@tonic-gate struct buf * 776*7c478bd9Sstevel@tonic-gate geteblk(void) 777*7c478bd9Sstevel@tonic-gate { 778*7c478bd9Sstevel@tonic-gate return (ngeteblk((long)1024)); 779*7c478bd9Sstevel@tonic-gate } 780*7c478bd9Sstevel@tonic-gate 781*7c478bd9Sstevel@tonic-gate /* 782*7c478bd9Sstevel@tonic-gate * Return a buffer w/o sleeping 783*7c478bd9Sstevel@tonic-gate */ 784*7c478bd9Sstevel@tonic-gate struct buf * 785*7c478bd9Sstevel@tonic-gate trygetblk(dev_t dev, daddr_t blkno) 786*7c478bd9Sstevel@tonic-gate { 787*7c478bd9Sstevel@tonic-gate struct buf *bp; 788*7c478bd9Sstevel@tonic-gate struct buf *dp; 789*7c478bd9Sstevel@tonic-gate struct hbuf *hp; 790*7c478bd9Sstevel@tonic-gate kmutex_t *hmp; 791*7c478bd9Sstevel@tonic-gate uint_t index; 792*7c478bd9Sstevel@tonic-gate 793*7c478bd9Sstevel@tonic-gate index = bio_bhash(dev, blkno); 794*7c478bd9Sstevel@tonic-gate hp = &hbuf[index]; 795*7c478bd9Sstevel@tonic-gate hmp = &hp->b_lock; 796*7c478bd9Sstevel@tonic-gate 797*7c478bd9Sstevel@tonic-gate if (!mutex_tryenter(hmp)) 798*7c478bd9Sstevel@tonic-gate return (NULL); 799*7c478bd9Sstevel@tonic-gate 800*7c478bd9Sstevel@tonic-gate dp = (struct buf *)hp; 801*7c478bd9Sstevel@tonic-gate for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) { 802*7c478bd9Sstevel@tonic-gate if (bp->b_blkno != blkno || bp->b_edev != dev || 803*7c478bd9Sstevel@tonic-gate (bp->b_flags & B_STALE)) 804*7c478bd9Sstevel@tonic-gate continue; 805*7c478bd9Sstevel@tonic-gate /* 806*7c478bd9Sstevel@tonic-gate * Get access to a valid buffer without sleeping 807*7c478bd9Sstevel@tonic-gate */ 808*7c478bd9Sstevel@tonic-gate if (sema_tryp(&bp->b_sem)) { 809*7c478bd9Sstevel@tonic-gate if (bp->b_flags & B_DONE) { 810*7c478bd9Sstevel@tonic-gate hp->b_length--; 811*7c478bd9Sstevel@tonic-gate notavail(bp); 812*7c478bd9Sstevel@tonic-gate mutex_exit(hmp); 813*7c478bd9Sstevel@tonic-gate return (bp); 814*7c478bd9Sstevel@tonic-gate } else { 815*7c478bd9Sstevel@tonic-gate sema_v(&bp->b_sem); 816*7c478bd9Sstevel@tonic-gate break; 817*7c478bd9Sstevel@tonic-gate } 818*7c478bd9Sstevel@tonic-gate } 819*7c478bd9Sstevel@tonic-gate break; 820*7c478bd9Sstevel@tonic-gate } 821*7c478bd9Sstevel@tonic-gate mutex_exit(hmp); 822*7c478bd9Sstevel@tonic-gate return (NULL); 823*7c478bd9Sstevel@tonic-gate } 824*7c478bd9Sstevel@tonic-gate 825*7c478bd9Sstevel@tonic-gate /* 826*7c478bd9Sstevel@tonic-gate * Wait for I/O completion on the buffer; return errors 827*7c478bd9Sstevel@tonic-gate * to the user. 828*7c478bd9Sstevel@tonic-gate */ 829*7c478bd9Sstevel@tonic-gate int 830*7c478bd9Sstevel@tonic-gate iowait(struct buf *bp) 831*7c478bd9Sstevel@tonic-gate { 832*7c478bd9Sstevel@tonic-gate ASSERT(SEMA_HELD(&bp->b_sem)); 833*7c478bd9Sstevel@tonic-gate return (biowait(bp)); 834*7c478bd9Sstevel@tonic-gate } 835*7c478bd9Sstevel@tonic-gate 836*7c478bd9Sstevel@tonic-gate /* 837*7c478bd9Sstevel@tonic-gate * Mark I/O complete on a buffer, release it if I/O is asynchronous, 838*7c478bd9Sstevel@tonic-gate * and wake up anyone waiting for it. 839*7c478bd9Sstevel@tonic-gate */ 840*7c478bd9Sstevel@tonic-gate void 841*7c478bd9Sstevel@tonic-gate iodone(struct buf *bp) 842*7c478bd9Sstevel@tonic-gate { 843*7c478bd9Sstevel@tonic-gate ASSERT(SEMA_HELD(&bp->b_sem)); 844*7c478bd9Sstevel@tonic-gate (void) biodone(bp); 845*7c478bd9Sstevel@tonic-gate } 846*7c478bd9Sstevel@tonic-gate 847*7c478bd9Sstevel@tonic-gate /* 848*7c478bd9Sstevel@tonic-gate * Zero the core associated with a buffer. 849*7c478bd9Sstevel@tonic-gate */ 850*7c478bd9Sstevel@tonic-gate void 851*7c478bd9Sstevel@tonic-gate clrbuf(struct buf *bp) 852*7c478bd9Sstevel@tonic-gate { 853*7c478bd9Sstevel@tonic-gate ASSERT(SEMA_HELD(&bp->b_sem)); 854*7c478bd9Sstevel@tonic-gate bzero(bp->b_un.b_addr, bp->b_bcount); 855*7c478bd9Sstevel@tonic-gate bp->b_resid = 0; 856*7c478bd9Sstevel@tonic-gate } 857*7c478bd9Sstevel@tonic-gate 858*7c478bd9Sstevel@tonic-gate 859*7c478bd9Sstevel@tonic-gate /* 860*7c478bd9Sstevel@tonic-gate * Make sure all write-behind blocks on dev (or NODEV for all) 861*7c478bd9Sstevel@tonic-gate * are flushed out. 862*7c478bd9Sstevel@tonic-gate */ 863*7c478bd9Sstevel@tonic-gate void 864*7c478bd9Sstevel@tonic-gate bflush(dev_t dev) 865*7c478bd9Sstevel@tonic-gate { 866*7c478bd9Sstevel@tonic-gate struct buf *bp, *dp; 867*7c478bd9Sstevel@tonic-gate struct hbuf *hp; 868*7c478bd9Sstevel@tonic-gate struct buf *delwri_list = EMPTY_LIST; 869*7c478bd9Sstevel@tonic-gate int i, index; 870*7c478bd9Sstevel@tonic-gate kmutex_t *hmp; 871*7c478bd9Sstevel@tonic-gate 872*7c478bd9Sstevel@tonic-gate mutex_enter(&blist_lock); 873*7c478bd9Sstevel@tonic-gate /* 874*7c478bd9Sstevel@tonic-gate * Wait for any invalidates or flushes ahead of us to finish. 875*7c478bd9Sstevel@tonic-gate * We really could split blist_lock up per device for better 876*7c478bd9Sstevel@tonic-gate * parallelism here. 877*7c478bd9Sstevel@tonic-gate */ 878*7c478bd9Sstevel@tonic-gate while (bio_doinginval || bio_doingflush) { 879*7c478bd9Sstevel@tonic-gate bio_flinv_cv_wanted = 1; 880*7c478bd9Sstevel@tonic-gate cv_wait(&bio_flushinval_cv, &blist_lock); 881*7c478bd9Sstevel@tonic-gate } 882*7c478bd9Sstevel@tonic-gate bio_doingflush++; 883*7c478bd9Sstevel@tonic-gate /* 884*7c478bd9Sstevel@tonic-gate * Gather all B_DELWRI buffer for device. 885*7c478bd9Sstevel@tonic-gate * Lock ordering is b_sem > hash lock (brelse). 886*7c478bd9Sstevel@tonic-gate * Since we are finding the buffer via the delayed write list, 887*7c478bd9Sstevel@tonic-gate * it may be busy and we would block trying to get the 888*7c478bd9Sstevel@tonic-gate * b_sem lock while holding hash lock. So transfer all the 889*7c478bd9Sstevel@tonic-gate * candidates on the delwri_list and then drop the hash locks. 890*7c478bd9Sstevel@tonic-gate */ 891*7c478bd9Sstevel@tonic-gate for (i = 0; i < v.v_hbuf; i++) { 892*7c478bd9Sstevel@tonic-gate vfs_syncprogress(); 893*7c478bd9Sstevel@tonic-gate hmp = &hbuf[i].b_lock; 894*7c478bd9Sstevel@tonic-gate dp = (struct buf *)&dwbuf[i]; 895*7c478bd9Sstevel@tonic-gate mutex_enter(hmp); 896*7c478bd9Sstevel@tonic-gate for (bp = dp->av_forw; bp != dp; bp = bp->av_forw) { 897*7c478bd9Sstevel@tonic-gate if (dev == NODEV || bp->b_edev == dev) { 898*7c478bd9Sstevel@tonic-gate if (bp->b_list == NULL) { 899*7c478bd9Sstevel@tonic-gate bp->b_list = delwri_list; 900*7c478bd9Sstevel@tonic-gate delwri_list = bp; 901*7c478bd9Sstevel@tonic-gate } 902*7c478bd9Sstevel@tonic-gate } 903*7c478bd9Sstevel@tonic-gate } 904*7c478bd9Sstevel@tonic-gate mutex_exit(hmp); 905*7c478bd9Sstevel@tonic-gate } 906*7c478bd9Sstevel@tonic-gate mutex_exit(&blist_lock); 907*7c478bd9Sstevel@tonic-gate 908*7c478bd9Sstevel@tonic-gate /* 909*7c478bd9Sstevel@tonic-gate * Now that the hash locks have been dropped grab the semaphores 910*7c478bd9Sstevel@tonic-gate * and write back all the buffers that have B_DELWRI set. 911*7c478bd9Sstevel@tonic-gate */ 912*7c478bd9Sstevel@tonic-gate while (delwri_list != EMPTY_LIST) { 913*7c478bd9Sstevel@tonic-gate vfs_syncprogress(); 914*7c478bd9Sstevel@tonic-gate bp = delwri_list; 915*7c478bd9Sstevel@tonic-gate 916*7c478bd9Sstevel@tonic-gate sema_p(&bp->b_sem); /* may block */ 917*7c478bd9Sstevel@tonic-gate if ((dev != bp->b_edev && dev != NODEV) || 918*7c478bd9Sstevel@tonic-gate (panicstr && bp->b_flags & B_BUSY)) { 919*7c478bd9Sstevel@tonic-gate sema_v(&bp->b_sem); 920*7c478bd9Sstevel@tonic-gate delwri_list = bp->b_list; 921*7c478bd9Sstevel@tonic-gate bp->b_list = NULL; 922*7c478bd9Sstevel@tonic-gate continue; /* No longer a candidate */ 923*7c478bd9Sstevel@tonic-gate } 924*7c478bd9Sstevel@tonic-gate if (bp->b_flags & B_DELWRI) { 925*7c478bd9Sstevel@tonic-gate index = bio_bhash(bp->b_edev, bp->b_blkno); 926*7c478bd9Sstevel@tonic-gate hp = &hbuf[index]; 927*7c478bd9Sstevel@tonic-gate hmp = &hp->b_lock; 928*7c478bd9Sstevel@tonic-gate dp = (struct buf *)hp; 929*7c478bd9Sstevel@tonic-gate 930*7c478bd9Sstevel@tonic-gate bp->b_flags |= B_ASYNC; 931*7c478bd9Sstevel@tonic-gate mutex_enter(hmp); 932*7c478bd9Sstevel@tonic-gate hp->b_length--; 933*7c478bd9Sstevel@tonic-gate notavail(bp); 934*7c478bd9Sstevel@tonic-gate mutex_exit(hmp); 935*7c478bd9Sstevel@tonic-gate if (bp->b_vp == NULL) { /* !ufs */ 936*7c478bd9Sstevel@tonic-gate BWRITE(bp); 937*7c478bd9Sstevel@tonic-gate } else { /* ufs */ 938*7c478bd9Sstevel@tonic-gate UFS_BWRITE(VTOI(bp->b_vp)->i_ufsvfs, bp); 939*7c478bd9Sstevel@tonic-gate } 940*7c478bd9Sstevel@tonic-gate } else { 941*7c478bd9Sstevel@tonic-gate sema_v(&bp->b_sem); 942*7c478bd9Sstevel@tonic-gate } 943*7c478bd9Sstevel@tonic-gate delwri_list = bp->b_list; 944*7c478bd9Sstevel@tonic-gate bp->b_list = NULL; 945*7c478bd9Sstevel@tonic-gate } 946*7c478bd9Sstevel@tonic-gate mutex_enter(&blist_lock); 947*7c478bd9Sstevel@tonic-gate bio_doingflush--; 948*7c478bd9Sstevel@tonic-gate if (bio_flinv_cv_wanted) { 949*7c478bd9Sstevel@tonic-gate bio_flinv_cv_wanted = 0; 950*7c478bd9Sstevel@tonic-gate cv_broadcast(&bio_flushinval_cv); 951*7c478bd9Sstevel@tonic-gate } 952*7c478bd9Sstevel@tonic-gate mutex_exit(&blist_lock); 953*7c478bd9Sstevel@tonic-gate } 954*7c478bd9Sstevel@tonic-gate 955*7c478bd9Sstevel@tonic-gate /* 956*7c478bd9Sstevel@tonic-gate * Ensure that a specified block is up-to-date on disk. 957*7c478bd9Sstevel@tonic-gate */ 958*7c478bd9Sstevel@tonic-gate void 959*7c478bd9Sstevel@tonic-gate blkflush(dev_t dev, daddr_t blkno) 960*7c478bd9Sstevel@tonic-gate { 961*7c478bd9Sstevel@tonic-gate struct buf *bp, *dp; 962*7c478bd9Sstevel@tonic-gate struct hbuf *hp; 963*7c478bd9Sstevel@tonic-gate struct buf *sbp = NULL; 964*7c478bd9Sstevel@tonic-gate uint_t index; 965*7c478bd9Sstevel@tonic-gate kmutex_t *hmp; 966*7c478bd9Sstevel@tonic-gate 967*7c478bd9Sstevel@tonic-gate index = bio_bhash(dev, blkno); 968*7c478bd9Sstevel@tonic-gate hp = &hbuf[index]; 969*7c478bd9Sstevel@tonic-gate dp = (struct buf *)hp; 970*7c478bd9Sstevel@tonic-gate hmp = &hp->b_lock; 971*7c478bd9Sstevel@tonic-gate 972*7c478bd9Sstevel@tonic-gate /* 973*7c478bd9Sstevel@tonic-gate * Identify the buffer in the cache belonging to 974*7c478bd9Sstevel@tonic-gate * this device and blkno (if any). 975*7c478bd9Sstevel@tonic-gate */ 976*7c478bd9Sstevel@tonic-gate mutex_enter(hmp); 977*7c478bd9Sstevel@tonic-gate for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) { 978*7c478bd9Sstevel@tonic-gate if (bp->b_blkno != blkno || bp->b_edev != dev || 979*7c478bd9Sstevel@tonic-gate (bp->b_flags & B_STALE)) 980*7c478bd9Sstevel@tonic-gate continue; 981*7c478bd9Sstevel@tonic-gate sbp = bp; 982*7c478bd9Sstevel@tonic-gate break; 983*7c478bd9Sstevel@tonic-gate } 984*7c478bd9Sstevel@tonic-gate mutex_exit(hmp); 985*7c478bd9Sstevel@tonic-gate if (sbp == NULL) 986*7c478bd9Sstevel@tonic-gate return; 987*7c478bd9Sstevel@tonic-gate /* 988*7c478bd9Sstevel@tonic-gate * Now check the buffer we have identified and 989*7c478bd9Sstevel@tonic-gate * make sure it still belongs to the device and is B_DELWRI 990*7c478bd9Sstevel@tonic-gate */ 991*7c478bd9Sstevel@tonic-gate sema_p(&sbp->b_sem); 992*7c478bd9Sstevel@tonic-gate if (sbp->b_blkno == blkno && sbp->b_edev == dev && 993*7c478bd9Sstevel@tonic-gate (sbp->b_flags & (B_DELWRI|B_STALE)) == B_DELWRI) { 994*7c478bd9Sstevel@tonic-gate mutex_enter(hmp); 995*7c478bd9Sstevel@tonic-gate hp->b_length--; 996*7c478bd9Sstevel@tonic-gate notavail(sbp); 997*7c478bd9Sstevel@tonic-gate mutex_exit(hmp); 998*7c478bd9Sstevel@tonic-gate /* 999*7c478bd9Sstevel@tonic-gate * XXX - There is nothing to guarantee a synchronous 1000*7c478bd9Sstevel@tonic-gate * write here if the B_ASYNC flag is set. This needs 1001*7c478bd9Sstevel@tonic-gate * some investigation. 1002*7c478bd9Sstevel@tonic-gate */ 1003*7c478bd9Sstevel@tonic-gate if (sbp->b_vp == NULL) { /* !ufs */ 1004*7c478bd9Sstevel@tonic-gate BWRITE(sbp); /* synchronous write */ 1005*7c478bd9Sstevel@tonic-gate } else { /* ufs */ 1006*7c478bd9Sstevel@tonic-gate UFS_BWRITE(VTOI(sbp->b_vp)->i_ufsvfs, sbp); 1007*7c478bd9Sstevel@tonic-gate } 1008*7c478bd9Sstevel@tonic-gate } else { 1009*7c478bd9Sstevel@tonic-gate sema_v(&sbp->b_sem); 1010*7c478bd9Sstevel@tonic-gate } 1011*7c478bd9Sstevel@tonic-gate } 1012*7c478bd9Sstevel@tonic-gate 1013*7c478bd9Sstevel@tonic-gate /* 1014*7c478bd9Sstevel@tonic-gate * Same as binval, except can force-invalidate delayed-write buffers 1015*7c478bd9Sstevel@tonic-gate * (which are not be already flushed because of device errors). Also 1016*7c478bd9Sstevel@tonic-gate * makes sure that the retry write flag is cleared. 1017*7c478bd9Sstevel@tonic-gate */ 1018*7c478bd9Sstevel@tonic-gate int 1019*7c478bd9Sstevel@tonic-gate bfinval(dev_t dev, int force) 1020*7c478bd9Sstevel@tonic-gate { 1021*7c478bd9Sstevel@tonic-gate struct buf *dp; 1022*7c478bd9Sstevel@tonic-gate struct buf *bp; 1023*7c478bd9Sstevel@tonic-gate struct buf *binval_list = EMPTY_LIST; 1024*7c478bd9Sstevel@tonic-gate int i, error = 0; 1025*7c478bd9Sstevel@tonic-gate kmutex_t *hmp; 1026*7c478bd9Sstevel@tonic-gate uint_t index; 1027*7c478bd9Sstevel@tonic-gate struct buf **backp; 1028*7c478bd9Sstevel@tonic-gate 1029*7c478bd9Sstevel@tonic-gate mutex_enter(&blist_lock); 1030*7c478bd9Sstevel@tonic-gate /* 1031*7c478bd9Sstevel@tonic-gate * Wait for any flushes ahead of us to finish, it's ok to 1032*7c478bd9Sstevel@tonic-gate * do invalidates in parallel. 1033*7c478bd9Sstevel@tonic-gate */ 1034*7c478bd9Sstevel@tonic-gate while (bio_doingflush) { 1035*7c478bd9Sstevel@tonic-gate bio_flinv_cv_wanted = 1; 1036*7c478bd9Sstevel@tonic-gate cv_wait(&bio_flushinval_cv, &blist_lock); 1037*7c478bd9Sstevel@tonic-gate } 1038*7c478bd9Sstevel@tonic-gate bio_doinginval++; 1039*7c478bd9Sstevel@tonic-gate 1040*7c478bd9Sstevel@tonic-gate /* Gather bp's */ 1041*7c478bd9Sstevel@tonic-gate for (i = 0; i < v.v_hbuf; i++) { 1042*7c478bd9Sstevel@tonic-gate dp = (struct buf *)&hbuf[i]; 1043*7c478bd9Sstevel@tonic-gate hmp = &hbuf[i].b_lock; 1044*7c478bd9Sstevel@tonic-gate 1045*7c478bd9Sstevel@tonic-gate mutex_enter(hmp); 1046*7c478bd9Sstevel@tonic-gate for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) { 1047*7c478bd9Sstevel@tonic-gate if (bp->b_edev == dev) { 1048*7c478bd9Sstevel@tonic-gate if (bp->b_list == NULL) { 1049*7c478bd9Sstevel@tonic-gate bp->b_list = binval_list; 1050*7c478bd9Sstevel@tonic-gate binval_list = bp; 1051*7c478bd9Sstevel@tonic-gate } 1052*7c478bd9Sstevel@tonic-gate } 1053*7c478bd9Sstevel@tonic-gate } 1054*7c478bd9Sstevel@tonic-gate mutex_exit(hmp); 1055*7c478bd9Sstevel@tonic-gate } 1056*7c478bd9Sstevel@tonic-gate mutex_exit(&blist_lock); 1057*7c478bd9Sstevel@tonic-gate 1058*7c478bd9Sstevel@tonic-gate /* Invalidate all bp's found */ 1059*7c478bd9Sstevel@tonic-gate while (binval_list != EMPTY_LIST) { 1060*7c478bd9Sstevel@tonic-gate bp = binval_list; 1061*7c478bd9Sstevel@tonic-gate 1062*7c478bd9Sstevel@tonic-gate sema_p(&bp->b_sem); 1063*7c478bd9Sstevel@tonic-gate if (bp->b_edev == dev) { 1064*7c478bd9Sstevel@tonic-gate if (force && (bp->b_flags & B_DELWRI)) { 1065*7c478bd9Sstevel@tonic-gate /* clear B_DELWRI, move to non-dw freelist */ 1066*7c478bd9Sstevel@tonic-gate index = bio_bhash(bp->b_edev, bp->b_blkno); 1067*7c478bd9Sstevel@tonic-gate hmp = &hbuf[index].b_lock; 1068*7c478bd9Sstevel@tonic-gate dp = (struct buf *)&hbuf[index]; 1069*7c478bd9Sstevel@tonic-gate mutex_enter(hmp); 1070*7c478bd9Sstevel@tonic-gate 1071*7c478bd9Sstevel@tonic-gate /* remove from delayed write freelist */ 1072*7c478bd9Sstevel@tonic-gate notavail(bp); 1073*7c478bd9Sstevel@tonic-gate 1074*7c478bd9Sstevel@tonic-gate /* add to B_AGE side of non-dw freelist */ 1075*7c478bd9Sstevel@tonic-gate backp = &dp->av_forw; 1076*7c478bd9Sstevel@tonic-gate (*backp)->av_back = bp; 1077*7c478bd9Sstevel@tonic-gate bp->av_forw = *backp; 1078*7c478bd9Sstevel@tonic-gate *backp = bp; 1079*7c478bd9Sstevel@tonic-gate bp->av_back = dp; 1080*7c478bd9Sstevel@tonic-gate 1081*7c478bd9Sstevel@tonic-gate /* 1082*7c478bd9Sstevel@tonic-gate * make sure write retries and busy are cleared 1083*7c478bd9Sstevel@tonic-gate */ 1084*7c478bd9Sstevel@tonic-gate bp->b_flags &= 1085*7c478bd9Sstevel@tonic-gate ~(B_BUSY | B_DELWRI | B_RETRYWRI); 1086*7c478bd9Sstevel@tonic-gate mutex_exit(hmp); 1087*7c478bd9Sstevel@tonic-gate } 1088*7c478bd9Sstevel@tonic-gate if ((bp->b_flags & B_DELWRI) == 0) 1089*7c478bd9Sstevel@tonic-gate bp->b_flags |= B_STALE|B_AGE; 1090*7c478bd9Sstevel@tonic-gate else 1091*7c478bd9Sstevel@tonic-gate error = EIO; 1092*7c478bd9Sstevel@tonic-gate } 1093*7c478bd9Sstevel@tonic-gate sema_v(&bp->b_sem); 1094*7c478bd9Sstevel@tonic-gate binval_list = bp->b_list; 1095*7c478bd9Sstevel@tonic-gate bp->b_list = NULL; 1096*7c478bd9Sstevel@tonic-gate } 1097*7c478bd9Sstevel@tonic-gate mutex_enter(&blist_lock); 1098*7c478bd9Sstevel@tonic-gate bio_doinginval--; 1099*7c478bd9Sstevel@tonic-gate if (bio_flinv_cv_wanted) { 1100*7c478bd9Sstevel@tonic-gate cv_broadcast(&bio_flushinval_cv); 1101*7c478bd9Sstevel@tonic-gate bio_flinv_cv_wanted = 0; 1102*7c478bd9Sstevel@tonic-gate } 1103*7c478bd9Sstevel@tonic-gate mutex_exit(&blist_lock); 1104*7c478bd9Sstevel@tonic-gate return (error); 1105*7c478bd9Sstevel@tonic-gate } 1106*7c478bd9Sstevel@tonic-gate 1107*7c478bd9Sstevel@tonic-gate /* 1108*7c478bd9Sstevel@tonic-gate * If possible, invalidate blocks for a dev on demand 1109*7c478bd9Sstevel@tonic-gate */ 1110*7c478bd9Sstevel@tonic-gate void 1111*7c478bd9Sstevel@tonic-gate binval(dev_t dev) 1112*7c478bd9Sstevel@tonic-gate { 1113*7c478bd9Sstevel@tonic-gate (void) bfinval(dev, 0); 1114*7c478bd9Sstevel@tonic-gate } 1115*7c478bd9Sstevel@tonic-gate 1116*7c478bd9Sstevel@tonic-gate /* 1117*7c478bd9Sstevel@tonic-gate * Initialize the buffer I/O system by freeing 1118*7c478bd9Sstevel@tonic-gate * all buffers and setting all device hash buffer lists to empty. 1119*7c478bd9Sstevel@tonic-gate */ 1120*7c478bd9Sstevel@tonic-gate void 1121*7c478bd9Sstevel@tonic-gate binit(void) 1122*7c478bd9Sstevel@tonic-gate { 1123*7c478bd9Sstevel@tonic-gate struct buf *bp; 1124*7c478bd9Sstevel@tonic-gate unsigned int i, pct; 1125*7c478bd9Sstevel@tonic-gate ulong_t bio_max_hwm, bio_default_hwm; 1126*7c478bd9Sstevel@tonic-gate 1127*7c478bd9Sstevel@tonic-gate /* 1128*7c478bd9Sstevel@tonic-gate * Maximum/Default values for bufhwm are set to the smallest of: 1129*7c478bd9Sstevel@tonic-gate * - BIO_MAX_PERCENT resp. BIO_BUF_PERCENT of real memory 1130*7c478bd9Sstevel@tonic-gate * - 1/4 of kernel virtual memory 1131*7c478bd9Sstevel@tonic-gate * - INT32_MAX to prevent overflows of v.v_bufhwm (which is int). 1132*7c478bd9Sstevel@tonic-gate * Additionally, in order to allow simple tuning by percentage of 1133*7c478bd9Sstevel@tonic-gate * physical memory, bufhwm_pct is used to calculate the default if 1134*7c478bd9Sstevel@tonic-gate * the value of this tunable is between 0 and BIO_MAX_PERCENT. 1135*7c478bd9Sstevel@tonic-gate * 1136*7c478bd9Sstevel@tonic-gate * Since the unit for v.v_bufhwm is kilobytes, this allows for 1137*7c478bd9Sstevel@tonic-gate * a maximum of 1024 * 2GB == 2TB memory usage by buffer headers. 1138*7c478bd9Sstevel@tonic-gate */ 1139*7c478bd9Sstevel@tonic-gate bio_max_hwm = MIN(physmem / BIO_MAX_PERCENT, 1140*7c478bd9Sstevel@tonic-gate btop(vmem_size(heap_arena, VMEM_FREE)) / 4) * (PAGESIZE / 1024); 1141*7c478bd9Sstevel@tonic-gate bio_max_hwm = MIN(INT32_MAX, bio_max_hwm); 1142*7c478bd9Sstevel@tonic-gate 1143*7c478bd9Sstevel@tonic-gate pct = BIO_BUF_PERCENT; 1144*7c478bd9Sstevel@tonic-gate if (bufhwm_pct != 0 && 1145*7c478bd9Sstevel@tonic-gate ((pct = 100 / bufhwm_pct) < BIO_MAX_PERCENT)) { 1146*7c478bd9Sstevel@tonic-gate pct = BIO_BUF_PERCENT; 1147*7c478bd9Sstevel@tonic-gate /* 1148*7c478bd9Sstevel@tonic-gate * Invalid user specified value, emit a warning. 1149*7c478bd9Sstevel@tonic-gate */ 1150*7c478bd9Sstevel@tonic-gate cmn_err(CE_WARN, "binit: bufhwm_pct(%d) out of \ 1151*7c478bd9Sstevel@tonic-gate range(1..%d). Using %d as default.", 1152*7c478bd9Sstevel@tonic-gate bufhwm_pct, 1153*7c478bd9Sstevel@tonic-gate 100 / BIO_MAX_PERCENT, 100 / BIO_BUF_PERCENT); 1154*7c478bd9Sstevel@tonic-gate } 1155*7c478bd9Sstevel@tonic-gate 1156*7c478bd9Sstevel@tonic-gate bio_default_hwm = MIN(physmem / pct, 1157*7c478bd9Sstevel@tonic-gate btop(vmem_size(heap_arena, VMEM_FREE)) / 4) * (PAGESIZE / 1024); 1158*7c478bd9Sstevel@tonic-gate bio_default_hwm = MIN(INT32_MAX, bio_default_hwm); 1159*7c478bd9Sstevel@tonic-gate 1160*7c478bd9Sstevel@tonic-gate if ((v.v_bufhwm = bufhwm) == 0) 1161*7c478bd9Sstevel@tonic-gate v.v_bufhwm = bio_default_hwm; 1162*7c478bd9Sstevel@tonic-gate 1163*7c478bd9Sstevel@tonic-gate if (v.v_bufhwm < BIO_MIN_HWM || v.v_bufhwm > bio_max_hwm) { 1164*7c478bd9Sstevel@tonic-gate v.v_bufhwm = (int)bio_max_hwm; 1165*7c478bd9Sstevel@tonic-gate /* 1166*7c478bd9Sstevel@tonic-gate * Invalid user specified value, emit a warning. 1167*7c478bd9Sstevel@tonic-gate */ 1168*7c478bd9Sstevel@tonic-gate cmn_err(CE_WARN, 1169*7c478bd9Sstevel@tonic-gate "binit: bufhwm(%d) out \ 1170*7c478bd9Sstevel@tonic-gate of range(%d..%lu). Using %lu as default", 1171*7c478bd9Sstevel@tonic-gate bufhwm, 1172*7c478bd9Sstevel@tonic-gate BIO_MIN_HWM, bio_max_hwm, bio_max_hwm); 1173*7c478bd9Sstevel@tonic-gate } 1174*7c478bd9Sstevel@tonic-gate 1175*7c478bd9Sstevel@tonic-gate /* 1176*7c478bd9Sstevel@tonic-gate * Determine the number of hash buckets. Default is to 1177*7c478bd9Sstevel@tonic-gate * create ~BIO_HASHLEN entries per chain based on MAXBSIZE buffers. 1178*7c478bd9Sstevel@tonic-gate * Round up number to the next power of 2. 1179*7c478bd9Sstevel@tonic-gate */ 1180*7c478bd9Sstevel@tonic-gate v.v_hbuf = 1 << highbit((((ulong_t)v.v_bufhwm * 1024) / MAXBSIZE) / 1181*7c478bd9Sstevel@tonic-gate BIO_HASHLEN); 1182*7c478bd9Sstevel@tonic-gate v.v_hmask = v.v_hbuf - 1; 1183*7c478bd9Sstevel@tonic-gate v.v_buf = BIO_BHDR_POOL; 1184*7c478bd9Sstevel@tonic-gate 1185*7c478bd9Sstevel@tonic-gate hbuf = kmem_zalloc(v.v_hbuf * sizeof (struct hbuf), KM_SLEEP); 1186*7c478bd9Sstevel@tonic-gate 1187*7c478bd9Sstevel@tonic-gate dwbuf = kmem_zalloc(v.v_hbuf * sizeof (struct dwbuf), KM_SLEEP); 1188*7c478bd9Sstevel@tonic-gate 1189*7c478bd9Sstevel@tonic-gate bfreelist.b_bufsize = (size_t)v.v_bufhwm * 1024; 1190*7c478bd9Sstevel@tonic-gate bp = &bfreelist; 1191*7c478bd9Sstevel@tonic-gate bp->b_forw = bp->b_back = bp->av_forw = bp->av_back = bp; 1192*7c478bd9Sstevel@tonic-gate 1193*7c478bd9Sstevel@tonic-gate for (i = 0; i < v.v_hbuf; i++) { 1194*7c478bd9Sstevel@tonic-gate hbuf[i].b_forw = hbuf[i].b_back = (struct buf *)&hbuf[i]; 1195*7c478bd9Sstevel@tonic-gate hbuf[i].av_forw = hbuf[i].av_back = (struct buf *)&hbuf[i]; 1196*7c478bd9Sstevel@tonic-gate 1197*7c478bd9Sstevel@tonic-gate /* 1198*7c478bd9Sstevel@tonic-gate * Initialize the delayed write buffer list. 1199*7c478bd9Sstevel@tonic-gate */ 1200*7c478bd9Sstevel@tonic-gate dwbuf[i].b_forw = dwbuf[i].b_back = (struct buf *)&dwbuf[i]; 1201*7c478bd9Sstevel@tonic-gate dwbuf[i].av_forw = dwbuf[i].av_back = (struct buf *)&dwbuf[i]; 1202*7c478bd9Sstevel@tonic-gate } 1203*7c478bd9Sstevel@tonic-gate } 1204*7c478bd9Sstevel@tonic-gate 1205*7c478bd9Sstevel@tonic-gate /* 1206*7c478bd9Sstevel@tonic-gate * Wait for I/O completion on the buffer; return error code. 1207*7c478bd9Sstevel@tonic-gate * If bp was for synchronous I/O, bp is invalid and associated 1208*7c478bd9Sstevel@tonic-gate * resources are freed on return. 1209*7c478bd9Sstevel@tonic-gate */ 1210*7c478bd9Sstevel@tonic-gate int 1211*7c478bd9Sstevel@tonic-gate biowait(struct buf *bp) 1212*7c478bd9Sstevel@tonic-gate { 1213*7c478bd9Sstevel@tonic-gate int error = 0; 1214*7c478bd9Sstevel@tonic-gate struct cpu *cpup; 1215*7c478bd9Sstevel@tonic-gate 1216*7c478bd9Sstevel@tonic-gate ASSERT(SEMA_HELD(&bp->b_sem)); 1217*7c478bd9Sstevel@tonic-gate 1218*7c478bd9Sstevel@tonic-gate cpup = CPU; 1219*7c478bd9Sstevel@tonic-gate atomic_add_64(&cpup->cpu_stats.sys.iowait, 1); 1220*7c478bd9Sstevel@tonic-gate DTRACE_IO1(wait__start, struct buf *, bp); 1221*7c478bd9Sstevel@tonic-gate 1222*7c478bd9Sstevel@tonic-gate /* 1223*7c478bd9Sstevel@tonic-gate * In case of panic, busy wait for completion 1224*7c478bd9Sstevel@tonic-gate */ 1225*7c478bd9Sstevel@tonic-gate if (panicstr) { 1226*7c478bd9Sstevel@tonic-gate while ((bp->b_flags & B_DONE) == 0) 1227*7c478bd9Sstevel@tonic-gate drv_usecwait(10); 1228*7c478bd9Sstevel@tonic-gate } else 1229*7c478bd9Sstevel@tonic-gate sema_p(&bp->b_io); 1230*7c478bd9Sstevel@tonic-gate 1231*7c478bd9Sstevel@tonic-gate DTRACE_IO1(wait__done, struct buf *, bp); 1232*7c478bd9Sstevel@tonic-gate atomic_add_64(&cpup->cpu_stats.sys.iowait, -1); 1233*7c478bd9Sstevel@tonic-gate 1234*7c478bd9Sstevel@tonic-gate error = geterror(bp); 1235*7c478bd9Sstevel@tonic-gate if ((bp->b_flags & B_ASYNC) == 0) { 1236*7c478bd9Sstevel@tonic-gate if (bp->b_flags & B_REMAPPED) 1237*7c478bd9Sstevel@tonic-gate bp_mapout(bp); 1238*7c478bd9Sstevel@tonic-gate } 1239*7c478bd9Sstevel@tonic-gate return (error); 1240*7c478bd9Sstevel@tonic-gate } 1241*7c478bd9Sstevel@tonic-gate 1242*7c478bd9Sstevel@tonic-gate static void 1243*7c478bd9Sstevel@tonic-gate biodone_tnf_probe(struct buf *bp) 1244*7c478bd9Sstevel@tonic-gate { 1245*7c478bd9Sstevel@tonic-gate /* Kernel probe */ 1246*7c478bd9Sstevel@tonic-gate TNF_PROBE_3(biodone, "io blockio", /* CSTYLED */, 1247*7c478bd9Sstevel@tonic-gate tnf_device, device, bp->b_edev, 1248*7c478bd9Sstevel@tonic-gate tnf_diskaddr, block, bp->b_lblkno, 1249*7c478bd9Sstevel@tonic-gate tnf_opaque, buf, bp); 1250*7c478bd9Sstevel@tonic-gate } 1251*7c478bd9Sstevel@tonic-gate 1252*7c478bd9Sstevel@tonic-gate /* 1253*7c478bd9Sstevel@tonic-gate * Mark I/O complete on a buffer, release it if I/O is asynchronous, 1254*7c478bd9Sstevel@tonic-gate * and wake up anyone waiting for it. 1255*7c478bd9Sstevel@tonic-gate */ 1256*7c478bd9Sstevel@tonic-gate void 1257*7c478bd9Sstevel@tonic-gate biodone(struct buf *bp) 1258*7c478bd9Sstevel@tonic-gate { 1259*7c478bd9Sstevel@tonic-gate if (bp->b_flags & B_STARTED) { 1260*7c478bd9Sstevel@tonic-gate DTRACE_IO1(done, struct buf *, bp); 1261*7c478bd9Sstevel@tonic-gate bp->b_flags &= ~B_STARTED; 1262*7c478bd9Sstevel@tonic-gate } 1263*7c478bd9Sstevel@tonic-gate 1264*7c478bd9Sstevel@tonic-gate /* 1265*7c478bd9Sstevel@tonic-gate * Call the TNF probe here instead of the inline code 1266*7c478bd9Sstevel@tonic-gate * to force our compiler to use the tail call optimization. 1267*7c478bd9Sstevel@tonic-gate */ 1268*7c478bd9Sstevel@tonic-gate biodone_tnf_probe(bp); 1269*7c478bd9Sstevel@tonic-gate 1270*7c478bd9Sstevel@tonic-gate if (bp->b_iodone != NULL) { 1271*7c478bd9Sstevel@tonic-gate (*(bp->b_iodone))(bp); 1272*7c478bd9Sstevel@tonic-gate return; 1273*7c478bd9Sstevel@tonic-gate } 1274*7c478bd9Sstevel@tonic-gate ASSERT((bp->b_flags & B_DONE) == 0); 1275*7c478bd9Sstevel@tonic-gate ASSERT(SEMA_HELD(&bp->b_sem)); 1276*7c478bd9Sstevel@tonic-gate bp->b_flags |= B_DONE; 1277*7c478bd9Sstevel@tonic-gate if (bp->b_flags & B_ASYNC) { 1278*7c478bd9Sstevel@tonic-gate if (bp->b_flags & (B_PAGEIO|B_REMAPPED)) 1279*7c478bd9Sstevel@tonic-gate bio_pageio_done(bp); 1280*7c478bd9Sstevel@tonic-gate else 1281*7c478bd9Sstevel@tonic-gate brelse(bp); /* release bp to freelist */ 1282*7c478bd9Sstevel@tonic-gate } else { 1283*7c478bd9Sstevel@tonic-gate sema_v(&bp->b_io); 1284*7c478bd9Sstevel@tonic-gate } 1285*7c478bd9Sstevel@tonic-gate } 1286*7c478bd9Sstevel@tonic-gate 1287*7c478bd9Sstevel@tonic-gate /* 1288*7c478bd9Sstevel@tonic-gate * Pick up the device's error number and pass it to the user; 1289*7c478bd9Sstevel@tonic-gate * if there is an error but the number is 0 set a generalized code. 1290*7c478bd9Sstevel@tonic-gate */ 1291*7c478bd9Sstevel@tonic-gate int 1292*7c478bd9Sstevel@tonic-gate geterror(struct buf *bp) 1293*7c478bd9Sstevel@tonic-gate { 1294*7c478bd9Sstevel@tonic-gate int error = 0; 1295*7c478bd9Sstevel@tonic-gate 1296*7c478bd9Sstevel@tonic-gate ASSERT(SEMA_HELD(&bp->b_sem)); 1297*7c478bd9Sstevel@tonic-gate if (bp->b_flags & B_ERROR) { 1298*7c478bd9Sstevel@tonic-gate error = bp->b_error; 1299*7c478bd9Sstevel@tonic-gate if (!error) 1300*7c478bd9Sstevel@tonic-gate error = EIO; 1301*7c478bd9Sstevel@tonic-gate } 1302*7c478bd9Sstevel@tonic-gate return (error); 1303*7c478bd9Sstevel@tonic-gate } 1304*7c478bd9Sstevel@tonic-gate 1305*7c478bd9Sstevel@tonic-gate /* 1306*7c478bd9Sstevel@tonic-gate * Support for pageio buffers. 1307*7c478bd9Sstevel@tonic-gate * 1308*7c478bd9Sstevel@tonic-gate * This stuff should be generalized to provide a generalized bp 1309*7c478bd9Sstevel@tonic-gate * header facility that can be used for things other than pageio. 1310*7c478bd9Sstevel@tonic-gate */ 1311*7c478bd9Sstevel@tonic-gate 1312*7c478bd9Sstevel@tonic-gate /* 1313*7c478bd9Sstevel@tonic-gate * Allocate and initialize a buf struct for use with pageio. 1314*7c478bd9Sstevel@tonic-gate */ 1315*7c478bd9Sstevel@tonic-gate struct buf * 1316*7c478bd9Sstevel@tonic-gate pageio_setup(struct page *pp, size_t len, struct vnode *vp, int flags) 1317*7c478bd9Sstevel@tonic-gate { 1318*7c478bd9Sstevel@tonic-gate struct buf *bp; 1319*7c478bd9Sstevel@tonic-gate struct cpu *cpup; 1320*7c478bd9Sstevel@tonic-gate 1321*7c478bd9Sstevel@tonic-gate if (flags & B_READ) { 1322*7c478bd9Sstevel@tonic-gate CPU_STATS_ENTER_K(); 1323*7c478bd9Sstevel@tonic-gate cpup = CPU; /* get pointer AFTER preemption is disabled */ 1324*7c478bd9Sstevel@tonic-gate CPU_STATS_ADDQ(cpup, vm, pgin, 1); 1325*7c478bd9Sstevel@tonic-gate CPU_STATS_ADDQ(cpup, vm, pgpgin, btopr(len)); 1326*7c478bd9Sstevel@tonic-gate if ((flags & B_ASYNC) == 0) { 1327*7c478bd9Sstevel@tonic-gate klwp_t *lwp = ttolwp(curthread); 1328*7c478bd9Sstevel@tonic-gate if (lwp != NULL) 1329*7c478bd9Sstevel@tonic-gate lwp->lwp_ru.majflt++; 1330*7c478bd9Sstevel@tonic-gate CPU_STATS_ADDQ(cpup, vm, maj_fault, 1); 1331*7c478bd9Sstevel@tonic-gate /* Kernel probe */ 1332*7c478bd9Sstevel@tonic-gate TNF_PROBE_2(major_fault, "vm pagefault", /* CSTYLED */, 1333*7c478bd9Sstevel@tonic-gate tnf_opaque, vnode, pp->p_vnode, 1334*7c478bd9Sstevel@tonic-gate tnf_offset, offset, pp->p_offset); 1335*7c478bd9Sstevel@tonic-gate } 1336*7c478bd9Sstevel@tonic-gate /* 1337*7c478bd9Sstevel@tonic-gate * Update statistics for pages being paged in 1338*7c478bd9Sstevel@tonic-gate */ 1339*7c478bd9Sstevel@tonic-gate if (pp != NULL && pp->p_vnode != NULL) { 1340*7c478bd9Sstevel@tonic-gate if (IS_SWAPFSVP(pp->p_vnode)) { 1341*7c478bd9Sstevel@tonic-gate CPU_STATS_ADDQ(cpup, vm, anonpgin, 1342*7c478bd9Sstevel@tonic-gate btopr(len)); 1343*7c478bd9Sstevel@tonic-gate } else { 1344*7c478bd9Sstevel@tonic-gate if (pp->p_vnode->v_flag & VVMEXEC) { 1345*7c478bd9Sstevel@tonic-gate CPU_STATS_ADDQ(cpup, vm, execpgin, 1346*7c478bd9Sstevel@tonic-gate btopr(len)); 1347*7c478bd9Sstevel@tonic-gate } else { 1348*7c478bd9Sstevel@tonic-gate CPU_STATS_ADDQ(cpup, vm, fspgin, 1349*7c478bd9Sstevel@tonic-gate btopr(len)); 1350*7c478bd9Sstevel@tonic-gate } 1351*7c478bd9Sstevel@tonic-gate } 1352*7c478bd9Sstevel@tonic-gate } 1353*7c478bd9Sstevel@tonic-gate CPU_STATS_EXIT_K(); 1354*7c478bd9Sstevel@tonic-gate TRACE_1(TR_FAC_VM, TR_PAGE_WS_IN, 1355*7c478bd9Sstevel@tonic-gate "page_ws_in:pp %p", pp); 1356*7c478bd9Sstevel@tonic-gate /* Kernel probe */ 1357*7c478bd9Sstevel@tonic-gate TNF_PROBE_3(pagein, "vm pageio io", /* CSTYLED */, 1358*7c478bd9Sstevel@tonic-gate tnf_opaque, vnode, pp->p_vnode, 1359*7c478bd9Sstevel@tonic-gate tnf_offset, offset, pp->p_offset, 1360*7c478bd9Sstevel@tonic-gate tnf_size, size, len); 1361*7c478bd9Sstevel@tonic-gate } 1362*7c478bd9Sstevel@tonic-gate 1363*7c478bd9Sstevel@tonic-gate bp = kmem_zalloc(sizeof (struct buf), KM_SLEEP); 1364*7c478bd9Sstevel@tonic-gate bp->b_bcount = len; 1365*7c478bd9Sstevel@tonic-gate bp->b_bufsize = len; 1366*7c478bd9Sstevel@tonic-gate bp->b_pages = pp; 1367*7c478bd9Sstevel@tonic-gate bp->b_flags = B_PAGEIO | B_NOCACHE | B_BUSY | flags; 1368*7c478bd9Sstevel@tonic-gate bp->b_offset = -1; 1369*7c478bd9Sstevel@tonic-gate sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL); 1370*7c478bd9Sstevel@tonic-gate 1371*7c478bd9Sstevel@tonic-gate /* Initialize bp->b_sem in "locked" state */ 1372*7c478bd9Sstevel@tonic-gate sema_init(&bp->b_sem, 0, NULL, SEMA_DEFAULT, NULL); 1373*7c478bd9Sstevel@tonic-gate 1374*7c478bd9Sstevel@tonic-gate VN_HOLD(vp); 1375*7c478bd9Sstevel@tonic-gate bp->b_vp = vp; 1376*7c478bd9Sstevel@tonic-gate THREAD_KPRI_RELEASE_N(btopr(len)); /* release kpri from page_locks */ 1377*7c478bd9Sstevel@tonic-gate 1378*7c478bd9Sstevel@tonic-gate /* 1379*7c478bd9Sstevel@tonic-gate * Caller sets dev & blkno and can adjust 1380*7c478bd9Sstevel@tonic-gate * b_addr for page offset and can use bp_mapin 1381*7c478bd9Sstevel@tonic-gate * to make pages kernel addressable. 1382*7c478bd9Sstevel@tonic-gate */ 1383*7c478bd9Sstevel@tonic-gate return (bp); 1384*7c478bd9Sstevel@tonic-gate } 1385*7c478bd9Sstevel@tonic-gate 1386*7c478bd9Sstevel@tonic-gate void 1387*7c478bd9Sstevel@tonic-gate pageio_done(struct buf *bp) 1388*7c478bd9Sstevel@tonic-gate { 1389*7c478bd9Sstevel@tonic-gate ASSERT(SEMA_HELD(&bp->b_sem)); 1390*7c478bd9Sstevel@tonic-gate if (bp->b_flags & B_REMAPPED) 1391*7c478bd9Sstevel@tonic-gate bp_mapout(bp); 1392*7c478bd9Sstevel@tonic-gate VN_RELE(bp->b_vp); 1393*7c478bd9Sstevel@tonic-gate bp->b_vp = NULL; 1394*7c478bd9Sstevel@tonic-gate ASSERT((bp->b_flags & B_NOCACHE) != 0); 1395*7c478bd9Sstevel@tonic-gate 1396*7c478bd9Sstevel@tonic-gate /* A sema_v(bp->b_sem) is implied if we are destroying it */ 1397*7c478bd9Sstevel@tonic-gate sema_destroy(&bp->b_sem); 1398*7c478bd9Sstevel@tonic-gate sema_destroy(&bp->b_io); 1399*7c478bd9Sstevel@tonic-gate kmem_free(bp, sizeof (struct buf)); 1400*7c478bd9Sstevel@tonic-gate } 1401*7c478bd9Sstevel@tonic-gate 1402*7c478bd9Sstevel@tonic-gate /* 1403*7c478bd9Sstevel@tonic-gate * Check to see whether the buffers, except the one pointed by sbp, 1404*7c478bd9Sstevel@tonic-gate * associated with the device are busy. 1405*7c478bd9Sstevel@tonic-gate * NOTE: This expensive operation shall be improved together with ufs_icheck(). 1406*7c478bd9Sstevel@tonic-gate */ 1407*7c478bd9Sstevel@tonic-gate int 1408*7c478bd9Sstevel@tonic-gate bcheck(dev_t dev, struct buf *sbp) 1409*7c478bd9Sstevel@tonic-gate { 1410*7c478bd9Sstevel@tonic-gate struct buf *bp; 1411*7c478bd9Sstevel@tonic-gate struct buf *dp; 1412*7c478bd9Sstevel@tonic-gate int i; 1413*7c478bd9Sstevel@tonic-gate kmutex_t *hmp; 1414*7c478bd9Sstevel@tonic-gate 1415*7c478bd9Sstevel@tonic-gate /* 1416*7c478bd9Sstevel@tonic-gate * check for busy bufs for this filesystem 1417*7c478bd9Sstevel@tonic-gate */ 1418*7c478bd9Sstevel@tonic-gate for (i = 0; i < v.v_hbuf; i++) { 1419*7c478bd9Sstevel@tonic-gate dp = (struct buf *)&hbuf[i]; 1420*7c478bd9Sstevel@tonic-gate hmp = &hbuf[i].b_lock; 1421*7c478bd9Sstevel@tonic-gate 1422*7c478bd9Sstevel@tonic-gate mutex_enter(hmp); 1423*7c478bd9Sstevel@tonic-gate for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) { 1424*7c478bd9Sstevel@tonic-gate /* 1425*7c478bd9Sstevel@tonic-gate * if buf is busy or dirty, then filesystem is busy 1426*7c478bd9Sstevel@tonic-gate */ 1427*7c478bd9Sstevel@tonic-gate if ((bp->b_edev == dev) && 1428*7c478bd9Sstevel@tonic-gate ((bp->b_flags & B_STALE) == 0) && 1429*7c478bd9Sstevel@tonic-gate (bp->b_flags & (B_DELWRI|B_BUSY)) && 1430*7c478bd9Sstevel@tonic-gate (bp != sbp)) { 1431*7c478bd9Sstevel@tonic-gate mutex_exit(hmp); 1432*7c478bd9Sstevel@tonic-gate return (1); 1433*7c478bd9Sstevel@tonic-gate } 1434*7c478bd9Sstevel@tonic-gate } 1435*7c478bd9Sstevel@tonic-gate mutex_exit(hmp); 1436*7c478bd9Sstevel@tonic-gate } 1437*7c478bd9Sstevel@tonic-gate return (0); 1438*7c478bd9Sstevel@tonic-gate } 1439*7c478bd9Sstevel@tonic-gate 1440*7c478bd9Sstevel@tonic-gate /* 1441*7c478bd9Sstevel@tonic-gate * Hash two 32 bit entities. 1442*7c478bd9Sstevel@tonic-gate */ 1443*7c478bd9Sstevel@tonic-gate int 1444*7c478bd9Sstevel@tonic-gate hash2ints(int x, int y) 1445*7c478bd9Sstevel@tonic-gate { 1446*7c478bd9Sstevel@tonic-gate int hash = 0; 1447*7c478bd9Sstevel@tonic-gate 1448*7c478bd9Sstevel@tonic-gate hash = x - 1; 1449*7c478bd9Sstevel@tonic-gate hash = ((hash * 7) + (x >> 8)) - 1; 1450*7c478bd9Sstevel@tonic-gate hash = ((hash * 7) + (x >> 16)) - 1; 1451*7c478bd9Sstevel@tonic-gate hash = ((hash * 7) + (x >> 24)) - 1; 1452*7c478bd9Sstevel@tonic-gate hash = ((hash * 7) + y) - 1; 1453*7c478bd9Sstevel@tonic-gate hash = ((hash * 7) + (y >> 8)) - 1; 1454*7c478bd9Sstevel@tonic-gate hash = ((hash * 7) + (y >> 16)) - 1; 1455*7c478bd9Sstevel@tonic-gate hash = ((hash * 7) + (y >> 24)) - 1; 1456*7c478bd9Sstevel@tonic-gate 1457*7c478bd9Sstevel@tonic-gate return (hash); 1458*7c478bd9Sstevel@tonic-gate } 1459*7c478bd9Sstevel@tonic-gate 1460*7c478bd9Sstevel@tonic-gate 1461*7c478bd9Sstevel@tonic-gate /* 1462*7c478bd9Sstevel@tonic-gate * Return a new buffer struct. 1463*7c478bd9Sstevel@tonic-gate * Create a new buffer if we haven't gone over our high water 1464*7c478bd9Sstevel@tonic-gate * mark for memory, otherwise try to get one off the freelist. 1465*7c478bd9Sstevel@tonic-gate * 1466*7c478bd9Sstevel@tonic-gate * Returns a locked buf that has no id and is not on any hash or free 1467*7c478bd9Sstevel@tonic-gate * list. 1468*7c478bd9Sstevel@tonic-gate */ 1469*7c478bd9Sstevel@tonic-gate static struct buf * 1470*7c478bd9Sstevel@tonic-gate bio_getfreeblk(long bsize) 1471*7c478bd9Sstevel@tonic-gate { 1472*7c478bd9Sstevel@tonic-gate struct buf *bp, *dp; 1473*7c478bd9Sstevel@tonic-gate struct hbuf *hp; 1474*7c478bd9Sstevel@tonic-gate kmutex_t *hmp; 1475*7c478bd9Sstevel@tonic-gate uint_t start, end; 1476*7c478bd9Sstevel@tonic-gate 1477*7c478bd9Sstevel@tonic-gate /* 1478*7c478bd9Sstevel@tonic-gate * mutex_enter(&bfree_lock); 1479*7c478bd9Sstevel@tonic-gate * bfreelist.b_bufsize represents the amount of memory 1480*7c478bd9Sstevel@tonic-gate * mutex_exit(&bfree_lock); protect ref to bfreelist 1481*7c478bd9Sstevel@tonic-gate * we are allowed to allocate in the cache before we hit our hwm. 1482*7c478bd9Sstevel@tonic-gate */ 1483*7c478bd9Sstevel@tonic-gate bio_mem_get(bsize); /* Account for our memory request */ 1484*7c478bd9Sstevel@tonic-gate 1485*7c478bd9Sstevel@tonic-gate again: 1486*7c478bd9Sstevel@tonic-gate bp = bio_bhdr_alloc(); /* Get a buf hdr */ 1487*7c478bd9Sstevel@tonic-gate sema_p(&bp->b_sem); /* Should never fail */ 1488*7c478bd9Sstevel@tonic-gate 1489*7c478bd9Sstevel@tonic-gate ASSERT(bp->b_un.b_addr == NULL); 1490*7c478bd9Sstevel@tonic-gate bp->b_un.b_addr = kmem_alloc(bsize, KM_NOSLEEP); 1491*7c478bd9Sstevel@tonic-gate if (bp->b_un.b_addr != NULL) { 1492*7c478bd9Sstevel@tonic-gate /* 1493*7c478bd9Sstevel@tonic-gate * Make the common path short 1494*7c478bd9Sstevel@tonic-gate */ 1495*7c478bd9Sstevel@tonic-gate bp->b_bufsize = bsize; 1496*7c478bd9Sstevel@tonic-gate ASSERT(SEMA_HELD(&bp->b_sem)); 1497*7c478bd9Sstevel@tonic-gate return (bp); 1498*7c478bd9Sstevel@tonic-gate } else { 1499*7c478bd9Sstevel@tonic-gate struct buf *save; 1500*7c478bd9Sstevel@tonic-gate 1501*7c478bd9Sstevel@tonic-gate save = bp; /* Save bp we allocated */ 1502*7c478bd9Sstevel@tonic-gate start = end = lastindex; 1503*7c478bd9Sstevel@tonic-gate 1504*7c478bd9Sstevel@tonic-gate biostats.bio_bufwant.value.ui32++; 1505*7c478bd9Sstevel@tonic-gate 1506*7c478bd9Sstevel@tonic-gate /* 1507*7c478bd9Sstevel@tonic-gate * Memory isn't available from the system now. Scan 1508*7c478bd9Sstevel@tonic-gate * the hash buckets till enough space is found. 1509*7c478bd9Sstevel@tonic-gate */ 1510*7c478bd9Sstevel@tonic-gate do { 1511*7c478bd9Sstevel@tonic-gate hp = &hbuf[start]; 1512*7c478bd9Sstevel@tonic-gate hmp = &hp->b_lock; 1513*7c478bd9Sstevel@tonic-gate dp = (struct buf *)hp; 1514*7c478bd9Sstevel@tonic-gate 1515*7c478bd9Sstevel@tonic-gate mutex_enter(hmp); 1516*7c478bd9Sstevel@tonic-gate bp = dp->av_forw; 1517*7c478bd9Sstevel@tonic-gate 1518*7c478bd9Sstevel@tonic-gate while (bp != dp) { 1519*7c478bd9Sstevel@tonic-gate 1520*7c478bd9Sstevel@tonic-gate ASSERT(bp != NULL); 1521*7c478bd9Sstevel@tonic-gate 1522*7c478bd9Sstevel@tonic-gate if (!sema_tryp(&bp->b_sem)) { 1523*7c478bd9Sstevel@tonic-gate bp = bp->av_forw; 1524*7c478bd9Sstevel@tonic-gate continue; 1525*7c478bd9Sstevel@tonic-gate } 1526*7c478bd9Sstevel@tonic-gate 1527*7c478bd9Sstevel@tonic-gate /* 1528*7c478bd9Sstevel@tonic-gate * Since we are going down the freelist 1529*7c478bd9Sstevel@tonic-gate * associated with this hash bucket the 1530*7c478bd9Sstevel@tonic-gate * B_DELWRI flag should not be set. 1531*7c478bd9Sstevel@tonic-gate */ 1532*7c478bd9Sstevel@tonic-gate ASSERT(!(bp->b_flags & B_DELWRI)); 1533*7c478bd9Sstevel@tonic-gate 1534*7c478bd9Sstevel@tonic-gate if (bp->b_bufsize == bsize) { 1535*7c478bd9Sstevel@tonic-gate hp->b_length--; 1536*7c478bd9Sstevel@tonic-gate notavail(bp); 1537*7c478bd9Sstevel@tonic-gate bremhash(bp); 1538*7c478bd9Sstevel@tonic-gate mutex_exit(hmp); 1539*7c478bd9Sstevel@tonic-gate 1540*7c478bd9Sstevel@tonic-gate /* 1541*7c478bd9Sstevel@tonic-gate * Didn't kmem_alloc any more, so don't 1542*7c478bd9Sstevel@tonic-gate * count it twice. 1543*7c478bd9Sstevel@tonic-gate */ 1544*7c478bd9Sstevel@tonic-gate mutex_enter(&bfree_lock); 1545*7c478bd9Sstevel@tonic-gate bfreelist.b_bufsize += bsize; 1546*7c478bd9Sstevel@tonic-gate mutex_exit(&bfree_lock); 1547*7c478bd9Sstevel@tonic-gate 1548*7c478bd9Sstevel@tonic-gate /* 1549*7c478bd9Sstevel@tonic-gate * Update the lastindex value. 1550*7c478bd9Sstevel@tonic-gate */ 1551*7c478bd9Sstevel@tonic-gate lastindex = start; 1552*7c478bd9Sstevel@tonic-gate 1553*7c478bd9Sstevel@tonic-gate /* 1554*7c478bd9Sstevel@tonic-gate * Put our saved bp back on the list 1555*7c478bd9Sstevel@tonic-gate */ 1556*7c478bd9Sstevel@tonic-gate sema_v(&save->b_sem); 1557*7c478bd9Sstevel@tonic-gate bio_bhdr_free(save); 1558*7c478bd9Sstevel@tonic-gate ASSERT(SEMA_HELD(&bp->b_sem)); 1559*7c478bd9Sstevel@tonic-gate return (bp); 1560*7c478bd9Sstevel@tonic-gate } 1561*7c478bd9Sstevel@tonic-gate sema_v(&bp->b_sem); 1562*7c478bd9Sstevel@tonic-gate bp = bp->av_forw; 1563*7c478bd9Sstevel@tonic-gate } 1564*7c478bd9Sstevel@tonic-gate mutex_exit(hmp); 1565*7c478bd9Sstevel@tonic-gate start = ((start + 1) % v.v_hbuf); 1566*7c478bd9Sstevel@tonic-gate } while (start != end); 1567*7c478bd9Sstevel@tonic-gate 1568*7c478bd9Sstevel@tonic-gate biostats.bio_bufwait.value.ui32++; 1569*7c478bd9Sstevel@tonic-gate bp = save; /* Use original bp */ 1570*7c478bd9Sstevel@tonic-gate bp->b_un.b_addr = kmem_alloc(bsize, KM_SLEEP); 1571*7c478bd9Sstevel@tonic-gate } 1572*7c478bd9Sstevel@tonic-gate 1573*7c478bd9Sstevel@tonic-gate bp->b_bufsize = bsize; 1574*7c478bd9Sstevel@tonic-gate ASSERT(SEMA_HELD(&bp->b_sem)); 1575*7c478bd9Sstevel@tonic-gate return (bp); 1576*7c478bd9Sstevel@tonic-gate } 1577*7c478bd9Sstevel@tonic-gate 1578*7c478bd9Sstevel@tonic-gate /* 1579*7c478bd9Sstevel@tonic-gate * Allocate a buffer header. If none currently available, allocate 1580*7c478bd9Sstevel@tonic-gate * a new pool. 1581*7c478bd9Sstevel@tonic-gate */ 1582*7c478bd9Sstevel@tonic-gate static struct buf * 1583*7c478bd9Sstevel@tonic-gate bio_bhdr_alloc(void) 1584*7c478bd9Sstevel@tonic-gate { 1585*7c478bd9Sstevel@tonic-gate struct buf *dp, *sdp; 1586*7c478bd9Sstevel@tonic-gate struct buf *bp; 1587*7c478bd9Sstevel@tonic-gate int i; 1588*7c478bd9Sstevel@tonic-gate 1589*7c478bd9Sstevel@tonic-gate for (;;) { 1590*7c478bd9Sstevel@tonic-gate mutex_enter(&bhdr_lock); 1591*7c478bd9Sstevel@tonic-gate if (bhdrlist != NULL) { 1592*7c478bd9Sstevel@tonic-gate bp = bhdrlist; 1593*7c478bd9Sstevel@tonic-gate bhdrlist = bp->av_forw; 1594*7c478bd9Sstevel@tonic-gate mutex_exit(&bhdr_lock); 1595*7c478bd9Sstevel@tonic-gate bp->av_forw = NULL; 1596*7c478bd9Sstevel@tonic-gate return (bp); 1597*7c478bd9Sstevel@tonic-gate } 1598*7c478bd9Sstevel@tonic-gate mutex_exit(&bhdr_lock); 1599*7c478bd9Sstevel@tonic-gate 1600*7c478bd9Sstevel@tonic-gate /* 1601*7c478bd9Sstevel@tonic-gate * Need to allocate a new pool. If the system is currently 1602*7c478bd9Sstevel@tonic-gate * out of memory, then try freeing things on the freelist. 1603*7c478bd9Sstevel@tonic-gate */ 1604*7c478bd9Sstevel@tonic-gate dp = kmem_zalloc(sizeof (struct buf) * v.v_buf, KM_NOSLEEP); 1605*7c478bd9Sstevel@tonic-gate if (dp == NULL) { 1606*7c478bd9Sstevel@tonic-gate /* 1607*7c478bd9Sstevel@tonic-gate * System can't give us a pool of headers, try 1608*7c478bd9Sstevel@tonic-gate * recycling from the free lists. 1609*7c478bd9Sstevel@tonic-gate */ 1610*7c478bd9Sstevel@tonic-gate bio_recycle(BIO_HEADER, 0); 1611*7c478bd9Sstevel@tonic-gate } else { 1612*7c478bd9Sstevel@tonic-gate sdp = dp; 1613*7c478bd9Sstevel@tonic-gate for (i = 0; i < v.v_buf; i++, dp++) { 1614*7c478bd9Sstevel@tonic-gate /* 1615*7c478bd9Sstevel@tonic-gate * The next two lines are needed since NODEV 1616*7c478bd9Sstevel@tonic-gate * is -1 and not NULL 1617*7c478bd9Sstevel@tonic-gate */ 1618*7c478bd9Sstevel@tonic-gate dp->b_dev = (o_dev_t)NODEV; 1619*7c478bd9Sstevel@tonic-gate dp->b_edev = NODEV; 1620*7c478bd9Sstevel@tonic-gate dp->av_forw = dp + 1; 1621*7c478bd9Sstevel@tonic-gate sema_init(&dp->b_sem, 1, NULL, SEMA_DEFAULT, 1622*7c478bd9Sstevel@tonic-gate NULL); 1623*7c478bd9Sstevel@tonic-gate sema_init(&dp->b_io, 0, NULL, SEMA_DEFAULT, 1624*7c478bd9Sstevel@tonic-gate NULL); 1625*7c478bd9Sstevel@tonic-gate dp->b_offset = -1; 1626*7c478bd9Sstevel@tonic-gate } 1627*7c478bd9Sstevel@tonic-gate mutex_enter(&bhdr_lock); 1628*7c478bd9Sstevel@tonic-gate (--dp)->av_forw = bhdrlist; /* Fix last pointer */ 1629*7c478bd9Sstevel@tonic-gate bhdrlist = sdp; 1630*7c478bd9Sstevel@tonic-gate nbuf += v.v_buf; 1631*7c478bd9Sstevel@tonic-gate bp = bhdrlist; 1632*7c478bd9Sstevel@tonic-gate bhdrlist = bp->av_forw; 1633*7c478bd9Sstevel@tonic-gate mutex_exit(&bhdr_lock); 1634*7c478bd9Sstevel@tonic-gate 1635*7c478bd9Sstevel@tonic-gate bp->av_forw = NULL; 1636*7c478bd9Sstevel@tonic-gate return (bp); 1637*7c478bd9Sstevel@tonic-gate } 1638*7c478bd9Sstevel@tonic-gate } 1639*7c478bd9Sstevel@tonic-gate } 1640*7c478bd9Sstevel@tonic-gate 1641*7c478bd9Sstevel@tonic-gate static void 1642*7c478bd9Sstevel@tonic-gate bio_bhdr_free(struct buf *bp) 1643*7c478bd9Sstevel@tonic-gate { 1644*7c478bd9Sstevel@tonic-gate ASSERT(bp->b_back == NULL); 1645*7c478bd9Sstevel@tonic-gate ASSERT(bp->b_forw == NULL); 1646*7c478bd9Sstevel@tonic-gate ASSERT(bp->av_back == NULL); 1647*7c478bd9Sstevel@tonic-gate ASSERT(bp->av_forw == NULL); 1648*7c478bd9Sstevel@tonic-gate ASSERT(bp->b_un.b_addr == NULL); 1649*7c478bd9Sstevel@tonic-gate ASSERT(bp->b_dev == (o_dev_t)NODEV); 1650*7c478bd9Sstevel@tonic-gate ASSERT(bp->b_edev == NODEV); 1651*7c478bd9Sstevel@tonic-gate ASSERT(bp->b_flags == 0); 1652*7c478bd9Sstevel@tonic-gate 1653*7c478bd9Sstevel@tonic-gate mutex_enter(&bhdr_lock); 1654*7c478bd9Sstevel@tonic-gate bp->av_forw = bhdrlist; 1655*7c478bd9Sstevel@tonic-gate bhdrlist = bp; 1656*7c478bd9Sstevel@tonic-gate mutex_exit(&bhdr_lock); 1657*7c478bd9Sstevel@tonic-gate } 1658*7c478bd9Sstevel@tonic-gate 1659*7c478bd9Sstevel@tonic-gate /* 1660*7c478bd9Sstevel@tonic-gate * If we haven't gone over the high water mark, it's o.k. to 1661*7c478bd9Sstevel@tonic-gate * allocate more buffer space, otherwise recycle buffers 1662*7c478bd9Sstevel@tonic-gate * from the freelist until enough memory is free for a bsize request. 1663*7c478bd9Sstevel@tonic-gate * 1664*7c478bd9Sstevel@tonic-gate * We account for this memory, even though 1665*7c478bd9Sstevel@tonic-gate * we don't allocate it here. 1666*7c478bd9Sstevel@tonic-gate */ 1667*7c478bd9Sstevel@tonic-gate static void 1668*7c478bd9Sstevel@tonic-gate bio_mem_get(long bsize) 1669*7c478bd9Sstevel@tonic-gate { 1670*7c478bd9Sstevel@tonic-gate mutex_enter(&bfree_lock); 1671*7c478bd9Sstevel@tonic-gate if (bfreelist.b_bufsize > bsize) { 1672*7c478bd9Sstevel@tonic-gate bfreelist.b_bufsize -= bsize; 1673*7c478bd9Sstevel@tonic-gate mutex_exit(&bfree_lock); 1674*7c478bd9Sstevel@tonic-gate return; 1675*7c478bd9Sstevel@tonic-gate } 1676*7c478bd9Sstevel@tonic-gate mutex_exit(&bfree_lock); 1677*7c478bd9Sstevel@tonic-gate bio_recycle(BIO_MEM, bsize); 1678*7c478bd9Sstevel@tonic-gate } 1679*7c478bd9Sstevel@tonic-gate 1680*7c478bd9Sstevel@tonic-gate /* 1681*7c478bd9Sstevel@tonic-gate * flush a list of delayed write buffers. 1682*7c478bd9Sstevel@tonic-gate * (currently used only by bio_recycle below.) 1683*7c478bd9Sstevel@tonic-gate */ 1684*7c478bd9Sstevel@tonic-gate static void 1685*7c478bd9Sstevel@tonic-gate bio_flushlist(struct buf *delwri_list) 1686*7c478bd9Sstevel@tonic-gate { 1687*7c478bd9Sstevel@tonic-gate struct buf *bp; 1688*7c478bd9Sstevel@tonic-gate 1689*7c478bd9Sstevel@tonic-gate while (delwri_list != EMPTY_LIST) { 1690*7c478bd9Sstevel@tonic-gate bp = delwri_list; 1691*7c478bd9Sstevel@tonic-gate bp->b_flags |= B_AGE | B_ASYNC; 1692*7c478bd9Sstevel@tonic-gate if (bp->b_vp == NULL) { /* !ufs */ 1693*7c478bd9Sstevel@tonic-gate BWRITE(bp); 1694*7c478bd9Sstevel@tonic-gate } else { /* ufs */ 1695*7c478bd9Sstevel@tonic-gate UFS_BWRITE(VTOI(bp->b_vp)->i_ufsvfs, bp); 1696*7c478bd9Sstevel@tonic-gate } 1697*7c478bd9Sstevel@tonic-gate delwri_list = bp->b_list; 1698*7c478bd9Sstevel@tonic-gate bp->b_list = NULL; 1699*7c478bd9Sstevel@tonic-gate } 1700*7c478bd9Sstevel@tonic-gate } 1701*7c478bd9Sstevel@tonic-gate 1702*7c478bd9Sstevel@tonic-gate /* 1703*7c478bd9Sstevel@tonic-gate * Start recycling buffers on the freelist for one of 2 reasons: 1704*7c478bd9Sstevel@tonic-gate * - we need a buffer header 1705*7c478bd9Sstevel@tonic-gate * - we need to free up memory 1706*7c478bd9Sstevel@tonic-gate * Once started we continue to recycle buffers until the B_AGE 1707*7c478bd9Sstevel@tonic-gate * buffers are gone. 1708*7c478bd9Sstevel@tonic-gate */ 1709*7c478bd9Sstevel@tonic-gate static void 1710*7c478bd9Sstevel@tonic-gate bio_recycle(int want, long bsize) 1711*7c478bd9Sstevel@tonic-gate { 1712*7c478bd9Sstevel@tonic-gate struct buf *bp, *dp, *dwp, *nbp; 1713*7c478bd9Sstevel@tonic-gate struct hbuf *hp; 1714*7c478bd9Sstevel@tonic-gate int found = 0; 1715*7c478bd9Sstevel@tonic-gate kmutex_t *hmp; 1716*7c478bd9Sstevel@tonic-gate int start, end; 1717*7c478bd9Sstevel@tonic-gate struct buf *delwri_list = EMPTY_LIST; 1718*7c478bd9Sstevel@tonic-gate 1719*7c478bd9Sstevel@tonic-gate /* 1720*7c478bd9Sstevel@tonic-gate * Recycle buffers. 1721*7c478bd9Sstevel@tonic-gate */ 1722*7c478bd9Sstevel@tonic-gate top: 1723*7c478bd9Sstevel@tonic-gate start = end = lastindex; 1724*7c478bd9Sstevel@tonic-gate do { 1725*7c478bd9Sstevel@tonic-gate hp = &hbuf[start]; 1726*7c478bd9Sstevel@tonic-gate hmp = &hp->b_lock; 1727*7c478bd9Sstevel@tonic-gate dp = (struct buf *)hp; 1728*7c478bd9Sstevel@tonic-gate 1729*7c478bd9Sstevel@tonic-gate mutex_enter(hmp); 1730*7c478bd9Sstevel@tonic-gate bp = dp->av_forw; 1731*7c478bd9Sstevel@tonic-gate 1732*7c478bd9Sstevel@tonic-gate while (bp != dp) { 1733*7c478bd9Sstevel@tonic-gate 1734*7c478bd9Sstevel@tonic-gate ASSERT(bp != NULL); 1735*7c478bd9Sstevel@tonic-gate 1736*7c478bd9Sstevel@tonic-gate if (!sema_tryp(&bp->b_sem)) { 1737*7c478bd9Sstevel@tonic-gate bp = bp->av_forw; 1738*7c478bd9Sstevel@tonic-gate continue; 1739*7c478bd9Sstevel@tonic-gate } 1740*7c478bd9Sstevel@tonic-gate /* 1741*7c478bd9Sstevel@tonic-gate * Do we really want to nuke all of the B_AGE stuff?? 1742*7c478bd9Sstevel@tonic-gate */ 1743*7c478bd9Sstevel@tonic-gate if ((bp->b_flags & B_AGE) == 0 && found) { 1744*7c478bd9Sstevel@tonic-gate sema_v(&bp->b_sem); 1745*7c478bd9Sstevel@tonic-gate mutex_exit(hmp); 1746*7c478bd9Sstevel@tonic-gate lastindex = start; 1747*7c478bd9Sstevel@tonic-gate return; /* All done */ 1748*7c478bd9Sstevel@tonic-gate } 1749*7c478bd9Sstevel@tonic-gate 1750*7c478bd9Sstevel@tonic-gate ASSERT(MUTEX_HELD(&hp->b_lock)); 1751*7c478bd9Sstevel@tonic-gate ASSERT(!(bp->b_flags & B_DELWRI)); 1752*7c478bd9Sstevel@tonic-gate hp->b_length--; 1753*7c478bd9Sstevel@tonic-gate notavail(bp); 1754*7c478bd9Sstevel@tonic-gate 1755*7c478bd9Sstevel@tonic-gate /* 1756*7c478bd9Sstevel@tonic-gate * Remove bhdr from cache, free up memory, 1757*7c478bd9Sstevel@tonic-gate * and add the hdr to the freelist. 1758*7c478bd9Sstevel@tonic-gate */ 1759*7c478bd9Sstevel@tonic-gate bremhash(bp); 1760*7c478bd9Sstevel@tonic-gate mutex_exit(hmp); 1761*7c478bd9Sstevel@tonic-gate 1762*7c478bd9Sstevel@tonic-gate if (bp->b_bufsize) { 1763*7c478bd9Sstevel@tonic-gate kmem_free(bp->b_un.b_addr, bp->b_bufsize); 1764*7c478bd9Sstevel@tonic-gate bp->b_un.b_addr = NULL; 1765*7c478bd9Sstevel@tonic-gate mutex_enter(&bfree_lock); 1766*7c478bd9Sstevel@tonic-gate bfreelist.b_bufsize += bp->b_bufsize; 1767*7c478bd9Sstevel@tonic-gate mutex_exit(&bfree_lock); 1768*7c478bd9Sstevel@tonic-gate } 1769*7c478bd9Sstevel@tonic-gate 1770*7c478bd9Sstevel@tonic-gate bp->b_dev = (o_dev_t)NODEV; 1771*7c478bd9Sstevel@tonic-gate bp->b_edev = NODEV; 1772*7c478bd9Sstevel@tonic-gate bp->b_flags = 0; 1773*7c478bd9Sstevel@tonic-gate sema_v(&bp->b_sem); 1774*7c478bd9Sstevel@tonic-gate bio_bhdr_free(bp); 1775*7c478bd9Sstevel@tonic-gate if (want == BIO_HEADER) { 1776*7c478bd9Sstevel@tonic-gate found = 1; 1777*7c478bd9Sstevel@tonic-gate } else { 1778*7c478bd9Sstevel@tonic-gate ASSERT(want == BIO_MEM); 1779*7c478bd9Sstevel@tonic-gate if (!found && bfreelist.b_bufsize >= bsize) { 1780*7c478bd9Sstevel@tonic-gate /* Account for the memory we want */ 1781*7c478bd9Sstevel@tonic-gate mutex_enter(&bfree_lock); 1782*7c478bd9Sstevel@tonic-gate if (bfreelist.b_bufsize >= bsize) { 1783*7c478bd9Sstevel@tonic-gate bfreelist.b_bufsize -= bsize; 1784*7c478bd9Sstevel@tonic-gate found = 1; 1785*7c478bd9Sstevel@tonic-gate } 1786*7c478bd9Sstevel@tonic-gate mutex_exit(&bfree_lock); 1787*7c478bd9Sstevel@tonic-gate } 1788*7c478bd9Sstevel@tonic-gate } 1789*7c478bd9Sstevel@tonic-gate 1790*7c478bd9Sstevel@tonic-gate /* 1791*7c478bd9Sstevel@tonic-gate * Since we dropped hmp start from the 1792*7c478bd9Sstevel@tonic-gate * begining. 1793*7c478bd9Sstevel@tonic-gate */ 1794*7c478bd9Sstevel@tonic-gate mutex_enter(hmp); 1795*7c478bd9Sstevel@tonic-gate bp = dp->av_forw; 1796*7c478bd9Sstevel@tonic-gate } 1797*7c478bd9Sstevel@tonic-gate mutex_exit(hmp); 1798*7c478bd9Sstevel@tonic-gate 1799*7c478bd9Sstevel@tonic-gate /* 1800*7c478bd9Sstevel@tonic-gate * Look at the delayed write list. 1801*7c478bd9Sstevel@tonic-gate * First gather into a private list, then write them. 1802*7c478bd9Sstevel@tonic-gate */ 1803*7c478bd9Sstevel@tonic-gate dwp = (struct buf *)&dwbuf[start]; 1804*7c478bd9Sstevel@tonic-gate mutex_enter(&blist_lock); 1805*7c478bd9Sstevel@tonic-gate bio_doingflush++; 1806*7c478bd9Sstevel@tonic-gate mutex_enter(hmp); 1807*7c478bd9Sstevel@tonic-gate for (bp = dwp->av_forw; bp != dwp; bp = nbp) { 1808*7c478bd9Sstevel@tonic-gate 1809*7c478bd9Sstevel@tonic-gate ASSERT(bp != NULL); 1810*7c478bd9Sstevel@tonic-gate nbp = bp->av_forw; 1811*7c478bd9Sstevel@tonic-gate 1812*7c478bd9Sstevel@tonic-gate if (!sema_tryp(&bp->b_sem)) 1813*7c478bd9Sstevel@tonic-gate continue; 1814*7c478bd9Sstevel@tonic-gate ASSERT(bp->b_flags & B_DELWRI); 1815*7c478bd9Sstevel@tonic-gate /* 1816*7c478bd9Sstevel@tonic-gate * Do we really want to nuke all of the B_AGE stuff?? 1817*7c478bd9Sstevel@tonic-gate */ 1818*7c478bd9Sstevel@tonic-gate 1819*7c478bd9Sstevel@tonic-gate if ((bp->b_flags & B_AGE) == 0 && found) { 1820*7c478bd9Sstevel@tonic-gate sema_v(&bp->b_sem); 1821*7c478bd9Sstevel@tonic-gate mutex_exit(hmp); 1822*7c478bd9Sstevel@tonic-gate lastindex = start; 1823*7c478bd9Sstevel@tonic-gate mutex_exit(&blist_lock); 1824*7c478bd9Sstevel@tonic-gate bio_flushlist(delwri_list); 1825*7c478bd9Sstevel@tonic-gate mutex_enter(&blist_lock); 1826*7c478bd9Sstevel@tonic-gate bio_doingflush--; 1827*7c478bd9Sstevel@tonic-gate if (bio_flinv_cv_wanted) { 1828*7c478bd9Sstevel@tonic-gate bio_flinv_cv_wanted = 0; 1829*7c478bd9Sstevel@tonic-gate cv_broadcast(&bio_flushinval_cv); 1830*7c478bd9Sstevel@tonic-gate } 1831*7c478bd9Sstevel@tonic-gate mutex_exit(&blist_lock); 1832*7c478bd9Sstevel@tonic-gate return; /* All done */ 1833*7c478bd9Sstevel@tonic-gate } 1834*7c478bd9Sstevel@tonic-gate 1835*7c478bd9Sstevel@tonic-gate /* 1836*7c478bd9Sstevel@tonic-gate * If the buffer is already on a flush or 1837*7c478bd9Sstevel@tonic-gate * invalidate list then just skip it. 1838*7c478bd9Sstevel@tonic-gate */ 1839*7c478bd9Sstevel@tonic-gate if (bp->b_list != NULL) { 1840*7c478bd9Sstevel@tonic-gate sema_v(&bp->b_sem); 1841*7c478bd9Sstevel@tonic-gate continue; 1842*7c478bd9Sstevel@tonic-gate } 1843*7c478bd9Sstevel@tonic-gate /* 1844*7c478bd9Sstevel@tonic-gate * We are still on the same bucket. 1845*7c478bd9Sstevel@tonic-gate */ 1846*7c478bd9Sstevel@tonic-gate hp->b_length--; 1847*7c478bd9Sstevel@tonic-gate notavail(bp); 1848*7c478bd9Sstevel@tonic-gate bp->b_list = delwri_list; 1849*7c478bd9Sstevel@tonic-gate delwri_list = bp; 1850*7c478bd9Sstevel@tonic-gate } 1851*7c478bd9Sstevel@tonic-gate mutex_exit(hmp); 1852*7c478bd9Sstevel@tonic-gate mutex_exit(&blist_lock); 1853*7c478bd9Sstevel@tonic-gate bio_flushlist(delwri_list); 1854*7c478bd9Sstevel@tonic-gate delwri_list = EMPTY_LIST; 1855*7c478bd9Sstevel@tonic-gate mutex_enter(&blist_lock); 1856*7c478bd9Sstevel@tonic-gate bio_doingflush--; 1857*7c478bd9Sstevel@tonic-gate if (bio_flinv_cv_wanted) { 1858*7c478bd9Sstevel@tonic-gate bio_flinv_cv_wanted = 0; 1859*7c478bd9Sstevel@tonic-gate cv_broadcast(&bio_flushinval_cv); 1860*7c478bd9Sstevel@tonic-gate } 1861*7c478bd9Sstevel@tonic-gate mutex_exit(&blist_lock); 1862*7c478bd9Sstevel@tonic-gate start = (start + 1) % v.v_hbuf; 1863*7c478bd9Sstevel@tonic-gate 1864*7c478bd9Sstevel@tonic-gate } while (start != end); 1865*7c478bd9Sstevel@tonic-gate 1866*7c478bd9Sstevel@tonic-gate if (found) 1867*7c478bd9Sstevel@tonic-gate return; 1868*7c478bd9Sstevel@tonic-gate 1869*7c478bd9Sstevel@tonic-gate /* 1870*7c478bd9Sstevel@tonic-gate * Free lists exhausted and we haven't satisfied the request. 1871*7c478bd9Sstevel@tonic-gate * Wait here for more entries to be added to freelist. 1872*7c478bd9Sstevel@tonic-gate * Because this might have just happened, make it timed. 1873*7c478bd9Sstevel@tonic-gate */ 1874*7c478bd9Sstevel@tonic-gate mutex_enter(&bfree_lock); 1875*7c478bd9Sstevel@tonic-gate bfreelist.b_flags |= B_WANTED; 1876*7c478bd9Sstevel@tonic-gate (void) cv_timedwait(&bio_mem_cv, &bfree_lock, lbolt+hz); 1877*7c478bd9Sstevel@tonic-gate mutex_exit(&bfree_lock); 1878*7c478bd9Sstevel@tonic-gate goto top; 1879*7c478bd9Sstevel@tonic-gate } 1880*7c478bd9Sstevel@tonic-gate 1881*7c478bd9Sstevel@tonic-gate /* 1882*7c478bd9Sstevel@tonic-gate * See if the block is associated with some buffer 1883*7c478bd9Sstevel@tonic-gate * (mainly to avoid getting hung up on a wait in breada). 1884*7c478bd9Sstevel@tonic-gate */ 1885*7c478bd9Sstevel@tonic-gate static int 1886*7c478bd9Sstevel@tonic-gate bio_incore(dev_t dev, daddr_t blkno) 1887*7c478bd9Sstevel@tonic-gate { 1888*7c478bd9Sstevel@tonic-gate struct buf *bp; 1889*7c478bd9Sstevel@tonic-gate struct buf *dp; 1890*7c478bd9Sstevel@tonic-gate uint_t index; 1891*7c478bd9Sstevel@tonic-gate kmutex_t *hmp; 1892*7c478bd9Sstevel@tonic-gate 1893*7c478bd9Sstevel@tonic-gate index = bio_bhash(dev, blkno); 1894*7c478bd9Sstevel@tonic-gate dp = (struct buf *)&hbuf[index]; 1895*7c478bd9Sstevel@tonic-gate hmp = &hbuf[index].b_lock; 1896*7c478bd9Sstevel@tonic-gate 1897*7c478bd9Sstevel@tonic-gate mutex_enter(hmp); 1898*7c478bd9Sstevel@tonic-gate for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) { 1899*7c478bd9Sstevel@tonic-gate if (bp->b_blkno == blkno && bp->b_edev == dev && 1900*7c478bd9Sstevel@tonic-gate (bp->b_flags & B_STALE) == 0) { 1901*7c478bd9Sstevel@tonic-gate mutex_exit(hmp); 1902*7c478bd9Sstevel@tonic-gate return (1); 1903*7c478bd9Sstevel@tonic-gate } 1904*7c478bd9Sstevel@tonic-gate } 1905*7c478bd9Sstevel@tonic-gate mutex_exit(hmp); 1906*7c478bd9Sstevel@tonic-gate return (0); 1907*7c478bd9Sstevel@tonic-gate } 1908*7c478bd9Sstevel@tonic-gate 1909*7c478bd9Sstevel@tonic-gate static void 1910*7c478bd9Sstevel@tonic-gate bio_pageio_done(struct buf *bp) 1911*7c478bd9Sstevel@tonic-gate { 1912*7c478bd9Sstevel@tonic-gate if (bp->b_flags & B_PAGEIO) { 1913*7c478bd9Sstevel@tonic-gate 1914*7c478bd9Sstevel@tonic-gate if (bp->b_flags & B_REMAPPED) 1915*7c478bd9Sstevel@tonic-gate bp_mapout(bp); 1916*7c478bd9Sstevel@tonic-gate 1917*7c478bd9Sstevel@tonic-gate if (bp->b_flags & B_READ) 1918*7c478bd9Sstevel@tonic-gate pvn_read_done(bp->b_pages, bp->b_flags); 1919*7c478bd9Sstevel@tonic-gate else 1920*7c478bd9Sstevel@tonic-gate pvn_write_done(bp->b_pages, B_WRITE | bp->b_flags); 1921*7c478bd9Sstevel@tonic-gate pageio_done(bp); 1922*7c478bd9Sstevel@tonic-gate } else { 1923*7c478bd9Sstevel@tonic-gate ASSERT(bp->b_flags & B_REMAPPED); 1924*7c478bd9Sstevel@tonic-gate bp_mapout(bp); 1925*7c478bd9Sstevel@tonic-gate brelse(bp); 1926*7c478bd9Sstevel@tonic-gate } 1927*7c478bd9Sstevel@tonic-gate } 1928*7c478bd9Sstevel@tonic-gate 1929*7c478bd9Sstevel@tonic-gate /* 1930*7c478bd9Sstevel@tonic-gate * bioerror(9F) - indicate error in buffer header 1931*7c478bd9Sstevel@tonic-gate * If 'error' is zero, remove the error indication. 1932*7c478bd9Sstevel@tonic-gate */ 1933*7c478bd9Sstevel@tonic-gate void 1934*7c478bd9Sstevel@tonic-gate bioerror(struct buf *bp, int error) 1935*7c478bd9Sstevel@tonic-gate { 1936*7c478bd9Sstevel@tonic-gate ASSERT(bp != NULL); 1937*7c478bd9Sstevel@tonic-gate ASSERT(error >= 0); 1938*7c478bd9Sstevel@tonic-gate ASSERT(SEMA_HELD(&bp->b_sem)); 1939*7c478bd9Sstevel@tonic-gate 1940*7c478bd9Sstevel@tonic-gate if (error != 0) { 1941*7c478bd9Sstevel@tonic-gate bp->b_flags |= B_ERROR; 1942*7c478bd9Sstevel@tonic-gate } else { 1943*7c478bd9Sstevel@tonic-gate bp->b_flags &= ~B_ERROR; 1944*7c478bd9Sstevel@tonic-gate } 1945*7c478bd9Sstevel@tonic-gate bp->b_error = error; 1946*7c478bd9Sstevel@tonic-gate } 1947*7c478bd9Sstevel@tonic-gate 1948*7c478bd9Sstevel@tonic-gate /* 1949*7c478bd9Sstevel@tonic-gate * bioreset(9F) - reuse a private buffer header after I/O is complete 1950*7c478bd9Sstevel@tonic-gate */ 1951*7c478bd9Sstevel@tonic-gate void 1952*7c478bd9Sstevel@tonic-gate bioreset(struct buf *bp) 1953*7c478bd9Sstevel@tonic-gate { 1954*7c478bd9Sstevel@tonic-gate ASSERT(bp != NULL); 1955*7c478bd9Sstevel@tonic-gate 1956*7c478bd9Sstevel@tonic-gate biofini(bp); 1957*7c478bd9Sstevel@tonic-gate bioinit(bp); 1958*7c478bd9Sstevel@tonic-gate } 1959*7c478bd9Sstevel@tonic-gate 1960*7c478bd9Sstevel@tonic-gate /* 1961*7c478bd9Sstevel@tonic-gate * biosize(9F) - return size of a buffer header 1962*7c478bd9Sstevel@tonic-gate */ 1963*7c478bd9Sstevel@tonic-gate size_t 1964*7c478bd9Sstevel@tonic-gate biosize(void) 1965*7c478bd9Sstevel@tonic-gate { 1966*7c478bd9Sstevel@tonic-gate return (sizeof (struct buf)); 1967*7c478bd9Sstevel@tonic-gate } 1968*7c478bd9Sstevel@tonic-gate 1969*7c478bd9Sstevel@tonic-gate /* 1970*7c478bd9Sstevel@tonic-gate * biomodified(9F) - check if buffer is modified 1971*7c478bd9Sstevel@tonic-gate */ 1972*7c478bd9Sstevel@tonic-gate int 1973*7c478bd9Sstevel@tonic-gate biomodified(struct buf *bp) 1974*7c478bd9Sstevel@tonic-gate { 1975*7c478bd9Sstevel@tonic-gate int npf; 1976*7c478bd9Sstevel@tonic-gate int ppattr; 1977*7c478bd9Sstevel@tonic-gate struct page *pp; 1978*7c478bd9Sstevel@tonic-gate 1979*7c478bd9Sstevel@tonic-gate ASSERT(bp != NULL); 1980*7c478bd9Sstevel@tonic-gate 1981*7c478bd9Sstevel@tonic-gate if ((bp->b_flags & B_PAGEIO) == 0) { 1982*7c478bd9Sstevel@tonic-gate return (-1); 1983*7c478bd9Sstevel@tonic-gate } 1984*7c478bd9Sstevel@tonic-gate pp = bp->b_pages; 1985*7c478bd9Sstevel@tonic-gate npf = btopr(bp->b_bcount + ((uintptr_t)bp->b_un.b_addr & PAGEOFFSET)); 1986*7c478bd9Sstevel@tonic-gate 1987*7c478bd9Sstevel@tonic-gate while (npf > 0) { 1988*7c478bd9Sstevel@tonic-gate ppattr = hat_pagesync(pp, HAT_SYNC_DONTZERO | 1989*7c478bd9Sstevel@tonic-gate HAT_SYNC_STOPON_MOD); 1990*7c478bd9Sstevel@tonic-gate if (ppattr & P_MOD) 1991*7c478bd9Sstevel@tonic-gate return (1); 1992*7c478bd9Sstevel@tonic-gate pp = pp->p_next; 1993*7c478bd9Sstevel@tonic-gate npf--; 1994*7c478bd9Sstevel@tonic-gate } 1995*7c478bd9Sstevel@tonic-gate 1996*7c478bd9Sstevel@tonic-gate return (0); 1997*7c478bd9Sstevel@tonic-gate } 1998*7c478bd9Sstevel@tonic-gate 1999*7c478bd9Sstevel@tonic-gate /* 2000*7c478bd9Sstevel@tonic-gate * bioinit(9F) - initialize a buffer structure 2001*7c478bd9Sstevel@tonic-gate */ 2002*7c478bd9Sstevel@tonic-gate void 2003*7c478bd9Sstevel@tonic-gate bioinit(struct buf *bp) 2004*7c478bd9Sstevel@tonic-gate { 2005*7c478bd9Sstevel@tonic-gate bzero(bp, sizeof (struct buf)); 2006*7c478bd9Sstevel@tonic-gate sema_init(&bp->b_sem, 0, NULL, SEMA_DEFAULT, NULL); 2007*7c478bd9Sstevel@tonic-gate sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL); 2008*7c478bd9Sstevel@tonic-gate bp->b_offset = -1; 2009*7c478bd9Sstevel@tonic-gate } 2010*7c478bd9Sstevel@tonic-gate 2011*7c478bd9Sstevel@tonic-gate /* 2012*7c478bd9Sstevel@tonic-gate * biofini(9F) - uninitialize a buffer structure 2013*7c478bd9Sstevel@tonic-gate */ 2014*7c478bd9Sstevel@tonic-gate void 2015*7c478bd9Sstevel@tonic-gate biofini(struct buf *bp) 2016*7c478bd9Sstevel@tonic-gate { 2017*7c478bd9Sstevel@tonic-gate sema_destroy(&bp->b_io); 2018*7c478bd9Sstevel@tonic-gate sema_destroy(&bp->b_sem); 2019*7c478bd9Sstevel@tonic-gate } 2020*7c478bd9Sstevel@tonic-gate 2021*7c478bd9Sstevel@tonic-gate /* 2022*7c478bd9Sstevel@tonic-gate * bioclone(9F) - clone a buffer 2023*7c478bd9Sstevel@tonic-gate */ 2024*7c478bd9Sstevel@tonic-gate struct buf * 2025*7c478bd9Sstevel@tonic-gate bioclone(struct buf *bp, off_t off, size_t len, dev_t dev, daddr_t blkno, 2026*7c478bd9Sstevel@tonic-gate int (*iodone)(struct buf *), struct buf *bp_mem, int sleep) 2027*7c478bd9Sstevel@tonic-gate { 2028*7c478bd9Sstevel@tonic-gate struct buf *bufp; 2029*7c478bd9Sstevel@tonic-gate 2030*7c478bd9Sstevel@tonic-gate ASSERT(bp); 2031*7c478bd9Sstevel@tonic-gate if (bp_mem == NULL) { 2032*7c478bd9Sstevel@tonic-gate bufp = kmem_alloc(sizeof (struct buf), sleep); 2033*7c478bd9Sstevel@tonic-gate if (bufp == NULL) { 2034*7c478bd9Sstevel@tonic-gate return (NULL); 2035*7c478bd9Sstevel@tonic-gate } 2036*7c478bd9Sstevel@tonic-gate bioinit(bufp); 2037*7c478bd9Sstevel@tonic-gate } else { 2038*7c478bd9Sstevel@tonic-gate bufp = bp_mem; 2039*7c478bd9Sstevel@tonic-gate bioreset(bufp); 2040*7c478bd9Sstevel@tonic-gate } 2041*7c478bd9Sstevel@tonic-gate 2042*7c478bd9Sstevel@tonic-gate #define BUF_CLONE_FLAGS (B_READ|B_WRITE|B_SHADOW|B_PHYS|B_PAGEIO|B_FAILFAST|\ 2043*7c478bd9Sstevel@tonic-gate B_ABRWRITE) 2044*7c478bd9Sstevel@tonic-gate 2045*7c478bd9Sstevel@tonic-gate /* 2046*7c478bd9Sstevel@tonic-gate * the cloned buffer does not inherit the B_REMAPPED flag. A separate 2047*7c478bd9Sstevel@tonic-gate * bp_mapin(9F) has to be done to get a kernel mapping. 2048*7c478bd9Sstevel@tonic-gate */ 2049*7c478bd9Sstevel@tonic-gate bufp->b_flags = (bp->b_flags & BUF_CLONE_FLAGS) | B_BUSY; 2050*7c478bd9Sstevel@tonic-gate bufp->b_bcount = len; 2051*7c478bd9Sstevel@tonic-gate bufp->b_blkno = blkno; 2052*7c478bd9Sstevel@tonic-gate bufp->b_iodone = iodone; 2053*7c478bd9Sstevel@tonic-gate bufp->b_proc = bp->b_proc; 2054*7c478bd9Sstevel@tonic-gate bufp->b_edev = dev; 2055*7c478bd9Sstevel@tonic-gate bufp->b_file = bp->b_file; 2056*7c478bd9Sstevel@tonic-gate bufp->b_offset = bp->b_offset; 2057*7c478bd9Sstevel@tonic-gate 2058*7c478bd9Sstevel@tonic-gate if (bp->b_flags & B_SHADOW) { 2059*7c478bd9Sstevel@tonic-gate ASSERT(bp->b_shadow); 2060*7c478bd9Sstevel@tonic-gate ASSERT(bp->b_flags & B_PHYS); 2061*7c478bd9Sstevel@tonic-gate 2062*7c478bd9Sstevel@tonic-gate bufp->b_shadow = bp->b_shadow + 2063*7c478bd9Sstevel@tonic-gate btop(((uintptr_t)bp->b_un.b_addr & PAGEOFFSET) + off); 2064*7c478bd9Sstevel@tonic-gate bufp->b_un.b_addr = (caddr_t)((uintptr_t)bp->b_un.b_addr + off); 2065*7c478bd9Sstevel@tonic-gate } else { 2066*7c478bd9Sstevel@tonic-gate if (bp->b_flags & B_PAGEIO) { 2067*7c478bd9Sstevel@tonic-gate struct page *pp; 2068*7c478bd9Sstevel@tonic-gate off_t o; 2069*7c478bd9Sstevel@tonic-gate int i; 2070*7c478bd9Sstevel@tonic-gate 2071*7c478bd9Sstevel@tonic-gate pp = bp->b_pages; 2072*7c478bd9Sstevel@tonic-gate o = ((uintptr_t)bp->b_un.b_addr & PAGEOFFSET) + off; 2073*7c478bd9Sstevel@tonic-gate for (i = btop(o); i > 0; i--) { 2074*7c478bd9Sstevel@tonic-gate pp = pp->p_next; 2075*7c478bd9Sstevel@tonic-gate } 2076*7c478bd9Sstevel@tonic-gate bufp->b_pages = pp; 2077*7c478bd9Sstevel@tonic-gate bufp->b_un.b_addr = (caddr_t)(o & PAGEOFFSET); 2078*7c478bd9Sstevel@tonic-gate } else { 2079*7c478bd9Sstevel@tonic-gate bufp->b_un.b_addr = 2080*7c478bd9Sstevel@tonic-gate (caddr_t)((uintptr_t)bp->b_un.b_addr + off); 2081*7c478bd9Sstevel@tonic-gate if (bp->b_flags & B_REMAPPED) 2082*7c478bd9Sstevel@tonic-gate bufp->b_proc = NULL; 2083*7c478bd9Sstevel@tonic-gate } 2084*7c478bd9Sstevel@tonic-gate } 2085*7c478bd9Sstevel@tonic-gate return (bufp); 2086*7c478bd9Sstevel@tonic-gate } 2087