xref: /illumos-gate/usr/src/uts/common/os/bio.c (revision 7c478bd9)
1*7c478bd9Sstevel@tonic-gate /*
2*7c478bd9Sstevel@tonic-gate  * CDDL HEADER START
3*7c478bd9Sstevel@tonic-gate  *
4*7c478bd9Sstevel@tonic-gate  * The contents of this file are subject to the terms of the
5*7c478bd9Sstevel@tonic-gate  * Common Development and Distribution License, Version 1.0 only
6*7c478bd9Sstevel@tonic-gate  * (the "License").  You may not use this file except in compliance
7*7c478bd9Sstevel@tonic-gate  * with the License.
8*7c478bd9Sstevel@tonic-gate  *
9*7c478bd9Sstevel@tonic-gate  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10*7c478bd9Sstevel@tonic-gate  * or http://www.opensolaris.org/os/licensing.
11*7c478bd9Sstevel@tonic-gate  * See the License for the specific language governing permissions
12*7c478bd9Sstevel@tonic-gate  * and limitations under the License.
13*7c478bd9Sstevel@tonic-gate  *
14*7c478bd9Sstevel@tonic-gate  * When distributing Covered Code, include this CDDL HEADER in each
15*7c478bd9Sstevel@tonic-gate  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16*7c478bd9Sstevel@tonic-gate  * If applicable, add the following below this CDDL HEADER, with the
17*7c478bd9Sstevel@tonic-gate  * fields enclosed by brackets "[]" replaced with your own identifying
18*7c478bd9Sstevel@tonic-gate  * information: Portions Copyright [yyyy] [name of copyright owner]
19*7c478bd9Sstevel@tonic-gate  *
20*7c478bd9Sstevel@tonic-gate  * CDDL HEADER END
21*7c478bd9Sstevel@tonic-gate  */
22*7c478bd9Sstevel@tonic-gate /*
23*7c478bd9Sstevel@tonic-gate  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24*7c478bd9Sstevel@tonic-gate  * Use is subject to license terms.
25*7c478bd9Sstevel@tonic-gate  */
26*7c478bd9Sstevel@tonic-gate 
27*7c478bd9Sstevel@tonic-gate /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
28*7c478bd9Sstevel@tonic-gate /*	  All Rights Reserved  	*/
29*7c478bd9Sstevel@tonic-gate 
30*7c478bd9Sstevel@tonic-gate /*
31*7c478bd9Sstevel@tonic-gate  * University Copyright- Copyright (c) 1982, 1986, 1988
32*7c478bd9Sstevel@tonic-gate  * The Regents of the University of California
33*7c478bd9Sstevel@tonic-gate  * All Rights Reserved
34*7c478bd9Sstevel@tonic-gate  *
35*7c478bd9Sstevel@tonic-gate  * University Acknowledgment- Portions of this document are derived from
36*7c478bd9Sstevel@tonic-gate  * software developed by the University of California, Berkeley, and its
37*7c478bd9Sstevel@tonic-gate  * contributors.
38*7c478bd9Sstevel@tonic-gate  */
39*7c478bd9Sstevel@tonic-gate 
40*7c478bd9Sstevel@tonic-gate #pragma ident	"%Z%%M%	%I%	%E% SMI"
41*7c478bd9Sstevel@tonic-gate 
42*7c478bd9Sstevel@tonic-gate #include <sys/types.h>
43*7c478bd9Sstevel@tonic-gate #include <sys/t_lock.h>
44*7c478bd9Sstevel@tonic-gate #include <sys/sysmacros.h>
45*7c478bd9Sstevel@tonic-gate #include <sys/conf.h>
46*7c478bd9Sstevel@tonic-gate #include <sys/cpuvar.h>
47*7c478bd9Sstevel@tonic-gate #include <sys/errno.h>
48*7c478bd9Sstevel@tonic-gate #include <sys/debug.h>
49*7c478bd9Sstevel@tonic-gate #include <sys/buf.h>
50*7c478bd9Sstevel@tonic-gate #include <sys/var.h>
51*7c478bd9Sstevel@tonic-gate #include <sys/vnode.h>
52*7c478bd9Sstevel@tonic-gate #include <sys/bitmap.h>
53*7c478bd9Sstevel@tonic-gate #include <sys/cmn_err.h>
54*7c478bd9Sstevel@tonic-gate #include <sys/kmem.h>
55*7c478bd9Sstevel@tonic-gate #include <sys/vmem.h>
56*7c478bd9Sstevel@tonic-gate #include <sys/atomic.h>
57*7c478bd9Sstevel@tonic-gate #include <vm/seg_kmem.h>
58*7c478bd9Sstevel@tonic-gate #include <vm/page.h>
59*7c478bd9Sstevel@tonic-gate #include <vm/pvn.h>
60*7c478bd9Sstevel@tonic-gate #include <sys/vtrace.h>
61*7c478bd9Sstevel@tonic-gate #include <sys/tnf_probe.h>
62*7c478bd9Sstevel@tonic-gate #include <sys/fs/ufs_inode.h>
63*7c478bd9Sstevel@tonic-gate #include <sys/fs/ufs_bio.h>
64*7c478bd9Sstevel@tonic-gate #include <sys/fs/ufs_log.h>
65*7c478bd9Sstevel@tonic-gate #include <sys/systm.h>
66*7c478bd9Sstevel@tonic-gate #include <sys/vfs.h>
67*7c478bd9Sstevel@tonic-gate #include <sys/sdt.h>
68*7c478bd9Sstevel@tonic-gate 
69*7c478bd9Sstevel@tonic-gate /* Locks */
70*7c478bd9Sstevel@tonic-gate static	kmutex_t	blist_lock;	/* protects b_list */
71*7c478bd9Sstevel@tonic-gate static	kmutex_t	bhdr_lock;	/* protects the bhdrlist */
72*7c478bd9Sstevel@tonic-gate static	kmutex_t	bfree_lock;	/* protects the bfreelist structure */
73*7c478bd9Sstevel@tonic-gate 
74*7c478bd9Sstevel@tonic-gate struct hbuf	*hbuf;			/* Hash buckets */
75*7c478bd9Sstevel@tonic-gate struct dwbuf	*dwbuf;			/* Delayed write buckets */
76*7c478bd9Sstevel@tonic-gate static struct buf *bhdrlist;		/* buf header free list */
77*7c478bd9Sstevel@tonic-gate static int 	nbuf;			/* number of buffer headers allocated */
78*7c478bd9Sstevel@tonic-gate 
79*7c478bd9Sstevel@tonic-gate static int	lastindex;		/* Reference point on where to start */
80*7c478bd9Sstevel@tonic-gate 					/* when looking for free buffers */
81*7c478bd9Sstevel@tonic-gate 
82*7c478bd9Sstevel@tonic-gate #define	bio_bhash(dev, bn)	(hash2ints((dev), (int)(bn)) & v.v_hmask)
83*7c478bd9Sstevel@tonic-gate #define	EMPTY_LIST	((struct buf *)-1)
84*7c478bd9Sstevel@tonic-gate 
85*7c478bd9Sstevel@tonic-gate static kcondvar_t	bio_mem_cv; 	/* Condition variables */
86*7c478bd9Sstevel@tonic-gate static kcondvar_t	bio_flushinval_cv;
87*7c478bd9Sstevel@tonic-gate static int	bio_doingflush;		/* flush in progress */
88*7c478bd9Sstevel@tonic-gate static int	bio_doinginval;		/* inval in progress */
89*7c478bd9Sstevel@tonic-gate static int	bio_flinv_cv_wanted;	/* someone waiting for cv */
90*7c478bd9Sstevel@tonic-gate 
91*7c478bd9Sstevel@tonic-gate /*
92*7c478bd9Sstevel@tonic-gate  * Statistics on the buffer cache
93*7c478bd9Sstevel@tonic-gate  */
94*7c478bd9Sstevel@tonic-gate struct biostats biostats = {
95*7c478bd9Sstevel@tonic-gate 	{ "buffer_cache_lookups",		KSTAT_DATA_UINT32 },
96*7c478bd9Sstevel@tonic-gate 	{ "buffer_cache_hits",			KSTAT_DATA_UINT32 },
97*7c478bd9Sstevel@tonic-gate 	{ "new_buffer_requests",		KSTAT_DATA_UINT32 },
98*7c478bd9Sstevel@tonic-gate 	{ "waits_for_buffer_allocs",		KSTAT_DATA_UINT32 },
99*7c478bd9Sstevel@tonic-gate 	{ "buffers_locked_by_someone",		KSTAT_DATA_UINT32 },
100*7c478bd9Sstevel@tonic-gate 	{ "duplicate_buffers_found",		KSTAT_DATA_UINT32 }
101*7c478bd9Sstevel@tonic-gate };
102*7c478bd9Sstevel@tonic-gate 
103*7c478bd9Sstevel@tonic-gate /*
104*7c478bd9Sstevel@tonic-gate  * kstat data
105*7c478bd9Sstevel@tonic-gate  */
106*7c478bd9Sstevel@tonic-gate kstat_named_t	*biostats_ptr = (kstat_named_t *)&biostats;
107*7c478bd9Sstevel@tonic-gate uint_t		biostats_ndata = (uint_t)(sizeof (biostats) /
108*7c478bd9Sstevel@tonic-gate 					sizeof (kstat_named_t));
109*7c478bd9Sstevel@tonic-gate 
110*7c478bd9Sstevel@tonic-gate /*
111*7c478bd9Sstevel@tonic-gate  * Statistics on ufs buffer cache
112*7c478bd9Sstevel@tonic-gate  * Not protected by locks
113*7c478bd9Sstevel@tonic-gate  */
114*7c478bd9Sstevel@tonic-gate struct ufsbiostats ub = {
115*7c478bd9Sstevel@tonic-gate 	{ "breads",			KSTAT_DATA_UINT32 },
116*7c478bd9Sstevel@tonic-gate 	{ "bwrites",			KSTAT_DATA_UINT32 },
117*7c478bd9Sstevel@tonic-gate 	{ "fbiwrites",			KSTAT_DATA_UINT32 },
118*7c478bd9Sstevel@tonic-gate 	{ "getpages",			KSTAT_DATA_UINT32 },
119*7c478bd9Sstevel@tonic-gate 	{ "getras",			KSTAT_DATA_UINT32 },
120*7c478bd9Sstevel@tonic-gate 	{ "putsyncs",			KSTAT_DATA_UINT32 },
121*7c478bd9Sstevel@tonic-gate 	{ "putasyncs",			KSTAT_DATA_UINT32 },
122*7c478bd9Sstevel@tonic-gate 	{ "putpageios",			KSTAT_DATA_UINT32 },
123*7c478bd9Sstevel@tonic-gate };
124*7c478bd9Sstevel@tonic-gate 
125*7c478bd9Sstevel@tonic-gate /*
126*7c478bd9Sstevel@tonic-gate  * more UFS Logging eccentricities...
127*7c478bd9Sstevel@tonic-gate  *
128*7c478bd9Sstevel@tonic-gate  * required since "#pragma weak ..." doesn't work in reverse order.
129*7c478bd9Sstevel@tonic-gate  * i.e.:  genunix (bio.c) is loaded before the ufs modules and pointers
130*7c478bd9Sstevel@tonic-gate  *        to ufs routines don't get plugged into bio.c calls so
131*7c478bd9Sstevel@tonic-gate  *        we initialize it when setting up the "lufsops" table
132*7c478bd9Sstevel@tonic-gate  *        in "lufs.c:_init()"
133*7c478bd9Sstevel@tonic-gate  */
134*7c478bd9Sstevel@tonic-gate void (*bio_lufs_strategy)(void *, buf_t *);
135*7c478bd9Sstevel@tonic-gate void (*bio_snapshot_strategy)(void *, buf_t *);
136*7c478bd9Sstevel@tonic-gate 
137*7c478bd9Sstevel@tonic-gate 
138*7c478bd9Sstevel@tonic-gate /* Private routines */
139*7c478bd9Sstevel@tonic-gate static struct buf	*bio_getfreeblk(long);
140*7c478bd9Sstevel@tonic-gate static void 		bio_mem_get(long);
141*7c478bd9Sstevel@tonic-gate static void		bio_bhdr_free(struct buf *);
142*7c478bd9Sstevel@tonic-gate static struct buf	*bio_bhdr_alloc(void);
143*7c478bd9Sstevel@tonic-gate static void		bio_recycle(int, long);
144*7c478bd9Sstevel@tonic-gate static void 		bio_pageio_done(struct buf *);
145*7c478bd9Sstevel@tonic-gate static int 		bio_incore(dev_t, daddr_t);
146*7c478bd9Sstevel@tonic-gate 
147*7c478bd9Sstevel@tonic-gate /*
148*7c478bd9Sstevel@tonic-gate  * Buffer cache constants
149*7c478bd9Sstevel@tonic-gate  */
150*7c478bd9Sstevel@tonic-gate #define	BIO_BUF_PERCENT	(100/2)		/* default: 2% of memory */
151*7c478bd9Sstevel@tonic-gate #define	BIO_MAX_PERCENT	(100/20)	/* max is 20% of real memory */
152*7c478bd9Sstevel@tonic-gate #define	BIO_BHDR_POOL	100		/* Default bhdr pool size */
153*7c478bd9Sstevel@tonic-gate #define	BIO_MIN_HDR	10		/* Minimum number of buffer headers */
154*7c478bd9Sstevel@tonic-gate #define	BIO_MIN_HWM	(BIO_MIN_HDR * MAXBSIZE / 1024)
155*7c478bd9Sstevel@tonic-gate #define	BIO_HASHLEN	4		/* Target length of hash chains */
156*7c478bd9Sstevel@tonic-gate 
157*7c478bd9Sstevel@tonic-gate 
158*7c478bd9Sstevel@tonic-gate /* Flags for bio_recycle() */
159*7c478bd9Sstevel@tonic-gate #define	BIO_HEADER	0x01
160*7c478bd9Sstevel@tonic-gate #define	BIO_MEM		0x02
161*7c478bd9Sstevel@tonic-gate 
162*7c478bd9Sstevel@tonic-gate extern	int bufhwm;		/* User tunable - high water mark for mem  */
163*7c478bd9Sstevel@tonic-gate extern	int bufhwm_pct;		/* ditto - given in % of physmem  */
164*7c478bd9Sstevel@tonic-gate 
165*7c478bd9Sstevel@tonic-gate /*
166*7c478bd9Sstevel@tonic-gate  * The following routines allocate and free
167*7c478bd9Sstevel@tonic-gate  * buffers with various side effects.  In general the
168*7c478bd9Sstevel@tonic-gate  * arguments to an allocate routine are a device and
169*7c478bd9Sstevel@tonic-gate  * a block number, and the value is a pointer to
170*7c478bd9Sstevel@tonic-gate  * to the buffer header; the buffer returned is locked with a
171*7c478bd9Sstevel@tonic-gate  * binary semaphore so that no one else can touch it. If the block was
172*7c478bd9Sstevel@tonic-gate  * already in core, no I/O need be done; if it is
173*7c478bd9Sstevel@tonic-gate  * already locked, the process waits until it becomes free.
174*7c478bd9Sstevel@tonic-gate  * The following routines allocate a buffer:
175*7c478bd9Sstevel@tonic-gate  *	getblk
176*7c478bd9Sstevel@tonic-gate  *	bread/BREAD
177*7c478bd9Sstevel@tonic-gate  *	breada
178*7c478bd9Sstevel@tonic-gate  * Eventually the buffer must be released, possibly with the
179*7c478bd9Sstevel@tonic-gate  * side effect of writing it out, by using one of
180*7c478bd9Sstevel@tonic-gate  *	bwrite/BWRITE/brwrite
181*7c478bd9Sstevel@tonic-gate  *	bdwrite/bdrwrite
182*7c478bd9Sstevel@tonic-gate  *	bawrite
183*7c478bd9Sstevel@tonic-gate  *	brelse
184*7c478bd9Sstevel@tonic-gate  *
185*7c478bd9Sstevel@tonic-gate  * The B_WANTED/B_BUSY bits are NOT used by these routines for synchronization.
186*7c478bd9Sstevel@tonic-gate  * Instead, a binary semaphore, b_sem is used to gain exclusive access to
187*7c478bd9Sstevel@tonic-gate  * a buffer and a binary semaphore, b_io is used for I/O synchronization.
188*7c478bd9Sstevel@tonic-gate  * B_DONE is still used to denote a buffer with I/O complete on it.
189*7c478bd9Sstevel@tonic-gate  *
190*7c478bd9Sstevel@tonic-gate  * The bfreelist.b_bcount field is computed everytime fsflush runs. It is
191*7c478bd9Sstevel@tonic-gate  * should not be used where a very accurate count of the free buffers is
192*7c478bd9Sstevel@tonic-gate  * needed.
193*7c478bd9Sstevel@tonic-gate  */
194*7c478bd9Sstevel@tonic-gate 
195*7c478bd9Sstevel@tonic-gate /*
196*7c478bd9Sstevel@tonic-gate  * Read in (if necessary) the block and return a buffer pointer.
197*7c478bd9Sstevel@tonic-gate  *
198*7c478bd9Sstevel@tonic-gate  * This interface is provided for binary compatibility.  Using
199*7c478bd9Sstevel@tonic-gate  * BREAD() directly avoids the extra function call overhead invoked
200*7c478bd9Sstevel@tonic-gate  * by calling this routine.
201*7c478bd9Sstevel@tonic-gate  */
202*7c478bd9Sstevel@tonic-gate struct buf *
203*7c478bd9Sstevel@tonic-gate bread(dev_t dev, daddr_t blkno, long bsize)
204*7c478bd9Sstevel@tonic-gate {
205*7c478bd9Sstevel@tonic-gate 	return (BREAD(dev, blkno, bsize));
206*7c478bd9Sstevel@tonic-gate }
207*7c478bd9Sstevel@tonic-gate 
208*7c478bd9Sstevel@tonic-gate /*
209*7c478bd9Sstevel@tonic-gate  * Common code for reading a buffer with various options
210*7c478bd9Sstevel@tonic-gate  *
211*7c478bd9Sstevel@tonic-gate  * Read in (if necessary) the block and return a buffer pointer.
212*7c478bd9Sstevel@tonic-gate  */
213*7c478bd9Sstevel@tonic-gate struct buf *
214*7c478bd9Sstevel@tonic-gate bread_common(void *arg, dev_t dev, daddr_t blkno, long bsize)
215*7c478bd9Sstevel@tonic-gate {
216*7c478bd9Sstevel@tonic-gate 	struct ufsvfs *ufsvfsp = (struct ufsvfs *)arg;
217*7c478bd9Sstevel@tonic-gate 	struct buf *bp;
218*7c478bd9Sstevel@tonic-gate 	klwp_t *lwp = ttolwp(curthread);
219*7c478bd9Sstevel@tonic-gate 
220*7c478bd9Sstevel@tonic-gate 	CPU_STATS_ADD_K(sys, lread, 1);
221*7c478bd9Sstevel@tonic-gate 	bp = getblk_common(ufsvfsp, dev, blkno, bsize, /* errflg */ 1);
222*7c478bd9Sstevel@tonic-gate 	if (bp->b_flags & B_DONE)
223*7c478bd9Sstevel@tonic-gate 		return (bp);
224*7c478bd9Sstevel@tonic-gate 	bp->b_flags |= B_READ;
225*7c478bd9Sstevel@tonic-gate 	ASSERT(bp->b_bcount == bsize);
226*7c478bd9Sstevel@tonic-gate 	if (ufsvfsp == NULL) {					/* !ufs */
227*7c478bd9Sstevel@tonic-gate 		(void) bdev_strategy(bp);
228*7c478bd9Sstevel@tonic-gate 	} else if (ufsvfsp->vfs_log && bio_lufs_strategy != NULL) {
229*7c478bd9Sstevel@tonic-gate 							/* ufs && logging */
230*7c478bd9Sstevel@tonic-gate 		(*bio_lufs_strategy)(ufsvfsp->vfs_log, bp);
231*7c478bd9Sstevel@tonic-gate 	} else if (ufsvfsp->vfs_snapshot && bio_snapshot_strategy != NULL) {
232*7c478bd9Sstevel@tonic-gate 							/* ufs && snapshots */
233*7c478bd9Sstevel@tonic-gate 		(*bio_snapshot_strategy)(&ufsvfsp->vfs_snapshot, bp);
234*7c478bd9Sstevel@tonic-gate 	} else {
235*7c478bd9Sstevel@tonic-gate 		ufsvfsp->vfs_iotstamp = lbolt;
236*7c478bd9Sstevel@tonic-gate 		ub.ub_breads.value.ul++;		/* ufs && !logging */
237*7c478bd9Sstevel@tonic-gate 		(void) bdev_strategy(bp);
238*7c478bd9Sstevel@tonic-gate 	}
239*7c478bd9Sstevel@tonic-gate 	if (lwp != NULL)
240*7c478bd9Sstevel@tonic-gate 		lwp->lwp_ru.inblock++;
241*7c478bd9Sstevel@tonic-gate 	CPU_STATS_ADD_K(sys, bread, 1);
242*7c478bd9Sstevel@tonic-gate 	(void) biowait(bp);
243*7c478bd9Sstevel@tonic-gate 	return (bp);
244*7c478bd9Sstevel@tonic-gate }
245*7c478bd9Sstevel@tonic-gate 
246*7c478bd9Sstevel@tonic-gate /*
247*7c478bd9Sstevel@tonic-gate  * Read in the block, like bread, but also start I/O on the
248*7c478bd9Sstevel@tonic-gate  * read-ahead block (which is not allocated to the caller).
249*7c478bd9Sstevel@tonic-gate  */
250*7c478bd9Sstevel@tonic-gate struct buf *
251*7c478bd9Sstevel@tonic-gate breada(dev_t dev, daddr_t blkno, daddr_t rablkno, long bsize)
252*7c478bd9Sstevel@tonic-gate {
253*7c478bd9Sstevel@tonic-gate 	struct buf *bp, *rabp;
254*7c478bd9Sstevel@tonic-gate 	klwp_t *lwp = ttolwp(curthread);
255*7c478bd9Sstevel@tonic-gate 
256*7c478bd9Sstevel@tonic-gate 	bp = NULL;
257*7c478bd9Sstevel@tonic-gate 	if (!bio_incore(dev, blkno)) {
258*7c478bd9Sstevel@tonic-gate 		CPU_STATS_ADD_K(sys, lread, 1);
259*7c478bd9Sstevel@tonic-gate 		bp = GETBLK(dev, blkno, bsize);
260*7c478bd9Sstevel@tonic-gate 		if ((bp->b_flags & B_DONE) == 0) {
261*7c478bd9Sstevel@tonic-gate 			bp->b_flags |= B_READ;
262*7c478bd9Sstevel@tonic-gate 			bp->b_bcount = bsize;
263*7c478bd9Sstevel@tonic-gate 			(void) bdev_strategy(bp);
264*7c478bd9Sstevel@tonic-gate 			if (lwp != NULL)
265*7c478bd9Sstevel@tonic-gate 				lwp->lwp_ru.inblock++;
266*7c478bd9Sstevel@tonic-gate 			CPU_STATS_ADD_K(sys, bread, 1);
267*7c478bd9Sstevel@tonic-gate 		}
268*7c478bd9Sstevel@tonic-gate 	}
269*7c478bd9Sstevel@tonic-gate 	if (rablkno && bfreelist.b_bcount > 1 &&
270*7c478bd9Sstevel@tonic-gate 	    !bio_incore(dev, rablkno)) {
271*7c478bd9Sstevel@tonic-gate 		rabp = GETBLK(dev, rablkno, bsize);
272*7c478bd9Sstevel@tonic-gate 		if (rabp->b_flags & B_DONE)
273*7c478bd9Sstevel@tonic-gate 			brelse(rabp);
274*7c478bd9Sstevel@tonic-gate 		else {
275*7c478bd9Sstevel@tonic-gate 			rabp->b_flags |= B_READ|B_ASYNC;
276*7c478bd9Sstevel@tonic-gate 			rabp->b_bcount = bsize;
277*7c478bd9Sstevel@tonic-gate 			(void) bdev_strategy(rabp);
278*7c478bd9Sstevel@tonic-gate 			if (lwp != NULL)
279*7c478bd9Sstevel@tonic-gate 				lwp->lwp_ru.inblock++;
280*7c478bd9Sstevel@tonic-gate 			CPU_STATS_ADD_K(sys, bread, 1);
281*7c478bd9Sstevel@tonic-gate 		}
282*7c478bd9Sstevel@tonic-gate 	}
283*7c478bd9Sstevel@tonic-gate 	if (bp == NULL)
284*7c478bd9Sstevel@tonic-gate 		return (BREAD(dev, blkno, bsize));
285*7c478bd9Sstevel@tonic-gate 	(void) biowait(bp);
286*7c478bd9Sstevel@tonic-gate 	return (bp);
287*7c478bd9Sstevel@tonic-gate }
288*7c478bd9Sstevel@tonic-gate 
289*7c478bd9Sstevel@tonic-gate /*
290*7c478bd9Sstevel@tonic-gate  * Common code for writing a buffer with various options.
291*7c478bd9Sstevel@tonic-gate  *
292*7c478bd9Sstevel@tonic-gate  * force_wait  - wait for write completion regardless of B_ASYNC flag
293*7c478bd9Sstevel@tonic-gate  * do_relse    - release the buffer when we are done
294*7c478bd9Sstevel@tonic-gate  * clear_flags - flags to clear from the buffer
295*7c478bd9Sstevel@tonic-gate  */
296*7c478bd9Sstevel@tonic-gate void
297*7c478bd9Sstevel@tonic-gate bwrite_common(void *arg, struct buf *bp, int force_wait,
298*7c478bd9Sstevel@tonic-gate 				int do_relse, int clear_flags)
299*7c478bd9Sstevel@tonic-gate {
300*7c478bd9Sstevel@tonic-gate 	register int do_wait;
301*7c478bd9Sstevel@tonic-gate 	struct ufsvfs *ufsvfsp = (struct ufsvfs *)arg;
302*7c478bd9Sstevel@tonic-gate 	int flag;
303*7c478bd9Sstevel@tonic-gate 	klwp_t *lwp = ttolwp(curthread);
304*7c478bd9Sstevel@tonic-gate 	struct cpu *cpup;
305*7c478bd9Sstevel@tonic-gate 
306*7c478bd9Sstevel@tonic-gate 	ASSERT(SEMA_HELD(&bp->b_sem));
307*7c478bd9Sstevel@tonic-gate 	flag = bp->b_flags;
308*7c478bd9Sstevel@tonic-gate 	bp->b_flags &= ~clear_flags;
309*7c478bd9Sstevel@tonic-gate 	if (lwp != NULL)
310*7c478bd9Sstevel@tonic-gate 		lwp->lwp_ru.oublock++;
311*7c478bd9Sstevel@tonic-gate 	CPU_STATS_ENTER_K();
312*7c478bd9Sstevel@tonic-gate 	cpup = CPU;		/* get pointer AFTER preemption is disabled */
313*7c478bd9Sstevel@tonic-gate 	CPU_STATS_ADDQ(cpup, sys, lwrite, 1);
314*7c478bd9Sstevel@tonic-gate 	CPU_STATS_ADDQ(cpup, sys, bwrite, 1);
315*7c478bd9Sstevel@tonic-gate 	do_wait = ((flag & B_ASYNC) == 0 || force_wait);
316*7c478bd9Sstevel@tonic-gate 	if (do_wait == 0)
317*7c478bd9Sstevel@tonic-gate 		CPU_STATS_ADDQ(cpup, sys, bawrite, 1);
318*7c478bd9Sstevel@tonic-gate 	CPU_STATS_EXIT_K();
319*7c478bd9Sstevel@tonic-gate 	if (ufsvfsp == NULL) {
320*7c478bd9Sstevel@tonic-gate 		(void) bdev_strategy(bp);
321*7c478bd9Sstevel@tonic-gate 	} else if (ufsvfsp->vfs_log && bio_lufs_strategy != NULL) {
322*7c478bd9Sstevel@tonic-gate 							/* ufs && logging */
323*7c478bd9Sstevel@tonic-gate 		(*bio_lufs_strategy)(ufsvfsp->vfs_log, bp);
324*7c478bd9Sstevel@tonic-gate 	} else if (ufsvfsp->vfs_snapshot && bio_snapshot_strategy != NULL) {
325*7c478bd9Sstevel@tonic-gate 							/* ufs && snapshots */
326*7c478bd9Sstevel@tonic-gate 		(*bio_snapshot_strategy)(&ufsvfsp->vfs_snapshot, bp);
327*7c478bd9Sstevel@tonic-gate 	} else {
328*7c478bd9Sstevel@tonic-gate 		ub.ub_bwrites.value.ul++;		/* ufs && !logging */
329*7c478bd9Sstevel@tonic-gate 		(void) bdev_strategy(bp);
330*7c478bd9Sstevel@tonic-gate 	}
331*7c478bd9Sstevel@tonic-gate 	if (do_wait) {
332*7c478bd9Sstevel@tonic-gate 		(void) biowait(bp);
333*7c478bd9Sstevel@tonic-gate 		if (do_relse) {
334*7c478bd9Sstevel@tonic-gate 			brelse(bp);
335*7c478bd9Sstevel@tonic-gate 		}
336*7c478bd9Sstevel@tonic-gate 	}
337*7c478bd9Sstevel@tonic-gate }
338*7c478bd9Sstevel@tonic-gate 
339*7c478bd9Sstevel@tonic-gate /*
340*7c478bd9Sstevel@tonic-gate  * Write the buffer, waiting for completion (unless B_ASYNC is set).
341*7c478bd9Sstevel@tonic-gate  * Then release the buffer.
342*7c478bd9Sstevel@tonic-gate  * This interface is provided for binary compatibility.  Using
343*7c478bd9Sstevel@tonic-gate  * BWRITE() directly avoids the extra function call overhead invoked
344*7c478bd9Sstevel@tonic-gate  * by calling this routine.
345*7c478bd9Sstevel@tonic-gate  */
346*7c478bd9Sstevel@tonic-gate void
347*7c478bd9Sstevel@tonic-gate bwrite(struct buf *bp)
348*7c478bd9Sstevel@tonic-gate {
349*7c478bd9Sstevel@tonic-gate 	BWRITE(bp);
350*7c478bd9Sstevel@tonic-gate }
351*7c478bd9Sstevel@tonic-gate 
352*7c478bd9Sstevel@tonic-gate /*
353*7c478bd9Sstevel@tonic-gate  * Write the buffer, waiting for completion.
354*7c478bd9Sstevel@tonic-gate  * But don't release the buffer afterwards.
355*7c478bd9Sstevel@tonic-gate  * This interface is provided for binary compatibility.  Using
356*7c478bd9Sstevel@tonic-gate  * BWRITE2() directly avoids the extra function call overhead.
357*7c478bd9Sstevel@tonic-gate  */
358*7c478bd9Sstevel@tonic-gate void
359*7c478bd9Sstevel@tonic-gate bwrite2(struct buf *bp)
360*7c478bd9Sstevel@tonic-gate {
361*7c478bd9Sstevel@tonic-gate 	BWRITE2(bp);
362*7c478bd9Sstevel@tonic-gate }
363*7c478bd9Sstevel@tonic-gate 
364*7c478bd9Sstevel@tonic-gate /*
365*7c478bd9Sstevel@tonic-gate  * Release the buffer, marking it so that if it is grabbed
366*7c478bd9Sstevel@tonic-gate  * for another purpose it will be written out before being
367*7c478bd9Sstevel@tonic-gate  * given up (e.g. when writing a partial block where it is
368*7c478bd9Sstevel@tonic-gate  * assumed that another write for the same block will soon follow).
369*7c478bd9Sstevel@tonic-gate  * Also save the time that the block is first marked as delayed
370*7c478bd9Sstevel@tonic-gate  * so that it will be written in a reasonable time.
371*7c478bd9Sstevel@tonic-gate  */
372*7c478bd9Sstevel@tonic-gate void
373*7c478bd9Sstevel@tonic-gate bdwrite(struct buf *bp)
374*7c478bd9Sstevel@tonic-gate {
375*7c478bd9Sstevel@tonic-gate 	ASSERT(SEMA_HELD(&bp->b_sem));
376*7c478bd9Sstevel@tonic-gate 	CPU_STATS_ADD_K(sys, lwrite, 1);
377*7c478bd9Sstevel@tonic-gate 	if ((bp->b_flags & B_DELWRI) == 0)
378*7c478bd9Sstevel@tonic-gate 		bp->b_start = lbolt;
379*7c478bd9Sstevel@tonic-gate 	/*
380*7c478bd9Sstevel@tonic-gate 	 * B_DONE allows others to use the buffer, B_DELWRI causes the
381*7c478bd9Sstevel@tonic-gate 	 * buffer to be written before being reused, and setting b_resid
382*7c478bd9Sstevel@tonic-gate 	 * to zero says the buffer is complete.
383*7c478bd9Sstevel@tonic-gate 	 */
384*7c478bd9Sstevel@tonic-gate 	bp->b_flags |= B_DELWRI | B_DONE;
385*7c478bd9Sstevel@tonic-gate 	bp->b_resid = 0;
386*7c478bd9Sstevel@tonic-gate 	brelse(bp);
387*7c478bd9Sstevel@tonic-gate }
388*7c478bd9Sstevel@tonic-gate 
389*7c478bd9Sstevel@tonic-gate /*
390*7c478bd9Sstevel@tonic-gate  * Release the buffer, start I/O on it, but don't wait for completion.
391*7c478bd9Sstevel@tonic-gate  */
392*7c478bd9Sstevel@tonic-gate void
393*7c478bd9Sstevel@tonic-gate bawrite(struct buf *bp)
394*7c478bd9Sstevel@tonic-gate {
395*7c478bd9Sstevel@tonic-gate 	ASSERT(SEMA_HELD(&bp->b_sem));
396*7c478bd9Sstevel@tonic-gate 
397*7c478bd9Sstevel@tonic-gate 	/* Use bfreelist.b_bcount as a weird-ass heuristic */
398*7c478bd9Sstevel@tonic-gate 	if (bfreelist.b_bcount > 4)
399*7c478bd9Sstevel@tonic-gate 		bp->b_flags |= B_ASYNC;
400*7c478bd9Sstevel@tonic-gate 	BWRITE(bp);
401*7c478bd9Sstevel@tonic-gate }
402*7c478bd9Sstevel@tonic-gate 
403*7c478bd9Sstevel@tonic-gate /*
404*7c478bd9Sstevel@tonic-gate  * Release the buffer, with no I/O implied.
405*7c478bd9Sstevel@tonic-gate  */
406*7c478bd9Sstevel@tonic-gate void
407*7c478bd9Sstevel@tonic-gate brelse(struct buf *bp)
408*7c478bd9Sstevel@tonic-gate {
409*7c478bd9Sstevel@tonic-gate 	struct buf	**backp;
410*7c478bd9Sstevel@tonic-gate 	uint_t		index;
411*7c478bd9Sstevel@tonic-gate 	kmutex_t	*hmp;
412*7c478bd9Sstevel@tonic-gate 	struct	buf	*dp;
413*7c478bd9Sstevel@tonic-gate 	struct	hbuf	*hp;
414*7c478bd9Sstevel@tonic-gate 
415*7c478bd9Sstevel@tonic-gate 
416*7c478bd9Sstevel@tonic-gate 	ASSERT(SEMA_HELD(&bp->b_sem));
417*7c478bd9Sstevel@tonic-gate 
418*7c478bd9Sstevel@tonic-gate 	/*
419*7c478bd9Sstevel@tonic-gate 	 * Clear the retry write flag if the buffer was written without
420*7c478bd9Sstevel@tonic-gate 	 * error.  The presence of B_DELWRI means the buffer has not yet
421*7c478bd9Sstevel@tonic-gate 	 * been written and the presence of B_ERROR means that an error
422*7c478bd9Sstevel@tonic-gate 	 * is still occurring.
423*7c478bd9Sstevel@tonic-gate 	 */
424*7c478bd9Sstevel@tonic-gate 	if ((bp->b_flags & (B_ERROR | B_DELWRI | B_RETRYWRI)) == B_RETRYWRI) {
425*7c478bd9Sstevel@tonic-gate 		bp->b_flags &= ~B_RETRYWRI;
426*7c478bd9Sstevel@tonic-gate 	}
427*7c478bd9Sstevel@tonic-gate 
428*7c478bd9Sstevel@tonic-gate 	/* Check for anomalous conditions */
429*7c478bd9Sstevel@tonic-gate 	if (bp->b_flags & (B_ERROR|B_NOCACHE)) {
430*7c478bd9Sstevel@tonic-gate 		if (bp->b_flags & B_NOCACHE) {
431*7c478bd9Sstevel@tonic-gate 			/* Don't add to the freelist. Destroy it now */
432*7c478bd9Sstevel@tonic-gate 			kmem_free(bp->b_un.b_addr, bp->b_bufsize);
433*7c478bd9Sstevel@tonic-gate 			sema_destroy(&bp->b_sem);
434*7c478bd9Sstevel@tonic-gate 			sema_destroy(&bp->b_io);
435*7c478bd9Sstevel@tonic-gate 			kmem_free(bp, sizeof (struct buf));
436*7c478bd9Sstevel@tonic-gate 			return;
437*7c478bd9Sstevel@tonic-gate 		}
438*7c478bd9Sstevel@tonic-gate 		/*
439*7c478bd9Sstevel@tonic-gate 		 * If a write failed and we are supposed to retry write,
440*7c478bd9Sstevel@tonic-gate 		 * don't toss the buffer.  Keep it around and mark it
441*7c478bd9Sstevel@tonic-gate 		 * delayed write in the hopes that it will eventually
442*7c478bd9Sstevel@tonic-gate 		 * get flushed (and still keep the system running.)
443*7c478bd9Sstevel@tonic-gate 		 */
444*7c478bd9Sstevel@tonic-gate 		if ((bp->b_flags & (B_READ | B_RETRYWRI)) == B_RETRYWRI) {
445*7c478bd9Sstevel@tonic-gate 			bp->b_flags |= B_DELWRI;
446*7c478bd9Sstevel@tonic-gate 			/* keep fsflush from trying continuously to flush */
447*7c478bd9Sstevel@tonic-gate 			bp->b_start = lbolt;
448*7c478bd9Sstevel@tonic-gate 		} else
449*7c478bd9Sstevel@tonic-gate 			bp->b_flags |= B_AGE|B_STALE;
450*7c478bd9Sstevel@tonic-gate 		bp->b_flags &= ~B_ERROR;
451*7c478bd9Sstevel@tonic-gate 		bp->b_error = 0;
452*7c478bd9Sstevel@tonic-gate 	}
453*7c478bd9Sstevel@tonic-gate 
454*7c478bd9Sstevel@tonic-gate 	/*
455*7c478bd9Sstevel@tonic-gate 	 * If delayed write is set then put in on the delayed
456*7c478bd9Sstevel@tonic-gate 	 * write list instead of the free buffer list.
457*7c478bd9Sstevel@tonic-gate 	 */
458*7c478bd9Sstevel@tonic-gate 	index = bio_bhash(bp->b_edev, bp->b_blkno);
459*7c478bd9Sstevel@tonic-gate 	hmp   = &hbuf[index].b_lock;
460*7c478bd9Sstevel@tonic-gate 
461*7c478bd9Sstevel@tonic-gate 	mutex_enter(hmp);
462*7c478bd9Sstevel@tonic-gate 	hp = &hbuf[index];
463*7c478bd9Sstevel@tonic-gate 	dp = (struct buf *)hp;
464*7c478bd9Sstevel@tonic-gate 
465*7c478bd9Sstevel@tonic-gate 	/*
466*7c478bd9Sstevel@tonic-gate 	 * Make sure that the number of entries on this list are
467*7c478bd9Sstevel@tonic-gate 	 * Zero <= count <= total # buffers
468*7c478bd9Sstevel@tonic-gate 	 */
469*7c478bd9Sstevel@tonic-gate 	ASSERT(hp->b_length >= 0);
470*7c478bd9Sstevel@tonic-gate 	ASSERT(hp->b_length < nbuf);
471*7c478bd9Sstevel@tonic-gate 
472*7c478bd9Sstevel@tonic-gate 	hp->b_length++;		/* We are adding this buffer */
473*7c478bd9Sstevel@tonic-gate 
474*7c478bd9Sstevel@tonic-gate 	if (bp->b_flags & B_DELWRI) {
475*7c478bd9Sstevel@tonic-gate 		/*
476*7c478bd9Sstevel@tonic-gate 		 * This buffer goes on the delayed write buffer list
477*7c478bd9Sstevel@tonic-gate 		 */
478*7c478bd9Sstevel@tonic-gate 		dp = (struct buf *)&dwbuf[index];
479*7c478bd9Sstevel@tonic-gate 	}
480*7c478bd9Sstevel@tonic-gate 	ASSERT(bp->b_bufsize > 0);
481*7c478bd9Sstevel@tonic-gate 	ASSERT(bp->b_bcount > 0);
482*7c478bd9Sstevel@tonic-gate 	ASSERT(bp->b_un.b_addr != NULL);
483*7c478bd9Sstevel@tonic-gate 
484*7c478bd9Sstevel@tonic-gate 	if (bp->b_flags & B_AGE) {
485*7c478bd9Sstevel@tonic-gate 		backp = &dp->av_forw;
486*7c478bd9Sstevel@tonic-gate 		(*backp)->av_back = bp;
487*7c478bd9Sstevel@tonic-gate 		bp->av_forw = *backp;
488*7c478bd9Sstevel@tonic-gate 		*backp = bp;
489*7c478bd9Sstevel@tonic-gate 		bp->av_back = dp;
490*7c478bd9Sstevel@tonic-gate 	} else {
491*7c478bd9Sstevel@tonic-gate 		backp = &dp->av_back;
492*7c478bd9Sstevel@tonic-gate 		(*backp)->av_forw = bp;
493*7c478bd9Sstevel@tonic-gate 		bp->av_back = *backp;
494*7c478bd9Sstevel@tonic-gate 		*backp = bp;
495*7c478bd9Sstevel@tonic-gate 		bp->av_forw = dp;
496*7c478bd9Sstevel@tonic-gate 	}
497*7c478bd9Sstevel@tonic-gate 	mutex_exit(hmp);
498*7c478bd9Sstevel@tonic-gate 
499*7c478bd9Sstevel@tonic-gate 	if (bfreelist.b_flags & B_WANTED) {
500*7c478bd9Sstevel@tonic-gate 		/*
501*7c478bd9Sstevel@tonic-gate 		 * Should come here very very rarely.
502*7c478bd9Sstevel@tonic-gate 		 */
503*7c478bd9Sstevel@tonic-gate 		mutex_enter(&bfree_lock);
504*7c478bd9Sstevel@tonic-gate 		if (bfreelist.b_flags & B_WANTED) {
505*7c478bd9Sstevel@tonic-gate 			bfreelist.b_flags &= ~B_WANTED;
506*7c478bd9Sstevel@tonic-gate 			cv_broadcast(&bio_mem_cv);
507*7c478bd9Sstevel@tonic-gate 		}
508*7c478bd9Sstevel@tonic-gate 		mutex_exit(&bfree_lock);
509*7c478bd9Sstevel@tonic-gate 	}
510*7c478bd9Sstevel@tonic-gate 
511*7c478bd9Sstevel@tonic-gate 	bp->b_flags &= ~(B_WANTED|B_BUSY|B_ASYNC);
512*7c478bd9Sstevel@tonic-gate 	/*
513*7c478bd9Sstevel@tonic-gate 	 * Don't let anyone get the buffer off the freelist before we
514*7c478bd9Sstevel@tonic-gate 	 * release our hold on it.
515*7c478bd9Sstevel@tonic-gate 	 */
516*7c478bd9Sstevel@tonic-gate 	sema_v(&bp->b_sem);
517*7c478bd9Sstevel@tonic-gate }
518*7c478bd9Sstevel@tonic-gate 
519*7c478bd9Sstevel@tonic-gate /*
520*7c478bd9Sstevel@tonic-gate  * Return a count of the number of B_BUSY buffers in the system
521*7c478bd9Sstevel@tonic-gate  * Can only be used as a good estimate.  If 'cleanit' is set,
522*7c478bd9Sstevel@tonic-gate  * try to flush all bufs.
523*7c478bd9Sstevel@tonic-gate  */
524*7c478bd9Sstevel@tonic-gate int
525*7c478bd9Sstevel@tonic-gate bio_busy(int cleanit)
526*7c478bd9Sstevel@tonic-gate {
527*7c478bd9Sstevel@tonic-gate 	struct buf *bp, *dp;
528*7c478bd9Sstevel@tonic-gate 	int busy = 0;
529*7c478bd9Sstevel@tonic-gate 	int i;
530*7c478bd9Sstevel@tonic-gate 	kmutex_t *hmp;
531*7c478bd9Sstevel@tonic-gate 
532*7c478bd9Sstevel@tonic-gate 	for (i = 0; i < v.v_hbuf; i++) {
533*7c478bd9Sstevel@tonic-gate 		vfs_syncprogress();
534*7c478bd9Sstevel@tonic-gate 		dp = (struct buf *)&hbuf[i];
535*7c478bd9Sstevel@tonic-gate 		hmp = &hbuf[i].b_lock;
536*7c478bd9Sstevel@tonic-gate 
537*7c478bd9Sstevel@tonic-gate 		mutex_enter(hmp);
538*7c478bd9Sstevel@tonic-gate 		for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
539*7c478bd9Sstevel@tonic-gate 			if (bp->b_flags & B_BUSY)
540*7c478bd9Sstevel@tonic-gate 				busy++;
541*7c478bd9Sstevel@tonic-gate 		}
542*7c478bd9Sstevel@tonic-gate 		mutex_exit(hmp);
543*7c478bd9Sstevel@tonic-gate 	}
544*7c478bd9Sstevel@tonic-gate 
545*7c478bd9Sstevel@tonic-gate 	if (cleanit && busy != 0) {
546*7c478bd9Sstevel@tonic-gate 		bflush(NODEV);
547*7c478bd9Sstevel@tonic-gate 	}
548*7c478bd9Sstevel@tonic-gate 
549*7c478bd9Sstevel@tonic-gate 	return (busy);
550*7c478bd9Sstevel@tonic-gate }
551*7c478bd9Sstevel@tonic-gate 
552*7c478bd9Sstevel@tonic-gate /*
553*7c478bd9Sstevel@tonic-gate  * this interface is provided for binary compatibility.
554*7c478bd9Sstevel@tonic-gate  *
555*7c478bd9Sstevel@tonic-gate  * Assign a buffer for the given block.  If the appropriate
556*7c478bd9Sstevel@tonic-gate  * block is already associated, return it; otherwise search
557*7c478bd9Sstevel@tonic-gate  * for the oldest non-busy buffer and reassign it.
558*7c478bd9Sstevel@tonic-gate  */
559*7c478bd9Sstevel@tonic-gate struct buf *
560*7c478bd9Sstevel@tonic-gate getblk(dev_t dev, daddr_t blkno, long bsize)
561*7c478bd9Sstevel@tonic-gate {
562*7c478bd9Sstevel@tonic-gate 	return (getblk_common(/* ufsvfsp */ NULL, dev,
563*7c478bd9Sstevel@tonic-gate 			blkno, bsize, /* errflg */ 0));
564*7c478bd9Sstevel@tonic-gate }
565*7c478bd9Sstevel@tonic-gate 
566*7c478bd9Sstevel@tonic-gate /*
567*7c478bd9Sstevel@tonic-gate  * Assign a buffer for the given block.  If the appropriate
568*7c478bd9Sstevel@tonic-gate  * block is already associated, return it; otherwise search
569*7c478bd9Sstevel@tonic-gate  * for the oldest non-busy buffer and reassign it.
570*7c478bd9Sstevel@tonic-gate  */
571*7c478bd9Sstevel@tonic-gate struct buf *
572*7c478bd9Sstevel@tonic-gate getblk_common(void * arg, dev_t dev, daddr_t blkno, long bsize, int errflg)
573*7c478bd9Sstevel@tonic-gate {
574*7c478bd9Sstevel@tonic-gate 	ufsvfs_t *ufsvfsp = (struct ufsvfs *)arg;
575*7c478bd9Sstevel@tonic-gate 	struct buf *bp;
576*7c478bd9Sstevel@tonic-gate 	struct buf *dp;
577*7c478bd9Sstevel@tonic-gate 	struct buf *nbp = NULL;
578*7c478bd9Sstevel@tonic-gate 	struct buf *errbp;
579*7c478bd9Sstevel@tonic-gate 	uint_t		index;
580*7c478bd9Sstevel@tonic-gate 	kmutex_t	*hmp;
581*7c478bd9Sstevel@tonic-gate 	struct	hbuf	*hp;
582*7c478bd9Sstevel@tonic-gate 
583*7c478bd9Sstevel@tonic-gate 	if (getmajor(dev) >= devcnt)
584*7c478bd9Sstevel@tonic-gate 		cmn_err(CE_PANIC, "blkdev");
585*7c478bd9Sstevel@tonic-gate 
586*7c478bd9Sstevel@tonic-gate 	biostats.bio_lookup.value.ui32++;
587*7c478bd9Sstevel@tonic-gate 
588*7c478bd9Sstevel@tonic-gate 	index = bio_bhash(dev, blkno);
589*7c478bd9Sstevel@tonic-gate 	hp    = &hbuf[index];
590*7c478bd9Sstevel@tonic-gate 	dp    = (struct buf *)hp;
591*7c478bd9Sstevel@tonic-gate 	hmp   = &hp->b_lock;
592*7c478bd9Sstevel@tonic-gate 
593*7c478bd9Sstevel@tonic-gate 	mutex_enter(hmp);
594*7c478bd9Sstevel@tonic-gate loop:
595*7c478bd9Sstevel@tonic-gate 	for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
596*7c478bd9Sstevel@tonic-gate 		if (bp->b_blkno != blkno || bp->b_edev != dev ||
597*7c478bd9Sstevel@tonic-gate 		    (bp->b_flags & B_STALE))
598*7c478bd9Sstevel@tonic-gate 			continue;
599*7c478bd9Sstevel@tonic-gate 		/*
600*7c478bd9Sstevel@tonic-gate 		 * Avoid holding the hash lock in the event that
601*7c478bd9Sstevel@tonic-gate 		 * the buffer is locked by someone. Since the hash chain
602*7c478bd9Sstevel@tonic-gate 		 * may change when we drop the hash lock
603*7c478bd9Sstevel@tonic-gate 		 * we have to start at the beginning of the chain if the
604*7c478bd9Sstevel@tonic-gate 		 * buffer identity/contents aren't valid.
605*7c478bd9Sstevel@tonic-gate 		 */
606*7c478bd9Sstevel@tonic-gate 		if (!sema_tryp(&bp->b_sem)) {
607*7c478bd9Sstevel@tonic-gate 			biostats.bio_bufbusy.value.ui32++;
608*7c478bd9Sstevel@tonic-gate 			mutex_exit(hmp);
609*7c478bd9Sstevel@tonic-gate 			/*
610*7c478bd9Sstevel@tonic-gate 			 * OK, we are dealing with a busy buffer.
611*7c478bd9Sstevel@tonic-gate 			 * In the case that we are panicking and we
612*7c478bd9Sstevel@tonic-gate 			 * got called from bread(), we have some chance
613*7c478bd9Sstevel@tonic-gate 			 * for error recovery. So better bail out from
614*7c478bd9Sstevel@tonic-gate 			 * here since sema_p() won't block. If we got
615*7c478bd9Sstevel@tonic-gate 			 * called directly from ufs routines, there is
616*7c478bd9Sstevel@tonic-gate 			 * no way to report an error yet.
617*7c478bd9Sstevel@tonic-gate 			 */
618*7c478bd9Sstevel@tonic-gate 			if (panicstr && errflg)
619*7c478bd9Sstevel@tonic-gate 				goto errout;
620*7c478bd9Sstevel@tonic-gate 			/*
621*7c478bd9Sstevel@tonic-gate 			 * For the following line of code to work
622*7c478bd9Sstevel@tonic-gate 			 * correctly never kmem_free the buffer "header".
623*7c478bd9Sstevel@tonic-gate 			 */
624*7c478bd9Sstevel@tonic-gate 			sema_p(&bp->b_sem);
625*7c478bd9Sstevel@tonic-gate 			if (bp->b_blkno != blkno || bp->b_edev != dev ||
626*7c478bd9Sstevel@tonic-gate 			    (bp->b_flags & B_STALE)) {
627*7c478bd9Sstevel@tonic-gate 				sema_v(&bp->b_sem);
628*7c478bd9Sstevel@tonic-gate 				mutex_enter(hmp);
629*7c478bd9Sstevel@tonic-gate 				goto loop;	/* start over */
630*7c478bd9Sstevel@tonic-gate 			}
631*7c478bd9Sstevel@tonic-gate 			mutex_enter(hmp);
632*7c478bd9Sstevel@tonic-gate 		}
633*7c478bd9Sstevel@tonic-gate 		/* Found */
634*7c478bd9Sstevel@tonic-gate 		biostats.bio_hit.value.ui32++;
635*7c478bd9Sstevel@tonic-gate 		bp->b_flags &= ~B_AGE;
636*7c478bd9Sstevel@tonic-gate 
637*7c478bd9Sstevel@tonic-gate 		/*
638*7c478bd9Sstevel@tonic-gate 		 * Yank it off the free/delayed write lists
639*7c478bd9Sstevel@tonic-gate 		 */
640*7c478bd9Sstevel@tonic-gate 		hp->b_length--;
641*7c478bd9Sstevel@tonic-gate 		notavail(bp);
642*7c478bd9Sstevel@tonic-gate 		mutex_exit(hmp);
643*7c478bd9Sstevel@tonic-gate 
644*7c478bd9Sstevel@tonic-gate 		ASSERT((bp->b_flags & B_NOCACHE) == NULL);
645*7c478bd9Sstevel@tonic-gate 
646*7c478bd9Sstevel@tonic-gate 		if (nbp == NULL) {
647*7c478bd9Sstevel@tonic-gate 			/*
648*7c478bd9Sstevel@tonic-gate 			 * Make the common path short.
649*7c478bd9Sstevel@tonic-gate 			 */
650*7c478bd9Sstevel@tonic-gate 			ASSERT(SEMA_HELD(&bp->b_sem));
651*7c478bd9Sstevel@tonic-gate 			return (bp);
652*7c478bd9Sstevel@tonic-gate 		}
653*7c478bd9Sstevel@tonic-gate 
654*7c478bd9Sstevel@tonic-gate 		biostats.bio_bufdup.value.ui32++;
655*7c478bd9Sstevel@tonic-gate 
656*7c478bd9Sstevel@tonic-gate 		/*
657*7c478bd9Sstevel@tonic-gate 		 * The buffer must have entered during the lock upgrade
658*7c478bd9Sstevel@tonic-gate 		 * so free the new buffer we allocated and return the
659*7c478bd9Sstevel@tonic-gate 		 * found buffer.
660*7c478bd9Sstevel@tonic-gate 		 */
661*7c478bd9Sstevel@tonic-gate 		kmem_free(nbp->b_un.b_addr, nbp->b_bufsize);
662*7c478bd9Sstevel@tonic-gate 		nbp->b_un.b_addr = NULL;
663*7c478bd9Sstevel@tonic-gate 
664*7c478bd9Sstevel@tonic-gate 		/*
665*7c478bd9Sstevel@tonic-gate 		 * Account for the memory
666*7c478bd9Sstevel@tonic-gate 		 */
667*7c478bd9Sstevel@tonic-gate 		mutex_enter(&bfree_lock);
668*7c478bd9Sstevel@tonic-gate 		bfreelist.b_bufsize += nbp->b_bufsize;
669*7c478bd9Sstevel@tonic-gate 		mutex_exit(&bfree_lock);
670*7c478bd9Sstevel@tonic-gate 
671*7c478bd9Sstevel@tonic-gate 		/*
672*7c478bd9Sstevel@tonic-gate 		 * Destroy buf identity, and place on avail list
673*7c478bd9Sstevel@tonic-gate 		 */
674*7c478bd9Sstevel@tonic-gate 		nbp->b_dev = (o_dev_t)NODEV;
675*7c478bd9Sstevel@tonic-gate 		nbp->b_edev = NODEV;
676*7c478bd9Sstevel@tonic-gate 		nbp->b_flags = 0;
677*7c478bd9Sstevel@tonic-gate 		nbp->b_file = NULL;
678*7c478bd9Sstevel@tonic-gate 		nbp->b_offset = -1;
679*7c478bd9Sstevel@tonic-gate 
680*7c478bd9Sstevel@tonic-gate 		sema_v(&nbp->b_sem);
681*7c478bd9Sstevel@tonic-gate 		bio_bhdr_free(nbp);
682*7c478bd9Sstevel@tonic-gate 
683*7c478bd9Sstevel@tonic-gate 		ASSERT(SEMA_HELD(&bp->b_sem));
684*7c478bd9Sstevel@tonic-gate 		return (bp);
685*7c478bd9Sstevel@tonic-gate 	}
686*7c478bd9Sstevel@tonic-gate 
687*7c478bd9Sstevel@tonic-gate 	/*
688*7c478bd9Sstevel@tonic-gate 	 * bio_getfreeblk may block so check the hash chain again.
689*7c478bd9Sstevel@tonic-gate 	 */
690*7c478bd9Sstevel@tonic-gate 	if (nbp == NULL) {
691*7c478bd9Sstevel@tonic-gate 		mutex_exit(hmp);
692*7c478bd9Sstevel@tonic-gate 		nbp = bio_getfreeblk(bsize);
693*7c478bd9Sstevel@tonic-gate 		mutex_enter(hmp);
694*7c478bd9Sstevel@tonic-gate 		goto loop;
695*7c478bd9Sstevel@tonic-gate 	}
696*7c478bd9Sstevel@tonic-gate 
697*7c478bd9Sstevel@tonic-gate 	/*
698*7c478bd9Sstevel@tonic-gate 	 * New buffer. Assign nbp and stick it on the hash.
699*7c478bd9Sstevel@tonic-gate 	 */
700*7c478bd9Sstevel@tonic-gate 	nbp->b_flags = B_BUSY;
701*7c478bd9Sstevel@tonic-gate 	nbp->b_edev = dev;
702*7c478bd9Sstevel@tonic-gate 	nbp->b_dev = (o_dev_t)cmpdev(dev);
703*7c478bd9Sstevel@tonic-gate 	nbp->b_blkno = blkno;
704*7c478bd9Sstevel@tonic-gate 	nbp->b_iodone = NULL;
705*7c478bd9Sstevel@tonic-gate 	nbp->b_bcount = bsize;
706*7c478bd9Sstevel@tonic-gate 	/*
707*7c478bd9Sstevel@tonic-gate 	 * If we are given a ufsvfsp and the vfs_root field is NULL
708*7c478bd9Sstevel@tonic-gate 	 * then this must be I/O for a superblock.  A superblock's
709*7c478bd9Sstevel@tonic-gate 	 * buffer is set up in mountfs() and there is no root vnode
710*7c478bd9Sstevel@tonic-gate 	 * at that point.
711*7c478bd9Sstevel@tonic-gate 	 */
712*7c478bd9Sstevel@tonic-gate 	if (ufsvfsp && ufsvfsp->vfs_root) {
713*7c478bd9Sstevel@tonic-gate 		nbp->b_vp = ufsvfsp->vfs_root;
714*7c478bd9Sstevel@tonic-gate 	} else {
715*7c478bd9Sstevel@tonic-gate 		nbp->b_vp = NULL;
716*7c478bd9Sstevel@tonic-gate 	}
717*7c478bd9Sstevel@tonic-gate 
718*7c478bd9Sstevel@tonic-gate 	ASSERT((nbp->b_flags & B_NOCACHE) == NULL);
719*7c478bd9Sstevel@tonic-gate 
720*7c478bd9Sstevel@tonic-gate 	binshash(nbp, dp);
721*7c478bd9Sstevel@tonic-gate 	mutex_exit(hmp);
722*7c478bd9Sstevel@tonic-gate 
723*7c478bd9Sstevel@tonic-gate 	ASSERT(SEMA_HELD(&nbp->b_sem));
724*7c478bd9Sstevel@tonic-gate 
725*7c478bd9Sstevel@tonic-gate 	return (nbp);
726*7c478bd9Sstevel@tonic-gate 
727*7c478bd9Sstevel@tonic-gate 
728*7c478bd9Sstevel@tonic-gate 	/*
729*7c478bd9Sstevel@tonic-gate 	 * Come here in case of an internal error. At this point we couldn't
730*7c478bd9Sstevel@tonic-gate 	 * get a buffer, but he have to return one. Hence we allocate some
731*7c478bd9Sstevel@tonic-gate 	 * kind of error reply buffer on the fly. This buffer is marked as
732*7c478bd9Sstevel@tonic-gate 	 * B_NOCACHE | B_AGE | B_ERROR | B_DONE to assure the following:
733*7c478bd9Sstevel@tonic-gate 	 *	- B_ERROR will indicate error to the caller.
734*7c478bd9Sstevel@tonic-gate 	 *	- B_DONE will prevent us from reading the buffer from
735*7c478bd9Sstevel@tonic-gate 	 *	  the device.
736*7c478bd9Sstevel@tonic-gate 	 *	- B_NOCACHE will cause that this buffer gets free'd in
737*7c478bd9Sstevel@tonic-gate 	 *	  brelse().
738*7c478bd9Sstevel@tonic-gate 	 */
739*7c478bd9Sstevel@tonic-gate 
740*7c478bd9Sstevel@tonic-gate errout:
741*7c478bd9Sstevel@tonic-gate 	errbp = geteblk();
742*7c478bd9Sstevel@tonic-gate 	sema_p(&errbp->b_sem);
743*7c478bd9Sstevel@tonic-gate 	errbp->b_flags &= ~B_BUSY;
744*7c478bd9Sstevel@tonic-gate 	errbp->b_flags |= (B_ERROR | B_DONE);
745*7c478bd9Sstevel@tonic-gate 	return (errbp);
746*7c478bd9Sstevel@tonic-gate }
747*7c478bd9Sstevel@tonic-gate 
748*7c478bd9Sstevel@tonic-gate /*
749*7c478bd9Sstevel@tonic-gate  * Get an empty block, not assigned to any particular device.
750*7c478bd9Sstevel@tonic-gate  * Returns a locked buffer that is not on any hash or free list.
751*7c478bd9Sstevel@tonic-gate  */
752*7c478bd9Sstevel@tonic-gate struct buf *
753*7c478bd9Sstevel@tonic-gate ngeteblk(long bsize)
754*7c478bd9Sstevel@tonic-gate {
755*7c478bd9Sstevel@tonic-gate 	struct buf *bp;
756*7c478bd9Sstevel@tonic-gate 
757*7c478bd9Sstevel@tonic-gate 	bp = kmem_alloc(sizeof (struct buf), KM_SLEEP);
758*7c478bd9Sstevel@tonic-gate 	bioinit(bp);
759*7c478bd9Sstevel@tonic-gate 	bp->av_forw = bp->av_back = NULL;
760*7c478bd9Sstevel@tonic-gate 	bp->b_un.b_addr = kmem_alloc(bsize, KM_SLEEP);
761*7c478bd9Sstevel@tonic-gate 	bp->b_bufsize = bsize;
762*7c478bd9Sstevel@tonic-gate 	bp->b_flags = B_BUSY | B_NOCACHE | B_AGE;
763*7c478bd9Sstevel@tonic-gate 	bp->b_dev = (o_dev_t)NODEV;
764*7c478bd9Sstevel@tonic-gate 	bp->b_edev = NODEV;
765*7c478bd9Sstevel@tonic-gate 	bp->b_lblkno = 0;
766*7c478bd9Sstevel@tonic-gate 	bp->b_bcount = bsize;
767*7c478bd9Sstevel@tonic-gate 	bp->b_iodone = NULL;
768*7c478bd9Sstevel@tonic-gate 	return (bp);
769*7c478bd9Sstevel@tonic-gate }
770*7c478bd9Sstevel@tonic-gate 
771*7c478bd9Sstevel@tonic-gate /*
772*7c478bd9Sstevel@tonic-gate  * Interface of geteblk() is kept intact to maintain driver compatibility.
773*7c478bd9Sstevel@tonic-gate  * Use ngeteblk() to allocate block size other than 1 KB.
774*7c478bd9Sstevel@tonic-gate  */
775*7c478bd9Sstevel@tonic-gate struct buf *
776*7c478bd9Sstevel@tonic-gate geteblk(void)
777*7c478bd9Sstevel@tonic-gate {
778*7c478bd9Sstevel@tonic-gate 	return (ngeteblk((long)1024));
779*7c478bd9Sstevel@tonic-gate }
780*7c478bd9Sstevel@tonic-gate 
781*7c478bd9Sstevel@tonic-gate /*
782*7c478bd9Sstevel@tonic-gate  * Return a buffer w/o sleeping
783*7c478bd9Sstevel@tonic-gate  */
784*7c478bd9Sstevel@tonic-gate struct buf *
785*7c478bd9Sstevel@tonic-gate trygetblk(dev_t dev, daddr_t blkno)
786*7c478bd9Sstevel@tonic-gate {
787*7c478bd9Sstevel@tonic-gate 	struct buf	*bp;
788*7c478bd9Sstevel@tonic-gate 	struct buf	*dp;
789*7c478bd9Sstevel@tonic-gate 	struct hbuf	*hp;
790*7c478bd9Sstevel@tonic-gate 	kmutex_t	*hmp;
791*7c478bd9Sstevel@tonic-gate 	uint_t		index;
792*7c478bd9Sstevel@tonic-gate 
793*7c478bd9Sstevel@tonic-gate 	index = bio_bhash(dev, blkno);
794*7c478bd9Sstevel@tonic-gate 	hp = &hbuf[index];
795*7c478bd9Sstevel@tonic-gate 	hmp = &hp->b_lock;
796*7c478bd9Sstevel@tonic-gate 
797*7c478bd9Sstevel@tonic-gate 	if (!mutex_tryenter(hmp))
798*7c478bd9Sstevel@tonic-gate 		return (NULL);
799*7c478bd9Sstevel@tonic-gate 
800*7c478bd9Sstevel@tonic-gate 	dp = (struct buf *)hp;
801*7c478bd9Sstevel@tonic-gate 	for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
802*7c478bd9Sstevel@tonic-gate 		if (bp->b_blkno != blkno || bp->b_edev != dev ||
803*7c478bd9Sstevel@tonic-gate 		    (bp->b_flags & B_STALE))
804*7c478bd9Sstevel@tonic-gate 			continue;
805*7c478bd9Sstevel@tonic-gate 		/*
806*7c478bd9Sstevel@tonic-gate 		 * Get access to a valid buffer without sleeping
807*7c478bd9Sstevel@tonic-gate 		 */
808*7c478bd9Sstevel@tonic-gate 		if (sema_tryp(&bp->b_sem)) {
809*7c478bd9Sstevel@tonic-gate 			if (bp->b_flags & B_DONE) {
810*7c478bd9Sstevel@tonic-gate 				hp->b_length--;
811*7c478bd9Sstevel@tonic-gate 				notavail(bp);
812*7c478bd9Sstevel@tonic-gate 				mutex_exit(hmp);
813*7c478bd9Sstevel@tonic-gate 				return (bp);
814*7c478bd9Sstevel@tonic-gate 			} else {
815*7c478bd9Sstevel@tonic-gate 				sema_v(&bp->b_sem);
816*7c478bd9Sstevel@tonic-gate 				break;
817*7c478bd9Sstevel@tonic-gate 			}
818*7c478bd9Sstevel@tonic-gate 		}
819*7c478bd9Sstevel@tonic-gate 		break;
820*7c478bd9Sstevel@tonic-gate 	}
821*7c478bd9Sstevel@tonic-gate 	mutex_exit(hmp);
822*7c478bd9Sstevel@tonic-gate 	return (NULL);
823*7c478bd9Sstevel@tonic-gate }
824*7c478bd9Sstevel@tonic-gate 
825*7c478bd9Sstevel@tonic-gate /*
826*7c478bd9Sstevel@tonic-gate  * Wait for I/O completion on the buffer; return errors
827*7c478bd9Sstevel@tonic-gate  * to the user.
828*7c478bd9Sstevel@tonic-gate  */
829*7c478bd9Sstevel@tonic-gate int
830*7c478bd9Sstevel@tonic-gate iowait(struct buf *bp)
831*7c478bd9Sstevel@tonic-gate {
832*7c478bd9Sstevel@tonic-gate 	ASSERT(SEMA_HELD(&bp->b_sem));
833*7c478bd9Sstevel@tonic-gate 	return (biowait(bp));
834*7c478bd9Sstevel@tonic-gate }
835*7c478bd9Sstevel@tonic-gate 
836*7c478bd9Sstevel@tonic-gate /*
837*7c478bd9Sstevel@tonic-gate  * Mark I/O complete on a buffer, release it if I/O is asynchronous,
838*7c478bd9Sstevel@tonic-gate  * and wake up anyone waiting for it.
839*7c478bd9Sstevel@tonic-gate  */
840*7c478bd9Sstevel@tonic-gate void
841*7c478bd9Sstevel@tonic-gate iodone(struct buf *bp)
842*7c478bd9Sstevel@tonic-gate {
843*7c478bd9Sstevel@tonic-gate 	ASSERT(SEMA_HELD(&bp->b_sem));
844*7c478bd9Sstevel@tonic-gate 	(void) biodone(bp);
845*7c478bd9Sstevel@tonic-gate }
846*7c478bd9Sstevel@tonic-gate 
847*7c478bd9Sstevel@tonic-gate /*
848*7c478bd9Sstevel@tonic-gate  * Zero the core associated with a buffer.
849*7c478bd9Sstevel@tonic-gate  */
850*7c478bd9Sstevel@tonic-gate void
851*7c478bd9Sstevel@tonic-gate clrbuf(struct buf *bp)
852*7c478bd9Sstevel@tonic-gate {
853*7c478bd9Sstevel@tonic-gate 	ASSERT(SEMA_HELD(&bp->b_sem));
854*7c478bd9Sstevel@tonic-gate 	bzero(bp->b_un.b_addr, bp->b_bcount);
855*7c478bd9Sstevel@tonic-gate 	bp->b_resid = 0;
856*7c478bd9Sstevel@tonic-gate }
857*7c478bd9Sstevel@tonic-gate 
858*7c478bd9Sstevel@tonic-gate 
859*7c478bd9Sstevel@tonic-gate /*
860*7c478bd9Sstevel@tonic-gate  * Make sure all write-behind blocks on dev (or NODEV for all)
861*7c478bd9Sstevel@tonic-gate  * are flushed out.
862*7c478bd9Sstevel@tonic-gate  */
863*7c478bd9Sstevel@tonic-gate void
864*7c478bd9Sstevel@tonic-gate bflush(dev_t dev)
865*7c478bd9Sstevel@tonic-gate {
866*7c478bd9Sstevel@tonic-gate 	struct buf *bp, *dp;
867*7c478bd9Sstevel@tonic-gate 	struct hbuf *hp;
868*7c478bd9Sstevel@tonic-gate 	struct buf *delwri_list = EMPTY_LIST;
869*7c478bd9Sstevel@tonic-gate 	int i, index;
870*7c478bd9Sstevel@tonic-gate 	kmutex_t *hmp;
871*7c478bd9Sstevel@tonic-gate 
872*7c478bd9Sstevel@tonic-gate 	mutex_enter(&blist_lock);
873*7c478bd9Sstevel@tonic-gate 	/*
874*7c478bd9Sstevel@tonic-gate 	 * Wait for any invalidates or flushes ahead of us to finish.
875*7c478bd9Sstevel@tonic-gate 	 * We really could split blist_lock up per device for better
876*7c478bd9Sstevel@tonic-gate 	 * parallelism here.
877*7c478bd9Sstevel@tonic-gate 	 */
878*7c478bd9Sstevel@tonic-gate 	while (bio_doinginval || bio_doingflush) {
879*7c478bd9Sstevel@tonic-gate 		bio_flinv_cv_wanted = 1;
880*7c478bd9Sstevel@tonic-gate 		cv_wait(&bio_flushinval_cv, &blist_lock);
881*7c478bd9Sstevel@tonic-gate 	}
882*7c478bd9Sstevel@tonic-gate 	bio_doingflush++;
883*7c478bd9Sstevel@tonic-gate 	/*
884*7c478bd9Sstevel@tonic-gate 	 * Gather all B_DELWRI buffer for device.
885*7c478bd9Sstevel@tonic-gate 	 * Lock ordering is b_sem > hash lock (brelse).
886*7c478bd9Sstevel@tonic-gate 	 * Since we are finding the buffer via the delayed write list,
887*7c478bd9Sstevel@tonic-gate 	 * it may be busy and we would block trying to get the
888*7c478bd9Sstevel@tonic-gate 	 * b_sem lock while holding hash lock. So transfer all the
889*7c478bd9Sstevel@tonic-gate 	 * candidates on the delwri_list and then drop the hash locks.
890*7c478bd9Sstevel@tonic-gate 	 */
891*7c478bd9Sstevel@tonic-gate 	for (i = 0; i < v.v_hbuf; i++) {
892*7c478bd9Sstevel@tonic-gate 		vfs_syncprogress();
893*7c478bd9Sstevel@tonic-gate 		hmp = &hbuf[i].b_lock;
894*7c478bd9Sstevel@tonic-gate 		dp = (struct buf *)&dwbuf[i];
895*7c478bd9Sstevel@tonic-gate 		mutex_enter(hmp);
896*7c478bd9Sstevel@tonic-gate 		for (bp = dp->av_forw; bp != dp; bp = bp->av_forw) {
897*7c478bd9Sstevel@tonic-gate 			if (dev == NODEV || bp->b_edev == dev) {
898*7c478bd9Sstevel@tonic-gate 				if (bp->b_list == NULL) {
899*7c478bd9Sstevel@tonic-gate 					bp->b_list = delwri_list;
900*7c478bd9Sstevel@tonic-gate 					delwri_list = bp;
901*7c478bd9Sstevel@tonic-gate 				}
902*7c478bd9Sstevel@tonic-gate 			}
903*7c478bd9Sstevel@tonic-gate 		}
904*7c478bd9Sstevel@tonic-gate 		mutex_exit(hmp);
905*7c478bd9Sstevel@tonic-gate 	}
906*7c478bd9Sstevel@tonic-gate 	mutex_exit(&blist_lock);
907*7c478bd9Sstevel@tonic-gate 
908*7c478bd9Sstevel@tonic-gate 	/*
909*7c478bd9Sstevel@tonic-gate 	 * Now that the hash locks have been dropped grab the semaphores
910*7c478bd9Sstevel@tonic-gate 	 * and write back all the buffers that have B_DELWRI set.
911*7c478bd9Sstevel@tonic-gate 	 */
912*7c478bd9Sstevel@tonic-gate 	while (delwri_list != EMPTY_LIST) {
913*7c478bd9Sstevel@tonic-gate 		vfs_syncprogress();
914*7c478bd9Sstevel@tonic-gate 		bp = delwri_list;
915*7c478bd9Sstevel@tonic-gate 
916*7c478bd9Sstevel@tonic-gate 		sema_p(&bp->b_sem);	/* may block */
917*7c478bd9Sstevel@tonic-gate 		if ((dev != bp->b_edev && dev != NODEV) ||
918*7c478bd9Sstevel@tonic-gate 		    (panicstr && bp->b_flags & B_BUSY)) {
919*7c478bd9Sstevel@tonic-gate 			sema_v(&bp->b_sem);
920*7c478bd9Sstevel@tonic-gate 			delwri_list = bp->b_list;
921*7c478bd9Sstevel@tonic-gate 			bp->b_list = NULL;
922*7c478bd9Sstevel@tonic-gate 			continue;	/* No longer a candidate */
923*7c478bd9Sstevel@tonic-gate 		}
924*7c478bd9Sstevel@tonic-gate 		if (bp->b_flags & B_DELWRI) {
925*7c478bd9Sstevel@tonic-gate 			index = bio_bhash(bp->b_edev, bp->b_blkno);
926*7c478bd9Sstevel@tonic-gate 			hp = &hbuf[index];
927*7c478bd9Sstevel@tonic-gate 			hmp = &hp->b_lock;
928*7c478bd9Sstevel@tonic-gate 			dp = (struct buf *)hp;
929*7c478bd9Sstevel@tonic-gate 
930*7c478bd9Sstevel@tonic-gate 			bp->b_flags |= B_ASYNC;
931*7c478bd9Sstevel@tonic-gate 			mutex_enter(hmp);
932*7c478bd9Sstevel@tonic-gate 			hp->b_length--;
933*7c478bd9Sstevel@tonic-gate 			notavail(bp);
934*7c478bd9Sstevel@tonic-gate 			mutex_exit(hmp);
935*7c478bd9Sstevel@tonic-gate 			if (bp->b_vp == NULL) {		/* !ufs */
936*7c478bd9Sstevel@tonic-gate 				BWRITE(bp);
937*7c478bd9Sstevel@tonic-gate 			} else {			/* ufs */
938*7c478bd9Sstevel@tonic-gate 				UFS_BWRITE(VTOI(bp->b_vp)->i_ufsvfs, bp);
939*7c478bd9Sstevel@tonic-gate 			}
940*7c478bd9Sstevel@tonic-gate 		} else {
941*7c478bd9Sstevel@tonic-gate 			sema_v(&bp->b_sem);
942*7c478bd9Sstevel@tonic-gate 		}
943*7c478bd9Sstevel@tonic-gate 		delwri_list = bp->b_list;
944*7c478bd9Sstevel@tonic-gate 		bp->b_list = NULL;
945*7c478bd9Sstevel@tonic-gate 	}
946*7c478bd9Sstevel@tonic-gate 	mutex_enter(&blist_lock);
947*7c478bd9Sstevel@tonic-gate 	bio_doingflush--;
948*7c478bd9Sstevel@tonic-gate 	if (bio_flinv_cv_wanted) {
949*7c478bd9Sstevel@tonic-gate 		bio_flinv_cv_wanted = 0;
950*7c478bd9Sstevel@tonic-gate 		cv_broadcast(&bio_flushinval_cv);
951*7c478bd9Sstevel@tonic-gate 	}
952*7c478bd9Sstevel@tonic-gate 	mutex_exit(&blist_lock);
953*7c478bd9Sstevel@tonic-gate }
954*7c478bd9Sstevel@tonic-gate 
955*7c478bd9Sstevel@tonic-gate /*
956*7c478bd9Sstevel@tonic-gate  * Ensure that a specified block is up-to-date on disk.
957*7c478bd9Sstevel@tonic-gate  */
958*7c478bd9Sstevel@tonic-gate void
959*7c478bd9Sstevel@tonic-gate blkflush(dev_t dev, daddr_t blkno)
960*7c478bd9Sstevel@tonic-gate {
961*7c478bd9Sstevel@tonic-gate 	struct buf *bp, *dp;
962*7c478bd9Sstevel@tonic-gate 	struct hbuf *hp;
963*7c478bd9Sstevel@tonic-gate 	struct buf *sbp = NULL;
964*7c478bd9Sstevel@tonic-gate 	uint_t index;
965*7c478bd9Sstevel@tonic-gate 	kmutex_t *hmp;
966*7c478bd9Sstevel@tonic-gate 
967*7c478bd9Sstevel@tonic-gate 	index = bio_bhash(dev, blkno);
968*7c478bd9Sstevel@tonic-gate 	hp    = &hbuf[index];
969*7c478bd9Sstevel@tonic-gate 	dp    = (struct buf *)hp;
970*7c478bd9Sstevel@tonic-gate 	hmp   = &hp->b_lock;
971*7c478bd9Sstevel@tonic-gate 
972*7c478bd9Sstevel@tonic-gate 	/*
973*7c478bd9Sstevel@tonic-gate 	 * Identify the buffer in the cache belonging to
974*7c478bd9Sstevel@tonic-gate 	 * this device and blkno (if any).
975*7c478bd9Sstevel@tonic-gate 	 */
976*7c478bd9Sstevel@tonic-gate 	mutex_enter(hmp);
977*7c478bd9Sstevel@tonic-gate 	for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
978*7c478bd9Sstevel@tonic-gate 		if (bp->b_blkno != blkno || bp->b_edev != dev ||
979*7c478bd9Sstevel@tonic-gate 		    (bp->b_flags & B_STALE))
980*7c478bd9Sstevel@tonic-gate 			continue;
981*7c478bd9Sstevel@tonic-gate 		sbp = bp;
982*7c478bd9Sstevel@tonic-gate 		break;
983*7c478bd9Sstevel@tonic-gate 	}
984*7c478bd9Sstevel@tonic-gate 	mutex_exit(hmp);
985*7c478bd9Sstevel@tonic-gate 	if (sbp == NULL)
986*7c478bd9Sstevel@tonic-gate 		return;
987*7c478bd9Sstevel@tonic-gate 	/*
988*7c478bd9Sstevel@tonic-gate 	 * Now check the buffer we have identified and
989*7c478bd9Sstevel@tonic-gate 	 * make sure it still belongs to the device and is B_DELWRI
990*7c478bd9Sstevel@tonic-gate 	 */
991*7c478bd9Sstevel@tonic-gate 	sema_p(&sbp->b_sem);
992*7c478bd9Sstevel@tonic-gate 	if (sbp->b_blkno == blkno && sbp->b_edev == dev &&
993*7c478bd9Sstevel@tonic-gate 	    (sbp->b_flags & (B_DELWRI|B_STALE)) == B_DELWRI) {
994*7c478bd9Sstevel@tonic-gate 		mutex_enter(hmp);
995*7c478bd9Sstevel@tonic-gate 		hp->b_length--;
996*7c478bd9Sstevel@tonic-gate 		notavail(sbp);
997*7c478bd9Sstevel@tonic-gate 		mutex_exit(hmp);
998*7c478bd9Sstevel@tonic-gate 		/*
999*7c478bd9Sstevel@tonic-gate 		 * XXX - There is nothing to guarantee a synchronous
1000*7c478bd9Sstevel@tonic-gate 		 * write here if the B_ASYNC flag is set.  This needs
1001*7c478bd9Sstevel@tonic-gate 		 * some investigation.
1002*7c478bd9Sstevel@tonic-gate 		 */
1003*7c478bd9Sstevel@tonic-gate 		if (sbp->b_vp == NULL) {		/* !ufs */
1004*7c478bd9Sstevel@tonic-gate 			BWRITE(sbp);	/* synchronous write */
1005*7c478bd9Sstevel@tonic-gate 		} else {				/* ufs */
1006*7c478bd9Sstevel@tonic-gate 			UFS_BWRITE(VTOI(sbp->b_vp)->i_ufsvfs, sbp);
1007*7c478bd9Sstevel@tonic-gate 		}
1008*7c478bd9Sstevel@tonic-gate 	} else {
1009*7c478bd9Sstevel@tonic-gate 		sema_v(&sbp->b_sem);
1010*7c478bd9Sstevel@tonic-gate 	}
1011*7c478bd9Sstevel@tonic-gate }
1012*7c478bd9Sstevel@tonic-gate 
1013*7c478bd9Sstevel@tonic-gate /*
1014*7c478bd9Sstevel@tonic-gate  * Same as binval, except can force-invalidate delayed-write buffers
1015*7c478bd9Sstevel@tonic-gate  * (which are not be already flushed because of device errors).  Also
1016*7c478bd9Sstevel@tonic-gate  * makes sure that the retry write flag is cleared.
1017*7c478bd9Sstevel@tonic-gate  */
1018*7c478bd9Sstevel@tonic-gate int
1019*7c478bd9Sstevel@tonic-gate bfinval(dev_t dev, int force)
1020*7c478bd9Sstevel@tonic-gate {
1021*7c478bd9Sstevel@tonic-gate 	struct buf *dp;
1022*7c478bd9Sstevel@tonic-gate 	struct buf *bp;
1023*7c478bd9Sstevel@tonic-gate 	struct buf *binval_list = EMPTY_LIST;
1024*7c478bd9Sstevel@tonic-gate 	int i, error = 0;
1025*7c478bd9Sstevel@tonic-gate 	kmutex_t *hmp;
1026*7c478bd9Sstevel@tonic-gate 	uint_t index;
1027*7c478bd9Sstevel@tonic-gate 	struct buf **backp;
1028*7c478bd9Sstevel@tonic-gate 
1029*7c478bd9Sstevel@tonic-gate 	mutex_enter(&blist_lock);
1030*7c478bd9Sstevel@tonic-gate 	/*
1031*7c478bd9Sstevel@tonic-gate 	 * Wait for any flushes ahead of us to finish, it's ok to
1032*7c478bd9Sstevel@tonic-gate 	 * do invalidates in parallel.
1033*7c478bd9Sstevel@tonic-gate 	 */
1034*7c478bd9Sstevel@tonic-gate 	while (bio_doingflush) {
1035*7c478bd9Sstevel@tonic-gate 		bio_flinv_cv_wanted = 1;
1036*7c478bd9Sstevel@tonic-gate 		cv_wait(&bio_flushinval_cv, &blist_lock);
1037*7c478bd9Sstevel@tonic-gate 	}
1038*7c478bd9Sstevel@tonic-gate 	bio_doinginval++;
1039*7c478bd9Sstevel@tonic-gate 
1040*7c478bd9Sstevel@tonic-gate 	/* Gather bp's */
1041*7c478bd9Sstevel@tonic-gate 	for (i = 0; i < v.v_hbuf; i++) {
1042*7c478bd9Sstevel@tonic-gate 		dp = (struct buf *)&hbuf[i];
1043*7c478bd9Sstevel@tonic-gate 		hmp = &hbuf[i].b_lock;
1044*7c478bd9Sstevel@tonic-gate 
1045*7c478bd9Sstevel@tonic-gate 		mutex_enter(hmp);
1046*7c478bd9Sstevel@tonic-gate 		for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
1047*7c478bd9Sstevel@tonic-gate 			if (bp->b_edev == dev) {
1048*7c478bd9Sstevel@tonic-gate 				if (bp->b_list == NULL) {
1049*7c478bd9Sstevel@tonic-gate 					bp->b_list = binval_list;
1050*7c478bd9Sstevel@tonic-gate 					binval_list = bp;
1051*7c478bd9Sstevel@tonic-gate 				}
1052*7c478bd9Sstevel@tonic-gate 			}
1053*7c478bd9Sstevel@tonic-gate 		}
1054*7c478bd9Sstevel@tonic-gate 		mutex_exit(hmp);
1055*7c478bd9Sstevel@tonic-gate 	}
1056*7c478bd9Sstevel@tonic-gate 	mutex_exit(&blist_lock);
1057*7c478bd9Sstevel@tonic-gate 
1058*7c478bd9Sstevel@tonic-gate 	/* Invalidate all bp's found */
1059*7c478bd9Sstevel@tonic-gate 	while (binval_list != EMPTY_LIST) {
1060*7c478bd9Sstevel@tonic-gate 		bp = binval_list;
1061*7c478bd9Sstevel@tonic-gate 
1062*7c478bd9Sstevel@tonic-gate 		sema_p(&bp->b_sem);
1063*7c478bd9Sstevel@tonic-gate 		if (bp->b_edev == dev) {
1064*7c478bd9Sstevel@tonic-gate 			if (force && (bp->b_flags & B_DELWRI)) {
1065*7c478bd9Sstevel@tonic-gate 				/* clear B_DELWRI, move to non-dw freelist */
1066*7c478bd9Sstevel@tonic-gate 				index = bio_bhash(bp->b_edev, bp->b_blkno);
1067*7c478bd9Sstevel@tonic-gate 				hmp = &hbuf[index].b_lock;
1068*7c478bd9Sstevel@tonic-gate 				dp = (struct buf *)&hbuf[index];
1069*7c478bd9Sstevel@tonic-gate 				mutex_enter(hmp);
1070*7c478bd9Sstevel@tonic-gate 
1071*7c478bd9Sstevel@tonic-gate 				/* remove from delayed write freelist */
1072*7c478bd9Sstevel@tonic-gate 				notavail(bp);
1073*7c478bd9Sstevel@tonic-gate 
1074*7c478bd9Sstevel@tonic-gate 				/* add to B_AGE side of non-dw freelist */
1075*7c478bd9Sstevel@tonic-gate 				backp = &dp->av_forw;
1076*7c478bd9Sstevel@tonic-gate 				(*backp)->av_back = bp;
1077*7c478bd9Sstevel@tonic-gate 				bp->av_forw = *backp;
1078*7c478bd9Sstevel@tonic-gate 				*backp = bp;
1079*7c478bd9Sstevel@tonic-gate 				bp->av_back = dp;
1080*7c478bd9Sstevel@tonic-gate 
1081*7c478bd9Sstevel@tonic-gate 				/*
1082*7c478bd9Sstevel@tonic-gate 				 * make sure write retries and busy are cleared
1083*7c478bd9Sstevel@tonic-gate 				 */
1084*7c478bd9Sstevel@tonic-gate 				bp->b_flags &=
1085*7c478bd9Sstevel@tonic-gate 				    ~(B_BUSY | B_DELWRI | B_RETRYWRI);
1086*7c478bd9Sstevel@tonic-gate 				mutex_exit(hmp);
1087*7c478bd9Sstevel@tonic-gate 			}
1088*7c478bd9Sstevel@tonic-gate 			if ((bp->b_flags & B_DELWRI) == 0)
1089*7c478bd9Sstevel@tonic-gate 				bp->b_flags |= B_STALE|B_AGE;
1090*7c478bd9Sstevel@tonic-gate 			else
1091*7c478bd9Sstevel@tonic-gate 				error = EIO;
1092*7c478bd9Sstevel@tonic-gate 		}
1093*7c478bd9Sstevel@tonic-gate 		sema_v(&bp->b_sem);
1094*7c478bd9Sstevel@tonic-gate 		binval_list = bp->b_list;
1095*7c478bd9Sstevel@tonic-gate 		bp->b_list = NULL;
1096*7c478bd9Sstevel@tonic-gate 	}
1097*7c478bd9Sstevel@tonic-gate 	mutex_enter(&blist_lock);
1098*7c478bd9Sstevel@tonic-gate 	bio_doinginval--;
1099*7c478bd9Sstevel@tonic-gate 	if (bio_flinv_cv_wanted) {
1100*7c478bd9Sstevel@tonic-gate 		cv_broadcast(&bio_flushinval_cv);
1101*7c478bd9Sstevel@tonic-gate 		bio_flinv_cv_wanted = 0;
1102*7c478bd9Sstevel@tonic-gate 	}
1103*7c478bd9Sstevel@tonic-gate 	mutex_exit(&blist_lock);
1104*7c478bd9Sstevel@tonic-gate 	return (error);
1105*7c478bd9Sstevel@tonic-gate }
1106*7c478bd9Sstevel@tonic-gate 
1107*7c478bd9Sstevel@tonic-gate /*
1108*7c478bd9Sstevel@tonic-gate  * If possible, invalidate blocks for a dev on demand
1109*7c478bd9Sstevel@tonic-gate  */
1110*7c478bd9Sstevel@tonic-gate void
1111*7c478bd9Sstevel@tonic-gate binval(dev_t dev)
1112*7c478bd9Sstevel@tonic-gate {
1113*7c478bd9Sstevel@tonic-gate 	(void) bfinval(dev, 0);
1114*7c478bd9Sstevel@tonic-gate }
1115*7c478bd9Sstevel@tonic-gate 
1116*7c478bd9Sstevel@tonic-gate /*
1117*7c478bd9Sstevel@tonic-gate  * Initialize the buffer I/O system by freeing
1118*7c478bd9Sstevel@tonic-gate  * all buffers and setting all device hash buffer lists to empty.
1119*7c478bd9Sstevel@tonic-gate  */
1120*7c478bd9Sstevel@tonic-gate void
1121*7c478bd9Sstevel@tonic-gate binit(void)
1122*7c478bd9Sstevel@tonic-gate {
1123*7c478bd9Sstevel@tonic-gate 	struct buf *bp;
1124*7c478bd9Sstevel@tonic-gate 	unsigned int i, pct;
1125*7c478bd9Sstevel@tonic-gate 	ulong_t	bio_max_hwm, bio_default_hwm;
1126*7c478bd9Sstevel@tonic-gate 
1127*7c478bd9Sstevel@tonic-gate 	/*
1128*7c478bd9Sstevel@tonic-gate 	 * Maximum/Default values for bufhwm are set to the smallest of:
1129*7c478bd9Sstevel@tonic-gate 	 *	- BIO_MAX_PERCENT resp. BIO_BUF_PERCENT of real memory
1130*7c478bd9Sstevel@tonic-gate 	 *	- 1/4 of kernel virtual memory
1131*7c478bd9Sstevel@tonic-gate 	 *	- INT32_MAX to prevent overflows of v.v_bufhwm (which is int).
1132*7c478bd9Sstevel@tonic-gate 	 * Additionally, in order to allow simple tuning by percentage of
1133*7c478bd9Sstevel@tonic-gate 	 * physical memory, bufhwm_pct is used to calculate the default if
1134*7c478bd9Sstevel@tonic-gate 	 * the value of this tunable is between 0 and BIO_MAX_PERCENT.
1135*7c478bd9Sstevel@tonic-gate 	 *
1136*7c478bd9Sstevel@tonic-gate 	 * Since the unit for v.v_bufhwm is kilobytes, this allows for
1137*7c478bd9Sstevel@tonic-gate 	 * a maximum of 1024 * 2GB == 2TB memory usage by buffer headers.
1138*7c478bd9Sstevel@tonic-gate 	 */
1139*7c478bd9Sstevel@tonic-gate 	bio_max_hwm = MIN(physmem / BIO_MAX_PERCENT,
1140*7c478bd9Sstevel@tonic-gate 	    btop(vmem_size(heap_arena, VMEM_FREE)) / 4) * (PAGESIZE / 1024);
1141*7c478bd9Sstevel@tonic-gate 	bio_max_hwm = MIN(INT32_MAX, bio_max_hwm);
1142*7c478bd9Sstevel@tonic-gate 
1143*7c478bd9Sstevel@tonic-gate 	pct = BIO_BUF_PERCENT;
1144*7c478bd9Sstevel@tonic-gate 	if (bufhwm_pct != 0 &&
1145*7c478bd9Sstevel@tonic-gate 	    ((pct = 100 / bufhwm_pct) < BIO_MAX_PERCENT)) {
1146*7c478bd9Sstevel@tonic-gate 		pct = BIO_BUF_PERCENT;
1147*7c478bd9Sstevel@tonic-gate 		/*
1148*7c478bd9Sstevel@tonic-gate 		 * Invalid user specified value, emit a warning.
1149*7c478bd9Sstevel@tonic-gate 		 */
1150*7c478bd9Sstevel@tonic-gate 		cmn_err(CE_WARN, "binit: bufhwm_pct(%d) out of \
1151*7c478bd9Sstevel@tonic-gate 			range(1..%d). Using %d as default.",
1152*7c478bd9Sstevel@tonic-gate 			bufhwm_pct,
1153*7c478bd9Sstevel@tonic-gate 			100 / BIO_MAX_PERCENT, 100 / BIO_BUF_PERCENT);
1154*7c478bd9Sstevel@tonic-gate 	}
1155*7c478bd9Sstevel@tonic-gate 
1156*7c478bd9Sstevel@tonic-gate 	bio_default_hwm = MIN(physmem / pct,
1157*7c478bd9Sstevel@tonic-gate 	    btop(vmem_size(heap_arena, VMEM_FREE)) / 4) * (PAGESIZE / 1024);
1158*7c478bd9Sstevel@tonic-gate 	bio_default_hwm = MIN(INT32_MAX, bio_default_hwm);
1159*7c478bd9Sstevel@tonic-gate 
1160*7c478bd9Sstevel@tonic-gate 	if ((v.v_bufhwm = bufhwm) == 0)
1161*7c478bd9Sstevel@tonic-gate 		v.v_bufhwm = bio_default_hwm;
1162*7c478bd9Sstevel@tonic-gate 
1163*7c478bd9Sstevel@tonic-gate 	if (v.v_bufhwm < BIO_MIN_HWM || v.v_bufhwm > bio_max_hwm) {
1164*7c478bd9Sstevel@tonic-gate 		v.v_bufhwm = (int)bio_max_hwm;
1165*7c478bd9Sstevel@tonic-gate 		/*
1166*7c478bd9Sstevel@tonic-gate 		 * Invalid user specified value, emit a warning.
1167*7c478bd9Sstevel@tonic-gate 		 */
1168*7c478bd9Sstevel@tonic-gate 		cmn_err(CE_WARN,
1169*7c478bd9Sstevel@tonic-gate 			"binit: bufhwm(%d) out \
1170*7c478bd9Sstevel@tonic-gate 			of range(%d..%lu). Using %lu as default",
1171*7c478bd9Sstevel@tonic-gate 			bufhwm,
1172*7c478bd9Sstevel@tonic-gate 			BIO_MIN_HWM, bio_max_hwm, bio_max_hwm);
1173*7c478bd9Sstevel@tonic-gate 	}
1174*7c478bd9Sstevel@tonic-gate 
1175*7c478bd9Sstevel@tonic-gate 	/*
1176*7c478bd9Sstevel@tonic-gate 	 * Determine the number of hash buckets. Default is to
1177*7c478bd9Sstevel@tonic-gate 	 * create ~BIO_HASHLEN entries per chain based on MAXBSIZE buffers.
1178*7c478bd9Sstevel@tonic-gate 	 * Round up number to the next power of 2.
1179*7c478bd9Sstevel@tonic-gate 	 */
1180*7c478bd9Sstevel@tonic-gate 	v.v_hbuf = 1 << highbit((((ulong_t)v.v_bufhwm * 1024) / MAXBSIZE) /
1181*7c478bd9Sstevel@tonic-gate 	    BIO_HASHLEN);
1182*7c478bd9Sstevel@tonic-gate 	v.v_hmask = v.v_hbuf - 1;
1183*7c478bd9Sstevel@tonic-gate 	v.v_buf = BIO_BHDR_POOL;
1184*7c478bd9Sstevel@tonic-gate 
1185*7c478bd9Sstevel@tonic-gate 	hbuf = kmem_zalloc(v.v_hbuf * sizeof (struct hbuf), KM_SLEEP);
1186*7c478bd9Sstevel@tonic-gate 
1187*7c478bd9Sstevel@tonic-gate 	dwbuf = kmem_zalloc(v.v_hbuf * sizeof (struct dwbuf), KM_SLEEP);
1188*7c478bd9Sstevel@tonic-gate 
1189*7c478bd9Sstevel@tonic-gate 	bfreelist.b_bufsize = (size_t)v.v_bufhwm * 1024;
1190*7c478bd9Sstevel@tonic-gate 	bp = &bfreelist;
1191*7c478bd9Sstevel@tonic-gate 	bp->b_forw = bp->b_back = bp->av_forw = bp->av_back = bp;
1192*7c478bd9Sstevel@tonic-gate 
1193*7c478bd9Sstevel@tonic-gate 	for (i = 0; i < v.v_hbuf; i++) {
1194*7c478bd9Sstevel@tonic-gate 		hbuf[i].b_forw = hbuf[i].b_back = (struct buf *)&hbuf[i];
1195*7c478bd9Sstevel@tonic-gate 		hbuf[i].av_forw = hbuf[i].av_back = (struct buf *)&hbuf[i];
1196*7c478bd9Sstevel@tonic-gate 
1197*7c478bd9Sstevel@tonic-gate 		/*
1198*7c478bd9Sstevel@tonic-gate 		 * Initialize the delayed write buffer list.
1199*7c478bd9Sstevel@tonic-gate 		 */
1200*7c478bd9Sstevel@tonic-gate 		dwbuf[i].b_forw = dwbuf[i].b_back = (struct buf *)&dwbuf[i];
1201*7c478bd9Sstevel@tonic-gate 		dwbuf[i].av_forw = dwbuf[i].av_back = (struct buf *)&dwbuf[i];
1202*7c478bd9Sstevel@tonic-gate 	}
1203*7c478bd9Sstevel@tonic-gate }
1204*7c478bd9Sstevel@tonic-gate 
1205*7c478bd9Sstevel@tonic-gate /*
1206*7c478bd9Sstevel@tonic-gate  * Wait for I/O completion on the buffer; return error code.
1207*7c478bd9Sstevel@tonic-gate  * If bp was for synchronous I/O, bp is invalid and associated
1208*7c478bd9Sstevel@tonic-gate  * resources are freed on return.
1209*7c478bd9Sstevel@tonic-gate  */
1210*7c478bd9Sstevel@tonic-gate int
1211*7c478bd9Sstevel@tonic-gate biowait(struct buf *bp)
1212*7c478bd9Sstevel@tonic-gate {
1213*7c478bd9Sstevel@tonic-gate 	int error = 0;
1214*7c478bd9Sstevel@tonic-gate 	struct cpu *cpup;
1215*7c478bd9Sstevel@tonic-gate 
1216*7c478bd9Sstevel@tonic-gate 	ASSERT(SEMA_HELD(&bp->b_sem));
1217*7c478bd9Sstevel@tonic-gate 
1218*7c478bd9Sstevel@tonic-gate 	cpup = CPU;
1219*7c478bd9Sstevel@tonic-gate 	atomic_add_64(&cpup->cpu_stats.sys.iowait, 1);
1220*7c478bd9Sstevel@tonic-gate 	DTRACE_IO1(wait__start, struct buf *, bp);
1221*7c478bd9Sstevel@tonic-gate 
1222*7c478bd9Sstevel@tonic-gate 	/*
1223*7c478bd9Sstevel@tonic-gate 	 * In case of panic, busy wait for completion
1224*7c478bd9Sstevel@tonic-gate 	 */
1225*7c478bd9Sstevel@tonic-gate 	if (panicstr) {
1226*7c478bd9Sstevel@tonic-gate 		while ((bp->b_flags & B_DONE) == 0)
1227*7c478bd9Sstevel@tonic-gate 			drv_usecwait(10);
1228*7c478bd9Sstevel@tonic-gate 	} else
1229*7c478bd9Sstevel@tonic-gate 		sema_p(&bp->b_io);
1230*7c478bd9Sstevel@tonic-gate 
1231*7c478bd9Sstevel@tonic-gate 	DTRACE_IO1(wait__done, struct buf *, bp);
1232*7c478bd9Sstevel@tonic-gate 	atomic_add_64(&cpup->cpu_stats.sys.iowait, -1);
1233*7c478bd9Sstevel@tonic-gate 
1234*7c478bd9Sstevel@tonic-gate 	error = geterror(bp);
1235*7c478bd9Sstevel@tonic-gate 	if ((bp->b_flags & B_ASYNC) == 0) {
1236*7c478bd9Sstevel@tonic-gate 		if (bp->b_flags & B_REMAPPED)
1237*7c478bd9Sstevel@tonic-gate 			bp_mapout(bp);
1238*7c478bd9Sstevel@tonic-gate 	}
1239*7c478bd9Sstevel@tonic-gate 	return (error);
1240*7c478bd9Sstevel@tonic-gate }
1241*7c478bd9Sstevel@tonic-gate 
1242*7c478bd9Sstevel@tonic-gate static void
1243*7c478bd9Sstevel@tonic-gate biodone_tnf_probe(struct buf *bp)
1244*7c478bd9Sstevel@tonic-gate {
1245*7c478bd9Sstevel@tonic-gate 	/* Kernel probe */
1246*7c478bd9Sstevel@tonic-gate 	TNF_PROBE_3(biodone, "io blockio", /* CSTYLED */,
1247*7c478bd9Sstevel@tonic-gate 		tnf_device,	device,		bp->b_edev,
1248*7c478bd9Sstevel@tonic-gate 		tnf_diskaddr,	block,		bp->b_lblkno,
1249*7c478bd9Sstevel@tonic-gate 		tnf_opaque,	buf,		bp);
1250*7c478bd9Sstevel@tonic-gate }
1251*7c478bd9Sstevel@tonic-gate 
1252*7c478bd9Sstevel@tonic-gate /*
1253*7c478bd9Sstevel@tonic-gate  * Mark I/O complete on a buffer, release it if I/O is asynchronous,
1254*7c478bd9Sstevel@tonic-gate  * and wake up anyone waiting for it.
1255*7c478bd9Sstevel@tonic-gate  */
1256*7c478bd9Sstevel@tonic-gate void
1257*7c478bd9Sstevel@tonic-gate biodone(struct buf *bp)
1258*7c478bd9Sstevel@tonic-gate {
1259*7c478bd9Sstevel@tonic-gate 	if (bp->b_flags & B_STARTED) {
1260*7c478bd9Sstevel@tonic-gate 		DTRACE_IO1(done, struct buf *, bp);
1261*7c478bd9Sstevel@tonic-gate 		bp->b_flags &= ~B_STARTED;
1262*7c478bd9Sstevel@tonic-gate 	}
1263*7c478bd9Sstevel@tonic-gate 
1264*7c478bd9Sstevel@tonic-gate 	/*
1265*7c478bd9Sstevel@tonic-gate 	 * Call the TNF probe here instead of the inline code
1266*7c478bd9Sstevel@tonic-gate 	 * to force our compiler to use the tail call optimization.
1267*7c478bd9Sstevel@tonic-gate 	 */
1268*7c478bd9Sstevel@tonic-gate 	biodone_tnf_probe(bp);
1269*7c478bd9Sstevel@tonic-gate 
1270*7c478bd9Sstevel@tonic-gate 	if (bp->b_iodone != NULL) {
1271*7c478bd9Sstevel@tonic-gate 		(*(bp->b_iodone))(bp);
1272*7c478bd9Sstevel@tonic-gate 		return;
1273*7c478bd9Sstevel@tonic-gate 	}
1274*7c478bd9Sstevel@tonic-gate 	ASSERT((bp->b_flags & B_DONE) == 0);
1275*7c478bd9Sstevel@tonic-gate 	ASSERT(SEMA_HELD(&bp->b_sem));
1276*7c478bd9Sstevel@tonic-gate 	bp->b_flags |= B_DONE;
1277*7c478bd9Sstevel@tonic-gate 	if (bp->b_flags & B_ASYNC) {
1278*7c478bd9Sstevel@tonic-gate 		if (bp->b_flags & (B_PAGEIO|B_REMAPPED))
1279*7c478bd9Sstevel@tonic-gate 			bio_pageio_done(bp);
1280*7c478bd9Sstevel@tonic-gate 		else
1281*7c478bd9Sstevel@tonic-gate 			brelse(bp);	/* release bp to freelist */
1282*7c478bd9Sstevel@tonic-gate 	} else {
1283*7c478bd9Sstevel@tonic-gate 		sema_v(&bp->b_io);
1284*7c478bd9Sstevel@tonic-gate 	}
1285*7c478bd9Sstevel@tonic-gate }
1286*7c478bd9Sstevel@tonic-gate 
1287*7c478bd9Sstevel@tonic-gate /*
1288*7c478bd9Sstevel@tonic-gate  * Pick up the device's error number and pass it to the user;
1289*7c478bd9Sstevel@tonic-gate  * if there is an error but the number is 0 set a generalized code.
1290*7c478bd9Sstevel@tonic-gate  */
1291*7c478bd9Sstevel@tonic-gate int
1292*7c478bd9Sstevel@tonic-gate geterror(struct buf *bp)
1293*7c478bd9Sstevel@tonic-gate {
1294*7c478bd9Sstevel@tonic-gate 	int error = 0;
1295*7c478bd9Sstevel@tonic-gate 
1296*7c478bd9Sstevel@tonic-gate 	ASSERT(SEMA_HELD(&bp->b_sem));
1297*7c478bd9Sstevel@tonic-gate 	if (bp->b_flags & B_ERROR) {
1298*7c478bd9Sstevel@tonic-gate 		error = bp->b_error;
1299*7c478bd9Sstevel@tonic-gate 		if (!error)
1300*7c478bd9Sstevel@tonic-gate 			error = EIO;
1301*7c478bd9Sstevel@tonic-gate 	}
1302*7c478bd9Sstevel@tonic-gate 	return (error);
1303*7c478bd9Sstevel@tonic-gate }
1304*7c478bd9Sstevel@tonic-gate 
1305*7c478bd9Sstevel@tonic-gate /*
1306*7c478bd9Sstevel@tonic-gate  * Support for pageio buffers.
1307*7c478bd9Sstevel@tonic-gate  *
1308*7c478bd9Sstevel@tonic-gate  * This stuff should be generalized to provide a generalized bp
1309*7c478bd9Sstevel@tonic-gate  * header facility that can be used for things other than pageio.
1310*7c478bd9Sstevel@tonic-gate  */
1311*7c478bd9Sstevel@tonic-gate 
1312*7c478bd9Sstevel@tonic-gate /*
1313*7c478bd9Sstevel@tonic-gate  * Allocate and initialize a buf struct for use with pageio.
1314*7c478bd9Sstevel@tonic-gate  */
1315*7c478bd9Sstevel@tonic-gate struct buf *
1316*7c478bd9Sstevel@tonic-gate pageio_setup(struct page *pp, size_t len, struct vnode *vp, int flags)
1317*7c478bd9Sstevel@tonic-gate {
1318*7c478bd9Sstevel@tonic-gate 	struct buf *bp;
1319*7c478bd9Sstevel@tonic-gate 	struct cpu *cpup;
1320*7c478bd9Sstevel@tonic-gate 
1321*7c478bd9Sstevel@tonic-gate 	if (flags & B_READ) {
1322*7c478bd9Sstevel@tonic-gate 		CPU_STATS_ENTER_K();
1323*7c478bd9Sstevel@tonic-gate 		cpup = CPU;	/* get pointer AFTER preemption is disabled */
1324*7c478bd9Sstevel@tonic-gate 		CPU_STATS_ADDQ(cpup, vm, pgin, 1);
1325*7c478bd9Sstevel@tonic-gate 		CPU_STATS_ADDQ(cpup, vm, pgpgin, btopr(len));
1326*7c478bd9Sstevel@tonic-gate 		if ((flags & B_ASYNC) == 0) {
1327*7c478bd9Sstevel@tonic-gate 			klwp_t *lwp = ttolwp(curthread);
1328*7c478bd9Sstevel@tonic-gate 			if (lwp != NULL)
1329*7c478bd9Sstevel@tonic-gate 				lwp->lwp_ru.majflt++;
1330*7c478bd9Sstevel@tonic-gate 			CPU_STATS_ADDQ(cpup, vm, maj_fault, 1);
1331*7c478bd9Sstevel@tonic-gate 			/* Kernel probe */
1332*7c478bd9Sstevel@tonic-gate 			TNF_PROBE_2(major_fault, "vm pagefault", /* CSTYLED */,
1333*7c478bd9Sstevel@tonic-gate 				tnf_opaque,	vnode,		pp->p_vnode,
1334*7c478bd9Sstevel@tonic-gate 				tnf_offset,	offset,		pp->p_offset);
1335*7c478bd9Sstevel@tonic-gate 		}
1336*7c478bd9Sstevel@tonic-gate 		/*
1337*7c478bd9Sstevel@tonic-gate 		 * Update statistics for pages being paged in
1338*7c478bd9Sstevel@tonic-gate 		 */
1339*7c478bd9Sstevel@tonic-gate 		if (pp != NULL && pp->p_vnode != NULL) {
1340*7c478bd9Sstevel@tonic-gate 			if (IS_SWAPFSVP(pp->p_vnode)) {
1341*7c478bd9Sstevel@tonic-gate 				CPU_STATS_ADDQ(cpup, vm, anonpgin,
1342*7c478bd9Sstevel@tonic-gate 						btopr(len));
1343*7c478bd9Sstevel@tonic-gate 			} else {
1344*7c478bd9Sstevel@tonic-gate 				if (pp->p_vnode->v_flag & VVMEXEC) {
1345*7c478bd9Sstevel@tonic-gate 					CPU_STATS_ADDQ(cpup, vm, execpgin,
1346*7c478bd9Sstevel@tonic-gate 							btopr(len));
1347*7c478bd9Sstevel@tonic-gate 				} else {
1348*7c478bd9Sstevel@tonic-gate 					CPU_STATS_ADDQ(cpup, vm, fspgin,
1349*7c478bd9Sstevel@tonic-gate 							btopr(len));
1350*7c478bd9Sstevel@tonic-gate 				}
1351*7c478bd9Sstevel@tonic-gate 			}
1352*7c478bd9Sstevel@tonic-gate 		}
1353*7c478bd9Sstevel@tonic-gate 		CPU_STATS_EXIT_K();
1354*7c478bd9Sstevel@tonic-gate 		TRACE_1(TR_FAC_VM, TR_PAGE_WS_IN,
1355*7c478bd9Sstevel@tonic-gate 		    "page_ws_in:pp %p", pp);
1356*7c478bd9Sstevel@tonic-gate 		/* Kernel probe */
1357*7c478bd9Sstevel@tonic-gate 		TNF_PROBE_3(pagein, "vm pageio io", /* CSTYLED */,
1358*7c478bd9Sstevel@tonic-gate 			tnf_opaque,	vnode,		pp->p_vnode,
1359*7c478bd9Sstevel@tonic-gate 			tnf_offset,	offset,		pp->p_offset,
1360*7c478bd9Sstevel@tonic-gate 			tnf_size,	size,		len);
1361*7c478bd9Sstevel@tonic-gate 	}
1362*7c478bd9Sstevel@tonic-gate 
1363*7c478bd9Sstevel@tonic-gate 	bp = kmem_zalloc(sizeof (struct buf), KM_SLEEP);
1364*7c478bd9Sstevel@tonic-gate 	bp->b_bcount = len;
1365*7c478bd9Sstevel@tonic-gate 	bp->b_bufsize = len;
1366*7c478bd9Sstevel@tonic-gate 	bp->b_pages = pp;
1367*7c478bd9Sstevel@tonic-gate 	bp->b_flags = B_PAGEIO | B_NOCACHE | B_BUSY | flags;
1368*7c478bd9Sstevel@tonic-gate 	bp->b_offset = -1;
1369*7c478bd9Sstevel@tonic-gate 	sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
1370*7c478bd9Sstevel@tonic-gate 
1371*7c478bd9Sstevel@tonic-gate 	/* Initialize bp->b_sem in "locked" state */
1372*7c478bd9Sstevel@tonic-gate 	sema_init(&bp->b_sem, 0, NULL, SEMA_DEFAULT, NULL);
1373*7c478bd9Sstevel@tonic-gate 
1374*7c478bd9Sstevel@tonic-gate 	VN_HOLD(vp);
1375*7c478bd9Sstevel@tonic-gate 	bp->b_vp = vp;
1376*7c478bd9Sstevel@tonic-gate 	THREAD_KPRI_RELEASE_N(btopr(len)); /* release kpri from page_locks */
1377*7c478bd9Sstevel@tonic-gate 
1378*7c478bd9Sstevel@tonic-gate 	/*
1379*7c478bd9Sstevel@tonic-gate 	 * Caller sets dev & blkno and can adjust
1380*7c478bd9Sstevel@tonic-gate 	 * b_addr for page offset and can use bp_mapin
1381*7c478bd9Sstevel@tonic-gate 	 * to make pages kernel addressable.
1382*7c478bd9Sstevel@tonic-gate 	 */
1383*7c478bd9Sstevel@tonic-gate 	return (bp);
1384*7c478bd9Sstevel@tonic-gate }
1385*7c478bd9Sstevel@tonic-gate 
1386*7c478bd9Sstevel@tonic-gate void
1387*7c478bd9Sstevel@tonic-gate pageio_done(struct buf *bp)
1388*7c478bd9Sstevel@tonic-gate {
1389*7c478bd9Sstevel@tonic-gate 	ASSERT(SEMA_HELD(&bp->b_sem));
1390*7c478bd9Sstevel@tonic-gate 	if (bp->b_flags & B_REMAPPED)
1391*7c478bd9Sstevel@tonic-gate 		bp_mapout(bp);
1392*7c478bd9Sstevel@tonic-gate 	VN_RELE(bp->b_vp);
1393*7c478bd9Sstevel@tonic-gate 	bp->b_vp = NULL;
1394*7c478bd9Sstevel@tonic-gate 	ASSERT((bp->b_flags & B_NOCACHE) != 0);
1395*7c478bd9Sstevel@tonic-gate 
1396*7c478bd9Sstevel@tonic-gate 	/* A sema_v(bp->b_sem) is implied if we are destroying it */
1397*7c478bd9Sstevel@tonic-gate 	sema_destroy(&bp->b_sem);
1398*7c478bd9Sstevel@tonic-gate 	sema_destroy(&bp->b_io);
1399*7c478bd9Sstevel@tonic-gate 	kmem_free(bp, sizeof (struct buf));
1400*7c478bd9Sstevel@tonic-gate }
1401*7c478bd9Sstevel@tonic-gate 
1402*7c478bd9Sstevel@tonic-gate /*
1403*7c478bd9Sstevel@tonic-gate  * Check to see whether the buffers, except the one pointed by sbp,
1404*7c478bd9Sstevel@tonic-gate  * associated with the device are busy.
1405*7c478bd9Sstevel@tonic-gate  * NOTE: This expensive operation shall be improved together with ufs_icheck().
1406*7c478bd9Sstevel@tonic-gate  */
1407*7c478bd9Sstevel@tonic-gate int
1408*7c478bd9Sstevel@tonic-gate bcheck(dev_t dev, struct buf *sbp)
1409*7c478bd9Sstevel@tonic-gate {
1410*7c478bd9Sstevel@tonic-gate 	struct buf	*bp;
1411*7c478bd9Sstevel@tonic-gate 	struct buf	*dp;
1412*7c478bd9Sstevel@tonic-gate 	int i;
1413*7c478bd9Sstevel@tonic-gate 	kmutex_t *hmp;
1414*7c478bd9Sstevel@tonic-gate 
1415*7c478bd9Sstevel@tonic-gate 	/*
1416*7c478bd9Sstevel@tonic-gate 	 * check for busy bufs for this filesystem
1417*7c478bd9Sstevel@tonic-gate 	 */
1418*7c478bd9Sstevel@tonic-gate 	for (i = 0; i < v.v_hbuf; i++) {
1419*7c478bd9Sstevel@tonic-gate 		dp = (struct buf *)&hbuf[i];
1420*7c478bd9Sstevel@tonic-gate 		hmp = &hbuf[i].b_lock;
1421*7c478bd9Sstevel@tonic-gate 
1422*7c478bd9Sstevel@tonic-gate 		mutex_enter(hmp);
1423*7c478bd9Sstevel@tonic-gate 		for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
1424*7c478bd9Sstevel@tonic-gate 			/*
1425*7c478bd9Sstevel@tonic-gate 			 * if buf is busy or dirty, then filesystem is busy
1426*7c478bd9Sstevel@tonic-gate 			 */
1427*7c478bd9Sstevel@tonic-gate 			if ((bp->b_edev == dev) &&
1428*7c478bd9Sstevel@tonic-gate 			    ((bp->b_flags & B_STALE) == 0) &&
1429*7c478bd9Sstevel@tonic-gate 			    (bp->b_flags & (B_DELWRI|B_BUSY)) &&
1430*7c478bd9Sstevel@tonic-gate 			    (bp != sbp)) {
1431*7c478bd9Sstevel@tonic-gate 				mutex_exit(hmp);
1432*7c478bd9Sstevel@tonic-gate 				return (1);
1433*7c478bd9Sstevel@tonic-gate 			}
1434*7c478bd9Sstevel@tonic-gate 		}
1435*7c478bd9Sstevel@tonic-gate 		mutex_exit(hmp);
1436*7c478bd9Sstevel@tonic-gate 	}
1437*7c478bd9Sstevel@tonic-gate 	return (0);
1438*7c478bd9Sstevel@tonic-gate }
1439*7c478bd9Sstevel@tonic-gate 
1440*7c478bd9Sstevel@tonic-gate /*
1441*7c478bd9Sstevel@tonic-gate  * Hash two 32 bit entities.
1442*7c478bd9Sstevel@tonic-gate  */
1443*7c478bd9Sstevel@tonic-gate int
1444*7c478bd9Sstevel@tonic-gate hash2ints(int x, int y)
1445*7c478bd9Sstevel@tonic-gate {
1446*7c478bd9Sstevel@tonic-gate 	int hash = 0;
1447*7c478bd9Sstevel@tonic-gate 
1448*7c478bd9Sstevel@tonic-gate 	hash = x - 1;
1449*7c478bd9Sstevel@tonic-gate 	hash = ((hash * 7) + (x >> 8)) - 1;
1450*7c478bd9Sstevel@tonic-gate 	hash = ((hash * 7) + (x >> 16)) - 1;
1451*7c478bd9Sstevel@tonic-gate 	hash = ((hash * 7) + (x >> 24)) - 1;
1452*7c478bd9Sstevel@tonic-gate 	hash = ((hash * 7) + y) - 1;
1453*7c478bd9Sstevel@tonic-gate 	hash = ((hash * 7) + (y >> 8)) - 1;
1454*7c478bd9Sstevel@tonic-gate 	hash = ((hash * 7) + (y >> 16)) - 1;
1455*7c478bd9Sstevel@tonic-gate 	hash = ((hash * 7) + (y >> 24)) - 1;
1456*7c478bd9Sstevel@tonic-gate 
1457*7c478bd9Sstevel@tonic-gate 	return (hash);
1458*7c478bd9Sstevel@tonic-gate }
1459*7c478bd9Sstevel@tonic-gate 
1460*7c478bd9Sstevel@tonic-gate 
1461*7c478bd9Sstevel@tonic-gate /*
1462*7c478bd9Sstevel@tonic-gate  * Return a new buffer struct.
1463*7c478bd9Sstevel@tonic-gate  *	Create a new buffer if we haven't gone over our high water
1464*7c478bd9Sstevel@tonic-gate  *	mark for memory, otherwise try to get one off the freelist.
1465*7c478bd9Sstevel@tonic-gate  *
1466*7c478bd9Sstevel@tonic-gate  * Returns a locked buf that has no id and is not on any hash or free
1467*7c478bd9Sstevel@tonic-gate  * list.
1468*7c478bd9Sstevel@tonic-gate  */
1469*7c478bd9Sstevel@tonic-gate static struct buf *
1470*7c478bd9Sstevel@tonic-gate bio_getfreeblk(long bsize)
1471*7c478bd9Sstevel@tonic-gate {
1472*7c478bd9Sstevel@tonic-gate 	struct buf *bp, *dp;
1473*7c478bd9Sstevel@tonic-gate 	struct hbuf *hp;
1474*7c478bd9Sstevel@tonic-gate 	kmutex_t	*hmp;
1475*7c478bd9Sstevel@tonic-gate 	uint_t		start, end;
1476*7c478bd9Sstevel@tonic-gate 
1477*7c478bd9Sstevel@tonic-gate 	/*
1478*7c478bd9Sstevel@tonic-gate 	 * mutex_enter(&bfree_lock);
1479*7c478bd9Sstevel@tonic-gate 	 * bfreelist.b_bufsize represents the amount of memory
1480*7c478bd9Sstevel@tonic-gate 	 * mutex_exit(&bfree_lock); protect ref to bfreelist
1481*7c478bd9Sstevel@tonic-gate 	 * we are allowed to allocate in the cache before we hit our hwm.
1482*7c478bd9Sstevel@tonic-gate 	 */
1483*7c478bd9Sstevel@tonic-gate 	bio_mem_get(bsize);	/* Account for our memory request */
1484*7c478bd9Sstevel@tonic-gate 
1485*7c478bd9Sstevel@tonic-gate again:
1486*7c478bd9Sstevel@tonic-gate 	bp = bio_bhdr_alloc();	/* Get a buf hdr */
1487*7c478bd9Sstevel@tonic-gate 	sema_p(&bp->b_sem);	/* Should never fail */
1488*7c478bd9Sstevel@tonic-gate 
1489*7c478bd9Sstevel@tonic-gate 	ASSERT(bp->b_un.b_addr == NULL);
1490*7c478bd9Sstevel@tonic-gate 	bp->b_un.b_addr = kmem_alloc(bsize, KM_NOSLEEP);
1491*7c478bd9Sstevel@tonic-gate 	if (bp->b_un.b_addr != NULL) {
1492*7c478bd9Sstevel@tonic-gate 		/*
1493*7c478bd9Sstevel@tonic-gate 		 * Make the common path short
1494*7c478bd9Sstevel@tonic-gate 		 */
1495*7c478bd9Sstevel@tonic-gate 		bp->b_bufsize = bsize;
1496*7c478bd9Sstevel@tonic-gate 		ASSERT(SEMA_HELD(&bp->b_sem));
1497*7c478bd9Sstevel@tonic-gate 		return (bp);
1498*7c478bd9Sstevel@tonic-gate 	} else {
1499*7c478bd9Sstevel@tonic-gate 		struct buf *save;
1500*7c478bd9Sstevel@tonic-gate 
1501*7c478bd9Sstevel@tonic-gate 		save = bp;	/* Save bp we allocated */
1502*7c478bd9Sstevel@tonic-gate 		start = end = lastindex;
1503*7c478bd9Sstevel@tonic-gate 
1504*7c478bd9Sstevel@tonic-gate 		biostats.bio_bufwant.value.ui32++;
1505*7c478bd9Sstevel@tonic-gate 
1506*7c478bd9Sstevel@tonic-gate 		/*
1507*7c478bd9Sstevel@tonic-gate 		 * Memory isn't available from the system now. Scan
1508*7c478bd9Sstevel@tonic-gate 		 * the hash buckets till enough space is found.
1509*7c478bd9Sstevel@tonic-gate 		 */
1510*7c478bd9Sstevel@tonic-gate 		do {
1511*7c478bd9Sstevel@tonic-gate 			hp = &hbuf[start];
1512*7c478bd9Sstevel@tonic-gate 			hmp = &hp->b_lock;
1513*7c478bd9Sstevel@tonic-gate 			dp = (struct buf *)hp;
1514*7c478bd9Sstevel@tonic-gate 
1515*7c478bd9Sstevel@tonic-gate 			mutex_enter(hmp);
1516*7c478bd9Sstevel@tonic-gate 			bp = dp->av_forw;
1517*7c478bd9Sstevel@tonic-gate 
1518*7c478bd9Sstevel@tonic-gate 			while (bp != dp) {
1519*7c478bd9Sstevel@tonic-gate 
1520*7c478bd9Sstevel@tonic-gate 				ASSERT(bp != NULL);
1521*7c478bd9Sstevel@tonic-gate 
1522*7c478bd9Sstevel@tonic-gate 				if (!sema_tryp(&bp->b_sem)) {
1523*7c478bd9Sstevel@tonic-gate 					bp = bp->av_forw;
1524*7c478bd9Sstevel@tonic-gate 					continue;
1525*7c478bd9Sstevel@tonic-gate 				}
1526*7c478bd9Sstevel@tonic-gate 
1527*7c478bd9Sstevel@tonic-gate 				/*
1528*7c478bd9Sstevel@tonic-gate 				 * Since we are going down the freelist
1529*7c478bd9Sstevel@tonic-gate 				 * associated with this hash bucket the
1530*7c478bd9Sstevel@tonic-gate 				 * B_DELWRI flag should not be set.
1531*7c478bd9Sstevel@tonic-gate 				 */
1532*7c478bd9Sstevel@tonic-gate 				ASSERT(!(bp->b_flags & B_DELWRI));
1533*7c478bd9Sstevel@tonic-gate 
1534*7c478bd9Sstevel@tonic-gate 				if (bp->b_bufsize == bsize) {
1535*7c478bd9Sstevel@tonic-gate 					hp->b_length--;
1536*7c478bd9Sstevel@tonic-gate 					notavail(bp);
1537*7c478bd9Sstevel@tonic-gate 					bremhash(bp);
1538*7c478bd9Sstevel@tonic-gate 					mutex_exit(hmp);
1539*7c478bd9Sstevel@tonic-gate 
1540*7c478bd9Sstevel@tonic-gate 					/*
1541*7c478bd9Sstevel@tonic-gate 					 * Didn't kmem_alloc any more, so don't
1542*7c478bd9Sstevel@tonic-gate 					 * count it twice.
1543*7c478bd9Sstevel@tonic-gate 					 */
1544*7c478bd9Sstevel@tonic-gate 					mutex_enter(&bfree_lock);
1545*7c478bd9Sstevel@tonic-gate 					bfreelist.b_bufsize += bsize;
1546*7c478bd9Sstevel@tonic-gate 					mutex_exit(&bfree_lock);
1547*7c478bd9Sstevel@tonic-gate 
1548*7c478bd9Sstevel@tonic-gate 					/*
1549*7c478bd9Sstevel@tonic-gate 					 * Update the lastindex value.
1550*7c478bd9Sstevel@tonic-gate 					 */
1551*7c478bd9Sstevel@tonic-gate 					lastindex = start;
1552*7c478bd9Sstevel@tonic-gate 
1553*7c478bd9Sstevel@tonic-gate 					/*
1554*7c478bd9Sstevel@tonic-gate 					 * Put our saved bp back on the list
1555*7c478bd9Sstevel@tonic-gate 					 */
1556*7c478bd9Sstevel@tonic-gate 					sema_v(&save->b_sem);
1557*7c478bd9Sstevel@tonic-gate 					bio_bhdr_free(save);
1558*7c478bd9Sstevel@tonic-gate 					ASSERT(SEMA_HELD(&bp->b_sem));
1559*7c478bd9Sstevel@tonic-gate 					return (bp);
1560*7c478bd9Sstevel@tonic-gate 				}
1561*7c478bd9Sstevel@tonic-gate 				sema_v(&bp->b_sem);
1562*7c478bd9Sstevel@tonic-gate 				bp = bp->av_forw;
1563*7c478bd9Sstevel@tonic-gate 			}
1564*7c478bd9Sstevel@tonic-gate 			mutex_exit(hmp);
1565*7c478bd9Sstevel@tonic-gate 			start = ((start + 1) % v.v_hbuf);
1566*7c478bd9Sstevel@tonic-gate 		} while (start != end);
1567*7c478bd9Sstevel@tonic-gate 
1568*7c478bd9Sstevel@tonic-gate 		biostats.bio_bufwait.value.ui32++;
1569*7c478bd9Sstevel@tonic-gate 		bp = save;		/* Use original bp */
1570*7c478bd9Sstevel@tonic-gate 		bp->b_un.b_addr = kmem_alloc(bsize, KM_SLEEP);
1571*7c478bd9Sstevel@tonic-gate 	}
1572*7c478bd9Sstevel@tonic-gate 
1573*7c478bd9Sstevel@tonic-gate 	bp->b_bufsize = bsize;
1574*7c478bd9Sstevel@tonic-gate 	ASSERT(SEMA_HELD(&bp->b_sem));
1575*7c478bd9Sstevel@tonic-gate 	return (bp);
1576*7c478bd9Sstevel@tonic-gate }
1577*7c478bd9Sstevel@tonic-gate 
1578*7c478bd9Sstevel@tonic-gate /*
1579*7c478bd9Sstevel@tonic-gate  * Allocate a buffer header. If none currently available, allocate
1580*7c478bd9Sstevel@tonic-gate  * a new pool.
1581*7c478bd9Sstevel@tonic-gate  */
1582*7c478bd9Sstevel@tonic-gate static struct buf *
1583*7c478bd9Sstevel@tonic-gate bio_bhdr_alloc(void)
1584*7c478bd9Sstevel@tonic-gate {
1585*7c478bd9Sstevel@tonic-gate 	struct buf *dp, *sdp;
1586*7c478bd9Sstevel@tonic-gate 	struct buf *bp;
1587*7c478bd9Sstevel@tonic-gate 	int i;
1588*7c478bd9Sstevel@tonic-gate 
1589*7c478bd9Sstevel@tonic-gate 	for (;;) {
1590*7c478bd9Sstevel@tonic-gate 		mutex_enter(&bhdr_lock);
1591*7c478bd9Sstevel@tonic-gate 		if (bhdrlist != NULL) {
1592*7c478bd9Sstevel@tonic-gate 			bp = bhdrlist;
1593*7c478bd9Sstevel@tonic-gate 			bhdrlist = bp->av_forw;
1594*7c478bd9Sstevel@tonic-gate 			mutex_exit(&bhdr_lock);
1595*7c478bd9Sstevel@tonic-gate 			bp->av_forw = NULL;
1596*7c478bd9Sstevel@tonic-gate 			return (bp);
1597*7c478bd9Sstevel@tonic-gate 		}
1598*7c478bd9Sstevel@tonic-gate 		mutex_exit(&bhdr_lock);
1599*7c478bd9Sstevel@tonic-gate 
1600*7c478bd9Sstevel@tonic-gate 		/*
1601*7c478bd9Sstevel@tonic-gate 		 * Need to allocate a new pool. If the system is currently
1602*7c478bd9Sstevel@tonic-gate 		 * out of memory, then try freeing things on the freelist.
1603*7c478bd9Sstevel@tonic-gate 		 */
1604*7c478bd9Sstevel@tonic-gate 		dp = kmem_zalloc(sizeof (struct buf) * v.v_buf, KM_NOSLEEP);
1605*7c478bd9Sstevel@tonic-gate 		if (dp == NULL) {
1606*7c478bd9Sstevel@tonic-gate 			/*
1607*7c478bd9Sstevel@tonic-gate 			 * System can't give us a pool of headers, try
1608*7c478bd9Sstevel@tonic-gate 			 * recycling from the free lists.
1609*7c478bd9Sstevel@tonic-gate 			 */
1610*7c478bd9Sstevel@tonic-gate 			bio_recycle(BIO_HEADER, 0);
1611*7c478bd9Sstevel@tonic-gate 		} else {
1612*7c478bd9Sstevel@tonic-gate 			sdp = dp;
1613*7c478bd9Sstevel@tonic-gate 			for (i = 0; i < v.v_buf; i++, dp++) {
1614*7c478bd9Sstevel@tonic-gate 				/*
1615*7c478bd9Sstevel@tonic-gate 				 * The next two lines are needed since NODEV
1616*7c478bd9Sstevel@tonic-gate 				 * is -1 and not NULL
1617*7c478bd9Sstevel@tonic-gate 				 */
1618*7c478bd9Sstevel@tonic-gate 				dp->b_dev = (o_dev_t)NODEV;
1619*7c478bd9Sstevel@tonic-gate 				dp->b_edev = NODEV;
1620*7c478bd9Sstevel@tonic-gate 				dp->av_forw = dp + 1;
1621*7c478bd9Sstevel@tonic-gate 				sema_init(&dp->b_sem, 1, NULL, SEMA_DEFAULT,
1622*7c478bd9Sstevel@tonic-gate 				    NULL);
1623*7c478bd9Sstevel@tonic-gate 				sema_init(&dp->b_io, 0, NULL, SEMA_DEFAULT,
1624*7c478bd9Sstevel@tonic-gate 				    NULL);
1625*7c478bd9Sstevel@tonic-gate 				dp->b_offset = -1;
1626*7c478bd9Sstevel@tonic-gate 			}
1627*7c478bd9Sstevel@tonic-gate 			mutex_enter(&bhdr_lock);
1628*7c478bd9Sstevel@tonic-gate 			(--dp)->av_forw = bhdrlist;	/* Fix last pointer */
1629*7c478bd9Sstevel@tonic-gate 			bhdrlist = sdp;
1630*7c478bd9Sstevel@tonic-gate 			nbuf += v.v_buf;
1631*7c478bd9Sstevel@tonic-gate 			bp = bhdrlist;
1632*7c478bd9Sstevel@tonic-gate 			bhdrlist = bp->av_forw;
1633*7c478bd9Sstevel@tonic-gate 			mutex_exit(&bhdr_lock);
1634*7c478bd9Sstevel@tonic-gate 
1635*7c478bd9Sstevel@tonic-gate 			bp->av_forw = NULL;
1636*7c478bd9Sstevel@tonic-gate 			return (bp);
1637*7c478bd9Sstevel@tonic-gate 		}
1638*7c478bd9Sstevel@tonic-gate 	}
1639*7c478bd9Sstevel@tonic-gate }
1640*7c478bd9Sstevel@tonic-gate 
1641*7c478bd9Sstevel@tonic-gate static  void
1642*7c478bd9Sstevel@tonic-gate bio_bhdr_free(struct buf *bp)
1643*7c478bd9Sstevel@tonic-gate {
1644*7c478bd9Sstevel@tonic-gate 	ASSERT(bp->b_back == NULL);
1645*7c478bd9Sstevel@tonic-gate 	ASSERT(bp->b_forw == NULL);
1646*7c478bd9Sstevel@tonic-gate 	ASSERT(bp->av_back == NULL);
1647*7c478bd9Sstevel@tonic-gate 	ASSERT(bp->av_forw == NULL);
1648*7c478bd9Sstevel@tonic-gate 	ASSERT(bp->b_un.b_addr == NULL);
1649*7c478bd9Sstevel@tonic-gate 	ASSERT(bp->b_dev == (o_dev_t)NODEV);
1650*7c478bd9Sstevel@tonic-gate 	ASSERT(bp->b_edev == NODEV);
1651*7c478bd9Sstevel@tonic-gate 	ASSERT(bp->b_flags == 0);
1652*7c478bd9Sstevel@tonic-gate 
1653*7c478bd9Sstevel@tonic-gate 	mutex_enter(&bhdr_lock);
1654*7c478bd9Sstevel@tonic-gate 	bp->av_forw = bhdrlist;
1655*7c478bd9Sstevel@tonic-gate 	bhdrlist = bp;
1656*7c478bd9Sstevel@tonic-gate 	mutex_exit(&bhdr_lock);
1657*7c478bd9Sstevel@tonic-gate }
1658*7c478bd9Sstevel@tonic-gate 
1659*7c478bd9Sstevel@tonic-gate /*
1660*7c478bd9Sstevel@tonic-gate  * If we haven't gone over the high water mark, it's o.k. to
1661*7c478bd9Sstevel@tonic-gate  * allocate more buffer space, otherwise recycle buffers
1662*7c478bd9Sstevel@tonic-gate  * from the freelist until enough memory is free for a bsize request.
1663*7c478bd9Sstevel@tonic-gate  *
1664*7c478bd9Sstevel@tonic-gate  * We account for this memory, even though
1665*7c478bd9Sstevel@tonic-gate  * we don't allocate it here.
1666*7c478bd9Sstevel@tonic-gate  */
1667*7c478bd9Sstevel@tonic-gate static void
1668*7c478bd9Sstevel@tonic-gate bio_mem_get(long bsize)
1669*7c478bd9Sstevel@tonic-gate {
1670*7c478bd9Sstevel@tonic-gate 	mutex_enter(&bfree_lock);
1671*7c478bd9Sstevel@tonic-gate 	if (bfreelist.b_bufsize > bsize) {
1672*7c478bd9Sstevel@tonic-gate 		bfreelist.b_bufsize -= bsize;
1673*7c478bd9Sstevel@tonic-gate 		mutex_exit(&bfree_lock);
1674*7c478bd9Sstevel@tonic-gate 		return;
1675*7c478bd9Sstevel@tonic-gate 	}
1676*7c478bd9Sstevel@tonic-gate 	mutex_exit(&bfree_lock);
1677*7c478bd9Sstevel@tonic-gate 	bio_recycle(BIO_MEM, bsize);
1678*7c478bd9Sstevel@tonic-gate }
1679*7c478bd9Sstevel@tonic-gate 
1680*7c478bd9Sstevel@tonic-gate /*
1681*7c478bd9Sstevel@tonic-gate  * flush a list of delayed write buffers.
1682*7c478bd9Sstevel@tonic-gate  * (currently used only by bio_recycle below.)
1683*7c478bd9Sstevel@tonic-gate  */
1684*7c478bd9Sstevel@tonic-gate static void
1685*7c478bd9Sstevel@tonic-gate bio_flushlist(struct buf *delwri_list)
1686*7c478bd9Sstevel@tonic-gate {
1687*7c478bd9Sstevel@tonic-gate 	struct buf *bp;
1688*7c478bd9Sstevel@tonic-gate 
1689*7c478bd9Sstevel@tonic-gate 	while (delwri_list != EMPTY_LIST) {
1690*7c478bd9Sstevel@tonic-gate 		bp = delwri_list;
1691*7c478bd9Sstevel@tonic-gate 		bp->b_flags |= B_AGE | B_ASYNC;
1692*7c478bd9Sstevel@tonic-gate 		if (bp->b_vp == NULL) {		/* !ufs */
1693*7c478bd9Sstevel@tonic-gate 			BWRITE(bp);
1694*7c478bd9Sstevel@tonic-gate 		} else {			/* ufs */
1695*7c478bd9Sstevel@tonic-gate 			UFS_BWRITE(VTOI(bp->b_vp)->i_ufsvfs, bp);
1696*7c478bd9Sstevel@tonic-gate 		}
1697*7c478bd9Sstevel@tonic-gate 		delwri_list = bp->b_list;
1698*7c478bd9Sstevel@tonic-gate 		bp->b_list = NULL;
1699*7c478bd9Sstevel@tonic-gate 	}
1700*7c478bd9Sstevel@tonic-gate }
1701*7c478bd9Sstevel@tonic-gate 
1702*7c478bd9Sstevel@tonic-gate /*
1703*7c478bd9Sstevel@tonic-gate  * Start recycling buffers on the freelist for one of 2 reasons:
1704*7c478bd9Sstevel@tonic-gate  *	- we need a buffer header
1705*7c478bd9Sstevel@tonic-gate  *	- we need to free up memory
1706*7c478bd9Sstevel@tonic-gate  * Once started we continue to recycle buffers until the B_AGE
1707*7c478bd9Sstevel@tonic-gate  * buffers are gone.
1708*7c478bd9Sstevel@tonic-gate  */
1709*7c478bd9Sstevel@tonic-gate static void
1710*7c478bd9Sstevel@tonic-gate bio_recycle(int want, long bsize)
1711*7c478bd9Sstevel@tonic-gate {
1712*7c478bd9Sstevel@tonic-gate 	struct buf *bp, *dp, *dwp, *nbp;
1713*7c478bd9Sstevel@tonic-gate 	struct hbuf *hp;
1714*7c478bd9Sstevel@tonic-gate 	int	found = 0;
1715*7c478bd9Sstevel@tonic-gate 	kmutex_t	*hmp;
1716*7c478bd9Sstevel@tonic-gate 	int		start, end;
1717*7c478bd9Sstevel@tonic-gate 	struct buf *delwri_list = EMPTY_LIST;
1718*7c478bd9Sstevel@tonic-gate 
1719*7c478bd9Sstevel@tonic-gate 	/*
1720*7c478bd9Sstevel@tonic-gate 	 * Recycle buffers.
1721*7c478bd9Sstevel@tonic-gate 	 */
1722*7c478bd9Sstevel@tonic-gate top:
1723*7c478bd9Sstevel@tonic-gate 	start = end = lastindex;
1724*7c478bd9Sstevel@tonic-gate 	do {
1725*7c478bd9Sstevel@tonic-gate 		hp = &hbuf[start];
1726*7c478bd9Sstevel@tonic-gate 		hmp = &hp->b_lock;
1727*7c478bd9Sstevel@tonic-gate 		dp = (struct buf *)hp;
1728*7c478bd9Sstevel@tonic-gate 
1729*7c478bd9Sstevel@tonic-gate 		mutex_enter(hmp);
1730*7c478bd9Sstevel@tonic-gate 		bp = dp->av_forw;
1731*7c478bd9Sstevel@tonic-gate 
1732*7c478bd9Sstevel@tonic-gate 		while (bp != dp) {
1733*7c478bd9Sstevel@tonic-gate 
1734*7c478bd9Sstevel@tonic-gate 			ASSERT(bp != NULL);
1735*7c478bd9Sstevel@tonic-gate 
1736*7c478bd9Sstevel@tonic-gate 			if (!sema_tryp(&bp->b_sem)) {
1737*7c478bd9Sstevel@tonic-gate 				bp = bp->av_forw;
1738*7c478bd9Sstevel@tonic-gate 				continue;
1739*7c478bd9Sstevel@tonic-gate 			}
1740*7c478bd9Sstevel@tonic-gate 			/*
1741*7c478bd9Sstevel@tonic-gate 			 * Do we really want to nuke all of the B_AGE stuff??
1742*7c478bd9Sstevel@tonic-gate 			 */
1743*7c478bd9Sstevel@tonic-gate 			if ((bp->b_flags & B_AGE) == 0 && found) {
1744*7c478bd9Sstevel@tonic-gate 				sema_v(&bp->b_sem);
1745*7c478bd9Sstevel@tonic-gate 				mutex_exit(hmp);
1746*7c478bd9Sstevel@tonic-gate 				lastindex = start;
1747*7c478bd9Sstevel@tonic-gate 				return;	/* All done */
1748*7c478bd9Sstevel@tonic-gate 			}
1749*7c478bd9Sstevel@tonic-gate 
1750*7c478bd9Sstevel@tonic-gate 			ASSERT(MUTEX_HELD(&hp->b_lock));
1751*7c478bd9Sstevel@tonic-gate 			ASSERT(!(bp->b_flags & B_DELWRI));
1752*7c478bd9Sstevel@tonic-gate 			hp->b_length--;
1753*7c478bd9Sstevel@tonic-gate 			notavail(bp);
1754*7c478bd9Sstevel@tonic-gate 
1755*7c478bd9Sstevel@tonic-gate 			/*
1756*7c478bd9Sstevel@tonic-gate 			 * Remove bhdr from cache, free up memory,
1757*7c478bd9Sstevel@tonic-gate 			 * and add the hdr to the freelist.
1758*7c478bd9Sstevel@tonic-gate 			 */
1759*7c478bd9Sstevel@tonic-gate 			bremhash(bp);
1760*7c478bd9Sstevel@tonic-gate 			mutex_exit(hmp);
1761*7c478bd9Sstevel@tonic-gate 
1762*7c478bd9Sstevel@tonic-gate 			if (bp->b_bufsize) {
1763*7c478bd9Sstevel@tonic-gate 				kmem_free(bp->b_un.b_addr, bp->b_bufsize);
1764*7c478bd9Sstevel@tonic-gate 				bp->b_un.b_addr = NULL;
1765*7c478bd9Sstevel@tonic-gate 				mutex_enter(&bfree_lock);
1766*7c478bd9Sstevel@tonic-gate 				bfreelist.b_bufsize += bp->b_bufsize;
1767*7c478bd9Sstevel@tonic-gate 				mutex_exit(&bfree_lock);
1768*7c478bd9Sstevel@tonic-gate 			}
1769*7c478bd9Sstevel@tonic-gate 
1770*7c478bd9Sstevel@tonic-gate 			bp->b_dev = (o_dev_t)NODEV;
1771*7c478bd9Sstevel@tonic-gate 			bp->b_edev = NODEV;
1772*7c478bd9Sstevel@tonic-gate 			bp->b_flags = 0;
1773*7c478bd9Sstevel@tonic-gate 			sema_v(&bp->b_sem);
1774*7c478bd9Sstevel@tonic-gate 			bio_bhdr_free(bp);
1775*7c478bd9Sstevel@tonic-gate 			if (want == BIO_HEADER) {
1776*7c478bd9Sstevel@tonic-gate 				found = 1;
1777*7c478bd9Sstevel@tonic-gate 			} else {
1778*7c478bd9Sstevel@tonic-gate 				ASSERT(want == BIO_MEM);
1779*7c478bd9Sstevel@tonic-gate 				if (!found && bfreelist.b_bufsize >= bsize) {
1780*7c478bd9Sstevel@tonic-gate 					/* Account for the memory we want */
1781*7c478bd9Sstevel@tonic-gate 					mutex_enter(&bfree_lock);
1782*7c478bd9Sstevel@tonic-gate 					if (bfreelist.b_bufsize >= bsize) {
1783*7c478bd9Sstevel@tonic-gate 						bfreelist.b_bufsize -= bsize;
1784*7c478bd9Sstevel@tonic-gate 						found = 1;
1785*7c478bd9Sstevel@tonic-gate 					}
1786*7c478bd9Sstevel@tonic-gate 					mutex_exit(&bfree_lock);
1787*7c478bd9Sstevel@tonic-gate 				}
1788*7c478bd9Sstevel@tonic-gate 			}
1789*7c478bd9Sstevel@tonic-gate 
1790*7c478bd9Sstevel@tonic-gate 			/*
1791*7c478bd9Sstevel@tonic-gate 			 * Since we dropped hmp start from the
1792*7c478bd9Sstevel@tonic-gate 			 * begining.
1793*7c478bd9Sstevel@tonic-gate 			 */
1794*7c478bd9Sstevel@tonic-gate 			mutex_enter(hmp);
1795*7c478bd9Sstevel@tonic-gate 			bp = dp->av_forw;
1796*7c478bd9Sstevel@tonic-gate 		}
1797*7c478bd9Sstevel@tonic-gate 		mutex_exit(hmp);
1798*7c478bd9Sstevel@tonic-gate 
1799*7c478bd9Sstevel@tonic-gate 		/*
1800*7c478bd9Sstevel@tonic-gate 		 * Look at the delayed write list.
1801*7c478bd9Sstevel@tonic-gate 		 * First gather into a private list, then write them.
1802*7c478bd9Sstevel@tonic-gate 		 */
1803*7c478bd9Sstevel@tonic-gate 		dwp = (struct buf *)&dwbuf[start];
1804*7c478bd9Sstevel@tonic-gate 		mutex_enter(&blist_lock);
1805*7c478bd9Sstevel@tonic-gate 		bio_doingflush++;
1806*7c478bd9Sstevel@tonic-gate 		mutex_enter(hmp);
1807*7c478bd9Sstevel@tonic-gate 		for (bp = dwp->av_forw; bp != dwp; bp = nbp) {
1808*7c478bd9Sstevel@tonic-gate 
1809*7c478bd9Sstevel@tonic-gate 			ASSERT(bp != NULL);
1810*7c478bd9Sstevel@tonic-gate 			nbp = bp->av_forw;
1811*7c478bd9Sstevel@tonic-gate 
1812*7c478bd9Sstevel@tonic-gate 			if (!sema_tryp(&bp->b_sem))
1813*7c478bd9Sstevel@tonic-gate 				continue;
1814*7c478bd9Sstevel@tonic-gate 			ASSERT(bp->b_flags & B_DELWRI);
1815*7c478bd9Sstevel@tonic-gate 			/*
1816*7c478bd9Sstevel@tonic-gate 			 * Do we really want to nuke all of the B_AGE stuff??
1817*7c478bd9Sstevel@tonic-gate 			 */
1818*7c478bd9Sstevel@tonic-gate 
1819*7c478bd9Sstevel@tonic-gate 			if ((bp->b_flags & B_AGE) == 0 && found) {
1820*7c478bd9Sstevel@tonic-gate 				sema_v(&bp->b_sem);
1821*7c478bd9Sstevel@tonic-gate 				mutex_exit(hmp);
1822*7c478bd9Sstevel@tonic-gate 				lastindex = start;
1823*7c478bd9Sstevel@tonic-gate 				mutex_exit(&blist_lock);
1824*7c478bd9Sstevel@tonic-gate 				bio_flushlist(delwri_list);
1825*7c478bd9Sstevel@tonic-gate 				mutex_enter(&blist_lock);
1826*7c478bd9Sstevel@tonic-gate 				bio_doingflush--;
1827*7c478bd9Sstevel@tonic-gate 				if (bio_flinv_cv_wanted) {
1828*7c478bd9Sstevel@tonic-gate 					bio_flinv_cv_wanted = 0;
1829*7c478bd9Sstevel@tonic-gate 					cv_broadcast(&bio_flushinval_cv);
1830*7c478bd9Sstevel@tonic-gate 				}
1831*7c478bd9Sstevel@tonic-gate 				mutex_exit(&blist_lock);
1832*7c478bd9Sstevel@tonic-gate 				return; /* All done */
1833*7c478bd9Sstevel@tonic-gate 			}
1834*7c478bd9Sstevel@tonic-gate 
1835*7c478bd9Sstevel@tonic-gate 			/*
1836*7c478bd9Sstevel@tonic-gate 			 * If the buffer is already on a flush or
1837*7c478bd9Sstevel@tonic-gate 			 * invalidate list then just skip it.
1838*7c478bd9Sstevel@tonic-gate 			 */
1839*7c478bd9Sstevel@tonic-gate 			if (bp->b_list != NULL) {
1840*7c478bd9Sstevel@tonic-gate 				sema_v(&bp->b_sem);
1841*7c478bd9Sstevel@tonic-gate 				continue;
1842*7c478bd9Sstevel@tonic-gate 			}
1843*7c478bd9Sstevel@tonic-gate 			/*
1844*7c478bd9Sstevel@tonic-gate 			 * We are still on the same bucket.
1845*7c478bd9Sstevel@tonic-gate 			 */
1846*7c478bd9Sstevel@tonic-gate 			hp->b_length--;
1847*7c478bd9Sstevel@tonic-gate 			notavail(bp);
1848*7c478bd9Sstevel@tonic-gate 			bp->b_list = delwri_list;
1849*7c478bd9Sstevel@tonic-gate 			delwri_list = bp;
1850*7c478bd9Sstevel@tonic-gate 		}
1851*7c478bd9Sstevel@tonic-gate 		mutex_exit(hmp);
1852*7c478bd9Sstevel@tonic-gate 		mutex_exit(&blist_lock);
1853*7c478bd9Sstevel@tonic-gate 		bio_flushlist(delwri_list);
1854*7c478bd9Sstevel@tonic-gate 		delwri_list = EMPTY_LIST;
1855*7c478bd9Sstevel@tonic-gate 		mutex_enter(&blist_lock);
1856*7c478bd9Sstevel@tonic-gate 		bio_doingflush--;
1857*7c478bd9Sstevel@tonic-gate 		if (bio_flinv_cv_wanted) {
1858*7c478bd9Sstevel@tonic-gate 			bio_flinv_cv_wanted = 0;
1859*7c478bd9Sstevel@tonic-gate 			cv_broadcast(&bio_flushinval_cv);
1860*7c478bd9Sstevel@tonic-gate 		}
1861*7c478bd9Sstevel@tonic-gate 		mutex_exit(&blist_lock);
1862*7c478bd9Sstevel@tonic-gate 		start = (start + 1) % v.v_hbuf;
1863*7c478bd9Sstevel@tonic-gate 
1864*7c478bd9Sstevel@tonic-gate 	} while (start != end);
1865*7c478bd9Sstevel@tonic-gate 
1866*7c478bd9Sstevel@tonic-gate 	if (found)
1867*7c478bd9Sstevel@tonic-gate 		return;
1868*7c478bd9Sstevel@tonic-gate 
1869*7c478bd9Sstevel@tonic-gate 	/*
1870*7c478bd9Sstevel@tonic-gate 	 * Free lists exhausted and we haven't satisfied the request.
1871*7c478bd9Sstevel@tonic-gate 	 * Wait here for more entries to be added to freelist.
1872*7c478bd9Sstevel@tonic-gate 	 * Because this might have just happened, make it timed.
1873*7c478bd9Sstevel@tonic-gate 	 */
1874*7c478bd9Sstevel@tonic-gate 	mutex_enter(&bfree_lock);
1875*7c478bd9Sstevel@tonic-gate 	bfreelist.b_flags |= B_WANTED;
1876*7c478bd9Sstevel@tonic-gate 	(void) cv_timedwait(&bio_mem_cv, &bfree_lock, lbolt+hz);
1877*7c478bd9Sstevel@tonic-gate 	mutex_exit(&bfree_lock);
1878*7c478bd9Sstevel@tonic-gate 	goto top;
1879*7c478bd9Sstevel@tonic-gate }
1880*7c478bd9Sstevel@tonic-gate 
1881*7c478bd9Sstevel@tonic-gate /*
1882*7c478bd9Sstevel@tonic-gate  * See if the block is associated with some buffer
1883*7c478bd9Sstevel@tonic-gate  * (mainly to avoid getting hung up on a wait in breada).
1884*7c478bd9Sstevel@tonic-gate  */
1885*7c478bd9Sstevel@tonic-gate static int
1886*7c478bd9Sstevel@tonic-gate bio_incore(dev_t dev, daddr_t blkno)
1887*7c478bd9Sstevel@tonic-gate {
1888*7c478bd9Sstevel@tonic-gate 	struct buf *bp;
1889*7c478bd9Sstevel@tonic-gate 	struct buf *dp;
1890*7c478bd9Sstevel@tonic-gate 	uint_t index;
1891*7c478bd9Sstevel@tonic-gate 	kmutex_t *hmp;
1892*7c478bd9Sstevel@tonic-gate 
1893*7c478bd9Sstevel@tonic-gate 	index = bio_bhash(dev, blkno);
1894*7c478bd9Sstevel@tonic-gate 	dp = (struct buf *)&hbuf[index];
1895*7c478bd9Sstevel@tonic-gate 	hmp = &hbuf[index].b_lock;
1896*7c478bd9Sstevel@tonic-gate 
1897*7c478bd9Sstevel@tonic-gate 	mutex_enter(hmp);
1898*7c478bd9Sstevel@tonic-gate 	for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
1899*7c478bd9Sstevel@tonic-gate 		if (bp->b_blkno == blkno && bp->b_edev == dev &&
1900*7c478bd9Sstevel@tonic-gate 		    (bp->b_flags & B_STALE) == 0) {
1901*7c478bd9Sstevel@tonic-gate 			mutex_exit(hmp);
1902*7c478bd9Sstevel@tonic-gate 			return (1);
1903*7c478bd9Sstevel@tonic-gate 		}
1904*7c478bd9Sstevel@tonic-gate 	}
1905*7c478bd9Sstevel@tonic-gate 	mutex_exit(hmp);
1906*7c478bd9Sstevel@tonic-gate 	return (0);
1907*7c478bd9Sstevel@tonic-gate }
1908*7c478bd9Sstevel@tonic-gate 
1909*7c478bd9Sstevel@tonic-gate static void
1910*7c478bd9Sstevel@tonic-gate bio_pageio_done(struct buf *bp)
1911*7c478bd9Sstevel@tonic-gate {
1912*7c478bd9Sstevel@tonic-gate 	if (bp->b_flags & B_PAGEIO) {
1913*7c478bd9Sstevel@tonic-gate 
1914*7c478bd9Sstevel@tonic-gate 		if (bp->b_flags & B_REMAPPED)
1915*7c478bd9Sstevel@tonic-gate 			bp_mapout(bp);
1916*7c478bd9Sstevel@tonic-gate 
1917*7c478bd9Sstevel@tonic-gate 		if (bp->b_flags & B_READ)
1918*7c478bd9Sstevel@tonic-gate 			pvn_read_done(bp->b_pages, bp->b_flags);
1919*7c478bd9Sstevel@tonic-gate 		else
1920*7c478bd9Sstevel@tonic-gate 			pvn_write_done(bp->b_pages, B_WRITE | bp->b_flags);
1921*7c478bd9Sstevel@tonic-gate 		pageio_done(bp);
1922*7c478bd9Sstevel@tonic-gate 	} else {
1923*7c478bd9Sstevel@tonic-gate 		ASSERT(bp->b_flags & B_REMAPPED);
1924*7c478bd9Sstevel@tonic-gate 		bp_mapout(bp);
1925*7c478bd9Sstevel@tonic-gate 		brelse(bp);
1926*7c478bd9Sstevel@tonic-gate 	}
1927*7c478bd9Sstevel@tonic-gate }
1928*7c478bd9Sstevel@tonic-gate 
1929*7c478bd9Sstevel@tonic-gate /*
1930*7c478bd9Sstevel@tonic-gate  * bioerror(9F) - indicate error in buffer header
1931*7c478bd9Sstevel@tonic-gate  * If 'error' is zero, remove the error indication.
1932*7c478bd9Sstevel@tonic-gate  */
1933*7c478bd9Sstevel@tonic-gate void
1934*7c478bd9Sstevel@tonic-gate bioerror(struct buf *bp, int error)
1935*7c478bd9Sstevel@tonic-gate {
1936*7c478bd9Sstevel@tonic-gate 	ASSERT(bp != NULL);
1937*7c478bd9Sstevel@tonic-gate 	ASSERT(error >= 0);
1938*7c478bd9Sstevel@tonic-gate 	ASSERT(SEMA_HELD(&bp->b_sem));
1939*7c478bd9Sstevel@tonic-gate 
1940*7c478bd9Sstevel@tonic-gate 	if (error != 0) {
1941*7c478bd9Sstevel@tonic-gate 		bp->b_flags |= B_ERROR;
1942*7c478bd9Sstevel@tonic-gate 	} else {
1943*7c478bd9Sstevel@tonic-gate 		bp->b_flags &= ~B_ERROR;
1944*7c478bd9Sstevel@tonic-gate 	}
1945*7c478bd9Sstevel@tonic-gate 	bp->b_error = error;
1946*7c478bd9Sstevel@tonic-gate }
1947*7c478bd9Sstevel@tonic-gate 
1948*7c478bd9Sstevel@tonic-gate /*
1949*7c478bd9Sstevel@tonic-gate  * bioreset(9F) - reuse a private buffer header after I/O is complete
1950*7c478bd9Sstevel@tonic-gate  */
1951*7c478bd9Sstevel@tonic-gate void
1952*7c478bd9Sstevel@tonic-gate bioreset(struct buf *bp)
1953*7c478bd9Sstevel@tonic-gate {
1954*7c478bd9Sstevel@tonic-gate 	ASSERT(bp != NULL);
1955*7c478bd9Sstevel@tonic-gate 
1956*7c478bd9Sstevel@tonic-gate 	biofini(bp);
1957*7c478bd9Sstevel@tonic-gate 	bioinit(bp);
1958*7c478bd9Sstevel@tonic-gate }
1959*7c478bd9Sstevel@tonic-gate 
1960*7c478bd9Sstevel@tonic-gate /*
1961*7c478bd9Sstevel@tonic-gate  * biosize(9F) - return size of a buffer header
1962*7c478bd9Sstevel@tonic-gate  */
1963*7c478bd9Sstevel@tonic-gate size_t
1964*7c478bd9Sstevel@tonic-gate biosize(void)
1965*7c478bd9Sstevel@tonic-gate {
1966*7c478bd9Sstevel@tonic-gate 	return (sizeof (struct buf));
1967*7c478bd9Sstevel@tonic-gate }
1968*7c478bd9Sstevel@tonic-gate 
1969*7c478bd9Sstevel@tonic-gate /*
1970*7c478bd9Sstevel@tonic-gate  * biomodified(9F) - check if buffer is modified
1971*7c478bd9Sstevel@tonic-gate  */
1972*7c478bd9Sstevel@tonic-gate int
1973*7c478bd9Sstevel@tonic-gate biomodified(struct buf *bp)
1974*7c478bd9Sstevel@tonic-gate {
1975*7c478bd9Sstevel@tonic-gate 	int npf;
1976*7c478bd9Sstevel@tonic-gate 	int ppattr;
1977*7c478bd9Sstevel@tonic-gate 	struct page *pp;
1978*7c478bd9Sstevel@tonic-gate 
1979*7c478bd9Sstevel@tonic-gate 	ASSERT(bp != NULL);
1980*7c478bd9Sstevel@tonic-gate 
1981*7c478bd9Sstevel@tonic-gate 	if ((bp->b_flags & B_PAGEIO) == 0) {
1982*7c478bd9Sstevel@tonic-gate 		return (-1);
1983*7c478bd9Sstevel@tonic-gate 	}
1984*7c478bd9Sstevel@tonic-gate 	pp = bp->b_pages;
1985*7c478bd9Sstevel@tonic-gate 	npf = btopr(bp->b_bcount + ((uintptr_t)bp->b_un.b_addr & PAGEOFFSET));
1986*7c478bd9Sstevel@tonic-gate 
1987*7c478bd9Sstevel@tonic-gate 	while (npf > 0) {
1988*7c478bd9Sstevel@tonic-gate 		ppattr = hat_pagesync(pp, HAT_SYNC_DONTZERO |
1989*7c478bd9Sstevel@tonic-gate 				HAT_SYNC_STOPON_MOD);
1990*7c478bd9Sstevel@tonic-gate 		if (ppattr & P_MOD)
1991*7c478bd9Sstevel@tonic-gate 			return (1);
1992*7c478bd9Sstevel@tonic-gate 		pp = pp->p_next;
1993*7c478bd9Sstevel@tonic-gate 		npf--;
1994*7c478bd9Sstevel@tonic-gate 	}
1995*7c478bd9Sstevel@tonic-gate 
1996*7c478bd9Sstevel@tonic-gate 	return (0);
1997*7c478bd9Sstevel@tonic-gate }
1998*7c478bd9Sstevel@tonic-gate 
1999*7c478bd9Sstevel@tonic-gate /*
2000*7c478bd9Sstevel@tonic-gate  * bioinit(9F) - initialize a buffer structure
2001*7c478bd9Sstevel@tonic-gate  */
2002*7c478bd9Sstevel@tonic-gate void
2003*7c478bd9Sstevel@tonic-gate bioinit(struct buf *bp)
2004*7c478bd9Sstevel@tonic-gate {
2005*7c478bd9Sstevel@tonic-gate 	bzero(bp, sizeof (struct buf));
2006*7c478bd9Sstevel@tonic-gate 	sema_init(&bp->b_sem, 0, NULL, SEMA_DEFAULT, NULL);
2007*7c478bd9Sstevel@tonic-gate 	sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
2008*7c478bd9Sstevel@tonic-gate 	bp->b_offset = -1;
2009*7c478bd9Sstevel@tonic-gate }
2010*7c478bd9Sstevel@tonic-gate 
2011*7c478bd9Sstevel@tonic-gate /*
2012*7c478bd9Sstevel@tonic-gate  * biofini(9F) - uninitialize a buffer structure
2013*7c478bd9Sstevel@tonic-gate  */
2014*7c478bd9Sstevel@tonic-gate void
2015*7c478bd9Sstevel@tonic-gate biofini(struct buf *bp)
2016*7c478bd9Sstevel@tonic-gate {
2017*7c478bd9Sstevel@tonic-gate 	sema_destroy(&bp->b_io);
2018*7c478bd9Sstevel@tonic-gate 	sema_destroy(&bp->b_sem);
2019*7c478bd9Sstevel@tonic-gate }
2020*7c478bd9Sstevel@tonic-gate 
2021*7c478bd9Sstevel@tonic-gate /*
2022*7c478bd9Sstevel@tonic-gate  * bioclone(9F) - clone a buffer
2023*7c478bd9Sstevel@tonic-gate  */
2024*7c478bd9Sstevel@tonic-gate struct buf *
2025*7c478bd9Sstevel@tonic-gate bioclone(struct buf *bp, off_t off, size_t len, dev_t dev, daddr_t blkno,
2026*7c478bd9Sstevel@tonic-gate     int (*iodone)(struct buf *), struct buf *bp_mem, int sleep)
2027*7c478bd9Sstevel@tonic-gate {
2028*7c478bd9Sstevel@tonic-gate 	struct buf *bufp;
2029*7c478bd9Sstevel@tonic-gate 
2030*7c478bd9Sstevel@tonic-gate 	ASSERT(bp);
2031*7c478bd9Sstevel@tonic-gate 	if (bp_mem == NULL) {
2032*7c478bd9Sstevel@tonic-gate 		bufp = kmem_alloc(sizeof (struct buf), sleep);
2033*7c478bd9Sstevel@tonic-gate 		if (bufp == NULL) {
2034*7c478bd9Sstevel@tonic-gate 			return (NULL);
2035*7c478bd9Sstevel@tonic-gate 		}
2036*7c478bd9Sstevel@tonic-gate 		bioinit(bufp);
2037*7c478bd9Sstevel@tonic-gate 	} else {
2038*7c478bd9Sstevel@tonic-gate 		bufp = bp_mem;
2039*7c478bd9Sstevel@tonic-gate 		bioreset(bufp);
2040*7c478bd9Sstevel@tonic-gate 	}
2041*7c478bd9Sstevel@tonic-gate 
2042*7c478bd9Sstevel@tonic-gate #define	BUF_CLONE_FLAGS	(B_READ|B_WRITE|B_SHADOW|B_PHYS|B_PAGEIO|B_FAILFAST|\
2043*7c478bd9Sstevel@tonic-gate 	B_ABRWRITE)
2044*7c478bd9Sstevel@tonic-gate 
2045*7c478bd9Sstevel@tonic-gate 	/*
2046*7c478bd9Sstevel@tonic-gate 	 * the cloned buffer does not inherit the B_REMAPPED flag. A separate
2047*7c478bd9Sstevel@tonic-gate 	 * bp_mapin(9F) has to be done to get a kernel mapping.
2048*7c478bd9Sstevel@tonic-gate 	 */
2049*7c478bd9Sstevel@tonic-gate 	bufp->b_flags = (bp->b_flags & BUF_CLONE_FLAGS)  | B_BUSY;
2050*7c478bd9Sstevel@tonic-gate 	bufp->b_bcount = len;
2051*7c478bd9Sstevel@tonic-gate 	bufp->b_blkno = blkno;
2052*7c478bd9Sstevel@tonic-gate 	bufp->b_iodone = iodone;
2053*7c478bd9Sstevel@tonic-gate 	bufp->b_proc = bp->b_proc;
2054*7c478bd9Sstevel@tonic-gate 	bufp->b_edev = dev;
2055*7c478bd9Sstevel@tonic-gate 	bufp->b_file = bp->b_file;
2056*7c478bd9Sstevel@tonic-gate 	bufp->b_offset = bp->b_offset;
2057*7c478bd9Sstevel@tonic-gate 
2058*7c478bd9Sstevel@tonic-gate 	if (bp->b_flags & B_SHADOW) {
2059*7c478bd9Sstevel@tonic-gate 		ASSERT(bp->b_shadow);
2060*7c478bd9Sstevel@tonic-gate 		ASSERT(bp->b_flags & B_PHYS);
2061*7c478bd9Sstevel@tonic-gate 
2062*7c478bd9Sstevel@tonic-gate 		bufp->b_shadow = bp->b_shadow +
2063*7c478bd9Sstevel@tonic-gate 			btop(((uintptr_t)bp->b_un.b_addr & PAGEOFFSET) + off);
2064*7c478bd9Sstevel@tonic-gate 		bufp->b_un.b_addr = (caddr_t)((uintptr_t)bp->b_un.b_addr + off);
2065*7c478bd9Sstevel@tonic-gate 	} else {
2066*7c478bd9Sstevel@tonic-gate 		if (bp->b_flags & B_PAGEIO) {
2067*7c478bd9Sstevel@tonic-gate 			struct page *pp;
2068*7c478bd9Sstevel@tonic-gate 			off_t o;
2069*7c478bd9Sstevel@tonic-gate 			int i;
2070*7c478bd9Sstevel@tonic-gate 
2071*7c478bd9Sstevel@tonic-gate 			pp = bp->b_pages;
2072*7c478bd9Sstevel@tonic-gate 			o = ((uintptr_t)bp->b_un.b_addr & PAGEOFFSET) + off;
2073*7c478bd9Sstevel@tonic-gate 			for (i = btop(o); i > 0; i--) {
2074*7c478bd9Sstevel@tonic-gate 				pp = pp->p_next;
2075*7c478bd9Sstevel@tonic-gate 			}
2076*7c478bd9Sstevel@tonic-gate 			bufp->b_pages = pp;
2077*7c478bd9Sstevel@tonic-gate 			bufp->b_un.b_addr = (caddr_t)(o & PAGEOFFSET);
2078*7c478bd9Sstevel@tonic-gate 		} else {
2079*7c478bd9Sstevel@tonic-gate 			bufp->b_un.b_addr =
2080*7c478bd9Sstevel@tonic-gate 				(caddr_t)((uintptr_t)bp->b_un.b_addr + off);
2081*7c478bd9Sstevel@tonic-gate 			if (bp->b_flags & B_REMAPPED)
2082*7c478bd9Sstevel@tonic-gate 				bufp->b_proc = NULL;
2083*7c478bd9Sstevel@tonic-gate 		}
2084*7c478bd9Sstevel@tonic-gate 	}
2085*7c478bd9Sstevel@tonic-gate 	return (bufp);
2086*7c478bd9Sstevel@tonic-gate }
2087