xref: /illumos-gate/usr/src/uts/common/os/bio.c (revision 48bbca81)
17c478bd9Sstevel@tonic-gate /*
27c478bd9Sstevel@tonic-gate  * CDDL HEADER START
37c478bd9Sstevel@tonic-gate  *
47c478bd9Sstevel@tonic-gate  * The contents of this file are subject to the terms of the
56f84fed5Scth  * Common Development and Distribution License (the "License").
66f84fed5Scth  * You may not use this file except in compliance with the License.
77c478bd9Sstevel@tonic-gate  *
87c478bd9Sstevel@tonic-gate  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
97c478bd9Sstevel@tonic-gate  * or http://www.opensolaris.org/os/licensing.
107c478bd9Sstevel@tonic-gate  * See the License for the specific language governing permissions
117c478bd9Sstevel@tonic-gate  * and limitations under the License.
127c478bd9Sstevel@tonic-gate  *
137c478bd9Sstevel@tonic-gate  * When distributing Covered Code, include this CDDL HEADER in each
147c478bd9Sstevel@tonic-gate  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
157c478bd9Sstevel@tonic-gate  * If applicable, add the following below this CDDL HEADER, with the
167c478bd9Sstevel@tonic-gate  * fields enclosed by brackets "[]" replaced with your own identifying
177c478bd9Sstevel@tonic-gate  * information: Portions Copyright [yyyy] [name of copyright owner]
187c478bd9Sstevel@tonic-gate  *
197c478bd9Sstevel@tonic-gate  * CDDL HEADER END
207c478bd9Sstevel@tonic-gate  */
217c478bd9Sstevel@tonic-gate /*
22d3d50737SRafael Vanoni  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
237c478bd9Sstevel@tonic-gate  * Use is subject to license terms.
249468939eSJerry Jelinek  * Copyright 2011 Joyent, Inc.  All rights reserved.
257c478bd9Sstevel@tonic-gate  */
267c478bd9Sstevel@tonic-gate 
273f11de9dSSara Hartse /*
283f11de9dSSara Hartse  * Copyright (c) 2016 by Delphix. All rights reserved.
293f11de9dSSara Hartse  */
303f11de9dSSara Hartse 
317c478bd9Sstevel@tonic-gate /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
327c478bd9Sstevel@tonic-gate /*	  All Rights Reserved  	*/
337c478bd9Sstevel@tonic-gate 
347c478bd9Sstevel@tonic-gate /*
357c478bd9Sstevel@tonic-gate  * University Copyright- Copyright (c) 1982, 1986, 1988
367c478bd9Sstevel@tonic-gate  * The Regents of the University of California
377c478bd9Sstevel@tonic-gate  * All Rights Reserved
387c478bd9Sstevel@tonic-gate  *
397c478bd9Sstevel@tonic-gate  * University Acknowledgment- Portions of this document are derived from
407c478bd9Sstevel@tonic-gate  * software developed by the University of California, Berkeley, and its
417c478bd9Sstevel@tonic-gate  * contributors.
427c478bd9Sstevel@tonic-gate  */
437c478bd9Sstevel@tonic-gate 
447c478bd9Sstevel@tonic-gate #include <sys/types.h>
457c478bd9Sstevel@tonic-gate #include <sys/t_lock.h>
467c478bd9Sstevel@tonic-gate #include <sys/sysmacros.h>
477c478bd9Sstevel@tonic-gate #include <sys/conf.h>
487c478bd9Sstevel@tonic-gate #include <sys/cpuvar.h>
497c478bd9Sstevel@tonic-gate #include <sys/errno.h>
507c478bd9Sstevel@tonic-gate #include <sys/debug.h>
517c478bd9Sstevel@tonic-gate #include <sys/buf.h>
527c478bd9Sstevel@tonic-gate #include <sys/var.h>
537c478bd9Sstevel@tonic-gate #include <sys/vnode.h>
547c478bd9Sstevel@tonic-gate #include <sys/bitmap.h>
557c478bd9Sstevel@tonic-gate #include <sys/cmn_err.h>
567c478bd9Sstevel@tonic-gate #include <sys/kmem.h>
577c478bd9Sstevel@tonic-gate #include <sys/vmem.h>
587c478bd9Sstevel@tonic-gate #include <sys/atomic.h>
597c478bd9Sstevel@tonic-gate #include <vm/seg_kmem.h>
607c478bd9Sstevel@tonic-gate #include <vm/page.h>
617c478bd9Sstevel@tonic-gate #include <vm/pvn.h>
627c478bd9Sstevel@tonic-gate #include <sys/vtrace.h>
637c478bd9Sstevel@tonic-gate #include <sys/tnf_probe.h>
647c478bd9Sstevel@tonic-gate #include <sys/fs/ufs_inode.h>
657c478bd9Sstevel@tonic-gate #include <sys/fs/ufs_bio.h>
667c478bd9Sstevel@tonic-gate #include <sys/fs/ufs_log.h>
677c478bd9Sstevel@tonic-gate #include <sys/systm.h>
687c478bd9Sstevel@tonic-gate #include <sys/vfs.h>
697c478bd9Sstevel@tonic-gate #include <sys/sdt.h>
707c478bd9Sstevel@tonic-gate 
717c478bd9Sstevel@tonic-gate /* Locks */
727c478bd9Sstevel@tonic-gate static	kmutex_t	blist_lock;	/* protects b_list */
737c478bd9Sstevel@tonic-gate static	kmutex_t	bhdr_lock;	/* protects the bhdrlist */
747c478bd9Sstevel@tonic-gate static	kmutex_t	bfree_lock;	/* protects the bfreelist structure */
757c478bd9Sstevel@tonic-gate 
767c478bd9Sstevel@tonic-gate struct hbuf	*hbuf;			/* Hash buckets */
777c478bd9Sstevel@tonic-gate struct dwbuf	*dwbuf;			/* Delayed write buckets */
787c478bd9Sstevel@tonic-gate static struct buf *bhdrlist;		/* buf header free list */
797c478bd9Sstevel@tonic-gate static int 	nbuf;			/* number of buffer headers allocated */
807c478bd9Sstevel@tonic-gate 
817c478bd9Sstevel@tonic-gate static int	lastindex;		/* Reference point on where to start */
827c478bd9Sstevel@tonic-gate 					/* when looking for free buffers */
837c478bd9Sstevel@tonic-gate 
847c478bd9Sstevel@tonic-gate #define	bio_bhash(dev, bn)	(hash2ints((dev), (int)(bn)) & v.v_hmask)
857c478bd9Sstevel@tonic-gate #define	EMPTY_LIST	((struct buf *)-1)
867c478bd9Sstevel@tonic-gate 
877c478bd9Sstevel@tonic-gate static kcondvar_t	bio_mem_cv; 	/* Condition variables */
887c478bd9Sstevel@tonic-gate static kcondvar_t	bio_flushinval_cv;
897c478bd9Sstevel@tonic-gate static int	bio_doingflush;		/* flush in progress */
907c478bd9Sstevel@tonic-gate static int	bio_doinginval;		/* inval in progress */
917c478bd9Sstevel@tonic-gate static int	bio_flinv_cv_wanted;	/* someone waiting for cv */
927c478bd9Sstevel@tonic-gate 
937c478bd9Sstevel@tonic-gate /*
947c478bd9Sstevel@tonic-gate  * Statistics on the buffer cache
957c478bd9Sstevel@tonic-gate  */
967c478bd9Sstevel@tonic-gate struct biostats biostats = {
977c478bd9Sstevel@tonic-gate 	{ "buffer_cache_lookups",		KSTAT_DATA_UINT32 },
987c478bd9Sstevel@tonic-gate 	{ "buffer_cache_hits",			KSTAT_DATA_UINT32 },
997c478bd9Sstevel@tonic-gate 	{ "new_buffer_requests",		KSTAT_DATA_UINT32 },
1007c478bd9Sstevel@tonic-gate 	{ "waits_for_buffer_allocs",		KSTAT_DATA_UINT32 },
1017c478bd9Sstevel@tonic-gate 	{ "buffers_locked_by_someone",		KSTAT_DATA_UINT32 },
1027c478bd9Sstevel@tonic-gate 	{ "duplicate_buffers_found",		KSTAT_DATA_UINT32 }
1037c478bd9Sstevel@tonic-gate };
1047c478bd9Sstevel@tonic-gate 
1057c478bd9Sstevel@tonic-gate /*
1067c478bd9Sstevel@tonic-gate  * kstat data
1077c478bd9Sstevel@tonic-gate  */
1087c478bd9Sstevel@tonic-gate kstat_named_t	*biostats_ptr = (kstat_named_t *)&biostats;
1097c478bd9Sstevel@tonic-gate uint_t		biostats_ndata = (uint_t)(sizeof (biostats) /
1107c478bd9Sstevel@tonic-gate 					sizeof (kstat_named_t));
1117c478bd9Sstevel@tonic-gate 
1127c478bd9Sstevel@tonic-gate /*
1137c478bd9Sstevel@tonic-gate  * Statistics on ufs buffer cache
1147c478bd9Sstevel@tonic-gate  * Not protected by locks
1157c478bd9Sstevel@tonic-gate  */
1167c478bd9Sstevel@tonic-gate struct ufsbiostats ub = {
1177c478bd9Sstevel@tonic-gate 	{ "breads",			KSTAT_DATA_UINT32 },
1187c478bd9Sstevel@tonic-gate 	{ "bwrites",			KSTAT_DATA_UINT32 },
1197c478bd9Sstevel@tonic-gate 	{ "fbiwrites",			KSTAT_DATA_UINT32 },
1207c478bd9Sstevel@tonic-gate 	{ "getpages",			KSTAT_DATA_UINT32 },
1217c478bd9Sstevel@tonic-gate 	{ "getras",			KSTAT_DATA_UINT32 },
1227c478bd9Sstevel@tonic-gate 	{ "putsyncs",			KSTAT_DATA_UINT32 },
1237c478bd9Sstevel@tonic-gate 	{ "putasyncs",			KSTAT_DATA_UINT32 },
1247c478bd9Sstevel@tonic-gate 	{ "putpageios",			KSTAT_DATA_UINT32 },
1257c478bd9Sstevel@tonic-gate };
1267c478bd9Sstevel@tonic-gate 
1277c478bd9Sstevel@tonic-gate /*
1287c478bd9Sstevel@tonic-gate  * more UFS Logging eccentricities...
1297c478bd9Sstevel@tonic-gate  *
1307c478bd9Sstevel@tonic-gate  * required since "#pragma weak ..." doesn't work in reverse order.
1317c478bd9Sstevel@tonic-gate  * i.e.:  genunix (bio.c) is loaded before the ufs modules and pointers
1327c478bd9Sstevel@tonic-gate  *        to ufs routines don't get plugged into bio.c calls so
1337c478bd9Sstevel@tonic-gate  *        we initialize it when setting up the "lufsops" table
1347c478bd9Sstevel@tonic-gate  *        in "lufs.c:_init()"
1357c478bd9Sstevel@tonic-gate  */
1367c478bd9Sstevel@tonic-gate void (*bio_lufs_strategy)(void *, buf_t *);
1377c478bd9Sstevel@tonic-gate void (*bio_snapshot_strategy)(void *, buf_t *);
1387c478bd9Sstevel@tonic-gate 
1397c478bd9Sstevel@tonic-gate 
1407c478bd9Sstevel@tonic-gate /* Private routines */
1417c478bd9Sstevel@tonic-gate static struct buf	*bio_getfreeblk(long);
1427c478bd9Sstevel@tonic-gate static void 		bio_mem_get(long);
1437c478bd9Sstevel@tonic-gate static void		bio_bhdr_free(struct buf *);
1447c478bd9Sstevel@tonic-gate static struct buf	*bio_bhdr_alloc(void);
1457c478bd9Sstevel@tonic-gate static void		bio_recycle(int, long);
1467c478bd9Sstevel@tonic-gate static void 		bio_pageio_done(struct buf *);
1477c478bd9Sstevel@tonic-gate static int 		bio_incore(dev_t, daddr_t);
1487c478bd9Sstevel@tonic-gate 
1497c478bd9Sstevel@tonic-gate /*
1507c478bd9Sstevel@tonic-gate  * Buffer cache constants
1517c478bd9Sstevel@tonic-gate  */
1527c478bd9Sstevel@tonic-gate #define	BIO_BUF_PERCENT	(100/2)		/* default: 2% of memory */
1537c478bd9Sstevel@tonic-gate #define	BIO_MAX_PERCENT	(100/20)	/* max is 20% of real memory */
1547c478bd9Sstevel@tonic-gate #define	BIO_BHDR_POOL	100		/* Default bhdr pool size */
1557c478bd9Sstevel@tonic-gate #define	BIO_MIN_HDR	10		/* Minimum number of buffer headers */
1567c478bd9Sstevel@tonic-gate #define	BIO_MIN_HWM	(BIO_MIN_HDR * MAXBSIZE / 1024)
1577c478bd9Sstevel@tonic-gate #define	BIO_HASHLEN	4		/* Target length of hash chains */
1587c478bd9Sstevel@tonic-gate 
1597c478bd9Sstevel@tonic-gate 
1607c478bd9Sstevel@tonic-gate /* Flags for bio_recycle() */
1617c478bd9Sstevel@tonic-gate #define	BIO_HEADER	0x01
1627c478bd9Sstevel@tonic-gate #define	BIO_MEM		0x02
1637c478bd9Sstevel@tonic-gate 
1647c478bd9Sstevel@tonic-gate extern	int bufhwm;		/* User tunable - high water mark for mem  */
1657c478bd9Sstevel@tonic-gate extern	int bufhwm_pct;		/* ditto - given in % of physmem  */
1667c478bd9Sstevel@tonic-gate 
1677c478bd9Sstevel@tonic-gate /*
1687c478bd9Sstevel@tonic-gate  * The following routines allocate and free
1697c478bd9Sstevel@tonic-gate  * buffers with various side effects.  In general the
1707c478bd9Sstevel@tonic-gate  * arguments to an allocate routine are a device and
1717c478bd9Sstevel@tonic-gate  * a block number, and the value is a pointer to
1727c478bd9Sstevel@tonic-gate  * to the buffer header; the buffer returned is locked with a
1737c478bd9Sstevel@tonic-gate  * binary semaphore so that no one else can touch it. If the block was
1747c478bd9Sstevel@tonic-gate  * already in core, no I/O need be done; if it is
1757c478bd9Sstevel@tonic-gate  * already locked, the process waits until it becomes free.
1767c478bd9Sstevel@tonic-gate  * The following routines allocate a buffer:
1777c478bd9Sstevel@tonic-gate  *	getblk
1787c478bd9Sstevel@tonic-gate  *	bread/BREAD
1797c478bd9Sstevel@tonic-gate  *	breada
1807c478bd9Sstevel@tonic-gate  * Eventually the buffer must be released, possibly with the
1817c478bd9Sstevel@tonic-gate  * side effect of writing it out, by using one of
1827c478bd9Sstevel@tonic-gate  *	bwrite/BWRITE/brwrite
1837c478bd9Sstevel@tonic-gate  *	bdwrite/bdrwrite
1847c478bd9Sstevel@tonic-gate  *	bawrite
1857c478bd9Sstevel@tonic-gate  *	brelse
1867c478bd9Sstevel@tonic-gate  *
1877c478bd9Sstevel@tonic-gate  * The B_WANTED/B_BUSY bits are NOT used by these routines for synchronization.
1887c478bd9Sstevel@tonic-gate  * Instead, a binary semaphore, b_sem is used to gain exclusive access to
1897c478bd9Sstevel@tonic-gate  * a buffer and a binary semaphore, b_io is used for I/O synchronization.
1907c478bd9Sstevel@tonic-gate  * B_DONE is still used to denote a buffer with I/O complete on it.
1917c478bd9Sstevel@tonic-gate  *
1927c478bd9Sstevel@tonic-gate  * The bfreelist.b_bcount field is computed everytime fsflush runs. It is
1937c478bd9Sstevel@tonic-gate  * should not be used where a very accurate count of the free buffers is
1947c478bd9Sstevel@tonic-gate  * needed.
1957c478bd9Sstevel@tonic-gate  */
1967c478bd9Sstevel@tonic-gate 
1977c478bd9Sstevel@tonic-gate /*
1987c478bd9Sstevel@tonic-gate  * Read in (if necessary) the block and return a buffer pointer.
1997c478bd9Sstevel@tonic-gate  *
2007c478bd9Sstevel@tonic-gate  * This interface is provided for binary compatibility.  Using
2017c478bd9Sstevel@tonic-gate  * BREAD() directly avoids the extra function call overhead invoked
2027c478bd9Sstevel@tonic-gate  * by calling this routine.
2037c478bd9Sstevel@tonic-gate  */
2047c478bd9Sstevel@tonic-gate struct buf *
2057c478bd9Sstevel@tonic-gate bread(dev_t dev, daddr_t blkno, long bsize)
2067c478bd9Sstevel@tonic-gate {
2077c478bd9Sstevel@tonic-gate 	return (BREAD(dev, blkno, bsize));
2087c478bd9Sstevel@tonic-gate }
2097c478bd9Sstevel@tonic-gate 
2107c478bd9Sstevel@tonic-gate /*
2117c478bd9Sstevel@tonic-gate  * Common code for reading a buffer with various options
2127c478bd9Sstevel@tonic-gate  *
2137c478bd9Sstevel@tonic-gate  * Read in (if necessary) the block and return a buffer pointer.
2147c478bd9Sstevel@tonic-gate  */
2157c478bd9Sstevel@tonic-gate struct buf *
2167c478bd9Sstevel@tonic-gate bread_common(void *arg, dev_t dev, daddr_t blkno, long bsize)
2177c478bd9Sstevel@tonic-gate {
2187c478bd9Sstevel@tonic-gate 	struct ufsvfs *ufsvfsp = (struct ufsvfs *)arg;
2197c478bd9Sstevel@tonic-gate 	struct buf *bp;
2207c478bd9Sstevel@tonic-gate 	klwp_t *lwp = ttolwp(curthread);
2217c478bd9Sstevel@tonic-gate 
2227c478bd9Sstevel@tonic-gate 	CPU_STATS_ADD_K(sys, lread, 1);
2237c478bd9Sstevel@tonic-gate 	bp = getblk_common(ufsvfsp, dev, blkno, bsize, /* errflg */ 1);
2247c478bd9Sstevel@tonic-gate 	if (bp->b_flags & B_DONE)
2257c478bd9Sstevel@tonic-gate 		return (bp);
2267c478bd9Sstevel@tonic-gate 	bp->b_flags |= B_READ;
2277c478bd9Sstevel@tonic-gate 	ASSERT(bp->b_bcount == bsize);
2287c478bd9Sstevel@tonic-gate 	if (ufsvfsp == NULL) {					/* !ufs */
2297c478bd9Sstevel@tonic-gate 		(void) bdev_strategy(bp);
2307c478bd9Sstevel@tonic-gate 	} else if (ufsvfsp->vfs_log && bio_lufs_strategy != NULL) {
2317c478bd9Sstevel@tonic-gate 							/* ufs && logging */
2327c478bd9Sstevel@tonic-gate 		(*bio_lufs_strategy)(ufsvfsp->vfs_log, bp);
2337c478bd9Sstevel@tonic-gate 	} else if (ufsvfsp->vfs_snapshot && bio_snapshot_strategy != NULL) {
2347c478bd9Sstevel@tonic-gate 							/* ufs && snapshots */
2357c478bd9Sstevel@tonic-gate 		(*bio_snapshot_strategy)(&ufsvfsp->vfs_snapshot, bp);
2367c478bd9Sstevel@tonic-gate 	} else {
237d3d50737SRafael Vanoni 		ufsvfsp->vfs_iotstamp = ddi_get_lbolt();
2387c478bd9Sstevel@tonic-gate 		ub.ub_breads.value.ul++;		/* ufs && !logging */
2397c478bd9Sstevel@tonic-gate 		(void) bdev_strategy(bp);
2407c478bd9Sstevel@tonic-gate 	}
2417c478bd9Sstevel@tonic-gate 	if (lwp != NULL)
2427c478bd9Sstevel@tonic-gate 		lwp->lwp_ru.inblock++;
2437c478bd9Sstevel@tonic-gate 	CPU_STATS_ADD_K(sys, bread, 1);
2447c478bd9Sstevel@tonic-gate 	(void) biowait(bp);
2457c478bd9Sstevel@tonic-gate 	return (bp);
2467c478bd9Sstevel@tonic-gate }
2477c478bd9Sstevel@tonic-gate 
2487c478bd9Sstevel@tonic-gate /*
2497c478bd9Sstevel@tonic-gate  * Read in the block, like bread, but also start I/O on the
2507c478bd9Sstevel@tonic-gate  * read-ahead block (which is not allocated to the caller).
2517c478bd9Sstevel@tonic-gate  */
2527c478bd9Sstevel@tonic-gate struct buf *
2537c478bd9Sstevel@tonic-gate breada(dev_t dev, daddr_t blkno, daddr_t rablkno, long bsize)
2547c478bd9Sstevel@tonic-gate {
2557c478bd9Sstevel@tonic-gate 	struct buf *bp, *rabp;
2567c478bd9Sstevel@tonic-gate 	klwp_t *lwp = ttolwp(curthread);
2577c478bd9Sstevel@tonic-gate 
2587c478bd9Sstevel@tonic-gate 	bp = NULL;
2597c478bd9Sstevel@tonic-gate 	if (!bio_incore(dev, blkno)) {
2607c478bd9Sstevel@tonic-gate 		CPU_STATS_ADD_K(sys, lread, 1);
2617c478bd9Sstevel@tonic-gate 		bp = GETBLK(dev, blkno, bsize);
2627c478bd9Sstevel@tonic-gate 		if ((bp->b_flags & B_DONE) == 0) {
2637c478bd9Sstevel@tonic-gate 			bp->b_flags |= B_READ;
2647c478bd9Sstevel@tonic-gate 			bp->b_bcount = bsize;
2657c478bd9Sstevel@tonic-gate 			(void) bdev_strategy(bp);
2667c478bd9Sstevel@tonic-gate 			if (lwp != NULL)
2677c478bd9Sstevel@tonic-gate 				lwp->lwp_ru.inblock++;
2687c478bd9Sstevel@tonic-gate 			CPU_STATS_ADD_K(sys, bread, 1);
2697c478bd9Sstevel@tonic-gate 		}
2707c478bd9Sstevel@tonic-gate 	}
2717c478bd9Sstevel@tonic-gate 	if (rablkno && bfreelist.b_bcount > 1 &&
2727c478bd9Sstevel@tonic-gate 	    !bio_incore(dev, rablkno)) {
2737c478bd9Sstevel@tonic-gate 		rabp = GETBLK(dev, rablkno, bsize);
2747c478bd9Sstevel@tonic-gate 		if (rabp->b_flags & B_DONE)
2757c478bd9Sstevel@tonic-gate 			brelse(rabp);
2767c478bd9Sstevel@tonic-gate 		else {
2777c478bd9Sstevel@tonic-gate 			rabp->b_flags |= B_READ|B_ASYNC;
2787c478bd9Sstevel@tonic-gate 			rabp->b_bcount = bsize;
2797c478bd9Sstevel@tonic-gate 			(void) bdev_strategy(rabp);
2807c478bd9Sstevel@tonic-gate 			if (lwp != NULL)
2817c478bd9Sstevel@tonic-gate 				lwp->lwp_ru.inblock++;
2827c478bd9Sstevel@tonic-gate 			CPU_STATS_ADD_K(sys, bread, 1);
2837c478bd9Sstevel@tonic-gate 		}
2847c478bd9Sstevel@tonic-gate 	}
2857c478bd9Sstevel@tonic-gate 	if (bp == NULL)
2867c478bd9Sstevel@tonic-gate 		return (BREAD(dev, blkno, bsize));
2877c478bd9Sstevel@tonic-gate 	(void) biowait(bp);
2887c478bd9Sstevel@tonic-gate 	return (bp);
2897c478bd9Sstevel@tonic-gate }
2907c478bd9Sstevel@tonic-gate 
2917c478bd9Sstevel@tonic-gate /*
2927c478bd9Sstevel@tonic-gate  * Common code for writing a buffer with various options.
2937c478bd9Sstevel@tonic-gate  *
2947c478bd9Sstevel@tonic-gate  * force_wait  - wait for write completion regardless of B_ASYNC flag
2957c478bd9Sstevel@tonic-gate  * do_relse    - release the buffer when we are done
2967c478bd9Sstevel@tonic-gate  * clear_flags - flags to clear from the buffer
2977c478bd9Sstevel@tonic-gate  */
2987c478bd9Sstevel@tonic-gate void
2997c478bd9Sstevel@tonic-gate bwrite_common(void *arg, struct buf *bp, int force_wait,
3003f11de9dSSara Hartse     int do_relse, int clear_flags)
3017c478bd9Sstevel@tonic-gate {
3027c478bd9Sstevel@tonic-gate 	register int do_wait;
3037c478bd9Sstevel@tonic-gate 	struct ufsvfs *ufsvfsp = (struct ufsvfs *)arg;
3047c478bd9Sstevel@tonic-gate 	int flag;
3057c478bd9Sstevel@tonic-gate 	klwp_t *lwp = ttolwp(curthread);
3067c478bd9Sstevel@tonic-gate 	struct cpu *cpup;
3077c478bd9Sstevel@tonic-gate 
3087c478bd9Sstevel@tonic-gate 	ASSERT(SEMA_HELD(&bp->b_sem));
3097c478bd9Sstevel@tonic-gate 	flag = bp->b_flags;
3107c478bd9Sstevel@tonic-gate 	bp->b_flags &= ~clear_flags;
3117c478bd9Sstevel@tonic-gate 	if (lwp != NULL)
3127c478bd9Sstevel@tonic-gate 		lwp->lwp_ru.oublock++;
3137c478bd9Sstevel@tonic-gate 	CPU_STATS_ENTER_K();
3147c478bd9Sstevel@tonic-gate 	cpup = CPU;		/* get pointer AFTER preemption is disabled */
3157c478bd9Sstevel@tonic-gate 	CPU_STATS_ADDQ(cpup, sys, lwrite, 1);
3167c478bd9Sstevel@tonic-gate 	CPU_STATS_ADDQ(cpup, sys, bwrite, 1);
3177c478bd9Sstevel@tonic-gate 	do_wait = ((flag & B_ASYNC) == 0 || force_wait);
3187c478bd9Sstevel@tonic-gate 	if (do_wait == 0)
3197c478bd9Sstevel@tonic-gate 		CPU_STATS_ADDQ(cpup, sys, bawrite, 1);
3207c478bd9Sstevel@tonic-gate 	CPU_STATS_EXIT_K();
3217c478bd9Sstevel@tonic-gate 	if (ufsvfsp == NULL) {
3227c478bd9Sstevel@tonic-gate 		(void) bdev_strategy(bp);
3237c478bd9Sstevel@tonic-gate 	} else if (ufsvfsp->vfs_log && bio_lufs_strategy != NULL) {
3247c478bd9Sstevel@tonic-gate 							/* ufs && logging */
3257c478bd9Sstevel@tonic-gate 		(*bio_lufs_strategy)(ufsvfsp->vfs_log, bp);
3267c478bd9Sstevel@tonic-gate 	} else if (ufsvfsp->vfs_snapshot && bio_snapshot_strategy != NULL) {
3277c478bd9Sstevel@tonic-gate 							/* ufs && snapshots */
3287c478bd9Sstevel@tonic-gate 		(*bio_snapshot_strategy)(&ufsvfsp->vfs_snapshot, bp);
3297c478bd9Sstevel@tonic-gate 	} else {
3307c478bd9Sstevel@tonic-gate 		ub.ub_bwrites.value.ul++;		/* ufs && !logging */
3317c478bd9Sstevel@tonic-gate 		(void) bdev_strategy(bp);
3327c478bd9Sstevel@tonic-gate 	}
3337c478bd9Sstevel@tonic-gate 	if (do_wait) {
3347c478bd9Sstevel@tonic-gate 		(void) biowait(bp);
3357c478bd9Sstevel@tonic-gate 		if (do_relse) {
3367c478bd9Sstevel@tonic-gate 			brelse(bp);
3377c478bd9Sstevel@tonic-gate 		}
3387c478bd9Sstevel@tonic-gate 	}
3397c478bd9Sstevel@tonic-gate }
3407c478bd9Sstevel@tonic-gate 
3417c478bd9Sstevel@tonic-gate /*
3427c478bd9Sstevel@tonic-gate  * Write the buffer, waiting for completion (unless B_ASYNC is set).
3437c478bd9Sstevel@tonic-gate  * Then release the buffer.
3447c478bd9Sstevel@tonic-gate  * This interface is provided for binary compatibility.  Using
3457c478bd9Sstevel@tonic-gate  * BWRITE() directly avoids the extra function call overhead invoked
3467c478bd9Sstevel@tonic-gate  * by calling this routine.
3477c478bd9Sstevel@tonic-gate  */
3487c478bd9Sstevel@tonic-gate void
3497c478bd9Sstevel@tonic-gate bwrite(struct buf *bp)
3507c478bd9Sstevel@tonic-gate {
3517c478bd9Sstevel@tonic-gate 	BWRITE(bp);
3527c478bd9Sstevel@tonic-gate }
3537c478bd9Sstevel@tonic-gate 
3547c478bd9Sstevel@tonic-gate /*
3557c478bd9Sstevel@tonic-gate  * Write the buffer, waiting for completion.
3567c478bd9Sstevel@tonic-gate  * But don't release the buffer afterwards.
3577c478bd9Sstevel@tonic-gate  * This interface is provided for binary compatibility.  Using
3587c478bd9Sstevel@tonic-gate  * BWRITE2() directly avoids the extra function call overhead.
3597c478bd9Sstevel@tonic-gate  */
3607c478bd9Sstevel@tonic-gate void
3617c478bd9Sstevel@tonic-gate bwrite2(struct buf *bp)
3627c478bd9Sstevel@tonic-gate {
3637c478bd9Sstevel@tonic-gate 	BWRITE2(bp);
3647c478bd9Sstevel@tonic-gate }
3657c478bd9Sstevel@tonic-gate 
3667c478bd9Sstevel@tonic-gate /*
3677c478bd9Sstevel@tonic-gate  * Release the buffer, marking it so that if it is grabbed
3687c478bd9Sstevel@tonic-gate  * for another purpose it will be written out before being
3697c478bd9Sstevel@tonic-gate  * given up (e.g. when writing a partial block where it is
3707c478bd9Sstevel@tonic-gate  * assumed that another write for the same block will soon follow).
3717c478bd9Sstevel@tonic-gate  * Also save the time that the block is first marked as delayed
3727c478bd9Sstevel@tonic-gate  * so that it will be written in a reasonable time.
3737c478bd9Sstevel@tonic-gate  */
3747c478bd9Sstevel@tonic-gate void
3757c478bd9Sstevel@tonic-gate bdwrite(struct buf *bp)
3767c478bd9Sstevel@tonic-gate {
3777c478bd9Sstevel@tonic-gate 	ASSERT(SEMA_HELD(&bp->b_sem));
3787c478bd9Sstevel@tonic-gate 	CPU_STATS_ADD_K(sys, lwrite, 1);
3797c478bd9Sstevel@tonic-gate 	if ((bp->b_flags & B_DELWRI) == 0)
380d3d50737SRafael Vanoni 		bp->b_start = ddi_get_lbolt();
3817c478bd9Sstevel@tonic-gate 	/*
3827c478bd9Sstevel@tonic-gate 	 * B_DONE allows others to use the buffer, B_DELWRI causes the
3837c478bd9Sstevel@tonic-gate 	 * buffer to be written before being reused, and setting b_resid
3847c478bd9Sstevel@tonic-gate 	 * to zero says the buffer is complete.
3857c478bd9Sstevel@tonic-gate 	 */
3867c478bd9Sstevel@tonic-gate 	bp->b_flags |= B_DELWRI | B_DONE;
3877c478bd9Sstevel@tonic-gate 	bp->b_resid = 0;
3887c478bd9Sstevel@tonic-gate 	brelse(bp);
3897c478bd9Sstevel@tonic-gate }
3907c478bd9Sstevel@tonic-gate 
3917c478bd9Sstevel@tonic-gate /*
3927c478bd9Sstevel@tonic-gate  * Release the buffer, start I/O on it, but don't wait for completion.
3937c478bd9Sstevel@tonic-gate  */
3947c478bd9Sstevel@tonic-gate void
3957c478bd9Sstevel@tonic-gate bawrite(struct buf *bp)
3967c478bd9Sstevel@tonic-gate {
3977c478bd9Sstevel@tonic-gate 	ASSERT(SEMA_HELD(&bp->b_sem));
3987c478bd9Sstevel@tonic-gate 
3997c478bd9Sstevel@tonic-gate 	/* Use bfreelist.b_bcount as a weird-ass heuristic */
4007c478bd9Sstevel@tonic-gate 	if (bfreelist.b_bcount > 4)
4017c478bd9Sstevel@tonic-gate 		bp->b_flags |= B_ASYNC;
4027c478bd9Sstevel@tonic-gate 	BWRITE(bp);
4037c478bd9Sstevel@tonic-gate }
4047c478bd9Sstevel@tonic-gate 
4057c478bd9Sstevel@tonic-gate /*
4067c478bd9Sstevel@tonic-gate  * Release the buffer, with no I/O implied.
4077c478bd9Sstevel@tonic-gate  */
4087c478bd9Sstevel@tonic-gate void
4097c478bd9Sstevel@tonic-gate brelse(struct buf *bp)
4107c478bd9Sstevel@tonic-gate {
4117c478bd9Sstevel@tonic-gate 	struct buf	**backp;
4127c478bd9Sstevel@tonic-gate 	uint_t		index;
4137c478bd9Sstevel@tonic-gate 	kmutex_t	*hmp;
4147c478bd9Sstevel@tonic-gate 	struct	buf	*dp;
4157c478bd9Sstevel@tonic-gate 	struct	hbuf	*hp;
4167c478bd9Sstevel@tonic-gate 
4177c478bd9Sstevel@tonic-gate 
4187c478bd9Sstevel@tonic-gate 	ASSERT(SEMA_HELD(&bp->b_sem));
4197c478bd9Sstevel@tonic-gate 
4207c478bd9Sstevel@tonic-gate 	/*
4217c478bd9Sstevel@tonic-gate 	 * Clear the retry write flag if the buffer was written without
4227c478bd9Sstevel@tonic-gate 	 * error.  The presence of B_DELWRI means the buffer has not yet
4237c478bd9Sstevel@tonic-gate 	 * been written and the presence of B_ERROR means that an error
4247c478bd9Sstevel@tonic-gate 	 * is still occurring.
4257c478bd9Sstevel@tonic-gate 	 */
4267c478bd9Sstevel@tonic-gate 	if ((bp->b_flags & (B_ERROR | B_DELWRI | B_RETRYWRI)) == B_RETRYWRI) {
4277c478bd9Sstevel@tonic-gate 		bp->b_flags &= ~B_RETRYWRI;
4287c478bd9Sstevel@tonic-gate 	}
4297c478bd9Sstevel@tonic-gate 
4307c478bd9Sstevel@tonic-gate 	/* Check for anomalous conditions */
4317c478bd9Sstevel@tonic-gate 	if (bp->b_flags & (B_ERROR|B_NOCACHE)) {
4327c478bd9Sstevel@tonic-gate 		if (bp->b_flags & B_NOCACHE) {
4337c478bd9Sstevel@tonic-gate 			/* Don't add to the freelist. Destroy it now */
4347c478bd9Sstevel@tonic-gate 			kmem_free(bp->b_un.b_addr, bp->b_bufsize);
4357c478bd9Sstevel@tonic-gate 			sema_destroy(&bp->b_sem);
4367c478bd9Sstevel@tonic-gate 			sema_destroy(&bp->b_io);
4377c478bd9Sstevel@tonic-gate 			kmem_free(bp, sizeof (struct buf));
4387c478bd9Sstevel@tonic-gate 			return;
4397c478bd9Sstevel@tonic-gate 		}
4407c478bd9Sstevel@tonic-gate 		/*
4417c478bd9Sstevel@tonic-gate 		 * If a write failed and we are supposed to retry write,
4427c478bd9Sstevel@tonic-gate 		 * don't toss the buffer.  Keep it around and mark it
4437c478bd9Sstevel@tonic-gate 		 * delayed write in the hopes that it will eventually
4447c478bd9Sstevel@tonic-gate 		 * get flushed (and still keep the system running.)
4457c478bd9Sstevel@tonic-gate 		 */
4467c478bd9Sstevel@tonic-gate 		if ((bp->b_flags & (B_READ | B_RETRYWRI)) == B_RETRYWRI) {
4477c478bd9Sstevel@tonic-gate 			bp->b_flags |= B_DELWRI;
4487c478bd9Sstevel@tonic-gate 			/* keep fsflush from trying continuously to flush */
449d3d50737SRafael Vanoni 			bp->b_start = ddi_get_lbolt();
4507c478bd9Sstevel@tonic-gate 		} else
4517c478bd9Sstevel@tonic-gate 			bp->b_flags |= B_AGE|B_STALE;
4527c478bd9Sstevel@tonic-gate 		bp->b_flags &= ~B_ERROR;
4537c478bd9Sstevel@tonic-gate 		bp->b_error = 0;
4547c478bd9Sstevel@tonic-gate 	}
4557c478bd9Sstevel@tonic-gate 
4567c478bd9Sstevel@tonic-gate 	/*
4577c478bd9Sstevel@tonic-gate 	 * If delayed write is set then put in on the delayed
4587c478bd9Sstevel@tonic-gate 	 * write list instead of the free buffer list.
4597c478bd9Sstevel@tonic-gate 	 */
4607c478bd9Sstevel@tonic-gate 	index = bio_bhash(bp->b_edev, bp->b_blkno);
4617c478bd9Sstevel@tonic-gate 	hmp   = &hbuf[index].b_lock;
4627c478bd9Sstevel@tonic-gate 
4637c478bd9Sstevel@tonic-gate 	mutex_enter(hmp);
4647c478bd9Sstevel@tonic-gate 	hp = &hbuf[index];
4657c478bd9Sstevel@tonic-gate 	dp = (struct buf *)hp;
4667c478bd9Sstevel@tonic-gate 
4677c478bd9Sstevel@tonic-gate 	/*
4687c478bd9Sstevel@tonic-gate 	 * Make sure that the number of entries on this list are
4697c478bd9Sstevel@tonic-gate 	 * Zero <= count <= total # buffers
4707c478bd9Sstevel@tonic-gate 	 */
4717c478bd9Sstevel@tonic-gate 	ASSERT(hp->b_length >= 0);
4727c478bd9Sstevel@tonic-gate 	ASSERT(hp->b_length < nbuf);
4737c478bd9Sstevel@tonic-gate 
4747c478bd9Sstevel@tonic-gate 	hp->b_length++;		/* We are adding this buffer */
4757c478bd9Sstevel@tonic-gate 
4767c478bd9Sstevel@tonic-gate 	if (bp->b_flags & B_DELWRI) {
4777c478bd9Sstevel@tonic-gate 		/*
4787c478bd9Sstevel@tonic-gate 		 * This buffer goes on the delayed write buffer list
4797c478bd9Sstevel@tonic-gate 		 */
4807c478bd9Sstevel@tonic-gate 		dp = (struct buf *)&dwbuf[index];
4817c478bd9Sstevel@tonic-gate 	}
4827c478bd9Sstevel@tonic-gate 	ASSERT(bp->b_bufsize > 0);
4837c478bd9Sstevel@tonic-gate 	ASSERT(bp->b_bcount > 0);
4847c478bd9Sstevel@tonic-gate 	ASSERT(bp->b_un.b_addr != NULL);
4857c478bd9Sstevel@tonic-gate 
4867c478bd9Sstevel@tonic-gate 	if (bp->b_flags & B_AGE) {
4877c478bd9Sstevel@tonic-gate 		backp = &dp->av_forw;
4887c478bd9Sstevel@tonic-gate 		(*backp)->av_back = bp;
4897c478bd9Sstevel@tonic-gate 		bp->av_forw = *backp;
4907c478bd9Sstevel@tonic-gate 		*backp = bp;
4917c478bd9Sstevel@tonic-gate 		bp->av_back = dp;
4927c478bd9Sstevel@tonic-gate 	} else {
4937c478bd9Sstevel@tonic-gate 		backp = &dp->av_back;
4947c478bd9Sstevel@tonic-gate 		(*backp)->av_forw = bp;
4957c478bd9Sstevel@tonic-gate 		bp->av_back = *backp;
4967c478bd9Sstevel@tonic-gate 		*backp = bp;
4977c478bd9Sstevel@tonic-gate 		bp->av_forw = dp;
4987c478bd9Sstevel@tonic-gate 	}
4997c478bd9Sstevel@tonic-gate 	mutex_exit(hmp);
5007c478bd9Sstevel@tonic-gate 
5017c478bd9Sstevel@tonic-gate 	if (bfreelist.b_flags & B_WANTED) {
5027c478bd9Sstevel@tonic-gate 		/*
5037c478bd9Sstevel@tonic-gate 		 * Should come here very very rarely.
5047c478bd9Sstevel@tonic-gate 		 */
5057c478bd9Sstevel@tonic-gate 		mutex_enter(&bfree_lock);
5067c478bd9Sstevel@tonic-gate 		if (bfreelist.b_flags & B_WANTED) {
5077c478bd9Sstevel@tonic-gate 			bfreelist.b_flags &= ~B_WANTED;
5087c478bd9Sstevel@tonic-gate 			cv_broadcast(&bio_mem_cv);
5097c478bd9Sstevel@tonic-gate 		}
5107c478bd9Sstevel@tonic-gate 		mutex_exit(&bfree_lock);
5117c478bd9Sstevel@tonic-gate 	}
5127c478bd9Sstevel@tonic-gate 
5137c478bd9Sstevel@tonic-gate 	bp->b_flags &= ~(B_WANTED|B_BUSY|B_ASYNC);
5147c478bd9Sstevel@tonic-gate 	/*
5157c478bd9Sstevel@tonic-gate 	 * Don't let anyone get the buffer off the freelist before we
5167c478bd9Sstevel@tonic-gate 	 * release our hold on it.
5177c478bd9Sstevel@tonic-gate 	 */
5187c478bd9Sstevel@tonic-gate 	sema_v(&bp->b_sem);
5197c478bd9Sstevel@tonic-gate }
5207c478bd9Sstevel@tonic-gate 
5217c478bd9Sstevel@tonic-gate /*
5227c478bd9Sstevel@tonic-gate  * Return a count of the number of B_BUSY buffers in the system
5237c478bd9Sstevel@tonic-gate  * Can only be used as a good estimate.  If 'cleanit' is set,
5247c478bd9Sstevel@tonic-gate  * try to flush all bufs.
5257c478bd9Sstevel@tonic-gate  */
5267c478bd9Sstevel@tonic-gate int
5277c478bd9Sstevel@tonic-gate bio_busy(int cleanit)
5287c478bd9Sstevel@tonic-gate {
5297c478bd9Sstevel@tonic-gate 	struct buf *bp, *dp;
5307c478bd9Sstevel@tonic-gate 	int busy = 0;
5317c478bd9Sstevel@tonic-gate 	int i;
5327c478bd9Sstevel@tonic-gate 	kmutex_t *hmp;
5337c478bd9Sstevel@tonic-gate 
5347c478bd9Sstevel@tonic-gate 	for (i = 0; i < v.v_hbuf; i++) {
5357c478bd9Sstevel@tonic-gate 		dp = (struct buf *)&hbuf[i];
5367c478bd9Sstevel@tonic-gate 		hmp = &hbuf[i].b_lock;
5377c478bd9Sstevel@tonic-gate 
5387c478bd9Sstevel@tonic-gate 		mutex_enter(hmp);
5397c478bd9Sstevel@tonic-gate 		for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
5407c478bd9Sstevel@tonic-gate 			if (bp->b_flags & B_BUSY)
5417c478bd9Sstevel@tonic-gate 				busy++;
5427c478bd9Sstevel@tonic-gate 		}
5437c478bd9Sstevel@tonic-gate 		mutex_exit(hmp);
5447c478bd9Sstevel@tonic-gate 	}
5457c478bd9Sstevel@tonic-gate 
5467c478bd9Sstevel@tonic-gate 	if (cleanit && busy != 0) {
5477c478bd9Sstevel@tonic-gate 		bflush(NODEV);
5487c478bd9Sstevel@tonic-gate 	}
5497c478bd9Sstevel@tonic-gate 
5507c478bd9Sstevel@tonic-gate 	return (busy);
5517c478bd9Sstevel@tonic-gate }
5527c478bd9Sstevel@tonic-gate 
5537c478bd9Sstevel@tonic-gate /*
5547c478bd9Sstevel@tonic-gate  * this interface is provided for binary compatibility.
5557c478bd9Sstevel@tonic-gate  *
5567c478bd9Sstevel@tonic-gate  * Assign a buffer for the given block.  If the appropriate
5577c478bd9Sstevel@tonic-gate  * block is already associated, return it; otherwise search
5587c478bd9Sstevel@tonic-gate  * for the oldest non-busy buffer and reassign it.
5597c478bd9Sstevel@tonic-gate  */
5607c478bd9Sstevel@tonic-gate struct buf *
5617c478bd9Sstevel@tonic-gate getblk(dev_t dev, daddr_t blkno, long bsize)
5627c478bd9Sstevel@tonic-gate {
5637c478bd9Sstevel@tonic-gate 	return (getblk_common(/* ufsvfsp */ NULL, dev,
564d3d50737SRafael Vanoni 	    blkno, bsize, /* errflg */ 0));
5657c478bd9Sstevel@tonic-gate }
5667c478bd9Sstevel@tonic-gate 
5677c478bd9Sstevel@tonic-gate /*
5687c478bd9Sstevel@tonic-gate  * Assign a buffer for the given block.  If the appropriate
5697c478bd9Sstevel@tonic-gate  * block is already associated, return it; otherwise search
5707c478bd9Sstevel@tonic-gate  * for the oldest non-busy buffer and reassign it.
5717c478bd9Sstevel@tonic-gate  */
5727c478bd9Sstevel@tonic-gate struct buf *
5737c478bd9Sstevel@tonic-gate getblk_common(void * arg, dev_t dev, daddr_t blkno, long bsize, int errflg)
5747c478bd9Sstevel@tonic-gate {
5757c478bd9Sstevel@tonic-gate 	ufsvfs_t *ufsvfsp = (struct ufsvfs *)arg;
5767c478bd9Sstevel@tonic-gate 	struct buf *bp;
5777c478bd9Sstevel@tonic-gate 	struct buf *dp;
5787c478bd9Sstevel@tonic-gate 	struct buf *nbp = NULL;
5797c478bd9Sstevel@tonic-gate 	struct buf *errbp;
5807c478bd9Sstevel@tonic-gate 	uint_t		index;
5817c478bd9Sstevel@tonic-gate 	kmutex_t	*hmp;
5827c478bd9Sstevel@tonic-gate 	struct	hbuf	*hp;
5837c478bd9Sstevel@tonic-gate 
5847c478bd9Sstevel@tonic-gate 	if (getmajor(dev) >= devcnt)
5857c478bd9Sstevel@tonic-gate 		cmn_err(CE_PANIC, "blkdev");
5867c478bd9Sstevel@tonic-gate 
5877c478bd9Sstevel@tonic-gate 	biostats.bio_lookup.value.ui32++;
5887c478bd9Sstevel@tonic-gate 
5897c478bd9Sstevel@tonic-gate 	index = bio_bhash(dev, blkno);
5907c478bd9Sstevel@tonic-gate 	hp    = &hbuf[index];
5917c478bd9Sstevel@tonic-gate 	dp    = (struct buf *)hp;
5927c478bd9Sstevel@tonic-gate 	hmp   = &hp->b_lock;
5937c478bd9Sstevel@tonic-gate 
5947c478bd9Sstevel@tonic-gate 	mutex_enter(hmp);
5957c478bd9Sstevel@tonic-gate loop:
5967c478bd9Sstevel@tonic-gate 	for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
5977c478bd9Sstevel@tonic-gate 		if (bp->b_blkno != blkno || bp->b_edev != dev ||
5987c478bd9Sstevel@tonic-gate 		    (bp->b_flags & B_STALE))
5997c478bd9Sstevel@tonic-gate 			continue;
6007c478bd9Sstevel@tonic-gate 		/*
6017c478bd9Sstevel@tonic-gate 		 * Avoid holding the hash lock in the event that
6027c478bd9Sstevel@tonic-gate 		 * the buffer is locked by someone. Since the hash chain
6037c478bd9Sstevel@tonic-gate 		 * may change when we drop the hash lock
6047c478bd9Sstevel@tonic-gate 		 * we have to start at the beginning of the chain if the
6057c478bd9Sstevel@tonic-gate 		 * buffer identity/contents aren't valid.
6067c478bd9Sstevel@tonic-gate 		 */
6077c478bd9Sstevel@tonic-gate 		if (!sema_tryp(&bp->b_sem)) {
6087c478bd9Sstevel@tonic-gate 			biostats.bio_bufbusy.value.ui32++;
6097c478bd9Sstevel@tonic-gate 			mutex_exit(hmp);
6107c478bd9Sstevel@tonic-gate 			/*
6117c478bd9Sstevel@tonic-gate 			 * OK, we are dealing with a busy buffer.
6127c478bd9Sstevel@tonic-gate 			 * In the case that we are panicking and we
6137c478bd9Sstevel@tonic-gate 			 * got called from bread(), we have some chance
6147c478bd9Sstevel@tonic-gate 			 * for error recovery. So better bail out from
6157c478bd9Sstevel@tonic-gate 			 * here since sema_p() won't block. If we got
6167c478bd9Sstevel@tonic-gate 			 * called directly from ufs routines, there is
6177c478bd9Sstevel@tonic-gate 			 * no way to report an error yet.
6187c478bd9Sstevel@tonic-gate 			 */
6197c478bd9Sstevel@tonic-gate 			if (panicstr && errflg)
6207c478bd9Sstevel@tonic-gate 				goto errout;
6217c478bd9Sstevel@tonic-gate 			/*
6227c478bd9Sstevel@tonic-gate 			 * For the following line of code to work
6237c478bd9Sstevel@tonic-gate 			 * correctly never kmem_free the buffer "header".
6247c478bd9Sstevel@tonic-gate 			 */
6257c478bd9Sstevel@tonic-gate 			sema_p(&bp->b_sem);
6267c478bd9Sstevel@tonic-gate 			if (bp->b_blkno != blkno || bp->b_edev != dev ||
6277c478bd9Sstevel@tonic-gate 			    (bp->b_flags & B_STALE)) {
6287c478bd9Sstevel@tonic-gate 				sema_v(&bp->b_sem);
6297c478bd9Sstevel@tonic-gate 				mutex_enter(hmp);
6307c478bd9Sstevel@tonic-gate 				goto loop;	/* start over */
6317c478bd9Sstevel@tonic-gate 			}
6327c478bd9Sstevel@tonic-gate 			mutex_enter(hmp);
6337c478bd9Sstevel@tonic-gate 		}
6347c478bd9Sstevel@tonic-gate 		/* Found */
6357c478bd9Sstevel@tonic-gate 		biostats.bio_hit.value.ui32++;
6367c478bd9Sstevel@tonic-gate 		bp->b_flags &= ~B_AGE;
6377c478bd9Sstevel@tonic-gate 
6387c478bd9Sstevel@tonic-gate 		/*
6397c478bd9Sstevel@tonic-gate 		 * Yank it off the free/delayed write lists
6407c478bd9Sstevel@tonic-gate 		 */
6417c478bd9Sstevel@tonic-gate 		hp->b_length--;
6427c478bd9Sstevel@tonic-gate 		notavail(bp);
6437c478bd9Sstevel@tonic-gate 		mutex_exit(hmp);
6447c478bd9Sstevel@tonic-gate 
6457c478bd9Sstevel@tonic-gate 		ASSERT((bp->b_flags & B_NOCACHE) == NULL);
6467c478bd9Sstevel@tonic-gate 
6477c478bd9Sstevel@tonic-gate 		if (nbp == NULL) {
6487c478bd9Sstevel@tonic-gate 			/*
6497c478bd9Sstevel@tonic-gate 			 * Make the common path short.
6507c478bd9Sstevel@tonic-gate 			 */
6517c478bd9Sstevel@tonic-gate 			ASSERT(SEMA_HELD(&bp->b_sem));
6527c478bd9Sstevel@tonic-gate 			return (bp);
6537c478bd9Sstevel@tonic-gate 		}
6547c478bd9Sstevel@tonic-gate 
6557c478bd9Sstevel@tonic-gate 		biostats.bio_bufdup.value.ui32++;
6567c478bd9Sstevel@tonic-gate 
6577c478bd9Sstevel@tonic-gate 		/*
6587c478bd9Sstevel@tonic-gate 		 * The buffer must have entered during the lock upgrade
6597c478bd9Sstevel@tonic-gate 		 * so free the new buffer we allocated and return the
6607c478bd9Sstevel@tonic-gate 		 * found buffer.
6617c478bd9Sstevel@tonic-gate 		 */
6627c478bd9Sstevel@tonic-gate 		kmem_free(nbp->b_un.b_addr, nbp->b_bufsize);
6637c478bd9Sstevel@tonic-gate 		nbp->b_un.b_addr = NULL;
6647c478bd9Sstevel@tonic-gate 
6657c478bd9Sstevel@tonic-gate 		/*
6667c478bd9Sstevel@tonic-gate 		 * Account for the memory
6677c478bd9Sstevel@tonic-gate 		 */
6687c478bd9Sstevel@tonic-gate 		mutex_enter(&bfree_lock);
6697c478bd9Sstevel@tonic-gate 		bfreelist.b_bufsize += nbp->b_bufsize;
6707c478bd9Sstevel@tonic-gate 		mutex_exit(&bfree_lock);
6717c478bd9Sstevel@tonic-gate 
6727c478bd9Sstevel@tonic-gate 		/*
6737c478bd9Sstevel@tonic-gate 		 * Destroy buf identity, and place on avail list
6747c478bd9Sstevel@tonic-gate 		 */
6757c478bd9Sstevel@tonic-gate 		nbp->b_dev = (o_dev_t)NODEV;
6767c478bd9Sstevel@tonic-gate 		nbp->b_edev = NODEV;
6777c478bd9Sstevel@tonic-gate 		nbp->b_flags = 0;
6787c478bd9Sstevel@tonic-gate 		nbp->b_file = NULL;
6797c478bd9Sstevel@tonic-gate 		nbp->b_offset = -1;
6807c478bd9Sstevel@tonic-gate 
6817c478bd9Sstevel@tonic-gate 		sema_v(&nbp->b_sem);
6827c478bd9Sstevel@tonic-gate 		bio_bhdr_free(nbp);
6837c478bd9Sstevel@tonic-gate 
6847c478bd9Sstevel@tonic-gate 		ASSERT(SEMA_HELD(&bp->b_sem));
6857c478bd9Sstevel@tonic-gate 		return (bp);
6867c478bd9Sstevel@tonic-gate 	}
6877c478bd9Sstevel@tonic-gate 
6887c478bd9Sstevel@tonic-gate 	/*
6897c478bd9Sstevel@tonic-gate 	 * bio_getfreeblk may block so check the hash chain again.
6907c478bd9Sstevel@tonic-gate 	 */
6917c478bd9Sstevel@tonic-gate 	if (nbp == NULL) {
6927c478bd9Sstevel@tonic-gate 		mutex_exit(hmp);
6937c478bd9Sstevel@tonic-gate 		nbp = bio_getfreeblk(bsize);
6947c478bd9Sstevel@tonic-gate 		mutex_enter(hmp);
6957c478bd9Sstevel@tonic-gate 		goto loop;
6967c478bd9Sstevel@tonic-gate 	}
6977c478bd9Sstevel@tonic-gate 
6987c478bd9Sstevel@tonic-gate 	/*
6997c478bd9Sstevel@tonic-gate 	 * New buffer. Assign nbp and stick it on the hash.
7007c478bd9Sstevel@tonic-gate 	 */
7017c478bd9Sstevel@tonic-gate 	nbp->b_flags = B_BUSY;
7027c478bd9Sstevel@tonic-gate 	nbp->b_edev = dev;
7037c478bd9Sstevel@tonic-gate 	nbp->b_dev = (o_dev_t)cmpdev(dev);
7047c478bd9Sstevel@tonic-gate 	nbp->b_blkno = blkno;
7057c478bd9Sstevel@tonic-gate 	nbp->b_iodone = NULL;
7067c478bd9Sstevel@tonic-gate 	nbp->b_bcount = bsize;
7077c478bd9Sstevel@tonic-gate 	/*
7087c478bd9Sstevel@tonic-gate 	 * If we are given a ufsvfsp and the vfs_root field is NULL
7097c478bd9Sstevel@tonic-gate 	 * then this must be I/O for a superblock.  A superblock's
7107c478bd9Sstevel@tonic-gate 	 * buffer is set up in mountfs() and there is no root vnode
7117c478bd9Sstevel@tonic-gate 	 * at that point.
7127c478bd9Sstevel@tonic-gate 	 */
7137c478bd9Sstevel@tonic-gate 	if (ufsvfsp && ufsvfsp->vfs_root) {
7147c478bd9Sstevel@tonic-gate 		nbp->b_vp = ufsvfsp->vfs_root;
7157c478bd9Sstevel@tonic-gate 	} else {
7167c478bd9Sstevel@tonic-gate 		nbp->b_vp = NULL;
7177c478bd9Sstevel@tonic-gate 	}
7187c478bd9Sstevel@tonic-gate 
7197c478bd9Sstevel@tonic-gate 	ASSERT((nbp->b_flags & B_NOCACHE) == NULL);
7207c478bd9Sstevel@tonic-gate 
7217c478bd9Sstevel@tonic-gate 	binshash(nbp, dp);
7227c478bd9Sstevel@tonic-gate 	mutex_exit(hmp);
7237c478bd9Sstevel@tonic-gate 
7247c478bd9Sstevel@tonic-gate 	ASSERT(SEMA_HELD(&nbp->b_sem));
7257c478bd9Sstevel@tonic-gate 
7267c478bd9Sstevel@tonic-gate 	return (nbp);
7277c478bd9Sstevel@tonic-gate 
7287c478bd9Sstevel@tonic-gate 
7297c478bd9Sstevel@tonic-gate 	/*
7307c478bd9Sstevel@tonic-gate 	 * Come here in case of an internal error. At this point we couldn't
731*48bbca81SDaniel Hoffman 	 * get a buffer, but we have to return one. Hence we allocate some
7327c478bd9Sstevel@tonic-gate 	 * kind of error reply buffer on the fly. This buffer is marked as
7337c478bd9Sstevel@tonic-gate 	 * B_NOCACHE | B_AGE | B_ERROR | B_DONE to assure the following:
7347c478bd9Sstevel@tonic-gate 	 *	- B_ERROR will indicate error to the caller.
7357c478bd9Sstevel@tonic-gate 	 *	- B_DONE will prevent us from reading the buffer from
7367c478bd9Sstevel@tonic-gate 	 *	  the device.
7377c478bd9Sstevel@tonic-gate 	 *	- B_NOCACHE will cause that this buffer gets free'd in
7387c478bd9Sstevel@tonic-gate 	 *	  brelse().
7397c478bd9Sstevel@tonic-gate 	 */
7407c478bd9Sstevel@tonic-gate 
7417c478bd9Sstevel@tonic-gate errout:
7427c478bd9Sstevel@tonic-gate 	errbp = geteblk();
7437c478bd9Sstevel@tonic-gate 	sema_p(&errbp->b_sem);
7447c478bd9Sstevel@tonic-gate 	errbp->b_flags &= ~B_BUSY;
7457c478bd9Sstevel@tonic-gate 	errbp->b_flags |= (B_ERROR | B_DONE);
7467c478bd9Sstevel@tonic-gate 	return (errbp);
7477c478bd9Sstevel@tonic-gate }
7487c478bd9Sstevel@tonic-gate 
7497c478bd9Sstevel@tonic-gate /*
7507c478bd9Sstevel@tonic-gate  * Get an empty block, not assigned to any particular device.
7517c478bd9Sstevel@tonic-gate  * Returns a locked buffer that is not on any hash or free list.
7527c478bd9Sstevel@tonic-gate  */
7537c478bd9Sstevel@tonic-gate struct buf *
7547c478bd9Sstevel@tonic-gate ngeteblk(long bsize)
7557c478bd9Sstevel@tonic-gate {
7567c478bd9Sstevel@tonic-gate 	struct buf *bp;
7577c478bd9Sstevel@tonic-gate 
7587c478bd9Sstevel@tonic-gate 	bp = kmem_alloc(sizeof (struct buf), KM_SLEEP);
7597c478bd9Sstevel@tonic-gate 	bioinit(bp);
7607c478bd9Sstevel@tonic-gate 	bp->av_forw = bp->av_back = NULL;
7617c478bd9Sstevel@tonic-gate 	bp->b_un.b_addr = kmem_alloc(bsize, KM_SLEEP);
7627c478bd9Sstevel@tonic-gate 	bp->b_bufsize = bsize;
7637c478bd9Sstevel@tonic-gate 	bp->b_flags = B_BUSY | B_NOCACHE | B_AGE;
7647c478bd9Sstevel@tonic-gate 	bp->b_dev = (o_dev_t)NODEV;
7657c478bd9Sstevel@tonic-gate 	bp->b_edev = NODEV;
7667c478bd9Sstevel@tonic-gate 	bp->b_lblkno = 0;
7677c478bd9Sstevel@tonic-gate 	bp->b_bcount = bsize;
7687c478bd9Sstevel@tonic-gate 	bp->b_iodone = NULL;
7697c478bd9Sstevel@tonic-gate 	return (bp);
7707c478bd9Sstevel@tonic-gate }
7717c478bd9Sstevel@tonic-gate 
7727c478bd9Sstevel@tonic-gate /*
7737c478bd9Sstevel@tonic-gate  * Interface of geteblk() is kept intact to maintain driver compatibility.
7747c478bd9Sstevel@tonic-gate  * Use ngeteblk() to allocate block size other than 1 KB.
7757c478bd9Sstevel@tonic-gate  */
7767c478bd9Sstevel@tonic-gate struct buf *
7777c478bd9Sstevel@tonic-gate geteblk(void)
7787c478bd9Sstevel@tonic-gate {
7797c478bd9Sstevel@tonic-gate 	return (ngeteblk((long)1024));
7807c478bd9Sstevel@tonic-gate }
7817c478bd9Sstevel@tonic-gate 
7827c478bd9Sstevel@tonic-gate /*
7837c478bd9Sstevel@tonic-gate  * Return a buffer w/o sleeping
7847c478bd9Sstevel@tonic-gate  */
7857c478bd9Sstevel@tonic-gate struct buf *
7867c478bd9Sstevel@tonic-gate trygetblk(dev_t dev, daddr_t blkno)
7877c478bd9Sstevel@tonic-gate {
7887c478bd9Sstevel@tonic-gate 	struct buf	*bp;
7897c478bd9Sstevel@tonic-gate 	struct buf	*dp;
7907c478bd9Sstevel@tonic-gate 	struct hbuf	*hp;
7917c478bd9Sstevel@tonic-gate 	kmutex_t	*hmp;
7927c478bd9Sstevel@tonic-gate 	uint_t		index;
7937c478bd9Sstevel@tonic-gate 
7947c478bd9Sstevel@tonic-gate 	index = bio_bhash(dev, blkno);
7957c478bd9Sstevel@tonic-gate 	hp = &hbuf[index];
7967c478bd9Sstevel@tonic-gate 	hmp = &hp->b_lock;
7977c478bd9Sstevel@tonic-gate 
7987c478bd9Sstevel@tonic-gate 	if (!mutex_tryenter(hmp))
7997c478bd9Sstevel@tonic-gate 		return (NULL);
8007c478bd9Sstevel@tonic-gate 
8017c478bd9Sstevel@tonic-gate 	dp = (struct buf *)hp;
8027c478bd9Sstevel@tonic-gate 	for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
8037c478bd9Sstevel@tonic-gate 		if (bp->b_blkno != blkno || bp->b_edev != dev ||
8047c478bd9Sstevel@tonic-gate 		    (bp->b_flags & B_STALE))
8057c478bd9Sstevel@tonic-gate 			continue;
8067c478bd9Sstevel@tonic-gate 		/*
8077c478bd9Sstevel@tonic-gate 		 * Get access to a valid buffer without sleeping
8087c478bd9Sstevel@tonic-gate 		 */
8097c478bd9Sstevel@tonic-gate 		if (sema_tryp(&bp->b_sem)) {
8107c478bd9Sstevel@tonic-gate 			if (bp->b_flags & B_DONE) {
8117c478bd9Sstevel@tonic-gate 				hp->b_length--;
8127c478bd9Sstevel@tonic-gate 				notavail(bp);
8137c478bd9Sstevel@tonic-gate 				mutex_exit(hmp);
8147c478bd9Sstevel@tonic-gate 				return (bp);
8157c478bd9Sstevel@tonic-gate 			} else {
8167c478bd9Sstevel@tonic-gate 				sema_v(&bp->b_sem);
8177c478bd9Sstevel@tonic-gate 				break;
8187c478bd9Sstevel@tonic-gate 			}
8197c478bd9Sstevel@tonic-gate 		}
8207c478bd9Sstevel@tonic-gate 		break;
8217c478bd9Sstevel@tonic-gate 	}
8227c478bd9Sstevel@tonic-gate 	mutex_exit(hmp);
8237c478bd9Sstevel@tonic-gate 	return (NULL);
8247c478bd9Sstevel@tonic-gate }
8257c478bd9Sstevel@tonic-gate 
8267c478bd9Sstevel@tonic-gate /*
8277c478bd9Sstevel@tonic-gate  * Wait for I/O completion on the buffer; return errors
8287c478bd9Sstevel@tonic-gate  * to the user.
8297c478bd9Sstevel@tonic-gate  */
8307c478bd9Sstevel@tonic-gate int
8317c478bd9Sstevel@tonic-gate iowait(struct buf *bp)
8327c478bd9Sstevel@tonic-gate {
8337c478bd9Sstevel@tonic-gate 	ASSERT(SEMA_HELD(&bp->b_sem));
8347c478bd9Sstevel@tonic-gate 	return (biowait(bp));
8357c478bd9Sstevel@tonic-gate }
8367c478bd9Sstevel@tonic-gate 
8377c478bd9Sstevel@tonic-gate /*
8387c478bd9Sstevel@tonic-gate  * Mark I/O complete on a buffer, release it if I/O is asynchronous,
8397c478bd9Sstevel@tonic-gate  * and wake up anyone waiting for it.
8407c478bd9Sstevel@tonic-gate  */
8417c478bd9Sstevel@tonic-gate void
8427c478bd9Sstevel@tonic-gate iodone(struct buf *bp)
8437c478bd9Sstevel@tonic-gate {
8447c478bd9Sstevel@tonic-gate 	ASSERT(SEMA_HELD(&bp->b_sem));
8457c478bd9Sstevel@tonic-gate 	(void) biodone(bp);
8467c478bd9Sstevel@tonic-gate }
8477c478bd9Sstevel@tonic-gate 
8487c478bd9Sstevel@tonic-gate /*
8497c478bd9Sstevel@tonic-gate  * Zero the core associated with a buffer.
8507c478bd9Sstevel@tonic-gate  */
8517c478bd9Sstevel@tonic-gate void
8527c478bd9Sstevel@tonic-gate clrbuf(struct buf *bp)
8537c478bd9Sstevel@tonic-gate {
8547c478bd9Sstevel@tonic-gate 	ASSERT(SEMA_HELD(&bp->b_sem));
8557c478bd9Sstevel@tonic-gate 	bzero(bp->b_un.b_addr, bp->b_bcount);
8567c478bd9Sstevel@tonic-gate 	bp->b_resid = 0;
8577c478bd9Sstevel@tonic-gate }
8587c478bd9Sstevel@tonic-gate 
8597c478bd9Sstevel@tonic-gate 
8607c478bd9Sstevel@tonic-gate /*
8617c478bd9Sstevel@tonic-gate  * Make sure all write-behind blocks on dev (or NODEV for all)
8627c478bd9Sstevel@tonic-gate  * are flushed out.
8637c478bd9Sstevel@tonic-gate  */
8647c478bd9Sstevel@tonic-gate void
8657c478bd9Sstevel@tonic-gate bflush(dev_t dev)
8667c478bd9Sstevel@tonic-gate {
8677c478bd9Sstevel@tonic-gate 	struct buf *bp, *dp;
8687c478bd9Sstevel@tonic-gate 	struct hbuf *hp;
8697c478bd9Sstevel@tonic-gate 	struct buf *delwri_list = EMPTY_LIST;
8707c478bd9Sstevel@tonic-gate 	int i, index;
8717c478bd9Sstevel@tonic-gate 	kmutex_t *hmp;
8727c478bd9Sstevel@tonic-gate 
8737c478bd9Sstevel@tonic-gate 	mutex_enter(&blist_lock);
8747c478bd9Sstevel@tonic-gate 	/*
8757c478bd9Sstevel@tonic-gate 	 * Wait for any invalidates or flushes ahead of us to finish.
8767c478bd9Sstevel@tonic-gate 	 * We really could split blist_lock up per device for better
8777c478bd9Sstevel@tonic-gate 	 * parallelism here.
8787c478bd9Sstevel@tonic-gate 	 */
8797c478bd9Sstevel@tonic-gate 	while (bio_doinginval || bio_doingflush) {
8807c478bd9Sstevel@tonic-gate 		bio_flinv_cv_wanted = 1;
8817c478bd9Sstevel@tonic-gate 		cv_wait(&bio_flushinval_cv, &blist_lock);
8827c478bd9Sstevel@tonic-gate 	}
8837c478bd9Sstevel@tonic-gate 	bio_doingflush++;
8847c478bd9Sstevel@tonic-gate 	/*
8857c478bd9Sstevel@tonic-gate 	 * Gather all B_DELWRI buffer for device.
8867c478bd9Sstevel@tonic-gate 	 * Lock ordering is b_sem > hash lock (brelse).
8877c478bd9Sstevel@tonic-gate 	 * Since we are finding the buffer via the delayed write list,
8887c478bd9Sstevel@tonic-gate 	 * it may be busy and we would block trying to get the
8897c478bd9Sstevel@tonic-gate 	 * b_sem lock while holding hash lock. So transfer all the
8907c478bd9Sstevel@tonic-gate 	 * candidates on the delwri_list and then drop the hash locks.
8917c478bd9Sstevel@tonic-gate 	 */
8927c478bd9Sstevel@tonic-gate 	for (i = 0; i < v.v_hbuf; i++) {
8937c478bd9Sstevel@tonic-gate 		hmp = &hbuf[i].b_lock;
8947c478bd9Sstevel@tonic-gate 		dp = (struct buf *)&dwbuf[i];
8957c478bd9Sstevel@tonic-gate 		mutex_enter(hmp);
8967c478bd9Sstevel@tonic-gate 		for (bp = dp->av_forw; bp != dp; bp = bp->av_forw) {
8977c478bd9Sstevel@tonic-gate 			if (dev == NODEV || bp->b_edev == dev) {
8987c478bd9Sstevel@tonic-gate 				if (bp->b_list == NULL) {
8997c478bd9Sstevel@tonic-gate 					bp->b_list = delwri_list;
9007c478bd9Sstevel@tonic-gate 					delwri_list = bp;
9017c478bd9Sstevel@tonic-gate 				}
9027c478bd9Sstevel@tonic-gate 			}
9037c478bd9Sstevel@tonic-gate 		}
9047c478bd9Sstevel@tonic-gate 		mutex_exit(hmp);
9057c478bd9Sstevel@tonic-gate 	}
9067c478bd9Sstevel@tonic-gate 	mutex_exit(&blist_lock);
9077c478bd9Sstevel@tonic-gate 
9087c478bd9Sstevel@tonic-gate 	/*
9097c478bd9Sstevel@tonic-gate 	 * Now that the hash locks have been dropped grab the semaphores
9107c478bd9Sstevel@tonic-gate 	 * and write back all the buffers that have B_DELWRI set.
9117c478bd9Sstevel@tonic-gate 	 */
9127c478bd9Sstevel@tonic-gate 	while (delwri_list != EMPTY_LIST) {
9137c478bd9Sstevel@tonic-gate 		bp = delwri_list;
9147c478bd9Sstevel@tonic-gate 
9157c478bd9Sstevel@tonic-gate 		sema_p(&bp->b_sem);	/* may block */
9167c478bd9Sstevel@tonic-gate 		if ((dev != bp->b_edev && dev != NODEV) ||
9177c478bd9Sstevel@tonic-gate 		    (panicstr && bp->b_flags & B_BUSY)) {
9187c478bd9Sstevel@tonic-gate 			sema_v(&bp->b_sem);
9197c478bd9Sstevel@tonic-gate 			delwri_list = bp->b_list;
9207c478bd9Sstevel@tonic-gate 			bp->b_list = NULL;
9217c478bd9Sstevel@tonic-gate 			continue;	/* No longer a candidate */
9227c478bd9Sstevel@tonic-gate 		}
9237c478bd9Sstevel@tonic-gate 		if (bp->b_flags & B_DELWRI) {
9247c478bd9Sstevel@tonic-gate 			index = bio_bhash(bp->b_edev, bp->b_blkno);
9257c478bd9Sstevel@tonic-gate 			hp = &hbuf[index];
9267c478bd9Sstevel@tonic-gate 			hmp = &hp->b_lock;
9277c478bd9Sstevel@tonic-gate 			dp = (struct buf *)hp;
9287c478bd9Sstevel@tonic-gate 
9297c478bd9Sstevel@tonic-gate 			bp->b_flags |= B_ASYNC;
9307c478bd9Sstevel@tonic-gate 			mutex_enter(hmp);
9317c478bd9Sstevel@tonic-gate 			hp->b_length--;
9327c478bd9Sstevel@tonic-gate 			notavail(bp);
9337c478bd9Sstevel@tonic-gate 			mutex_exit(hmp);
9347c478bd9Sstevel@tonic-gate 			if (bp->b_vp == NULL) {		/* !ufs */
9357c478bd9Sstevel@tonic-gate 				BWRITE(bp);
9367c478bd9Sstevel@tonic-gate 			} else {			/* ufs */
9377c478bd9Sstevel@tonic-gate 				UFS_BWRITE(VTOI(bp->b_vp)->i_ufsvfs, bp);
9387c478bd9Sstevel@tonic-gate 			}
9397c478bd9Sstevel@tonic-gate 		} else {
9407c478bd9Sstevel@tonic-gate 			sema_v(&bp->b_sem);
9417c478bd9Sstevel@tonic-gate 		}
9427c478bd9Sstevel@tonic-gate 		delwri_list = bp->b_list;
9437c478bd9Sstevel@tonic-gate 		bp->b_list = NULL;
9447c478bd9Sstevel@tonic-gate 	}
9457c478bd9Sstevel@tonic-gate 	mutex_enter(&blist_lock);
9467c478bd9Sstevel@tonic-gate 	bio_doingflush--;
9477c478bd9Sstevel@tonic-gate 	if (bio_flinv_cv_wanted) {
9487c478bd9Sstevel@tonic-gate 		bio_flinv_cv_wanted = 0;
9497c478bd9Sstevel@tonic-gate 		cv_broadcast(&bio_flushinval_cv);
9507c478bd9Sstevel@tonic-gate 	}
9517c478bd9Sstevel@tonic-gate 	mutex_exit(&blist_lock);
9527c478bd9Sstevel@tonic-gate }
9537c478bd9Sstevel@tonic-gate 
9547c478bd9Sstevel@tonic-gate /*
9557c478bd9Sstevel@tonic-gate  * Ensure that a specified block is up-to-date on disk.
9567c478bd9Sstevel@tonic-gate  */
9577c478bd9Sstevel@tonic-gate void
9587c478bd9Sstevel@tonic-gate blkflush(dev_t dev, daddr_t blkno)
9597c478bd9Sstevel@tonic-gate {
9607c478bd9Sstevel@tonic-gate 	struct buf *bp, *dp;
9617c478bd9Sstevel@tonic-gate 	struct hbuf *hp;
9627c478bd9Sstevel@tonic-gate 	struct buf *sbp = NULL;
9637c478bd9Sstevel@tonic-gate 	uint_t index;
9647c478bd9Sstevel@tonic-gate 	kmutex_t *hmp;
9657c478bd9Sstevel@tonic-gate 
9667c478bd9Sstevel@tonic-gate 	index = bio_bhash(dev, blkno);
9677c478bd9Sstevel@tonic-gate 	hp    = &hbuf[index];
9687c478bd9Sstevel@tonic-gate 	dp    = (struct buf *)hp;
9697c478bd9Sstevel@tonic-gate 	hmp   = &hp->b_lock;
9707c478bd9Sstevel@tonic-gate 
9717c478bd9Sstevel@tonic-gate 	/*
9727c478bd9Sstevel@tonic-gate 	 * Identify the buffer in the cache belonging to
9737c478bd9Sstevel@tonic-gate 	 * this device and blkno (if any).
9747c478bd9Sstevel@tonic-gate 	 */
9757c478bd9Sstevel@tonic-gate 	mutex_enter(hmp);
9767c478bd9Sstevel@tonic-gate 	for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
9777c478bd9Sstevel@tonic-gate 		if (bp->b_blkno != blkno || bp->b_edev != dev ||
9787c478bd9Sstevel@tonic-gate 		    (bp->b_flags & B_STALE))
9797c478bd9Sstevel@tonic-gate 			continue;
9807c478bd9Sstevel@tonic-gate 		sbp = bp;
9817c478bd9Sstevel@tonic-gate 		break;
9827c478bd9Sstevel@tonic-gate 	}
9837c478bd9Sstevel@tonic-gate 	mutex_exit(hmp);
9847c478bd9Sstevel@tonic-gate 	if (sbp == NULL)
9857c478bd9Sstevel@tonic-gate 		return;
9867c478bd9Sstevel@tonic-gate 	/*
9877c478bd9Sstevel@tonic-gate 	 * Now check the buffer we have identified and
9887c478bd9Sstevel@tonic-gate 	 * make sure it still belongs to the device and is B_DELWRI
9897c478bd9Sstevel@tonic-gate 	 */
9907c478bd9Sstevel@tonic-gate 	sema_p(&sbp->b_sem);
9917c478bd9Sstevel@tonic-gate 	if (sbp->b_blkno == blkno && sbp->b_edev == dev &&
9927c478bd9Sstevel@tonic-gate 	    (sbp->b_flags & (B_DELWRI|B_STALE)) == B_DELWRI) {
9937c478bd9Sstevel@tonic-gate 		mutex_enter(hmp);
9947c478bd9Sstevel@tonic-gate 		hp->b_length--;
9957c478bd9Sstevel@tonic-gate 		notavail(sbp);
9967c478bd9Sstevel@tonic-gate 		mutex_exit(hmp);
9977c478bd9Sstevel@tonic-gate 		/*
9987c478bd9Sstevel@tonic-gate 		 * XXX - There is nothing to guarantee a synchronous
9997c478bd9Sstevel@tonic-gate 		 * write here if the B_ASYNC flag is set.  This needs
10007c478bd9Sstevel@tonic-gate 		 * some investigation.
10017c478bd9Sstevel@tonic-gate 		 */
10027c478bd9Sstevel@tonic-gate 		if (sbp->b_vp == NULL) {		/* !ufs */
10037c478bd9Sstevel@tonic-gate 			BWRITE(sbp);	/* synchronous write */
10047c478bd9Sstevel@tonic-gate 		} else {				/* ufs */
10057c478bd9Sstevel@tonic-gate 			UFS_BWRITE(VTOI(sbp->b_vp)->i_ufsvfs, sbp);
10067c478bd9Sstevel@tonic-gate 		}
10077c478bd9Sstevel@tonic-gate 	} else {
10087c478bd9Sstevel@tonic-gate 		sema_v(&sbp->b_sem);
10097c478bd9Sstevel@tonic-gate 	}
10107c478bd9Sstevel@tonic-gate }
10117c478bd9Sstevel@tonic-gate 
10127c478bd9Sstevel@tonic-gate /*
10137c478bd9Sstevel@tonic-gate  * Same as binval, except can force-invalidate delayed-write buffers
10147c478bd9Sstevel@tonic-gate  * (which are not be already flushed because of device errors).  Also
10157c478bd9Sstevel@tonic-gate  * makes sure that the retry write flag is cleared.
10167c478bd9Sstevel@tonic-gate  */
10177c478bd9Sstevel@tonic-gate int
10187c478bd9Sstevel@tonic-gate bfinval(dev_t dev, int force)
10197c478bd9Sstevel@tonic-gate {
10207c478bd9Sstevel@tonic-gate 	struct buf *dp;
10217c478bd9Sstevel@tonic-gate 	struct buf *bp;
10227c478bd9Sstevel@tonic-gate 	struct buf *binval_list = EMPTY_LIST;
10237c478bd9Sstevel@tonic-gate 	int i, error = 0;
10247c478bd9Sstevel@tonic-gate 	kmutex_t *hmp;
10257c478bd9Sstevel@tonic-gate 	uint_t index;
10267c478bd9Sstevel@tonic-gate 	struct buf **backp;
10277c478bd9Sstevel@tonic-gate 
10287c478bd9Sstevel@tonic-gate 	mutex_enter(&blist_lock);
10297c478bd9Sstevel@tonic-gate 	/*
10307c478bd9Sstevel@tonic-gate 	 * Wait for any flushes ahead of us to finish, it's ok to
10317c478bd9Sstevel@tonic-gate 	 * do invalidates in parallel.
10327c478bd9Sstevel@tonic-gate 	 */
10337c478bd9Sstevel@tonic-gate 	while (bio_doingflush) {
10347c478bd9Sstevel@tonic-gate 		bio_flinv_cv_wanted = 1;
10357c478bd9Sstevel@tonic-gate 		cv_wait(&bio_flushinval_cv, &blist_lock);
10367c478bd9Sstevel@tonic-gate 	}
10377c478bd9Sstevel@tonic-gate 	bio_doinginval++;
10387c478bd9Sstevel@tonic-gate 
10397c478bd9Sstevel@tonic-gate 	/* Gather bp's */
10407c478bd9Sstevel@tonic-gate 	for (i = 0; i < v.v_hbuf; i++) {
10417c478bd9Sstevel@tonic-gate 		dp = (struct buf *)&hbuf[i];
10427c478bd9Sstevel@tonic-gate 		hmp = &hbuf[i].b_lock;
10437c478bd9Sstevel@tonic-gate 
10447c478bd9Sstevel@tonic-gate 		mutex_enter(hmp);
10457c478bd9Sstevel@tonic-gate 		for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
10467c478bd9Sstevel@tonic-gate 			if (bp->b_edev == dev) {
10477c478bd9Sstevel@tonic-gate 				if (bp->b_list == NULL) {
10487c478bd9Sstevel@tonic-gate 					bp->b_list = binval_list;
10497c478bd9Sstevel@tonic-gate 					binval_list = bp;
10507c478bd9Sstevel@tonic-gate 				}
10517c478bd9Sstevel@tonic-gate 			}
10527c478bd9Sstevel@tonic-gate 		}
10537c478bd9Sstevel@tonic-gate 		mutex_exit(hmp);
10547c478bd9Sstevel@tonic-gate 	}
10557c478bd9Sstevel@tonic-gate 	mutex_exit(&blist_lock);
10567c478bd9Sstevel@tonic-gate 
10577c478bd9Sstevel@tonic-gate 	/* Invalidate all bp's found */
10587c478bd9Sstevel@tonic-gate 	while (binval_list != EMPTY_LIST) {
10597c478bd9Sstevel@tonic-gate 		bp = binval_list;
10607c478bd9Sstevel@tonic-gate 
10617c478bd9Sstevel@tonic-gate 		sema_p(&bp->b_sem);
10627c478bd9Sstevel@tonic-gate 		if (bp->b_edev == dev) {
10637c478bd9Sstevel@tonic-gate 			if (force && (bp->b_flags & B_DELWRI)) {
10647c478bd9Sstevel@tonic-gate 				/* clear B_DELWRI, move to non-dw freelist */
10657c478bd9Sstevel@tonic-gate 				index = bio_bhash(bp->b_edev, bp->b_blkno);
10667c478bd9Sstevel@tonic-gate 				hmp = &hbuf[index].b_lock;
10677c478bd9Sstevel@tonic-gate 				dp = (struct buf *)&hbuf[index];
10687c478bd9Sstevel@tonic-gate 				mutex_enter(hmp);
10697c478bd9Sstevel@tonic-gate 
10707c478bd9Sstevel@tonic-gate 				/* remove from delayed write freelist */
10717c478bd9Sstevel@tonic-gate 				notavail(bp);
10727c478bd9Sstevel@tonic-gate 
10737c478bd9Sstevel@tonic-gate 				/* add to B_AGE side of non-dw freelist */
10747c478bd9Sstevel@tonic-gate 				backp = &dp->av_forw;
10757c478bd9Sstevel@tonic-gate 				(*backp)->av_back = bp;
10767c478bd9Sstevel@tonic-gate 				bp->av_forw = *backp;
10777c478bd9Sstevel@tonic-gate 				*backp = bp;
10787c478bd9Sstevel@tonic-gate 				bp->av_back = dp;
10797c478bd9Sstevel@tonic-gate 
10807c478bd9Sstevel@tonic-gate 				/*
10817c478bd9Sstevel@tonic-gate 				 * make sure write retries and busy are cleared
10827c478bd9Sstevel@tonic-gate 				 */
10837c478bd9Sstevel@tonic-gate 				bp->b_flags &=
10847c478bd9Sstevel@tonic-gate 				    ~(B_BUSY | B_DELWRI | B_RETRYWRI);
10857c478bd9Sstevel@tonic-gate 				mutex_exit(hmp);
10867c478bd9Sstevel@tonic-gate 			}
10877c478bd9Sstevel@tonic-gate 			if ((bp->b_flags & B_DELWRI) == 0)
10887c478bd9Sstevel@tonic-gate 				bp->b_flags |= B_STALE|B_AGE;
10897c478bd9Sstevel@tonic-gate 			else
10907c478bd9Sstevel@tonic-gate 				error = EIO;
10917c478bd9Sstevel@tonic-gate 		}
10927c478bd9Sstevel@tonic-gate 		sema_v(&bp->b_sem);
10937c478bd9Sstevel@tonic-gate 		binval_list = bp->b_list;
10947c478bd9Sstevel@tonic-gate 		bp->b_list = NULL;
10957c478bd9Sstevel@tonic-gate 	}
10967c478bd9Sstevel@tonic-gate 	mutex_enter(&blist_lock);
10977c478bd9Sstevel@tonic-gate 	bio_doinginval--;
10987c478bd9Sstevel@tonic-gate 	if (bio_flinv_cv_wanted) {
10997c478bd9Sstevel@tonic-gate 		cv_broadcast(&bio_flushinval_cv);
11007c478bd9Sstevel@tonic-gate 		bio_flinv_cv_wanted = 0;
11017c478bd9Sstevel@tonic-gate 	}
11027c478bd9Sstevel@tonic-gate 	mutex_exit(&blist_lock);
11037c478bd9Sstevel@tonic-gate 	return (error);
11047c478bd9Sstevel@tonic-gate }
11057c478bd9Sstevel@tonic-gate 
11067c478bd9Sstevel@tonic-gate /*
11077c478bd9Sstevel@tonic-gate  * If possible, invalidate blocks for a dev on demand
11087c478bd9Sstevel@tonic-gate  */
11097c478bd9Sstevel@tonic-gate void
11107c478bd9Sstevel@tonic-gate binval(dev_t dev)
11117c478bd9Sstevel@tonic-gate {
11127c478bd9Sstevel@tonic-gate 	(void) bfinval(dev, 0);
11137c478bd9Sstevel@tonic-gate }
11147c478bd9Sstevel@tonic-gate 
11157c478bd9Sstevel@tonic-gate /*
11167c478bd9Sstevel@tonic-gate  * Initialize the buffer I/O system by freeing
11177c478bd9Sstevel@tonic-gate  * all buffers and setting all device hash buffer lists to empty.
11187c478bd9Sstevel@tonic-gate  */
11197c478bd9Sstevel@tonic-gate void
11207c478bd9Sstevel@tonic-gate binit(void)
11217c478bd9Sstevel@tonic-gate {
11227c478bd9Sstevel@tonic-gate 	struct buf *bp;
11237c478bd9Sstevel@tonic-gate 	unsigned int i, pct;
11247c478bd9Sstevel@tonic-gate 	ulong_t	bio_max_hwm, bio_default_hwm;
11257c478bd9Sstevel@tonic-gate 
11267c478bd9Sstevel@tonic-gate 	/*
11277c478bd9Sstevel@tonic-gate 	 * Maximum/Default values for bufhwm are set to the smallest of:
11287c478bd9Sstevel@tonic-gate 	 *	- BIO_MAX_PERCENT resp. BIO_BUF_PERCENT of real memory
11297c478bd9Sstevel@tonic-gate 	 *	- 1/4 of kernel virtual memory
11307c478bd9Sstevel@tonic-gate 	 *	- INT32_MAX to prevent overflows of v.v_bufhwm (which is int).
11317c478bd9Sstevel@tonic-gate 	 * Additionally, in order to allow simple tuning by percentage of
11327c478bd9Sstevel@tonic-gate 	 * physical memory, bufhwm_pct is used to calculate the default if
11337c478bd9Sstevel@tonic-gate 	 * the value of this tunable is between 0 and BIO_MAX_PERCENT.
11347c478bd9Sstevel@tonic-gate 	 *
11357c478bd9Sstevel@tonic-gate 	 * Since the unit for v.v_bufhwm is kilobytes, this allows for
11367c478bd9Sstevel@tonic-gate 	 * a maximum of 1024 * 2GB == 2TB memory usage by buffer headers.
11377c478bd9Sstevel@tonic-gate 	 */
11387c478bd9Sstevel@tonic-gate 	bio_max_hwm = MIN(physmem / BIO_MAX_PERCENT,
11397c478bd9Sstevel@tonic-gate 	    btop(vmem_size(heap_arena, VMEM_FREE)) / 4) * (PAGESIZE / 1024);
11407c478bd9Sstevel@tonic-gate 	bio_max_hwm = MIN(INT32_MAX, bio_max_hwm);
11417c478bd9Sstevel@tonic-gate 
11427c478bd9Sstevel@tonic-gate 	pct = BIO_BUF_PERCENT;
11437c478bd9Sstevel@tonic-gate 	if (bufhwm_pct != 0 &&
11447c478bd9Sstevel@tonic-gate 	    ((pct = 100 / bufhwm_pct) < BIO_MAX_PERCENT)) {
11457c478bd9Sstevel@tonic-gate 		pct = BIO_BUF_PERCENT;
11467c478bd9Sstevel@tonic-gate 		/*
11477c478bd9Sstevel@tonic-gate 		 * Invalid user specified value, emit a warning.
11487c478bd9Sstevel@tonic-gate 		 */
11497c478bd9Sstevel@tonic-gate 		cmn_err(CE_WARN, "binit: bufhwm_pct(%d) out of \
1150d3d50737SRafael Vanoni 		    range(1..%d). Using %d as default.",
1151d3d50737SRafael Vanoni 		    bufhwm_pct,
1152d3d50737SRafael Vanoni 		    100 / BIO_MAX_PERCENT, 100 / BIO_BUF_PERCENT);
11537c478bd9Sstevel@tonic-gate 	}
11547c478bd9Sstevel@tonic-gate 
11557c478bd9Sstevel@tonic-gate 	bio_default_hwm = MIN(physmem / pct,
11567c478bd9Sstevel@tonic-gate 	    btop(vmem_size(heap_arena, VMEM_FREE)) / 4) * (PAGESIZE / 1024);
11577c478bd9Sstevel@tonic-gate 	bio_default_hwm = MIN(INT32_MAX, bio_default_hwm);
11587c478bd9Sstevel@tonic-gate 
11597c478bd9Sstevel@tonic-gate 	if ((v.v_bufhwm = bufhwm) == 0)
11607c478bd9Sstevel@tonic-gate 		v.v_bufhwm = bio_default_hwm;
11617c478bd9Sstevel@tonic-gate 
11627c478bd9Sstevel@tonic-gate 	if (v.v_bufhwm < BIO_MIN_HWM || v.v_bufhwm > bio_max_hwm) {
11637c478bd9Sstevel@tonic-gate 		v.v_bufhwm = (int)bio_max_hwm;
11647c478bd9Sstevel@tonic-gate 		/*
11657c478bd9Sstevel@tonic-gate 		 * Invalid user specified value, emit a warning.
11667c478bd9Sstevel@tonic-gate 		 */
11677c478bd9Sstevel@tonic-gate 		cmn_err(CE_WARN,
1168d3d50737SRafael Vanoni 		    "binit: bufhwm(%d) out \
1169d3d50737SRafael Vanoni 		    of range(%d..%lu). Using %lu as default",
1170d3d50737SRafael Vanoni 		    bufhwm,
1171d3d50737SRafael Vanoni 		    BIO_MIN_HWM, bio_max_hwm, bio_max_hwm);
11727c478bd9Sstevel@tonic-gate 	}
11737c478bd9Sstevel@tonic-gate 
11747c478bd9Sstevel@tonic-gate 	/*
11757c478bd9Sstevel@tonic-gate 	 * Determine the number of hash buckets. Default is to
11767c478bd9Sstevel@tonic-gate 	 * create ~BIO_HASHLEN entries per chain based on MAXBSIZE buffers.
11777c478bd9Sstevel@tonic-gate 	 * Round up number to the next power of 2.
11787c478bd9Sstevel@tonic-gate 	 */
11797c478bd9Sstevel@tonic-gate 	v.v_hbuf = 1 << highbit((((ulong_t)v.v_bufhwm * 1024) / MAXBSIZE) /
11807c478bd9Sstevel@tonic-gate 	    BIO_HASHLEN);
11817c478bd9Sstevel@tonic-gate 	v.v_hmask = v.v_hbuf - 1;
11827c478bd9Sstevel@tonic-gate 	v.v_buf = BIO_BHDR_POOL;
11837c478bd9Sstevel@tonic-gate 
11847c478bd9Sstevel@tonic-gate 	hbuf = kmem_zalloc(v.v_hbuf * sizeof (struct hbuf), KM_SLEEP);
11857c478bd9Sstevel@tonic-gate 
11867c478bd9Sstevel@tonic-gate 	dwbuf = kmem_zalloc(v.v_hbuf * sizeof (struct dwbuf), KM_SLEEP);
11877c478bd9Sstevel@tonic-gate 
11887c478bd9Sstevel@tonic-gate 	bfreelist.b_bufsize = (size_t)v.v_bufhwm * 1024;
11897c478bd9Sstevel@tonic-gate 	bp = &bfreelist;
11907c478bd9Sstevel@tonic-gate 	bp->b_forw = bp->b_back = bp->av_forw = bp->av_back = bp;
11917c478bd9Sstevel@tonic-gate 
11927c478bd9Sstevel@tonic-gate 	for (i = 0; i < v.v_hbuf; i++) {
11937c478bd9Sstevel@tonic-gate 		hbuf[i].b_forw = hbuf[i].b_back = (struct buf *)&hbuf[i];
11947c478bd9Sstevel@tonic-gate 		hbuf[i].av_forw = hbuf[i].av_back = (struct buf *)&hbuf[i];
11957c478bd9Sstevel@tonic-gate 
11967c478bd9Sstevel@tonic-gate 		/*
11977c478bd9Sstevel@tonic-gate 		 * Initialize the delayed write buffer list.
11987c478bd9Sstevel@tonic-gate 		 */
11997c478bd9Sstevel@tonic-gate 		dwbuf[i].b_forw = dwbuf[i].b_back = (struct buf *)&dwbuf[i];
12007c478bd9Sstevel@tonic-gate 		dwbuf[i].av_forw = dwbuf[i].av_back = (struct buf *)&dwbuf[i];
12017c478bd9Sstevel@tonic-gate 	}
12027c478bd9Sstevel@tonic-gate }
12037c478bd9Sstevel@tonic-gate 
12047c478bd9Sstevel@tonic-gate /*
12057c478bd9Sstevel@tonic-gate  * Wait for I/O completion on the buffer; return error code.
12067c478bd9Sstevel@tonic-gate  * If bp was for synchronous I/O, bp is invalid and associated
12077c478bd9Sstevel@tonic-gate  * resources are freed on return.
12087c478bd9Sstevel@tonic-gate  */
12097c478bd9Sstevel@tonic-gate int
12107c478bd9Sstevel@tonic-gate biowait(struct buf *bp)
12117c478bd9Sstevel@tonic-gate {
12127c478bd9Sstevel@tonic-gate 	int error = 0;
12137c478bd9Sstevel@tonic-gate 	struct cpu *cpup;
12147c478bd9Sstevel@tonic-gate 
12157c478bd9Sstevel@tonic-gate 	ASSERT(SEMA_HELD(&bp->b_sem));
12167c478bd9Sstevel@tonic-gate 
12177c478bd9Sstevel@tonic-gate 	cpup = CPU;
12181a5e258fSJosef 'Jeff' Sipek 	atomic_inc_64(&cpup->cpu_stats.sys.iowait);
12197c478bd9Sstevel@tonic-gate 	DTRACE_IO1(wait__start, struct buf *, bp);
12207c478bd9Sstevel@tonic-gate 
12217c478bd9Sstevel@tonic-gate 	/*
12227c478bd9Sstevel@tonic-gate 	 * In case of panic, busy wait for completion
12237c478bd9Sstevel@tonic-gate 	 */
12247c478bd9Sstevel@tonic-gate 	if (panicstr) {
12257c478bd9Sstevel@tonic-gate 		while ((bp->b_flags & B_DONE) == 0)
12267c478bd9Sstevel@tonic-gate 			drv_usecwait(10);
12277c478bd9Sstevel@tonic-gate 	} else
12287c478bd9Sstevel@tonic-gate 		sema_p(&bp->b_io);
12297c478bd9Sstevel@tonic-gate 
12307c478bd9Sstevel@tonic-gate 	DTRACE_IO1(wait__done, struct buf *, bp);
12311a5e258fSJosef 'Jeff' Sipek 	atomic_dec_64(&cpup->cpu_stats.sys.iowait);
12327c478bd9Sstevel@tonic-gate 
12337c478bd9Sstevel@tonic-gate 	error = geterror(bp);
12347c478bd9Sstevel@tonic-gate 	if ((bp->b_flags & B_ASYNC) == 0) {
12357c478bd9Sstevel@tonic-gate 		if (bp->b_flags & B_REMAPPED)
12367c478bd9Sstevel@tonic-gate 			bp_mapout(bp);
12377c478bd9Sstevel@tonic-gate 	}
12387c478bd9Sstevel@tonic-gate 	return (error);
12397c478bd9Sstevel@tonic-gate }
12407c478bd9Sstevel@tonic-gate 
12417c478bd9Sstevel@tonic-gate static void
12427c478bd9Sstevel@tonic-gate biodone_tnf_probe(struct buf *bp)
12437c478bd9Sstevel@tonic-gate {
12447c478bd9Sstevel@tonic-gate 	/* Kernel probe */
12457c478bd9Sstevel@tonic-gate 	TNF_PROBE_3(biodone, "io blockio", /* CSTYLED */,
1246d3d50737SRafael Vanoni 	    tnf_device,		device,		bp->b_edev,
1247d3d50737SRafael Vanoni 	    tnf_diskaddr,	block,		bp->b_lblkno,
1248d3d50737SRafael Vanoni 	    tnf_opaque,		buf,		bp);
12497c478bd9Sstevel@tonic-gate }
12507c478bd9Sstevel@tonic-gate 
12517c478bd9Sstevel@tonic-gate /*
12527c478bd9Sstevel@tonic-gate  * Mark I/O complete on a buffer, release it if I/O is asynchronous,
12537c478bd9Sstevel@tonic-gate  * and wake up anyone waiting for it.
12547c478bd9Sstevel@tonic-gate  */
12557c478bd9Sstevel@tonic-gate void
12567c478bd9Sstevel@tonic-gate biodone(struct buf *bp)
12577c478bd9Sstevel@tonic-gate {
12587c478bd9Sstevel@tonic-gate 	if (bp->b_flags & B_STARTED) {
12597c478bd9Sstevel@tonic-gate 		DTRACE_IO1(done, struct buf *, bp);
12607c478bd9Sstevel@tonic-gate 		bp->b_flags &= ~B_STARTED;
12617c478bd9Sstevel@tonic-gate 	}
12627c478bd9Sstevel@tonic-gate 
12637c478bd9Sstevel@tonic-gate 	/*
12647c478bd9Sstevel@tonic-gate 	 * Call the TNF probe here instead of the inline code
12657c478bd9Sstevel@tonic-gate 	 * to force our compiler to use the tail call optimization.
12667c478bd9Sstevel@tonic-gate 	 */
12677c478bd9Sstevel@tonic-gate 	biodone_tnf_probe(bp);
12687c478bd9Sstevel@tonic-gate 
12697c478bd9Sstevel@tonic-gate 	if (bp->b_iodone != NULL) {
12707c478bd9Sstevel@tonic-gate 		(*(bp->b_iodone))(bp);
12717c478bd9Sstevel@tonic-gate 		return;
12727c478bd9Sstevel@tonic-gate 	}
12737c478bd9Sstevel@tonic-gate 	ASSERT((bp->b_flags & B_DONE) == 0);
12747c478bd9Sstevel@tonic-gate 	ASSERT(SEMA_HELD(&bp->b_sem));
12757c478bd9Sstevel@tonic-gate 	bp->b_flags |= B_DONE;
12767c478bd9Sstevel@tonic-gate 	if (bp->b_flags & B_ASYNC) {
12777c478bd9Sstevel@tonic-gate 		if (bp->b_flags & (B_PAGEIO|B_REMAPPED))
12787c478bd9Sstevel@tonic-gate 			bio_pageio_done(bp);
12797c478bd9Sstevel@tonic-gate 		else
12807c478bd9Sstevel@tonic-gate 			brelse(bp);	/* release bp to freelist */
12817c478bd9Sstevel@tonic-gate 	} else {
12827c478bd9Sstevel@tonic-gate 		sema_v(&bp->b_io);
12837c478bd9Sstevel@tonic-gate 	}
12847c478bd9Sstevel@tonic-gate }
12857c478bd9Sstevel@tonic-gate 
12867c478bd9Sstevel@tonic-gate /*
12877c478bd9Sstevel@tonic-gate  * Pick up the device's error number and pass it to the user;
12887c478bd9Sstevel@tonic-gate  * if there is an error but the number is 0 set a generalized code.
12897c478bd9Sstevel@tonic-gate  */
12907c478bd9Sstevel@tonic-gate int
12917c478bd9Sstevel@tonic-gate geterror(struct buf *bp)
12927c478bd9Sstevel@tonic-gate {
12937c478bd9Sstevel@tonic-gate 	int error = 0;
12947c478bd9Sstevel@tonic-gate 
12957c478bd9Sstevel@tonic-gate 	ASSERT(SEMA_HELD(&bp->b_sem));
12967c478bd9Sstevel@tonic-gate 	if (bp->b_flags & B_ERROR) {
12977c478bd9Sstevel@tonic-gate 		error = bp->b_error;
12987c478bd9Sstevel@tonic-gate 		if (!error)
12997c478bd9Sstevel@tonic-gate 			error = EIO;
13007c478bd9Sstevel@tonic-gate 	}
13017c478bd9Sstevel@tonic-gate 	return (error);
13027c478bd9Sstevel@tonic-gate }
13037c478bd9Sstevel@tonic-gate 
13047c478bd9Sstevel@tonic-gate /*
13057c478bd9Sstevel@tonic-gate  * Support for pageio buffers.
13067c478bd9Sstevel@tonic-gate  *
13077c478bd9Sstevel@tonic-gate  * This stuff should be generalized to provide a generalized bp
13087c478bd9Sstevel@tonic-gate  * header facility that can be used for things other than pageio.
13097c478bd9Sstevel@tonic-gate  */
13107c478bd9Sstevel@tonic-gate 
13117c478bd9Sstevel@tonic-gate /*
13127c478bd9Sstevel@tonic-gate  * Allocate and initialize a buf struct for use with pageio.
13137c478bd9Sstevel@tonic-gate  */
13147c478bd9Sstevel@tonic-gate struct buf *
13157c478bd9Sstevel@tonic-gate pageio_setup(struct page *pp, size_t len, struct vnode *vp, int flags)
13167c478bd9Sstevel@tonic-gate {
13177c478bd9Sstevel@tonic-gate 	struct buf *bp;
13187c478bd9Sstevel@tonic-gate 	struct cpu *cpup;
13197c478bd9Sstevel@tonic-gate 
13207c478bd9Sstevel@tonic-gate 	if (flags & B_READ) {
13217c478bd9Sstevel@tonic-gate 		CPU_STATS_ENTER_K();
13227c478bd9Sstevel@tonic-gate 		cpup = CPU;	/* get pointer AFTER preemption is disabled */
13237c478bd9Sstevel@tonic-gate 		CPU_STATS_ADDQ(cpup, vm, pgin, 1);
13247c478bd9Sstevel@tonic-gate 		CPU_STATS_ADDQ(cpup, vm, pgpgin, btopr(len));
13259468939eSJerry Jelinek 
13269468939eSJerry Jelinek 		atomic_add_64(&curzone->zone_pgpgin, btopr(len));
13279468939eSJerry Jelinek 
13287c478bd9Sstevel@tonic-gate 		if ((flags & B_ASYNC) == 0) {
13297c478bd9Sstevel@tonic-gate 			klwp_t *lwp = ttolwp(curthread);
13307c478bd9Sstevel@tonic-gate 			if (lwp != NULL)
13317c478bd9Sstevel@tonic-gate 				lwp->lwp_ru.majflt++;
13327c478bd9Sstevel@tonic-gate 			CPU_STATS_ADDQ(cpup, vm, maj_fault, 1);
13337c478bd9Sstevel@tonic-gate 			/* Kernel probe */
13347c478bd9Sstevel@tonic-gate 			TNF_PROBE_2(major_fault, "vm pagefault", /* CSTYLED */,
1335d3d50737SRafael Vanoni 			    tnf_opaque,		vnode,		pp->p_vnode,
1336d3d50737SRafael Vanoni 			    tnf_offset,		offset,		pp->p_offset);
13377c478bd9Sstevel@tonic-gate 		}
13387c478bd9Sstevel@tonic-gate 		/*
13397c478bd9Sstevel@tonic-gate 		 * Update statistics for pages being paged in
13407c478bd9Sstevel@tonic-gate 		 */
13417c478bd9Sstevel@tonic-gate 		if (pp != NULL && pp->p_vnode != NULL) {
13427c478bd9Sstevel@tonic-gate 			if (IS_SWAPFSVP(pp->p_vnode)) {
1343d3d50737SRafael Vanoni 				CPU_STATS_ADDQ(cpup, vm, anonpgin, btopr(len));
13449468939eSJerry Jelinek 				atomic_add_64(&curzone->zone_anonpgin,
13459468939eSJerry Jelinek 				    btopr(len));
13467c478bd9Sstevel@tonic-gate 			} else {
13477c478bd9Sstevel@tonic-gate 				if (pp->p_vnode->v_flag & VVMEXEC) {
13487c478bd9Sstevel@tonic-gate 					CPU_STATS_ADDQ(cpup, vm, execpgin,
1349d3d50737SRafael Vanoni 					    btopr(len));
13509468939eSJerry Jelinek 					atomic_add_64(&curzone->zone_execpgin,
13519468939eSJerry Jelinek 					    btopr(len));
13527c478bd9Sstevel@tonic-gate 				} else {
13537c478bd9Sstevel@tonic-gate 					CPU_STATS_ADDQ(cpup, vm, fspgin,
1354d3d50737SRafael Vanoni 					    btopr(len));
13559468939eSJerry Jelinek 					atomic_add_64(&curzone->zone_fspgin,
13569468939eSJerry Jelinek 					    btopr(len));
13577c478bd9Sstevel@tonic-gate 				}
13587c478bd9Sstevel@tonic-gate 			}
13597c478bd9Sstevel@tonic-gate 		}
13607c478bd9Sstevel@tonic-gate 		CPU_STATS_EXIT_K();
13617c478bd9Sstevel@tonic-gate 		TRACE_1(TR_FAC_VM, TR_PAGE_WS_IN,
13627c478bd9Sstevel@tonic-gate 		    "page_ws_in:pp %p", pp);
13637c478bd9Sstevel@tonic-gate 		/* Kernel probe */
13647c478bd9Sstevel@tonic-gate 		TNF_PROBE_3(pagein, "vm pageio io", /* CSTYLED */,
1365d3d50737SRafael Vanoni 		    tnf_opaque,	vnode,	pp->p_vnode,
1366d3d50737SRafael Vanoni 		    tnf_offset,	offset,	pp->p_offset,
1367d3d50737SRafael Vanoni 		    tnf_size,	size,	len);
13687c478bd9Sstevel@tonic-gate 	}
13697c478bd9Sstevel@tonic-gate 
13707c478bd9Sstevel@tonic-gate 	bp = kmem_zalloc(sizeof (struct buf), KM_SLEEP);
13717c478bd9Sstevel@tonic-gate 	bp->b_bcount = len;
13727c478bd9Sstevel@tonic-gate 	bp->b_bufsize = len;
13737c478bd9Sstevel@tonic-gate 	bp->b_pages = pp;
13747c478bd9Sstevel@tonic-gate 	bp->b_flags = B_PAGEIO | B_NOCACHE | B_BUSY | flags;
13757c478bd9Sstevel@tonic-gate 	bp->b_offset = -1;
13767c478bd9Sstevel@tonic-gate 	sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
13777c478bd9Sstevel@tonic-gate 
13787c478bd9Sstevel@tonic-gate 	/* Initialize bp->b_sem in "locked" state */
13797c478bd9Sstevel@tonic-gate 	sema_init(&bp->b_sem, 0, NULL, SEMA_DEFAULT, NULL);
13807c478bd9Sstevel@tonic-gate 
13817c478bd9Sstevel@tonic-gate 	VN_HOLD(vp);
13827c478bd9Sstevel@tonic-gate 	bp->b_vp = vp;
13837c478bd9Sstevel@tonic-gate 	THREAD_KPRI_RELEASE_N(btopr(len)); /* release kpri from page_locks */
13847c478bd9Sstevel@tonic-gate 
13857c478bd9Sstevel@tonic-gate 	/*
13867c478bd9Sstevel@tonic-gate 	 * Caller sets dev & blkno and can adjust
13877c478bd9Sstevel@tonic-gate 	 * b_addr for page offset and can use bp_mapin
13887c478bd9Sstevel@tonic-gate 	 * to make pages kernel addressable.
13897c478bd9Sstevel@tonic-gate 	 */
13907c478bd9Sstevel@tonic-gate 	return (bp);
13917c478bd9Sstevel@tonic-gate }
13927c478bd9Sstevel@tonic-gate 
13937c478bd9Sstevel@tonic-gate void
13947c478bd9Sstevel@tonic-gate pageio_done(struct buf *bp)
13957c478bd9Sstevel@tonic-gate {
13967c478bd9Sstevel@tonic-gate 	ASSERT(SEMA_HELD(&bp->b_sem));
13977c478bd9Sstevel@tonic-gate 	if (bp->b_flags & B_REMAPPED)
13987c478bd9Sstevel@tonic-gate 		bp_mapout(bp);
13997c478bd9Sstevel@tonic-gate 	VN_RELE(bp->b_vp);
14007c478bd9Sstevel@tonic-gate 	bp->b_vp = NULL;
14017c478bd9Sstevel@tonic-gate 	ASSERT((bp->b_flags & B_NOCACHE) != 0);
14027c478bd9Sstevel@tonic-gate 
14037c478bd9Sstevel@tonic-gate 	/* A sema_v(bp->b_sem) is implied if we are destroying it */
14047c478bd9Sstevel@tonic-gate 	sema_destroy(&bp->b_sem);
14057c478bd9Sstevel@tonic-gate 	sema_destroy(&bp->b_io);
14067c478bd9Sstevel@tonic-gate 	kmem_free(bp, sizeof (struct buf));
14077c478bd9Sstevel@tonic-gate }
14087c478bd9Sstevel@tonic-gate 
14097c478bd9Sstevel@tonic-gate /*
14107c478bd9Sstevel@tonic-gate  * Check to see whether the buffers, except the one pointed by sbp,
14117c478bd9Sstevel@tonic-gate  * associated with the device are busy.
14127c478bd9Sstevel@tonic-gate  * NOTE: This expensive operation shall be improved together with ufs_icheck().
14137c478bd9Sstevel@tonic-gate  */
14147c478bd9Sstevel@tonic-gate int
14157c478bd9Sstevel@tonic-gate bcheck(dev_t dev, struct buf *sbp)
14167c478bd9Sstevel@tonic-gate {
14177c478bd9Sstevel@tonic-gate 	struct buf	*bp;
14187c478bd9Sstevel@tonic-gate 	struct buf	*dp;
14197c478bd9Sstevel@tonic-gate 	int i;
14207c478bd9Sstevel@tonic-gate 	kmutex_t *hmp;
14217c478bd9Sstevel@tonic-gate 
14227c478bd9Sstevel@tonic-gate 	/*
14237c478bd9Sstevel@tonic-gate 	 * check for busy bufs for this filesystem
14247c478bd9Sstevel@tonic-gate 	 */
14257c478bd9Sstevel@tonic-gate 	for (i = 0; i < v.v_hbuf; i++) {
14267c478bd9Sstevel@tonic-gate 		dp = (struct buf *)&hbuf[i];
14277c478bd9Sstevel@tonic-gate 		hmp = &hbuf[i].b_lock;
14287c478bd9Sstevel@tonic-gate 
14297c478bd9Sstevel@tonic-gate 		mutex_enter(hmp);
14307c478bd9Sstevel@tonic-gate 		for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
14317c478bd9Sstevel@tonic-gate 			/*
14327c478bd9Sstevel@tonic-gate 			 * if buf is busy or dirty, then filesystem is busy
14337c478bd9Sstevel@tonic-gate 			 */
14347c478bd9Sstevel@tonic-gate 			if ((bp->b_edev == dev) &&
14357c478bd9Sstevel@tonic-gate 			    ((bp->b_flags & B_STALE) == 0) &&
14367c478bd9Sstevel@tonic-gate 			    (bp->b_flags & (B_DELWRI|B_BUSY)) &&
14377c478bd9Sstevel@tonic-gate 			    (bp != sbp)) {
14387c478bd9Sstevel@tonic-gate 				mutex_exit(hmp);
14397c478bd9Sstevel@tonic-gate 				return (1);
14407c478bd9Sstevel@tonic-gate 			}
14417c478bd9Sstevel@tonic-gate 		}
14427c478bd9Sstevel@tonic-gate 		mutex_exit(hmp);
14437c478bd9Sstevel@tonic-gate 	}
14447c478bd9Sstevel@tonic-gate 	return (0);
14457c478bd9Sstevel@tonic-gate }
14467c478bd9Sstevel@tonic-gate 
14477c478bd9Sstevel@tonic-gate /*
14487c478bd9Sstevel@tonic-gate  * Hash two 32 bit entities.
14497c478bd9Sstevel@tonic-gate  */
14507c478bd9Sstevel@tonic-gate int
14517c478bd9Sstevel@tonic-gate hash2ints(int x, int y)
14527c478bd9Sstevel@tonic-gate {
14537c478bd9Sstevel@tonic-gate 	int hash = 0;
14547c478bd9Sstevel@tonic-gate 
14557c478bd9Sstevel@tonic-gate 	hash = x - 1;
14567c478bd9Sstevel@tonic-gate 	hash = ((hash * 7) + (x >> 8)) - 1;
14577c478bd9Sstevel@tonic-gate 	hash = ((hash * 7) + (x >> 16)) - 1;
14587c478bd9Sstevel@tonic-gate 	hash = ((hash * 7) + (x >> 24)) - 1;
14597c478bd9Sstevel@tonic-gate 	hash = ((hash * 7) + y) - 1;
14607c478bd9Sstevel@tonic-gate 	hash = ((hash * 7) + (y >> 8)) - 1;
14617c478bd9Sstevel@tonic-gate 	hash = ((hash * 7) + (y >> 16)) - 1;
14627c478bd9Sstevel@tonic-gate 	hash = ((hash * 7) + (y >> 24)) - 1;
14637c478bd9Sstevel@tonic-gate 
14647c478bd9Sstevel@tonic-gate 	return (hash);
14657c478bd9Sstevel@tonic-gate }
14667c478bd9Sstevel@tonic-gate 
14677c478bd9Sstevel@tonic-gate 
14687c478bd9Sstevel@tonic-gate /*
14697c478bd9Sstevel@tonic-gate  * Return a new buffer struct.
14707c478bd9Sstevel@tonic-gate  *	Create a new buffer if we haven't gone over our high water
14717c478bd9Sstevel@tonic-gate  *	mark for memory, otherwise try to get one off the freelist.
14727c478bd9Sstevel@tonic-gate  *
14737c478bd9Sstevel@tonic-gate  * Returns a locked buf that has no id and is not on any hash or free
14747c478bd9Sstevel@tonic-gate  * list.
14757c478bd9Sstevel@tonic-gate  */
14767c478bd9Sstevel@tonic-gate static struct buf *
14777c478bd9Sstevel@tonic-gate bio_getfreeblk(long bsize)
14787c478bd9Sstevel@tonic-gate {
14797c478bd9Sstevel@tonic-gate 	struct buf *bp, *dp;
14807c478bd9Sstevel@tonic-gate 	struct hbuf *hp;
14817c478bd9Sstevel@tonic-gate 	kmutex_t	*hmp;
14827c478bd9Sstevel@tonic-gate 	uint_t		start, end;
14837c478bd9Sstevel@tonic-gate 
14847c478bd9Sstevel@tonic-gate 	/*
14857c478bd9Sstevel@tonic-gate 	 * mutex_enter(&bfree_lock);
14867c478bd9Sstevel@tonic-gate 	 * bfreelist.b_bufsize represents the amount of memory
14877c478bd9Sstevel@tonic-gate 	 * mutex_exit(&bfree_lock); protect ref to bfreelist
14887c478bd9Sstevel@tonic-gate 	 * we are allowed to allocate in the cache before we hit our hwm.
14897c478bd9Sstevel@tonic-gate 	 */
14907c478bd9Sstevel@tonic-gate 	bio_mem_get(bsize);	/* Account for our memory request */
14917c478bd9Sstevel@tonic-gate 
14927c478bd9Sstevel@tonic-gate again:
14937c478bd9Sstevel@tonic-gate 	bp = bio_bhdr_alloc();	/* Get a buf hdr */
14947c478bd9Sstevel@tonic-gate 	sema_p(&bp->b_sem);	/* Should never fail */
14957c478bd9Sstevel@tonic-gate 
14967c478bd9Sstevel@tonic-gate 	ASSERT(bp->b_un.b_addr == NULL);
14977c478bd9Sstevel@tonic-gate 	bp->b_un.b_addr = kmem_alloc(bsize, KM_NOSLEEP);
14987c478bd9Sstevel@tonic-gate 	if (bp->b_un.b_addr != NULL) {
14997c478bd9Sstevel@tonic-gate 		/*
15007c478bd9Sstevel@tonic-gate 		 * Make the common path short
15017c478bd9Sstevel@tonic-gate 		 */
15027c478bd9Sstevel@tonic-gate 		bp->b_bufsize = bsize;
15037c478bd9Sstevel@tonic-gate 		ASSERT(SEMA_HELD(&bp->b_sem));
15047c478bd9Sstevel@tonic-gate 		return (bp);
15057c478bd9Sstevel@tonic-gate 	} else {
15067c478bd9Sstevel@tonic-gate 		struct buf *save;
15077c478bd9Sstevel@tonic-gate 
15087c478bd9Sstevel@tonic-gate 		save = bp;	/* Save bp we allocated */
15097c478bd9Sstevel@tonic-gate 		start = end = lastindex;
15107c478bd9Sstevel@tonic-gate 
15117c478bd9Sstevel@tonic-gate 		biostats.bio_bufwant.value.ui32++;
15127c478bd9Sstevel@tonic-gate 
15137c478bd9Sstevel@tonic-gate 		/*
15147c478bd9Sstevel@tonic-gate 		 * Memory isn't available from the system now. Scan
15157c478bd9Sstevel@tonic-gate 		 * the hash buckets till enough space is found.
15167c478bd9Sstevel@tonic-gate 		 */
15177c478bd9Sstevel@tonic-gate 		do {
15187c478bd9Sstevel@tonic-gate 			hp = &hbuf[start];
15197c478bd9Sstevel@tonic-gate 			hmp = &hp->b_lock;
15207c478bd9Sstevel@tonic-gate 			dp = (struct buf *)hp;
15217c478bd9Sstevel@tonic-gate 
15227c478bd9Sstevel@tonic-gate 			mutex_enter(hmp);
15237c478bd9Sstevel@tonic-gate 			bp = dp->av_forw;
15247c478bd9Sstevel@tonic-gate 
15257c478bd9Sstevel@tonic-gate 			while (bp != dp) {
15267c478bd9Sstevel@tonic-gate 
15277c478bd9Sstevel@tonic-gate 				ASSERT(bp != NULL);
15287c478bd9Sstevel@tonic-gate 
15297c478bd9Sstevel@tonic-gate 				if (!sema_tryp(&bp->b_sem)) {
15307c478bd9Sstevel@tonic-gate 					bp = bp->av_forw;
15317c478bd9Sstevel@tonic-gate 					continue;
15327c478bd9Sstevel@tonic-gate 				}
15337c478bd9Sstevel@tonic-gate 
15347c478bd9Sstevel@tonic-gate 				/*
15357c478bd9Sstevel@tonic-gate 				 * Since we are going down the freelist
15367c478bd9Sstevel@tonic-gate 				 * associated with this hash bucket the
15377c478bd9Sstevel@tonic-gate 				 * B_DELWRI flag should not be set.
15387c478bd9Sstevel@tonic-gate 				 */
15397c478bd9Sstevel@tonic-gate 				ASSERT(!(bp->b_flags & B_DELWRI));
15407c478bd9Sstevel@tonic-gate 
15417c478bd9Sstevel@tonic-gate 				if (bp->b_bufsize == bsize) {
15427c478bd9Sstevel@tonic-gate 					hp->b_length--;
15437c478bd9Sstevel@tonic-gate 					notavail(bp);
15447c478bd9Sstevel@tonic-gate 					bremhash(bp);
15457c478bd9Sstevel@tonic-gate 					mutex_exit(hmp);
15467c478bd9Sstevel@tonic-gate 
15477c478bd9Sstevel@tonic-gate 					/*
15487c478bd9Sstevel@tonic-gate 					 * Didn't kmem_alloc any more, so don't
15497c478bd9Sstevel@tonic-gate 					 * count it twice.
15507c478bd9Sstevel@tonic-gate 					 */
15517c478bd9Sstevel@tonic-gate 					mutex_enter(&bfree_lock);
15527c478bd9Sstevel@tonic-gate 					bfreelist.b_bufsize += bsize;
15537c478bd9Sstevel@tonic-gate 					mutex_exit(&bfree_lock);
15547c478bd9Sstevel@tonic-gate 
15557c478bd9Sstevel@tonic-gate 					/*
15567c478bd9Sstevel@tonic-gate 					 * Update the lastindex value.
15577c478bd9Sstevel@tonic-gate 					 */
15587c478bd9Sstevel@tonic-gate 					lastindex = start;
15597c478bd9Sstevel@tonic-gate 
15607c478bd9Sstevel@tonic-gate 					/*
15617c478bd9Sstevel@tonic-gate 					 * Put our saved bp back on the list
15627c478bd9Sstevel@tonic-gate 					 */
15637c478bd9Sstevel@tonic-gate 					sema_v(&save->b_sem);
15647c478bd9Sstevel@tonic-gate 					bio_bhdr_free(save);
15657c478bd9Sstevel@tonic-gate 					ASSERT(SEMA_HELD(&bp->b_sem));
15667c478bd9Sstevel@tonic-gate 					return (bp);
15677c478bd9Sstevel@tonic-gate 				}
15687c478bd9Sstevel@tonic-gate 				sema_v(&bp->b_sem);
15697c478bd9Sstevel@tonic-gate 				bp = bp->av_forw;
15707c478bd9Sstevel@tonic-gate 			}
15717c478bd9Sstevel@tonic-gate 			mutex_exit(hmp);
15727c478bd9Sstevel@tonic-gate 			start = ((start + 1) % v.v_hbuf);
15737c478bd9Sstevel@tonic-gate 		} while (start != end);
15747c478bd9Sstevel@tonic-gate 
15757c478bd9Sstevel@tonic-gate 		biostats.bio_bufwait.value.ui32++;
15767c478bd9Sstevel@tonic-gate 		bp = save;		/* Use original bp */
15777c478bd9Sstevel@tonic-gate 		bp->b_un.b_addr = kmem_alloc(bsize, KM_SLEEP);
15787c478bd9Sstevel@tonic-gate 	}
15797c478bd9Sstevel@tonic-gate 
15807c478bd9Sstevel@tonic-gate 	bp->b_bufsize = bsize;
15817c478bd9Sstevel@tonic-gate 	ASSERT(SEMA_HELD(&bp->b_sem));
15827c478bd9Sstevel@tonic-gate 	return (bp);
15837c478bd9Sstevel@tonic-gate }
15847c478bd9Sstevel@tonic-gate 
15857c478bd9Sstevel@tonic-gate /*
15867c478bd9Sstevel@tonic-gate  * Allocate a buffer header. If none currently available, allocate
15877c478bd9Sstevel@tonic-gate  * a new pool.
15887c478bd9Sstevel@tonic-gate  */
15897c478bd9Sstevel@tonic-gate static struct buf *
15907c478bd9Sstevel@tonic-gate bio_bhdr_alloc(void)
15917c478bd9Sstevel@tonic-gate {
15927c478bd9Sstevel@tonic-gate 	struct buf *dp, *sdp;
15937c478bd9Sstevel@tonic-gate 	struct buf *bp;
15947c478bd9Sstevel@tonic-gate 	int i;
15957c478bd9Sstevel@tonic-gate 
15967c478bd9Sstevel@tonic-gate 	for (;;) {
15977c478bd9Sstevel@tonic-gate 		mutex_enter(&bhdr_lock);
15987c478bd9Sstevel@tonic-gate 		if (bhdrlist != NULL) {
15997c478bd9Sstevel@tonic-gate 			bp = bhdrlist;
16007c478bd9Sstevel@tonic-gate 			bhdrlist = bp->av_forw;
16017c478bd9Sstevel@tonic-gate 			mutex_exit(&bhdr_lock);
16027c478bd9Sstevel@tonic-gate 			bp->av_forw = NULL;
16037c478bd9Sstevel@tonic-gate 			return (bp);
16047c478bd9Sstevel@tonic-gate 		}
16057c478bd9Sstevel@tonic-gate 		mutex_exit(&bhdr_lock);
16067c478bd9Sstevel@tonic-gate 
16077c478bd9Sstevel@tonic-gate 		/*
16087c478bd9Sstevel@tonic-gate 		 * Need to allocate a new pool. If the system is currently
16097c478bd9Sstevel@tonic-gate 		 * out of memory, then try freeing things on the freelist.
16107c478bd9Sstevel@tonic-gate 		 */
16117c478bd9Sstevel@tonic-gate 		dp = kmem_zalloc(sizeof (struct buf) * v.v_buf, KM_NOSLEEP);
16127c478bd9Sstevel@tonic-gate 		if (dp == NULL) {
16137c478bd9Sstevel@tonic-gate 			/*
16147c478bd9Sstevel@tonic-gate 			 * System can't give us a pool of headers, try
16157c478bd9Sstevel@tonic-gate 			 * recycling from the free lists.
16167c478bd9Sstevel@tonic-gate 			 */
16177c478bd9Sstevel@tonic-gate 			bio_recycle(BIO_HEADER, 0);
16187c478bd9Sstevel@tonic-gate 		} else {
16197c478bd9Sstevel@tonic-gate 			sdp = dp;
16207c478bd9Sstevel@tonic-gate 			for (i = 0; i < v.v_buf; i++, dp++) {
16217c478bd9Sstevel@tonic-gate 				/*
16227c478bd9Sstevel@tonic-gate 				 * The next two lines are needed since NODEV
16237c478bd9Sstevel@tonic-gate 				 * is -1 and not NULL
16247c478bd9Sstevel@tonic-gate 				 */
16257c478bd9Sstevel@tonic-gate 				dp->b_dev = (o_dev_t)NODEV;
16267c478bd9Sstevel@tonic-gate 				dp->b_edev = NODEV;
16277c478bd9Sstevel@tonic-gate 				dp->av_forw = dp + 1;
16287c478bd9Sstevel@tonic-gate 				sema_init(&dp->b_sem, 1, NULL, SEMA_DEFAULT,
16297c478bd9Sstevel@tonic-gate 				    NULL);
16307c478bd9Sstevel@tonic-gate 				sema_init(&dp->b_io, 0, NULL, SEMA_DEFAULT,
16317c478bd9Sstevel@tonic-gate 				    NULL);
16327c478bd9Sstevel@tonic-gate 				dp->b_offset = -1;
16337c478bd9Sstevel@tonic-gate 			}
16347c478bd9Sstevel@tonic-gate 			mutex_enter(&bhdr_lock);
16357c478bd9Sstevel@tonic-gate 			(--dp)->av_forw = bhdrlist;	/* Fix last pointer */
16367c478bd9Sstevel@tonic-gate 			bhdrlist = sdp;
16377c478bd9Sstevel@tonic-gate 			nbuf += v.v_buf;
16387c478bd9Sstevel@tonic-gate 			bp = bhdrlist;
16397c478bd9Sstevel@tonic-gate 			bhdrlist = bp->av_forw;
16407c478bd9Sstevel@tonic-gate 			mutex_exit(&bhdr_lock);
16417c478bd9Sstevel@tonic-gate 
16427c478bd9Sstevel@tonic-gate 			bp->av_forw = NULL;
16437c478bd9Sstevel@tonic-gate 			return (bp);
16447c478bd9Sstevel@tonic-gate 		}
16457c478bd9Sstevel@tonic-gate 	}
16467c478bd9Sstevel@tonic-gate }
16477c478bd9Sstevel@tonic-gate 
16487c478bd9Sstevel@tonic-gate static  void
16497c478bd9Sstevel@tonic-gate bio_bhdr_free(struct buf *bp)
16507c478bd9Sstevel@tonic-gate {
16517c478bd9Sstevel@tonic-gate 	ASSERT(bp->b_back == NULL);
16527c478bd9Sstevel@tonic-gate 	ASSERT(bp->b_forw == NULL);
16537c478bd9Sstevel@tonic-gate 	ASSERT(bp->av_back == NULL);
16547c478bd9Sstevel@tonic-gate 	ASSERT(bp->av_forw == NULL);
16557c478bd9Sstevel@tonic-gate 	ASSERT(bp->b_un.b_addr == NULL);
16567c478bd9Sstevel@tonic-gate 	ASSERT(bp->b_dev == (o_dev_t)NODEV);
16577c478bd9Sstevel@tonic-gate 	ASSERT(bp->b_edev == NODEV);
16587c478bd9Sstevel@tonic-gate 	ASSERT(bp->b_flags == 0);
16597c478bd9Sstevel@tonic-gate 
16607c478bd9Sstevel@tonic-gate 	mutex_enter(&bhdr_lock);
16617c478bd9Sstevel@tonic-gate 	bp->av_forw = bhdrlist;
16627c478bd9Sstevel@tonic-gate 	bhdrlist = bp;
16637c478bd9Sstevel@tonic-gate 	mutex_exit(&bhdr_lock);
16647c478bd9Sstevel@tonic-gate }
16657c478bd9Sstevel@tonic-gate 
16667c478bd9Sstevel@tonic-gate /*
16677c478bd9Sstevel@tonic-gate  * If we haven't gone over the high water mark, it's o.k. to
16687c478bd9Sstevel@tonic-gate  * allocate more buffer space, otherwise recycle buffers
16697c478bd9Sstevel@tonic-gate  * from the freelist until enough memory is free for a bsize request.
16707c478bd9Sstevel@tonic-gate  *
16717c478bd9Sstevel@tonic-gate  * We account for this memory, even though
16727c478bd9Sstevel@tonic-gate  * we don't allocate it here.
16737c478bd9Sstevel@tonic-gate  */
16747c478bd9Sstevel@tonic-gate static void
16757c478bd9Sstevel@tonic-gate bio_mem_get(long bsize)
16767c478bd9Sstevel@tonic-gate {
16777c478bd9Sstevel@tonic-gate 	mutex_enter(&bfree_lock);
16787c478bd9Sstevel@tonic-gate 	if (bfreelist.b_bufsize > bsize) {
16797c478bd9Sstevel@tonic-gate 		bfreelist.b_bufsize -= bsize;
16807c478bd9Sstevel@tonic-gate 		mutex_exit(&bfree_lock);
16817c478bd9Sstevel@tonic-gate 		return;
16827c478bd9Sstevel@tonic-gate 	}
16837c478bd9Sstevel@tonic-gate 	mutex_exit(&bfree_lock);
16847c478bd9Sstevel@tonic-gate 	bio_recycle(BIO_MEM, bsize);
16857c478bd9Sstevel@tonic-gate }
16867c478bd9Sstevel@tonic-gate 
16877c478bd9Sstevel@tonic-gate /*
16887c478bd9Sstevel@tonic-gate  * flush a list of delayed write buffers.
16897c478bd9Sstevel@tonic-gate  * (currently used only by bio_recycle below.)
16907c478bd9Sstevel@tonic-gate  */
16917c478bd9Sstevel@tonic-gate static void
16927c478bd9Sstevel@tonic-gate bio_flushlist(struct buf *delwri_list)
16937c478bd9Sstevel@tonic-gate {
16947c478bd9Sstevel@tonic-gate 	struct buf *bp;
16957c478bd9Sstevel@tonic-gate 
16967c478bd9Sstevel@tonic-gate 	while (delwri_list != EMPTY_LIST) {
16977c478bd9Sstevel@tonic-gate 		bp = delwri_list;
16987c478bd9Sstevel@tonic-gate 		bp->b_flags |= B_AGE | B_ASYNC;
16997c478bd9Sstevel@tonic-gate 		if (bp->b_vp == NULL) {		/* !ufs */
17007c478bd9Sstevel@tonic-gate 			BWRITE(bp);
17017c478bd9Sstevel@tonic-gate 		} else {			/* ufs */
17027c478bd9Sstevel@tonic-gate 			UFS_BWRITE(VTOI(bp->b_vp)->i_ufsvfs, bp);
17037c478bd9Sstevel@tonic-gate 		}
17047c478bd9Sstevel@tonic-gate 		delwri_list = bp->b_list;
17057c478bd9Sstevel@tonic-gate 		bp->b_list = NULL;
17067c478bd9Sstevel@tonic-gate 	}
17077c478bd9Sstevel@tonic-gate }
17087c478bd9Sstevel@tonic-gate 
17097c478bd9Sstevel@tonic-gate /*
17107c478bd9Sstevel@tonic-gate  * Start recycling buffers on the freelist for one of 2 reasons:
17117c478bd9Sstevel@tonic-gate  *	- we need a buffer header
17127c478bd9Sstevel@tonic-gate  *	- we need to free up memory
17137c478bd9Sstevel@tonic-gate  * Once started we continue to recycle buffers until the B_AGE
17147c478bd9Sstevel@tonic-gate  * buffers are gone.
17157c478bd9Sstevel@tonic-gate  */
17167c478bd9Sstevel@tonic-gate static void
17177c478bd9Sstevel@tonic-gate bio_recycle(int want, long bsize)
17187c478bd9Sstevel@tonic-gate {
17197c478bd9Sstevel@tonic-gate 	struct buf *bp, *dp, *dwp, *nbp;
17207c478bd9Sstevel@tonic-gate 	struct hbuf *hp;
17217c478bd9Sstevel@tonic-gate 	int	found = 0;
17227c478bd9Sstevel@tonic-gate 	kmutex_t	*hmp;
17237c478bd9Sstevel@tonic-gate 	int		start, end;
17247c478bd9Sstevel@tonic-gate 	struct buf *delwri_list = EMPTY_LIST;
17257c478bd9Sstevel@tonic-gate 
17267c478bd9Sstevel@tonic-gate 	/*
17277c478bd9Sstevel@tonic-gate 	 * Recycle buffers.
17287c478bd9Sstevel@tonic-gate 	 */
17297c478bd9Sstevel@tonic-gate top:
17307c478bd9Sstevel@tonic-gate 	start = end = lastindex;
17317c478bd9Sstevel@tonic-gate 	do {
17327c478bd9Sstevel@tonic-gate 		hp = &hbuf[start];
17337c478bd9Sstevel@tonic-gate 		hmp = &hp->b_lock;
17347c478bd9Sstevel@tonic-gate 		dp = (struct buf *)hp;
17357c478bd9Sstevel@tonic-gate 
17367c478bd9Sstevel@tonic-gate 		mutex_enter(hmp);
17377c478bd9Sstevel@tonic-gate 		bp = dp->av_forw;
17387c478bd9Sstevel@tonic-gate 
17397c478bd9Sstevel@tonic-gate 		while (bp != dp) {
17407c478bd9Sstevel@tonic-gate 
17417c478bd9Sstevel@tonic-gate 			ASSERT(bp != NULL);
17427c478bd9Sstevel@tonic-gate 
17437c478bd9Sstevel@tonic-gate 			if (!sema_tryp(&bp->b_sem)) {
17447c478bd9Sstevel@tonic-gate 				bp = bp->av_forw;
17457c478bd9Sstevel@tonic-gate 				continue;
17467c478bd9Sstevel@tonic-gate 			}
17477c478bd9Sstevel@tonic-gate 			/*
17487c478bd9Sstevel@tonic-gate 			 * Do we really want to nuke all of the B_AGE stuff??
17497c478bd9Sstevel@tonic-gate 			 */
17507c478bd9Sstevel@tonic-gate 			if ((bp->b_flags & B_AGE) == 0 && found) {
17517c478bd9Sstevel@tonic-gate 				sema_v(&bp->b_sem);
17527c478bd9Sstevel@tonic-gate 				mutex_exit(hmp);
17537c478bd9Sstevel@tonic-gate 				lastindex = start;
17547c478bd9Sstevel@tonic-gate 				return;	/* All done */
17557c478bd9Sstevel@tonic-gate 			}
17567c478bd9Sstevel@tonic-gate 
17577c478bd9Sstevel@tonic-gate 			ASSERT(MUTEX_HELD(&hp->b_lock));
17587c478bd9Sstevel@tonic-gate 			ASSERT(!(bp->b_flags & B_DELWRI));
17597c478bd9Sstevel@tonic-gate 			hp->b_length--;
17607c478bd9Sstevel@tonic-gate 			notavail(bp);
17617c478bd9Sstevel@tonic-gate 
17627c478bd9Sstevel@tonic-gate 			/*
17637c478bd9Sstevel@tonic-gate 			 * Remove bhdr from cache, free up memory,
17647c478bd9Sstevel@tonic-gate 			 * and add the hdr to the freelist.
17657c478bd9Sstevel@tonic-gate 			 */
17667c478bd9Sstevel@tonic-gate 			bremhash(bp);
17677c478bd9Sstevel@tonic-gate 			mutex_exit(hmp);
17687c478bd9Sstevel@tonic-gate 
17697c478bd9Sstevel@tonic-gate 			if (bp->b_bufsize) {
17707c478bd9Sstevel@tonic-gate 				kmem_free(bp->b_un.b_addr, bp->b_bufsize);
17717c478bd9Sstevel@tonic-gate 				bp->b_un.b_addr = NULL;
17727c478bd9Sstevel@tonic-gate 				mutex_enter(&bfree_lock);
17737c478bd9Sstevel@tonic-gate 				bfreelist.b_bufsize += bp->b_bufsize;
17747c478bd9Sstevel@tonic-gate 				mutex_exit(&bfree_lock);
17757c478bd9Sstevel@tonic-gate 			}
17767c478bd9Sstevel@tonic-gate 
17777c478bd9Sstevel@tonic-gate 			bp->b_dev = (o_dev_t)NODEV;
17787c478bd9Sstevel@tonic-gate 			bp->b_edev = NODEV;
17797c478bd9Sstevel@tonic-gate 			bp->b_flags = 0;
17807c478bd9Sstevel@tonic-gate 			sema_v(&bp->b_sem);
17817c478bd9Sstevel@tonic-gate 			bio_bhdr_free(bp);
17827c478bd9Sstevel@tonic-gate 			if (want == BIO_HEADER) {
17837c478bd9Sstevel@tonic-gate 				found = 1;
17847c478bd9Sstevel@tonic-gate 			} else {
17857c478bd9Sstevel@tonic-gate 				ASSERT(want == BIO_MEM);
17867c478bd9Sstevel@tonic-gate 				if (!found && bfreelist.b_bufsize >= bsize) {
17877c478bd9Sstevel@tonic-gate 					/* Account for the memory we want */
17887c478bd9Sstevel@tonic-gate 					mutex_enter(&bfree_lock);
17897c478bd9Sstevel@tonic-gate 					if (bfreelist.b_bufsize >= bsize) {
17907c478bd9Sstevel@tonic-gate 						bfreelist.b_bufsize -= bsize;
17917c478bd9Sstevel@tonic-gate 						found = 1;
17927c478bd9Sstevel@tonic-gate 					}
17937c478bd9Sstevel@tonic-gate 					mutex_exit(&bfree_lock);
17947c478bd9Sstevel@tonic-gate 				}
17957c478bd9Sstevel@tonic-gate 			}
17967c478bd9Sstevel@tonic-gate 
17977c478bd9Sstevel@tonic-gate 			/*
17987c478bd9Sstevel@tonic-gate 			 * Since we dropped hmp start from the
17997c478bd9Sstevel@tonic-gate 			 * begining.
18007c478bd9Sstevel@tonic-gate 			 */
18017c478bd9Sstevel@tonic-gate 			mutex_enter(hmp);
18027c478bd9Sstevel@tonic-gate 			bp = dp->av_forw;
18037c478bd9Sstevel@tonic-gate 		}
18047c478bd9Sstevel@tonic-gate 		mutex_exit(hmp);
18057c478bd9Sstevel@tonic-gate 
18067c478bd9Sstevel@tonic-gate 		/*
18077c478bd9Sstevel@tonic-gate 		 * Look at the delayed write list.
18087c478bd9Sstevel@tonic-gate 		 * First gather into a private list, then write them.
18097c478bd9Sstevel@tonic-gate 		 */
18107c478bd9Sstevel@tonic-gate 		dwp = (struct buf *)&dwbuf[start];
18117c478bd9Sstevel@tonic-gate 		mutex_enter(&blist_lock);
18127c478bd9Sstevel@tonic-gate 		bio_doingflush++;
18137c478bd9Sstevel@tonic-gate 		mutex_enter(hmp);
18147c478bd9Sstevel@tonic-gate 		for (bp = dwp->av_forw; bp != dwp; bp = nbp) {
18157c478bd9Sstevel@tonic-gate 
18167c478bd9Sstevel@tonic-gate 			ASSERT(bp != NULL);
18177c478bd9Sstevel@tonic-gate 			nbp = bp->av_forw;
18187c478bd9Sstevel@tonic-gate 
18197c478bd9Sstevel@tonic-gate 			if (!sema_tryp(&bp->b_sem))
18207c478bd9Sstevel@tonic-gate 				continue;
18217c478bd9Sstevel@tonic-gate 			ASSERT(bp->b_flags & B_DELWRI);
18227c478bd9Sstevel@tonic-gate 			/*
18237c478bd9Sstevel@tonic-gate 			 * Do we really want to nuke all of the B_AGE stuff??
18247c478bd9Sstevel@tonic-gate 			 */
18257c478bd9Sstevel@tonic-gate 
18267c478bd9Sstevel@tonic-gate 			if ((bp->b_flags & B_AGE) == 0 && found) {
18277c478bd9Sstevel@tonic-gate 				sema_v(&bp->b_sem);
18287c478bd9Sstevel@tonic-gate 				mutex_exit(hmp);
18297c478bd9Sstevel@tonic-gate 				lastindex = start;
18307c478bd9Sstevel@tonic-gate 				mutex_exit(&blist_lock);
18317c478bd9Sstevel@tonic-gate 				bio_flushlist(delwri_list);
18327c478bd9Sstevel@tonic-gate 				mutex_enter(&blist_lock);
18337c478bd9Sstevel@tonic-gate 				bio_doingflush--;
18347c478bd9Sstevel@tonic-gate 				if (bio_flinv_cv_wanted) {
18357c478bd9Sstevel@tonic-gate 					bio_flinv_cv_wanted = 0;
18367c478bd9Sstevel@tonic-gate 					cv_broadcast(&bio_flushinval_cv);
18377c478bd9Sstevel@tonic-gate 				}
18387c478bd9Sstevel@tonic-gate 				mutex_exit(&blist_lock);
18397c478bd9Sstevel@tonic-gate 				return; /* All done */
18407c478bd9Sstevel@tonic-gate 			}
18417c478bd9Sstevel@tonic-gate 
18427c478bd9Sstevel@tonic-gate 			/*
18437c478bd9Sstevel@tonic-gate 			 * If the buffer is already on a flush or
18447c478bd9Sstevel@tonic-gate 			 * invalidate list then just skip it.
18457c478bd9Sstevel@tonic-gate 			 */
18467c478bd9Sstevel@tonic-gate 			if (bp->b_list != NULL) {
18477c478bd9Sstevel@tonic-gate 				sema_v(&bp->b_sem);
18487c478bd9Sstevel@tonic-gate 				continue;
18497c478bd9Sstevel@tonic-gate 			}
18507c478bd9Sstevel@tonic-gate 			/*
18517c478bd9Sstevel@tonic-gate 			 * We are still on the same bucket.
18527c478bd9Sstevel@tonic-gate 			 */
18537c478bd9Sstevel@tonic-gate 			hp->b_length--;
18547c478bd9Sstevel@tonic-gate 			notavail(bp);
18557c478bd9Sstevel@tonic-gate 			bp->b_list = delwri_list;
18567c478bd9Sstevel@tonic-gate 			delwri_list = bp;
18577c478bd9Sstevel@tonic-gate 		}
18587c478bd9Sstevel@tonic-gate 		mutex_exit(hmp);
18597c478bd9Sstevel@tonic-gate 		mutex_exit(&blist_lock);
18607c478bd9Sstevel@tonic-gate 		bio_flushlist(delwri_list);
18617c478bd9Sstevel@tonic-gate 		delwri_list = EMPTY_LIST;
18627c478bd9Sstevel@tonic-gate 		mutex_enter(&blist_lock);
18637c478bd9Sstevel@tonic-gate 		bio_doingflush--;
18647c478bd9Sstevel@tonic-gate 		if (bio_flinv_cv_wanted) {
18657c478bd9Sstevel@tonic-gate 			bio_flinv_cv_wanted = 0;
18667c478bd9Sstevel@tonic-gate 			cv_broadcast(&bio_flushinval_cv);
18677c478bd9Sstevel@tonic-gate 		}
18687c478bd9Sstevel@tonic-gate 		mutex_exit(&blist_lock);
18697c478bd9Sstevel@tonic-gate 		start = (start + 1) % v.v_hbuf;
18707c478bd9Sstevel@tonic-gate 
18717c478bd9Sstevel@tonic-gate 	} while (start != end);
18727c478bd9Sstevel@tonic-gate 
18737c478bd9Sstevel@tonic-gate 	if (found)
18747c478bd9Sstevel@tonic-gate 		return;
18757c478bd9Sstevel@tonic-gate 
18767c478bd9Sstevel@tonic-gate 	/*
18777c478bd9Sstevel@tonic-gate 	 * Free lists exhausted and we haven't satisfied the request.
18787c478bd9Sstevel@tonic-gate 	 * Wait here for more entries to be added to freelist.
18797c478bd9Sstevel@tonic-gate 	 * Because this might have just happened, make it timed.
18807c478bd9Sstevel@tonic-gate 	 */
18817c478bd9Sstevel@tonic-gate 	mutex_enter(&bfree_lock);
18827c478bd9Sstevel@tonic-gate 	bfreelist.b_flags |= B_WANTED;
1883d3d50737SRafael Vanoni 	(void) cv_reltimedwait(&bio_mem_cv, &bfree_lock, hz, TR_CLOCK_TICK);
18847c478bd9Sstevel@tonic-gate 	mutex_exit(&bfree_lock);
18857c478bd9Sstevel@tonic-gate 	goto top;
18867c478bd9Sstevel@tonic-gate }
18877c478bd9Sstevel@tonic-gate 
18887c478bd9Sstevel@tonic-gate /*
18897c478bd9Sstevel@tonic-gate  * See if the block is associated with some buffer
18907c478bd9Sstevel@tonic-gate  * (mainly to avoid getting hung up on a wait in breada).
18917c478bd9Sstevel@tonic-gate  */
18927c478bd9Sstevel@tonic-gate static int
18937c478bd9Sstevel@tonic-gate bio_incore(dev_t dev, daddr_t blkno)
18947c478bd9Sstevel@tonic-gate {
18957c478bd9Sstevel@tonic-gate 	struct buf *bp;
18967c478bd9Sstevel@tonic-gate 	struct buf *dp;
18977c478bd9Sstevel@tonic-gate 	uint_t index;
18987c478bd9Sstevel@tonic-gate 	kmutex_t *hmp;
18997c478bd9Sstevel@tonic-gate 
19007c478bd9Sstevel@tonic-gate 	index = bio_bhash(dev, blkno);
19017c478bd9Sstevel@tonic-gate 	dp = (struct buf *)&hbuf[index];
19027c478bd9Sstevel@tonic-gate 	hmp = &hbuf[index].b_lock;
19037c478bd9Sstevel@tonic-gate 
19047c478bd9Sstevel@tonic-gate 	mutex_enter(hmp);
19057c478bd9Sstevel@tonic-gate 	for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
19067c478bd9Sstevel@tonic-gate 		if (bp->b_blkno == blkno && bp->b_edev == dev &&
19077c478bd9Sstevel@tonic-gate 		    (bp->b_flags & B_STALE) == 0) {
19087c478bd9Sstevel@tonic-gate 			mutex_exit(hmp);
19097c478bd9Sstevel@tonic-gate 			return (1);
19107c478bd9Sstevel@tonic-gate 		}
19117c478bd9Sstevel@tonic-gate 	}
19127c478bd9Sstevel@tonic-gate 	mutex_exit(hmp);
19137c478bd9Sstevel@tonic-gate 	return (0);
19147c478bd9Sstevel@tonic-gate }
19157c478bd9Sstevel@tonic-gate 
19167c478bd9Sstevel@tonic-gate static void
19177c478bd9Sstevel@tonic-gate bio_pageio_done(struct buf *bp)
19187c478bd9Sstevel@tonic-gate {
19197c478bd9Sstevel@tonic-gate 	if (bp->b_flags & B_PAGEIO) {
19207c478bd9Sstevel@tonic-gate 
19217c478bd9Sstevel@tonic-gate 		if (bp->b_flags & B_REMAPPED)
19227c478bd9Sstevel@tonic-gate 			bp_mapout(bp);
19237c478bd9Sstevel@tonic-gate 
19247c478bd9Sstevel@tonic-gate 		if (bp->b_flags & B_READ)
19257c478bd9Sstevel@tonic-gate 			pvn_read_done(bp->b_pages, bp->b_flags);
19267c478bd9Sstevel@tonic-gate 		else
19277c478bd9Sstevel@tonic-gate 			pvn_write_done(bp->b_pages, B_WRITE | bp->b_flags);
19287c478bd9Sstevel@tonic-gate 		pageio_done(bp);
19297c478bd9Sstevel@tonic-gate 	} else {
19307c478bd9Sstevel@tonic-gate 		ASSERT(bp->b_flags & B_REMAPPED);
19317c478bd9Sstevel@tonic-gate 		bp_mapout(bp);
19327c478bd9Sstevel@tonic-gate 		brelse(bp);
19337c478bd9Sstevel@tonic-gate 	}
19347c478bd9Sstevel@tonic-gate }
19357c478bd9Sstevel@tonic-gate 
19367c478bd9Sstevel@tonic-gate /*
19377c478bd9Sstevel@tonic-gate  * bioerror(9F) - indicate error in buffer header
19387c478bd9Sstevel@tonic-gate  * If 'error' is zero, remove the error indication.
19397c478bd9Sstevel@tonic-gate  */
19407c478bd9Sstevel@tonic-gate void
19417c478bd9Sstevel@tonic-gate bioerror(struct buf *bp, int error)
19427c478bd9Sstevel@tonic-gate {
19437c478bd9Sstevel@tonic-gate 	ASSERT(bp != NULL);
19447c478bd9Sstevel@tonic-gate 	ASSERT(error >= 0);
19457c478bd9Sstevel@tonic-gate 	ASSERT(SEMA_HELD(&bp->b_sem));
19467c478bd9Sstevel@tonic-gate 
19477c478bd9Sstevel@tonic-gate 	if (error != 0) {
19487c478bd9Sstevel@tonic-gate 		bp->b_flags |= B_ERROR;
19497c478bd9Sstevel@tonic-gate 	} else {
19507c478bd9Sstevel@tonic-gate 		bp->b_flags &= ~B_ERROR;
19517c478bd9Sstevel@tonic-gate 	}
19527c478bd9Sstevel@tonic-gate 	bp->b_error = error;
19537c478bd9Sstevel@tonic-gate }
19547c478bd9Sstevel@tonic-gate 
19557c478bd9Sstevel@tonic-gate /*
19567c478bd9Sstevel@tonic-gate  * bioreset(9F) - reuse a private buffer header after I/O is complete
19577c478bd9Sstevel@tonic-gate  */
19587c478bd9Sstevel@tonic-gate void
19597c478bd9Sstevel@tonic-gate bioreset(struct buf *bp)
19607c478bd9Sstevel@tonic-gate {
19617c478bd9Sstevel@tonic-gate 	ASSERT(bp != NULL);
19627c478bd9Sstevel@tonic-gate 
19637c478bd9Sstevel@tonic-gate 	biofini(bp);
19647c478bd9Sstevel@tonic-gate 	bioinit(bp);
19657c478bd9Sstevel@tonic-gate }
19667c478bd9Sstevel@tonic-gate 
19677c478bd9Sstevel@tonic-gate /*
19687c478bd9Sstevel@tonic-gate  * biosize(9F) - return size of a buffer header
19697c478bd9Sstevel@tonic-gate  */
19707c478bd9Sstevel@tonic-gate size_t
19717c478bd9Sstevel@tonic-gate biosize(void)
19727c478bd9Sstevel@tonic-gate {
19737c478bd9Sstevel@tonic-gate 	return (sizeof (struct buf));
19747c478bd9Sstevel@tonic-gate }
19757c478bd9Sstevel@tonic-gate 
19767c478bd9Sstevel@tonic-gate /*
19777c478bd9Sstevel@tonic-gate  * biomodified(9F) - check if buffer is modified
19787c478bd9Sstevel@tonic-gate  */
19797c478bd9Sstevel@tonic-gate int
19807c478bd9Sstevel@tonic-gate biomodified(struct buf *bp)
19817c478bd9Sstevel@tonic-gate {
19827c478bd9Sstevel@tonic-gate 	int npf;
19837c478bd9Sstevel@tonic-gate 	int ppattr;
19847c478bd9Sstevel@tonic-gate 	struct page *pp;
19857c478bd9Sstevel@tonic-gate 
19867c478bd9Sstevel@tonic-gate 	ASSERT(bp != NULL);
19877c478bd9Sstevel@tonic-gate 
19887c478bd9Sstevel@tonic-gate 	if ((bp->b_flags & B_PAGEIO) == 0) {
19897c478bd9Sstevel@tonic-gate 		return (-1);
19907c478bd9Sstevel@tonic-gate 	}
19917c478bd9Sstevel@tonic-gate 	pp = bp->b_pages;
19927c478bd9Sstevel@tonic-gate 	npf = btopr(bp->b_bcount + ((uintptr_t)bp->b_un.b_addr & PAGEOFFSET));
19937c478bd9Sstevel@tonic-gate 
19947c478bd9Sstevel@tonic-gate 	while (npf > 0) {
19957c478bd9Sstevel@tonic-gate 		ppattr = hat_pagesync(pp, HAT_SYNC_DONTZERO |
1996d3d50737SRafael Vanoni 		    HAT_SYNC_STOPON_MOD);
19977c478bd9Sstevel@tonic-gate 		if (ppattr & P_MOD)
19987c478bd9Sstevel@tonic-gate 			return (1);
19997c478bd9Sstevel@tonic-gate 		pp = pp->p_next;
20007c478bd9Sstevel@tonic-gate 		npf--;
20017c478bd9Sstevel@tonic-gate 	}
20027c478bd9Sstevel@tonic-gate 
20037c478bd9Sstevel@tonic-gate 	return (0);
20047c478bd9Sstevel@tonic-gate }
20057c478bd9Sstevel@tonic-gate 
20067c478bd9Sstevel@tonic-gate /*
20077c478bd9Sstevel@tonic-gate  * bioinit(9F) - initialize a buffer structure
20087c478bd9Sstevel@tonic-gate  */
20097c478bd9Sstevel@tonic-gate void
20107c478bd9Sstevel@tonic-gate bioinit(struct buf *bp)
20117c478bd9Sstevel@tonic-gate {
20127c478bd9Sstevel@tonic-gate 	bzero(bp, sizeof (struct buf));
20137c478bd9Sstevel@tonic-gate 	sema_init(&bp->b_sem, 0, NULL, SEMA_DEFAULT, NULL);
20147c478bd9Sstevel@tonic-gate 	sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
20157c478bd9Sstevel@tonic-gate 	bp->b_offset = -1;
20167c478bd9Sstevel@tonic-gate }
20177c478bd9Sstevel@tonic-gate 
20187c478bd9Sstevel@tonic-gate /*
20197c478bd9Sstevel@tonic-gate  * biofini(9F) - uninitialize a buffer structure
20207c478bd9Sstevel@tonic-gate  */
20217c478bd9Sstevel@tonic-gate void
20227c478bd9Sstevel@tonic-gate biofini(struct buf *bp)
20237c478bd9Sstevel@tonic-gate {
20247c478bd9Sstevel@tonic-gate 	sema_destroy(&bp->b_io);
20257c478bd9Sstevel@tonic-gate 	sema_destroy(&bp->b_sem);
20267c478bd9Sstevel@tonic-gate }
20277c478bd9Sstevel@tonic-gate 
20287c478bd9Sstevel@tonic-gate /*
20297c478bd9Sstevel@tonic-gate  * bioclone(9F) - clone a buffer
20307c478bd9Sstevel@tonic-gate  */
20317c478bd9Sstevel@tonic-gate struct buf *
20327c478bd9Sstevel@tonic-gate bioclone(struct buf *bp, off_t off, size_t len, dev_t dev, daddr_t blkno,
20337c478bd9Sstevel@tonic-gate     int (*iodone)(struct buf *), struct buf *bp_mem, int sleep)
20347c478bd9Sstevel@tonic-gate {
20357c478bd9Sstevel@tonic-gate 	struct buf *bufp;
20367c478bd9Sstevel@tonic-gate 
20377c478bd9Sstevel@tonic-gate 	ASSERT(bp);
20387c478bd9Sstevel@tonic-gate 	if (bp_mem == NULL) {
20397c478bd9Sstevel@tonic-gate 		bufp = kmem_alloc(sizeof (struct buf), sleep);
20407c478bd9Sstevel@tonic-gate 		if (bufp == NULL) {
20417c478bd9Sstevel@tonic-gate 			return (NULL);
20427c478bd9Sstevel@tonic-gate 		}
20437c478bd9Sstevel@tonic-gate 		bioinit(bufp);
20447c478bd9Sstevel@tonic-gate 	} else {
20457c478bd9Sstevel@tonic-gate 		bufp = bp_mem;
20467c478bd9Sstevel@tonic-gate 		bioreset(bufp);
20477c478bd9Sstevel@tonic-gate 	}
20487c478bd9Sstevel@tonic-gate 
20497c478bd9Sstevel@tonic-gate #define	BUF_CLONE_FLAGS	(B_READ|B_WRITE|B_SHADOW|B_PHYS|B_PAGEIO|B_FAILFAST|\
20507c478bd9Sstevel@tonic-gate 	B_ABRWRITE)
20517c478bd9Sstevel@tonic-gate 
20527c478bd9Sstevel@tonic-gate 	/*
20536f84fed5Scth 	 * The cloned buffer does not inherit the B_REMAPPED flag.
20547c478bd9Sstevel@tonic-gate 	 */
20557c478bd9Sstevel@tonic-gate 	bufp->b_flags = (bp->b_flags & BUF_CLONE_FLAGS)  | B_BUSY;
20567c478bd9Sstevel@tonic-gate 	bufp->b_bcount = len;
20577c478bd9Sstevel@tonic-gate 	bufp->b_blkno = blkno;
20587c478bd9Sstevel@tonic-gate 	bufp->b_iodone = iodone;
20597c478bd9Sstevel@tonic-gate 	bufp->b_proc = bp->b_proc;
20607c478bd9Sstevel@tonic-gate 	bufp->b_edev = dev;
20617c478bd9Sstevel@tonic-gate 	bufp->b_file = bp->b_file;
20627c478bd9Sstevel@tonic-gate 	bufp->b_offset = bp->b_offset;
20637c478bd9Sstevel@tonic-gate 
20647c478bd9Sstevel@tonic-gate 	if (bp->b_flags & B_SHADOW) {
20657c478bd9Sstevel@tonic-gate 		ASSERT(bp->b_shadow);
20667c478bd9Sstevel@tonic-gate 		ASSERT(bp->b_flags & B_PHYS);
20677c478bd9Sstevel@tonic-gate 
20687c478bd9Sstevel@tonic-gate 		bufp->b_shadow = bp->b_shadow +
2069d3d50737SRafael Vanoni 		    btop(((uintptr_t)bp->b_un.b_addr & PAGEOFFSET) + off);
20707c478bd9Sstevel@tonic-gate 		bufp->b_un.b_addr = (caddr_t)((uintptr_t)bp->b_un.b_addr + off);
20716f84fed5Scth 		if (bp->b_flags & B_REMAPPED)
20726f84fed5Scth 			bufp->b_proc = NULL;
20737c478bd9Sstevel@tonic-gate 	} else {
20747c478bd9Sstevel@tonic-gate 		if (bp->b_flags & B_PAGEIO) {
20757c478bd9Sstevel@tonic-gate 			struct page *pp;
20767c478bd9Sstevel@tonic-gate 			off_t o;
20777c478bd9Sstevel@tonic-gate 			int i;
20787c478bd9Sstevel@tonic-gate 
20797c478bd9Sstevel@tonic-gate 			pp = bp->b_pages;
20807c478bd9Sstevel@tonic-gate 			o = ((uintptr_t)bp->b_un.b_addr & PAGEOFFSET) + off;
20817c478bd9Sstevel@tonic-gate 			for (i = btop(o); i > 0; i--) {
20827c478bd9Sstevel@tonic-gate 				pp = pp->p_next;
20837c478bd9Sstevel@tonic-gate 			}
20847c478bd9Sstevel@tonic-gate 			bufp->b_pages = pp;
20857c478bd9Sstevel@tonic-gate 			bufp->b_un.b_addr = (caddr_t)(o & PAGEOFFSET);
20867c478bd9Sstevel@tonic-gate 		} else {
20877c478bd9Sstevel@tonic-gate 			bufp->b_un.b_addr =
2088d3d50737SRafael Vanoni 			    (caddr_t)((uintptr_t)bp->b_un.b_addr + off);
20897c478bd9Sstevel@tonic-gate 			if (bp->b_flags & B_REMAPPED)
20907c478bd9Sstevel@tonic-gate 				bufp->b_proc = NULL;
20917c478bd9Sstevel@tonic-gate 		}
20927c478bd9Sstevel@tonic-gate 	}
20937c478bd9Sstevel@tonic-gate 	return (bufp);
20947c478bd9Sstevel@tonic-gate }
2095