xref: /illumos-gate/usr/src/uts/common/os/bio.c (revision 6a0b1217)
17c478bd9Sstevel@tonic-gate /*
27c478bd9Sstevel@tonic-gate  * CDDL HEADER START
37c478bd9Sstevel@tonic-gate  *
47c478bd9Sstevel@tonic-gate  * The contents of this file are subject to the terms of the
56f84fed5Scth  * Common Development and Distribution License (the "License").
66f84fed5Scth  * You may not use this file except in compliance with the License.
77c478bd9Sstevel@tonic-gate  *
87c478bd9Sstevel@tonic-gate  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
97c478bd9Sstevel@tonic-gate  * or http://www.opensolaris.org/os/licensing.
107c478bd9Sstevel@tonic-gate  * See the License for the specific language governing permissions
117c478bd9Sstevel@tonic-gate  * and limitations under the License.
127c478bd9Sstevel@tonic-gate  *
137c478bd9Sstevel@tonic-gate  * When distributing Covered Code, include this CDDL HEADER in each
147c478bd9Sstevel@tonic-gate  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
157c478bd9Sstevel@tonic-gate  * If applicable, add the following below this CDDL HEADER, with the
167c478bd9Sstevel@tonic-gate  * fields enclosed by brackets "[]" replaced with your own identifying
177c478bd9Sstevel@tonic-gate  * information: Portions Copyright [yyyy] [name of copyright owner]
187c478bd9Sstevel@tonic-gate  *
197c478bd9Sstevel@tonic-gate  * CDDL HEADER END
207c478bd9Sstevel@tonic-gate  */
217c478bd9Sstevel@tonic-gate /*
22d3d50737SRafael Vanoni  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
237c478bd9Sstevel@tonic-gate  * Use is subject to license terms.
24*6a0b1217SPatrick Mooney  * Copyright 2019 Joyent, Inc.
257c478bd9Sstevel@tonic-gate  */
267c478bd9Sstevel@tonic-gate 
273f11de9dSSara Hartse /*
283f11de9dSSara Hartse  * Copyright (c) 2016 by Delphix. All rights reserved.
293f11de9dSSara Hartse  */
303f11de9dSSara Hartse 
317c478bd9Sstevel@tonic-gate /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
32a8b4e7c7SToomas Soome /*	  All Rights Reserved	*/
337c478bd9Sstevel@tonic-gate 
347c478bd9Sstevel@tonic-gate /*
357c478bd9Sstevel@tonic-gate  * University Copyright- Copyright (c) 1982, 1986, 1988
367c478bd9Sstevel@tonic-gate  * The Regents of the University of California
377c478bd9Sstevel@tonic-gate  * All Rights Reserved
387c478bd9Sstevel@tonic-gate  *
397c478bd9Sstevel@tonic-gate  * University Acknowledgment- Portions of this document are derived from
407c478bd9Sstevel@tonic-gate  * software developed by the University of California, Berkeley, and its
417c478bd9Sstevel@tonic-gate  * contributors.
427c478bd9Sstevel@tonic-gate  */
437c478bd9Sstevel@tonic-gate 
447c478bd9Sstevel@tonic-gate #include <sys/types.h>
457c478bd9Sstevel@tonic-gate #include <sys/t_lock.h>
467c478bd9Sstevel@tonic-gate #include <sys/sysmacros.h>
477c478bd9Sstevel@tonic-gate #include <sys/conf.h>
487c478bd9Sstevel@tonic-gate #include <sys/cpuvar.h>
497c478bd9Sstevel@tonic-gate #include <sys/errno.h>
507c478bd9Sstevel@tonic-gate #include <sys/debug.h>
517c478bd9Sstevel@tonic-gate #include <sys/buf.h>
527c478bd9Sstevel@tonic-gate #include <sys/var.h>
537c478bd9Sstevel@tonic-gate #include <sys/vnode.h>
547c478bd9Sstevel@tonic-gate #include <sys/bitmap.h>
557c478bd9Sstevel@tonic-gate #include <sys/cmn_err.h>
567c478bd9Sstevel@tonic-gate #include <sys/kmem.h>
577c478bd9Sstevel@tonic-gate #include <sys/vmem.h>
587c478bd9Sstevel@tonic-gate #include <sys/atomic.h>
597c478bd9Sstevel@tonic-gate #include <vm/seg_kmem.h>
607c478bd9Sstevel@tonic-gate #include <vm/page.h>
617c478bd9Sstevel@tonic-gate #include <vm/pvn.h>
627c478bd9Sstevel@tonic-gate #include <sys/vtrace.h>
637c478bd9Sstevel@tonic-gate #include <sys/tnf_probe.h>
647c478bd9Sstevel@tonic-gate #include <sys/fs/ufs_inode.h>
657c478bd9Sstevel@tonic-gate #include <sys/fs/ufs_bio.h>
667c478bd9Sstevel@tonic-gate #include <sys/fs/ufs_log.h>
677c478bd9Sstevel@tonic-gate #include <sys/systm.h>
687c478bd9Sstevel@tonic-gate #include <sys/vfs.h>
697c478bd9Sstevel@tonic-gate #include <sys/sdt.h>
707c478bd9Sstevel@tonic-gate 
717c478bd9Sstevel@tonic-gate /* Locks */
727c478bd9Sstevel@tonic-gate static	kmutex_t	blist_lock;	/* protects b_list */
737c478bd9Sstevel@tonic-gate static	kmutex_t	bhdr_lock;	/* protects the bhdrlist */
747c478bd9Sstevel@tonic-gate static	kmutex_t	bfree_lock;	/* protects the bfreelist structure */
757c478bd9Sstevel@tonic-gate 
767c478bd9Sstevel@tonic-gate struct hbuf	*hbuf;			/* Hash buckets */
777c478bd9Sstevel@tonic-gate struct dwbuf	*dwbuf;			/* Delayed write buckets */
787c478bd9Sstevel@tonic-gate static struct buf *bhdrlist;		/* buf header free list */
79a8b4e7c7SToomas Soome static int	nbuf;			/* number of buffer headers allocated */
807c478bd9Sstevel@tonic-gate 
817c478bd9Sstevel@tonic-gate static int	lastindex;		/* Reference point on where to start */
827c478bd9Sstevel@tonic-gate 					/* when looking for free buffers */
837c478bd9Sstevel@tonic-gate 
847c478bd9Sstevel@tonic-gate #define	bio_bhash(dev, bn)	(hash2ints((dev), (int)(bn)) & v.v_hmask)
857c478bd9Sstevel@tonic-gate #define	EMPTY_LIST	((struct buf *)-1)
867c478bd9Sstevel@tonic-gate 
87a8b4e7c7SToomas Soome static kcondvar_t	bio_mem_cv;	/* Condition variables */
887c478bd9Sstevel@tonic-gate static kcondvar_t	bio_flushinval_cv;
897c478bd9Sstevel@tonic-gate static int	bio_doingflush;		/* flush in progress */
907c478bd9Sstevel@tonic-gate static int	bio_doinginval;		/* inval in progress */
917c478bd9Sstevel@tonic-gate static int	bio_flinv_cv_wanted;	/* someone waiting for cv */
927c478bd9Sstevel@tonic-gate 
937c478bd9Sstevel@tonic-gate /*
947c478bd9Sstevel@tonic-gate  * Statistics on the buffer cache
957c478bd9Sstevel@tonic-gate  */
967c478bd9Sstevel@tonic-gate struct biostats biostats = {
977c478bd9Sstevel@tonic-gate 	{ "buffer_cache_lookups",		KSTAT_DATA_UINT32 },
987c478bd9Sstevel@tonic-gate 	{ "buffer_cache_hits",			KSTAT_DATA_UINT32 },
997c478bd9Sstevel@tonic-gate 	{ "new_buffer_requests",		KSTAT_DATA_UINT32 },
1007c478bd9Sstevel@tonic-gate 	{ "waits_for_buffer_allocs",		KSTAT_DATA_UINT32 },
1017c478bd9Sstevel@tonic-gate 	{ "buffers_locked_by_someone",		KSTAT_DATA_UINT32 },
1027c478bd9Sstevel@tonic-gate 	{ "duplicate_buffers_found",		KSTAT_DATA_UINT32 }
1037c478bd9Sstevel@tonic-gate };
1047c478bd9Sstevel@tonic-gate 
1057c478bd9Sstevel@tonic-gate /*
1067c478bd9Sstevel@tonic-gate  * kstat data
1077c478bd9Sstevel@tonic-gate  */
1087c478bd9Sstevel@tonic-gate kstat_named_t	*biostats_ptr = (kstat_named_t *)&biostats;
1097c478bd9Sstevel@tonic-gate uint_t		biostats_ndata = (uint_t)(sizeof (biostats) /
1107c478bd9Sstevel@tonic-gate 					sizeof (kstat_named_t));
1117c478bd9Sstevel@tonic-gate 
1127c478bd9Sstevel@tonic-gate /*
1137c478bd9Sstevel@tonic-gate  * Statistics on ufs buffer cache
1147c478bd9Sstevel@tonic-gate  * Not protected by locks
1157c478bd9Sstevel@tonic-gate  */
1167c478bd9Sstevel@tonic-gate struct ufsbiostats ub = {
1177c478bd9Sstevel@tonic-gate 	{ "breads",			KSTAT_DATA_UINT32 },
1187c478bd9Sstevel@tonic-gate 	{ "bwrites",			KSTAT_DATA_UINT32 },
1197c478bd9Sstevel@tonic-gate 	{ "fbiwrites",			KSTAT_DATA_UINT32 },
1207c478bd9Sstevel@tonic-gate 	{ "getpages",			KSTAT_DATA_UINT32 },
1217c478bd9Sstevel@tonic-gate 	{ "getras",			KSTAT_DATA_UINT32 },
1227c478bd9Sstevel@tonic-gate 	{ "putsyncs",			KSTAT_DATA_UINT32 },
1237c478bd9Sstevel@tonic-gate 	{ "putasyncs",			KSTAT_DATA_UINT32 },
1247c478bd9Sstevel@tonic-gate 	{ "putpageios",			KSTAT_DATA_UINT32 },
1257c478bd9Sstevel@tonic-gate };
1267c478bd9Sstevel@tonic-gate 
1277c478bd9Sstevel@tonic-gate /*
1287c478bd9Sstevel@tonic-gate  * more UFS Logging eccentricities...
1297c478bd9Sstevel@tonic-gate  *
1307c478bd9Sstevel@tonic-gate  * required since "#pragma weak ..." doesn't work in reverse order.
1317c478bd9Sstevel@tonic-gate  * i.e.:  genunix (bio.c) is loaded before the ufs modules and pointers
1327c478bd9Sstevel@tonic-gate  *        to ufs routines don't get plugged into bio.c calls so
1337c478bd9Sstevel@tonic-gate  *        we initialize it when setting up the "lufsops" table
1347c478bd9Sstevel@tonic-gate  *        in "lufs.c:_init()"
1357c478bd9Sstevel@tonic-gate  */
1367c478bd9Sstevel@tonic-gate void (*bio_lufs_strategy)(void *, buf_t *);
1377c478bd9Sstevel@tonic-gate void (*bio_snapshot_strategy)(void *, buf_t *);
1387c478bd9Sstevel@tonic-gate 
1397c478bd9Sstevel@tonic-gate 
1407c478bd9Sstevel@tonic-gate /* Private routines */
1417c478bd9Sstevel@tonic-gate static struct buf	*bio_getfreeblk(long);
142a8b4e7c7SToomas Soome static void		bio_mem_get(long);
1437c478bd9Sstevel@tonic-gate static void		bio_bhdr_free(struct buf *);
1447c478bd9Sstevel@tonic-gate static struct buf	*bio_bhdr_alloc(void);
1457c478bd9Sstevel@tonic-gate static void		bio_recycle(int, long);
146a8b4e7c7SToomas Soome static void		bio_pageio_done(struct buf *);
147a8b4e7c7SToomas Soome static int		bio_incore(dev_t, daddr_t);
1487c478bd9Sstevel@tonic-gate 
1497c478bd9Sstevel@tonic-gate /*
1507c478bd9Sstevel@tonic-gate  * Buffer cache constants
1517c478bd9Sstevel@tonic-gate  */
1527c478bd9Sstevel@tonic-gate #define	BIO_BUF_PERCENT	(100/2)		/* default: 2% of memory */
1537c478bd9Sstevel@tonic-gate #define	BIO_MAX_PERCENT	(100/20)	/* max is 20% of real memory */
1547c478bd9Sstevel@tonic-gate #define	BIO_BHDR_POOL	100		/* Default bhdr pool size */
1557c478bd9Sstevel@tonic-gate #define	BIO_MIN_HDR	10		/* Minimum number of buffer headers */
1567c478bd9Sstevel@tonic-gate #define	BIO_MIN_HWM	(BIO_MIN_HDR * MAXBSIZE / 1024)
1577c478bd9Sstevel@tonic-gate #define	BIO_HASHLEN	4		/* Target length of hash chains */
1587c478bd9Sstevel@tonic-gate 
1597c478bd9Sstevel@tonic-gate 
1607c478bd9Sstevel@tonic-gate /* Flags for bio_recycle() */
1617c478bd9Sstevel@tonic-gate #define	BIO_HEADER	0x01
1627c478bd9Sstevel@tonic-gate #define	BIO_MEM		0x02
1637c478bd9Sstevel@tonic-gate 
1647c478bd9Sstevel@tonic-gate extern	int bufhwm;		/* User tunable - high water mark for mem  */
1657c478bd9Sstevel@tonic-gate extern	int bufhwm_pct;		/* ditto - given in % of physmem  */
1667c478bd9Sstevel@tonic-gate 
1677c478bd9Sstevel@tonic-gate /*
1687c478bd9Sstevel@tonic-gate  * The following routines allocate and free
1697c478bd9Sstevel@tonic-gate  * buffers with various side effects.  In general the
1707c478bd9Sstevel@tonic-gate  * arguments to an allocate routine are a device and
1717c478bd9Sstevel@tonic-gate  * a block number, and the value is a pointer to
1727c478bd9Sstevel@tonic-gate  * to the buffer header; the buffer returned is locked with a
1737c478bd9Sstevel@tonic-gate  * binary semaphore so that no one else can touch it. If the block was
1747c478bd9Sstevel@tonic-gate  * already in core, no I/O need be done; if it is
1757c478bd9Sstevel@tonic-gate  * already locked, the process waits until it becomes free.
1767c478bd9Sstevel@tonic-gate  * The following routines allocate a buffer:
1777c478bd9Sstevel@tonic-gate  *	getblk
1787c478bd9Sstevel@tonic-gate  *	bread/BREAD
1797c478bd9Sstevel@tonic-gate  *	breada
1807c478bd9Sstevel@tonic-gate  * Eventually the buffer must be released, possibly with the
1817c478bd9Sstevel@tonic-gate  * side effect of writing it out, by using one of
1827c478bd9Sstevel@tonic-gate  *	bwrite/BWRITE/brwrite
1837c478bd9Sstevel@tonic-gate  *	bdwrite/bdrwrite
1847c478bd9Sstevel@tonic-gate  *	bawrite
1857c478bd9Sstevel@tonic-gate  *	brelse
1867c478bd9Sstevel@tonic-gate  *
1877c478bd9Sstevel@tonic-gate  * The B_WANTED/B_BUSY bits are NOT used by these routines for synchronization.
1887c478bd9Sstevel@tonic-gate  * Instead, a binary semaphore, b_sem is used to gain exclusive access to
1897c478bd9Sstevel@tonic-gate  * a buffer and a binary semaphore, b_io is used for I/O synchronization.
1907c478bd9Sstevel@tonic-gate  * B_DONE is still used to denote a buffer with I/O complete on it.
1917c478bd9Sstevel@tonic-gate  *
1927c478bd9Sstevel@tonic-gate  * The bfreelist.b_bcount field is computed everytime fsflush runs. It is
1937c478bd9Sstevel@tonic-gate  * should not be used where a very accurate count of the free buffers is
1947c478bd9Sstevel@tonic-gate  * needed.
1957c478bd9Sstevel@tonic-gate  */
1967c478bd9Sstevel@tonic-gate 
1977c478bd9Sstevel@tonic-gate /*
1987c478bd9Sstevel@tonic-gate  * Read in (if necessary) the block and return a buffer pointer.
1997c478bd9Sstevel@tonic-gate  *
2007c478bd9Sstevel@tonic-gate  * This interface is provided for binary compatibility.  Using
2017c478bd9Sstevel@tonic-gate  * BREAD() directly avoids the extra function call overhead invoked
2027c478bd9Sstevel@tonic-gate  * by calling this routine.
2037c478bd9Sstevel@tonic-gate  */
2047c478bd9Sstevel@tonic-gate struct buf *
bread(dev_t dev,daddr_t blkno,long bsize)2057c478bd9Sstevel@tonic-gate bread(dev_t dev, daddr_t blkno, long bsize)
2067c478bd9Sstevel@tonic-gate {
2077c478bd9Sstevel@tonic-gate 	return (BREAD(dev, blkno, bsize));
2087c478bd9Sstevel@tonic-gate }
2097c478bd9Sstevel@tonic-gate 
2107c478bd9Sstevel@tonic-gate /*
2117c478bd9Sstevel@tonic-gate  * Common code for reading a buffer with various options
2127c478bd9Sstevel@tonic-gate  *
2137c478bd9Sstevel@tonic-gate  * Read in (if necessary) the block and return a buffer pointer.
2147c478bd9Sstevel@tonic-gate  */
2157c478bd9Sstevel@tonic-gate struct buf *
bread_common(void * arg,dev_t dev,daddr_t blkno,long bsize)2167c478bd9Sstevel@tonic-gate bread_common(void *arg, dev_t dev, daddr_t blkno, long bsize)
2177c478bd9Sstevel@tonic-gate {
2187c478bd9Sstevel@tonic-gate 	struct ufsvfs *ufsvfsp = (struct ufsvfs *)arg;
2197c478bd9Sstevel@tonic-gate 	struct buf *bp;
2207c478bd9Sstevel@tonic-gate 	klwp_t *lwp = ttolwp(curthread);
2217c478bd9Sstevel@tonic-gate 
2227c478bd9Sstevel@tonic-gate 	CPU_STATS_ADD_K(sys, lread, 1);
2237c478bd9Sstevel@tonic-gate 	bp = getblk_common(ufsvfsp, dev, blkno, bsize, /* errflg */ 1);
2247c478bd9Sstevel@tonic-gate 	if (bp->b_flags & B_DONE)
2257c478bd9Sstevel@tonic-gate 		return (bp);
2267c478bd9Sstevel@tonic-gate 	bp->b_flags |= B_READ;
2277c478bd9Sstevel@tonic-gate 	ASSERT(bp->b_bcount == bsize);
2287c478bd9Sstevel@tonic-gate 	if (ufsvfsp == NULL) {					/* !ufs */
2297c478bd9Sstevel@tonic-gate 		(void) bdev_strategy(bp);
2307c478bd9Sstevel@tonic-gate 	} else if (ufsvfsp->vfs_log && bio_lufs_strategy != NULL) {
2317c478bd9Sstevel@tonic-gate 							/* ufs && logging */
2327c478bd9Sstevel@tonic-gate 		(*bio_lufs_strategy)(ufsvfsp->vfs_log, bp);
2337c478bd9Sstevel@tonic-gate 	} else if (ufsvfsp->vfs_snapshot && bio_snapshot_strategy != NULL) {
2347c478bd9Sstevel@tonic-gate 							/* ufs && snapshots */
2357c478bd9Sstevel@tonic-gate 		(*bio_snapshot_strategy)(&ufsvfsp->vfs_snapshot, bp);
2367c478bd9Sstevel@tonic-gate 	} else {
237d3d50737SRafael Vanoni 		ufsvfsp->vfs_iotstamp = ddi_get_lbolt();
2387c478bd9Sstevel@tonic-gate 		ub.ub_breads.value.ul++;		/* ufs && !logging */
2397c478bd9Sstevel@tonic-gate 		(void) bdev_strategy(bp);
2407c478bd9Sstevel@tonic-gate 	}
2417c478bd9Sstevel@tonic-gate 	if (lwp != NULL)
2427c478bd9Sstevel@tonic-gate 		lwp->lwp_ru.inblock++;
2437c478bd9Sstevel@tonic-gate 	CPU_STATS_ADD_K(sys, bread, 1);
2447c478bd9Sstevel@tonic-gate 	(void) biowait(bp);
2457c478bd9Sstevel@tonic-gate 	return (bp);
2467c478bd9Sstevel@tonic-gate }
2477c478bd9Sstevel@tonic-gate 
2487c478bd9Sstevel@tonic-gate /*
2497c478bd9Sstevel@tonic-gate  * Read in the block, like bread, but also start I/O on the
2507c478bd9Sstevel@tonic-gate  * read-ahead block (which is not allocated to the caller).
2517c478bd9Sstevel@tonic-gate  */
2527c478bd9Sstevel@tonic-gate struct buf *
breada(dev_t dev,daddr_t blkno,daddr_t rablkno,long bsize)2537c478bd9Sstevel@tonic-gate breada(dev_t dev, daddr_t blkno, daddr_t rablkno, long bsize)
2547c478bd9Sstevel@tonic-gate {
2557c478bd9Sstevel@tonic-gate 	struct buf *bp, *rabp;
2567c478bd9Sstevel@tonic-gate 	klwp_t *lwp = ttolwp(curthread);
2577c478bd9Sstevel@tonic-gate 
2587c478bd9Sstevel@tonic-gate 	bp = NULL;
2597c478bd9Sstevel@tonic-gate 	if (!bio_incore(dev, blkno)) {
2607c478bd9Sstevel@tonic-gate 		CPU_STATS_ADD_K(sys, lread, 1);
2617c478bd9Sstevel@tonic-gate 		bp = GETBLK(dev, blkno, bsize);
2627c478bd9Sstevel@tonic-gate 		if ((bp->b_flags & B_DONE) == 0) {
2637c478bd9Sstevel@tonic-gate 			bp->b_flags |= B_READ;
2647c478bd9Sstevel@tonic-gate 			bp->b_bcount = bsize;
2657c478bd9Sstevel@tonic-gate 			(void) bdev_strategy(bp);
2667c478bd9Sstevel@tonic-gate 			if (lwp != NULL)
2677c478bd9Sstevel@tonic-gate 				lwp->lwp_ru.inblock++;
2687c478bd9Sstevel@tonic-gate 			CPU_STATS_ADD_K(sys, bread, 1);
2697c478bd9Sstevel@tonic-gate 		}
2707c478bd9Sstevel@tonic-gate 	}
2717c478bd9Sstevel@tonic-gate 	if (rablkno && bfreelist.b_bcount > 1 &&
2727c478bd9Sstevel@tonic-gate 	    !bio_incore(dev, rablkno)) {
2737c478bd9Sstevel@tonic-gate 		rabp = GETBLK(dev, rablkno, bsize);
2747c478bd9Sstevel@tonic-gate 		if (rabp->b_flags & B_DONE)
2757c478bd9Sstevel@tonic-gate 			brelse(rabp);
2767c478bd9Sstevel@tonic-gate 		else {
2777c478bd9Sstevel@tonic-gate 			rabp->b_flags |= B_READ|B_ASYNC;
2787c478bd9Sstevel@tonic-gate 			rabp->b_bcount = bsize;
2797c478bd9Sstevel@tonic-gate 			(void) bdev_strategy(rabp);
2807c478bd9Sstevel@tonic-gate 			if (lwp != NULL)
2817c478bd9Sstevel@tonic-gate 				lwp->lwp_ru.inblock++;
2827c478bd9Sstevel@tonic-gate 			CPU_STATS_ADD_K(sys, bread, 1);
2837c478bd9Sstevel@tonic-gate 		}
2847c478bd9Sstevel@tonic-gate 	}
2857c478bd9Sstevel@tonic-gate 	if (bp == NULL)
2867c478bd9Sstevel@tonic-gate 		return (BREAD(dev, blkno, bsize));
2877c478bd9Sstevel@tonic-gate 	(void) biowait(bp);
2887c478bd9Sstevel@tonic-gate 	return (bp);
2897c478bd9Sstevel@tonic-gate }
2907c478bd9Sstevel@tonic-gate 
2917c478bd9Sstevel@tonic-gate /*
2927c478bd9Sstevel@tonic-gate  * Common code for writing a buffer with various options.
2937c478bd9Sstevel@tonic-gate  *
2947c478bd9Sstevel@tonic-gate  * force_wait  - wait for write completion regardless of B_ASYNC flag
2957c478bd9Sstevel@tonic-gate  * do_relse    - release the buffer when we are done
2967c478bd9Sstevel@tonic-gate  * clear_flags - flags to clear from the buffer
2977c478bd9Sstevel@tonic-gate  */
2987c478bd9Sstevel@tonic-gate void
bwrite_common(void * arg,struct buf * bp,int force_wait,int do_relse,int clear_flags)2997c478bd9Sstevel@tonic-gate bwrite_common(void *arg, struct buf *bp, int force_wait,
3003f11de9dSSara Hartse     int do_relse, int clear_flags)
3017c478bd9Sstevel@tonic-gate {
3027c478bd9Sstevel@tonic-gate 	register int do_wait;
3037c478bd9Sstevel@tonic-gate 	struct ufsvfs *ufsvfsp = (struct ufsvfs *)arg;
3047c478bd9Sstevel@tonic-gate 	int flag;
3057c478bd9Sstevel@tonic-gate 	klwp_t *lwp = ttolwp(curthread);
3067c478bd9Sstevel@tonic-gate 	struct cpu *cpup;
3077c478bd9Sstevel@tonic-gate 
3087c478bd9Sstevel@tonic-gate 	ASSERT(SEMA_HELD(&bp->b_sem));
3097c478bd9Sstevel@tonic-gate 	flag = bp->b_flags;
3107c478bd9Sstevel@tonic-gate 	bp->b_flags &= ~clear_flags;
3117c478bd9Sstevel@tonic-gate 	if (lwp != NULL)
3127c478bd9Sstevel@tonic-gate 		lwp->lwp_ru.oublock++;
3137c478bd9Sstevel@tonic-gate 	CPU_STATS_ENTER_K();
3147c478bd9Sstevel@tonic-gate 	cpup = CPU;		/* get pointer AFTER preemption is disabled */
3157c478bd9Sstevel@tonic-gate 	CPU_STATS_ADDQ(cpup, sys, lwrite, 1);
3167c478bd9Sstevel@tonic-gate 	CPU_STATS_ADDQ(cpup, sys, bwrite, 1);
3177c478bd9Sstevel@tonic-gate 	do_wait = ((flag & B_ASYNC) == 0 || force_wait);
3187c478bd9Sstevel@tonic-gate 	if (do_wait == 0)
3197c478bd9Sstevel@tonic-gate 		CPU_STATS_ADDQ(cpup, sys, bawrite, 1);
3207c478bd9Sstevel@tonic-gate 	CPU_STATS_EXIT_K();
3217c478bd9Sstevel@tonic-gate 	if (ufsvfsp == NULL) {
3227c478bd9Sstevel@tonic-gate 		(void) bdev_strategy(bp);
3237c478bd9Sstevel@tonic-gate 	} else if (ufsvfsp->vfs_log && bio_lufs_strategy != NULL) {
3247c478bd9Sstevel@tonic-gate 							/* ufs && logging */
3257c478bd9Sstevel@tonic-gate 		(*bio_lufs_strategy)(ufsvfsp->vfs_log, bp);
3267c478bd9Sstevel@tonic-gate 	} else if (ufsvfsp->vfs_snapshot && bio_snapshot_strategy != NULL) {
3277c478bd9Sstevel@tonic-gate 							/* ufs && snapshots */
3287c478bd9Sstevel@tonic-gate 		(*bio_snapshot_strategy)(&ufsvfsp->vfs_snapshot, bp);
3297c478bd9Sstevel@tonic-gate 	} else {
3307c478bd9Sstevel@tonic-gate 		ub.ub_bwrites.value.ul++;		/* ufs && !logging */
3317c478bd9Sstevel@tonic-gate 		(void) bdev_strategy(bp);
3327c478bd9Sstevel@tonic-gate 	}
3337c478bd9Sstevel@tonic-gate 	if (do_wait) {
3347c478bd9Sstevel@tonic-gate 		(void) biowait(bp);
3357c478bd9Sstevel@tonic-gate 		if (do_relse) {
3367c478bd9Sstevel@tonic-gate 			brelse(bp);
3377c478bd9Sstevel@tonic-gate 		}
3387c478bd9Sstevel@tonic-gate 	}
3397c478bd9Sstevel@tonic-gate }
3407c478bd9Sstevel@tonic-gate 
3417c478bd9Sstevel@tonic-gate /*
3427c478bd9Sstevel@tonic-gate  * Write the buffer, waiting for completion (unless B_ASYNC is set).
3437c478bd9Sstevel@tonic-gate  * Then release the buffer.
3447c478bd9Sstevel@tonic-gate  * This interface is provided for binary compatibility.  Using
3457c478bd9Sstevel@tonic-gate  * BWRITE() directly avoids the extra function call overhead invoked
3467c478bd9Sstevel@tonic-gate  * by calling this routine.
3477c478bd9Sstevel@tonic-gate  */
3487c478bd9Sstevel@tonic-gate void
bwrite(struct buf * bp)3497c478bd9Sstevel@tonic-gate bwrite(struct buf *bp)
3507c478bd9Sstevel@tonic-gate {
3517c478bd9Sstevel@tonic-gate 	BWRITE(bp);
3527c478bd9Sstevel@tonic-gate }
3537c478bd9Sstevel@tonic-gate 
3547c478bd9Sstevel@tonic-gate /*
3557c478bd9Sstevel@tonic-gate  * Write the buffer, waiting for completion.
3567c478bd9Sstevel@tonic-gate  * But don't release the buffer afterwards.
3577c478bd9Sstevel@tonic-gate  * This interface is provided for binary compatibility.  Using
3587c478bd9Sstevel@tonic-gate  * BWRITE2() directly avoids the extra function call overhead.
3597c478bd9Sstevel@tonic-gate  */
3607c478bd9Sstevel@tonic-gate void
bwrite2(struct buf * bp)3617c478bd9Sstevel@tonic-gate bwrite2(struct buf *bp)
3627c478bd9Sstevel@tonic-gate {
3637c478bd9Sstevel@tonic-gate 	BWRITE2(bp);
3647c478bd9Sstevel@tonic-gate }
3657c478bd9Sstevel@tonic-gate 
3667c478bd9Sstevel@tonic-gate /*
3677c478bd9Sstevel@tonic-gate  * Release the buffer, marking it so that if it is grabbed
3687c478bd9Sstevel@tonic-gate  * for another purpose it will be written out before being
3697c478bd9Sstevel@tonic-gate  * given up (e.g. when writing a partial block where it is
3707c478bd9Sstevel@tonic-gate  * assumed that another write for the same block will soon follow).
3717c478bd9Sstevel@tonic-gate  * Also save the time that the block is first marked as delayed
3727c478bd9Sstevel@tonic-gate  * so that it will be written in a reasonable time.
3737c478bd9Sstevel@tonic-gate  */
3747c478bd9Sstevel@tonic-gate void
bdwrite(struct buf * bp)3757c478bd9Sstevel@tonic-gate bdwrite(struct buf *bp)
3767c478bd9Sstevel@tonic-gate {
3777c478bd9Sstevel@tonic-gate 	ASSERT(SEMA_HELD(&bp->b_sem));
3787c478bd9Sstevel@tonic-gate 	CPU_STATS_ADD_K(sys, lwrite, 1);
3797c478bd9Sstevel@tonic-gate 	if ((bp->b_flags & B_DELWRI) == 0)
380d3d50737SRafael Vanoni 		bp->b_start = ddi_get_lbolt();
3817c478bd9Sstevel@tonic-gate 	/*
3827c478bd9Sstevel@tonic-gate 	 * B_DONE allows others to use the buffer, B_DELWRI causes the
3837c478bd9Sstevel@tonic-gate 	 * buffer to be written before being reused, and setting b_resid
3847c478bd9Sstevel@tonic-gate 	 * to zero says the buffer is complete.
3857c478bd9Sstevel@tonic-gate 	 */
3867c478bd9Sstevel@tonic-gate 	bp->b_flags |= B_DELWRI | B_DONE;
3877c478bd9Sstevel@tonic-gate 	bp->b_resid = 0;
3887c478bd9Sstevel@tonic-gate 	brelse(bp);
3897c478bd9Sstevel@tonic-gate }
3907c478bd9Sstevel@tonic-gate 
3917c478bd9Sstevel@tonic-gate /*
3927c478bd9Sstevel@tonic-gate  * Release the buffer, start I/O on it, but don't wait for completion.
3937c478bd9Sstevel@tonic-gate  */
3947c478bd9Sstevel@tonic-gate void
bawrite(struct buf * bp)3957c478bd9Sstevel@tonic-gate bawrite(struct buf *bp)
3967c478bd9Sstevel@tonic-gate {
3977c478bd9Sstevel@tonic-gate 	ASSERT(SEMA_HELD(&bp->b_sem));
3987c478bd9Sstevel@tonic-gate 
3997c478bd9Sstevel@tonic-gate 	/* Use bfreelist.b_bcount as a weird-ass heuristic */
4007c478bd9Sstevel@tonic-gate 	if (bfreelist.b_bcount > 4)
4017c478bd9Sstevel@tonic-gate 		bp->b_flags |= B_ASYNC;
4027c478bd9Sstevel@tonic-gate 	BWRITE(bp);
4037c478bd9Sstevel@tonic-gate }
4047c478bd9Sstevel@tonic-gate 
4057c478bd9Sstevel@tonic-gate /*
4067c478bd9Sstevel@tonic-gate  * Release the buffer, with no I/O implied.
4077c478bd9Sstevel@tonic-gate  */
4087c478bd9Sstevel@tonic-gate void
brelse(struct buf * bp)4097c478bd9Sstevel@tonic-gate brelse(struct buf *bp)
4107c478bd9Sstevel@tonic-gate {
4117c478bd9Sstevel@tonic-gate 	struct buf	**backp;
4127c478bd9Sstevel@tonic-gate 	uint_t		index;
4137c478bd9Sstevel@tonic-gate 	kmutex_t	*hmp;
4147c478bd9Sstevel@tonic-gate 	struct	buf	*dp;
4157c478bd9Sstevel@tonic-gate 	struct	hbuf	*hp;
4167c478bd9Sstevel@tonic-gate 
4177c478bd9Sstevel@tonic-gate 
4187c478bd9Sstevel@tonic-gate 	ASSERT(SEMA_HELD(&bp->b_sem));
4197c478bd9Sstevel@tonic-gate 
4207c478bd9Sstevel@tonic-gate 	/*
4217c478bd9Sstevel@tonic-gate 	 * Clear the retry write flag if the buffer was written without
4227c478bd9Sstevel@tonic-gate 	 * error.  The presence of B_DELWRI means the buffer has not yet
4237c478bd9Sstevel@tonic-gate 	 * been written and the presence of B_ERROR means that an error
4247c478bd9Sstevel@tonic-gate 	 * is still occurring.
4257c478bd9Sstevel@tonic-gate 	 */
4267c478bd9Sstevel@tonic-gate 	if ((bp->b_flags & (B_ERROR | B_DELWRI | B_RETRYWRI)) == B_RETRYWRI) {
4277c478bd9Sstevel@tonic-gate 		bp->b_flags &= ~B_RETRYWRI;
4287c478bd9Sstevel@tonic-gate 	}
4297c478bd9Sstevel@tonic-gate 
4307c478bd9Sstevel@tonic-gate 	/* Check for anomalous conditions */
4317c478bd9Sstevel@tonic-gate 	if (bp->b_flags & (B_ERROR|B_NOCACHE)) {
4327c478bd9Sstevel@tonic-gate 		if (bp->b_flags & B_NOCACHE) {
4337c478bd9Sstevel@tonic-gate 			/* Don't add to the freelist. Destroy it now */
4347c478bd9Sstevel@tonic-gate 			kmem_free(bp->b_un.b_addr, bp->b_bufsize);
4357c478bd9Sstevel@tonic-gate 			sema_destroy(&bp->b_sem);
4367c478bd9Sstevel@tonic-gate 			sema_destroy(&bp->b_io);
4377c478bd9Sstevel@tonic-gate 			kmem_free(bp, sizeof (struct buf));
4387c478bd9Sstevel@tonic-gate 			return;
4397c478bd9Sstevel@tonic-gate 		}
4407c478bd9Sstevel@tonic-gate 		/*
4417c478bd9Sstevel@tonic-gate 		 * If a write failed and we are supposed to retry write,
4427c478bd9Sstevel@tonic-gate 		 * don't toss the buffer.  Keep it around and mark it
4437c478bd9Sstevel@tonic-gate 		 * delayed write in the hopes that it will eventually
4447c478bd9Sstevel@tonic-gate 		 * get flushed (and still keep the system running.)
4457c478bd9Sstevel@tonic-gate 		 */
4467c478bd9Sstevel@tonic-gate 		if ((bp->b_flags & (B_READ | B_RETRYWRI)) == B_RETRYWRI) {
4477c478bd9Sstevel@tonic-gate 			bp->b_flags |= B_DELWRI;
4487c478bd9Sstevel@tonic-gate 			/* keep fsflush from trying continuously to flush */
449d3d50737SRafael Vanoni 			bp->b_start = ddi_get_lbolt();
4507c478bd9Sstevel@tonic-gate 		} else
4517c478bd9Sstevel@tonic-gate 			bp->b_flags |= B_AGE|B_STALE;
4527c478bd9Sstevel@tonic-gate 		bp->b_flags &= ~B_ERROR;
4537c478bd9Sstevel@tonic-gate 		bp->b_error = 0;
4547c478bd9Sstevel@tonic-gate 	}
4557c478bd9Sstevel@tonic-gate 
4567c478bd9Sstevel@tonic-gate 	/*
4577c478bd9Sstevel@tonic-gate 	 * If delayed write is set then put in on the delayed
4587c478bd9Sstevel@tonic-gate 	 * write list instead of the free buffer list.
4597c478bd9Sstevel@tonic-gate 	 */
4607c478bd9Sstevel@tonic-gate 	index = bio_bhash(bp->b_edev, bp->b_blkno);
4617c478bd9Sstevel@tonic-gate 	hmp   = &hbuf[index].b_lock;
4627c478bd9Sstevel@tonic-gate 
4637c478bd9Sstevel@tonic-gate 	mutex_enter(hmp);
4647c478bd9Sstevel@tonic-gate 	hp = &hbuf[index];
4657c478bd9Sstevel@tonic-gate 	dp = (struct buf *)hp;
4667c478bd9Sstevel@tonic-gate 
4677c478bd9Sstevel@tonic-gate 	/*
4687c478bd9Sstevel@tonic-gate 	 * Make sure that the number of entries on this list are
4697c478bd9Sstevel@tonic-gate 	 * Zero <= count <= total # buffers
4707c478bd9Sstevel@tonic-gate 	 */
4717c478bd9Sstevel@tonic-gate 	ASSERT(hp->b_length >= 0);
4727c478bd9Sstevel@tonic-gate 	ASSERT(hp->b_length < nbuf);
4737c478bd9Sstevel@tonic-gate 
4747c478bd9Sstevel@tonic-gate 	hp->b_length++;		/* We are adding this buffer */
4757c478bd9Sstevel@tonic-gate 
4767c478bd9Sstevel@tonic-gate 	if (bp->b_flags & B_DELWRI) {
4777c478bd9Sstevel@tonic-gate 		/*
4787c478bd9Sstevel@tonic-gate 		 * This buffer goes on the delayed write buffer list
4797c478bd9Sstevel@tonic-gate 		 */
4807c478bd9Sstevel@tonic-gate 		dp = (struct buf *)&dwbuf[index];
4817c478bd9Sstevel@tonic-gate 	}
4827c478bd9Sstevel@tonic-gate 	ASSERT(bp->b_bufsize > 0);
4837c478bd9Sstevel@tonic-gate 	ASSERT(bp->b_bcount > 0);
4847c478bd9Sstevel@tonic-gate 	ASSERT(bp->b_un.b_addr != NULL);
4857c478bd9Sstevel@tonic-gate 
4867c478bd9Sstevel@tonic-gate 	if (bp->b_flags & B_AGE) {
4877c478bd9Sstevel@tonic-gate 		backp = &dp->av_forw;
4887c478bd9Sstevel@tonic-gate 		(*backp)->av_back = bp;
4897c478bd9Sstevel@tonic-gate 		bp->av_forw = *backp;
4907c478bd9Sstevel@tonic-gate 		*backp = bp;
4917c478bd9Sstevel@tonic-gate 		bp->av_back = dp;
4927c478bd9Sstevel@tonic-gate 	} else {
4937c478bd9Sstevel@tonic-gate 		backp = &dp->av_back;
4947c478bd9Sstevel@tonic-gate 		(*backp)->av_forw = bp;
4957c478bd9Sstevel@tonic-gate 		bp->av_back = *backp;
4967c478bd9Sstevel@tonic-gate 		*backp = bp;
4977c478bd9Sstevel@tonic-gate 		bp->av_forw = dp;
4987c478bd9Sstevel@tonic-gate 	}
4997c478bd9Sstevel@tonic-gate 	mutex_exit(hmp);
5007c478bd9Sstevel@tonic-gate 
5017c478bd9Sstevel@tonic-gate 	if (bfreelist.b_flags & B_WANTED) {
5027c478bd9Sstevel@tonic-gate 		/*
5037c478bd9Sstevel@tonic-gate 		 * Should come here very very rarely.
5047c478bd9Sstevel@tonic-gate 		 */
5057c478bd9Sstevel@tonic-gate 		mutex_enter(&bfree_lock);
5067c478bd9Sstevel@tonic-gate 		if (bfreelist.b_flags & B_WANTED) {
5077c478bd9Sstevel@tonic-gate 			bfreelist.b_flags &= ~B_WANTED;
5087c478bd9Sstevel@tonic-gate 			cv_broadcast(&bio_mem_cv);
5097c478bd9Sstevel@tonic-gate 		}
5107c478bd9Sstevel@tonic-gate 		mutex_exit(&bfree_lock);
5117c478bd9Sstevel@tonic-gate 	}
5127c478bd9Sstevel@tonic-gate 
5137c478bd9Sstevel@tonic-gate 	bp->b_flags &= ~(B_WANTED|B_BUSY|B_ASYNC);
5147c478bd9Sstevel@tonic-gate 	/*
5157c478bd9Sstevel@tonic-gate 	 * Don't let anyone get the buffer off the freelist before we
5167c478bd9Sstevel@tonic-gate 	 * release our hold on it.
5177c478bd9Sstevel@tonic-gate 	 */
5187c478bd9Sstevel@tonic-gate 	sema_v(&bp->b_sem);
5197c478bd9Sstevel@tonic-gate }
5207c478bd9Sstevel@tonic-gate 
5217c478bd9Sstevel@tonic-gate /*
5227c478bd9Sstevel@tonic-gate  * Return a count of the number of B_BUSY buffers in the system
5237c478bd9Sstevel@tonic-gate  * Can only be used as a good estimate.  If 'cleanit' is set,
5247c478bd9Sstevel@tonic-gate  * try to flush all bufs.
5257c478bd9Sstevel@tonic-gate  */
5267c478bd9Sstevel@tonic-gate int
bio_busy(int cleanit)5277c478bd9Sstevel@tonic-gate bio_busy(int cleanit)
5287c478bd9Sstevel@tonic-gate {
5297c478bd9Sstevel@tonic-gate 	struct buf *bp, *dp;
5307c478bd9Sstevel@tonic-gate 	int busy = 0;
5317c478bd9Sstevel@tonic-gate 	int i;
5327c478bd9Sstevel@tonic-gate 	kmutex_t *hmp;
5337c478bd9Sstevel@tonic-gate 
5347c478bd9Sstevel@tonic-gate 	for (i = 0; i < v.v_hbuf; i++) {
5357c478bd9Sstevel@tonic-gate 		dp = (struct buf *)&hbuf[i];
5367c478bd9Sstevel@tonic-gate 		hmp = &hbuf[i].b_lock;
5377c478bd9Sstevel@tonic-gate 
5387c478bd9Sstevel@tonic-gate 		mutex_enter(hmp);
5397c478bd9Sstevel@tonic-gate 		for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
5407c478bd9Sstevel@tonic-gate 			if (bp->b_flags & B_BUSY)
5417c478bd9Sstevel@tonic-gate 				busy++;
5427c478bd9Sstevel@tonic-gate 		}
5437c478bd9Sstevel@tonic-gate 		mutex_exit(hmp);
5447c478bd9Sstevel@tonic-gate 	}
5457c478bd9Sstevel@tonic-gate 
5467c478bd9Sstevel@tonic-gate 	if (cleanit && busy != 0) {
5477c478bd9Sstevel@tonic-gate 		bflush(NODEV);
5487c478bd9Sstevel@tonic-gate 	}
5497c478bd9Sstevel@tonic-gate 
5507c478bd9Sstevel@tonic-gate 	return (busy);
5517c478bd9Sstevel@tonic-gate }
5527c478bd9Sstevel@tonic-gate 
5537c478bd9Sstevel@tonic-gate /*
5547c478bd9Sstevel@tonic-gate  * this interface is provided for binary compatibility.
5557c478bd9Sstevel@tonic-gate  *
5567c478bd9Sstevel@tonic-gate  * Assign a buffer for the given block.  If the appropriate
5577c478bd9Sstevel@tonic-gate  * block is already associated, return it; otherwise search
5587c478bd9Sstevel@tonic-gate  * for the oldest non-busy buffer and reassign it.
5597c478bd9Sstevel@tonic-gate  */
5607c478bd9Sstevel@tonic-gate struct buf *
getblk(dev_t dev,daddr_t blkno,long bsize)5617c478bd9Sstevel@tonic-gate getblk(dev_t dev, daddr_t blkno, long bsize)
5627c478bd9Sstevel@tonic-gate {
5637c478bd9Sstevel@tonic-gate 	return (getblk_common(/* ufsvfsp */ NULL, dev,
564d3d50737SRafael Vanoni 	    blkno, bsize, /* errflg */ 0));
5657c478bd9Sstevel@tonic-gate }
5667c478bd9Sstevel@tonic-gate 
5677c478bd9Sstevel@tonic-gate /*
5687c478bd9Sstevel@tonic-gate  * Assign a buffer for the given block.  If the appropriate
5697c478bd9Sstevel@tonic-gate  * block is already associated, return it; otherwise search
5707c478bd9Sstevel@tonic-gate  * for the oldest non-busy buffer and reassign it.
5717c478bd9Sstevel@tonic-gate  */
5727c478bd9Sstevel@tonic-gate struct buf *
getblk_common(void * arg,dev_t dev,daddr_t blkno,long bsize,int errflg)5737c478bd9Sstevel@tonic-gate getblk_common(void * arg, dev_t dev, daddr_t blkno, long bsize, int errflg)
5747c478bd9Sstevel@tonic-gate {
5757c478bd9Sstevel@tonic-gate 	ufsvfs_t *ufsvfsp = (struct ufsvfs *)arg;
5767c478bd9Sstevel@tonic-gate 	struct buf *bp;
5777c478bd9Sstevel@tonic-gate 	struct buf *dp;
5787c478bd9Sstevel@tonic-gate 	struct buf *nbp = NULL;
5797c478bd9Sstevel@tonic-gate 	struct buf *errbp;
5807c478bd9Sstevel@tonic-gate 	uint_t		index;
5817c478bd9Sstevel@tonic-gate 	kmutex_t	*hmp;
5827c478bd9Sstevel@tonic-gate 	struct	hbuf	*hp;
5837c478bd9Sstevel@tonic-gate 
5847c478bd9Sstevel@tonic-gate 	if (getmajor(dev) >= devcnt)
5857c478bd9Sstevel@tonic-gate 		cmn_err(CE_PANIC, "blkdev");
5867c478bd9Sstevel@tonic-gate 
5877c478bd9Sstevel@tonic-gate 	biostats.bio_lookup.value.ui32++;
5887c478bd9Sstevel@tonic-gate 
5897c478bd9Sstevel@tonic-gate 	index = bio_bhash(dev, blkno);
5907c478bd9Sstevel@tonic-gate 	hp    = &hbuf[index];
5917c478bd9Sstevel@tonic-gate 	dp    = (struct buf *)hp;
5927c478bd9Sstevel@tonic-gate 	hmp   = &hp->b_lock;
5937c478bd9Sstevel@tonic-gate 
5947c478bd9Sstevel@tonic-gate 	mutex_enter(hmp);
5957c478bd9Sstevel@tonic-gate loop:
5967c478bd9Sstevel@tonic-gate 	for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
5977c478bd9Sstevel@tonic-gate 		if (bp->b_blkno != blkno || bp->b_edev != dev ||
5987c478bd9Sstevel@tonic-gate 		    (bp->b_flags & B_STALE))
5997c478bd9Sstevel@tonic-gate 			continue;
6007c478bd9Sstevel@tonic-gate 		/*
6017c478bd9Sstevel@tonic-gate 		 * Avoid holding the hash lock in the event that
6027c478bd9Sstevel@tonic-gate 		 * the buffer is locked by someone. Since the hash chain
6037c478bd9Sstevel@tonic-gate 		 * may change when we drop the hash lock
6047c478bd9Sstevel@tonic-gate 		 * we have to start at the beginning of the chain if the
6057c478bd9Sstevel@tonic-gate 		 * buffer identity/contents aren't valid.
6067c478bd9Sstevel@tonic-gate 		 */
6077c478bd9Sstevel@tonic-gate 		if (!sema_tryp(&bp->b_sem)) {
6087c478bd9Sstevel@tonic-gate 			biostats.bio_bufbusy.value.ui32++;
6097c478bd9Sstevel@tonic-gate 			mutex_exit(hmp);
6107c478bd9Sstevel@tonic-gate 			/*
6117c478bd9Sstevel@tonic-gate 			 * OK, we are dealing with a busy buffer.
6127c478bd9Sstevel@tonic-gate 			 * In the case that we are panicking and we
6137c478bd9Sstevel@tonic-gate 			 * got called from bread(), we have some chance
6147c478bd9Sstevel@tonic-gate 			 * for error recovery. So better bail out from
6157c478bd9Sstevel@tonic-gate 			 * here since sema_p() won't block. If we got
6167c478bd9Sstevel@tonic-gate 			 * called directly from ufs routines, there is
6177c478bd9Sstevel@tonic-gate 			 * no way to report an error yet.
6187c478bd9Sstevel@tonic-gate 			 */
6197c478bd9Sstevel@tonic-gate 			if (panicstr && errflg)
6207c478bd9Sstevel@tonic-gate 				goto errout;
6217c478bd9Sstevel@tonic-gate 			/*
6227c478bd9Sstevel@tonic-gate 			 * For the following line of code to work
6237c478bd9Sstevel@tonic-gate 			 * correctly never kmem_free the buffer "header".
6247c478bd9Sstevel@tonic-gate 			 */
6257c478bd9Sstevel@tonic-gate 			sema_p(&bp->b_sem);
6267c478bd9Sstevel@tonic-gate 			if (bp->b_blkno != blkno || bp->b_edev != dev ||
6277c478bd9Sstevel@tonic-gate 			    (bp->b_flags & B_STALE)) {
6287c478bd9Sstevel@tonic-gate 				sema_v(&bp->b_sem);
6297c478bd9Sstevel@tonic-gate 				mutex_enter(hmp);
6307c478bd9Sstevel@tonic-gate 				goto loop;	/* start over */
6317c478bd9Sstevel@tonic-gate 			}
6327c478bd9Sstevel@tonic-gate 			mutex_enter(hmp);
6337c478bd9Sstevel@tonic-gate 		}
6347c478bd9Sstevel@tonic-gate 		/* Found */
6357c478bd9Sstevel@tonic-gate 		biostats.bio_hit.value.ui32++;
6367c478bd9Sstevel@tonic-gate 		bp->b_flags &= ~B_AGE;
6377c478bd9Sstevel@tonic-gate 
6387c478bd9Sstevel@tonic-gate 		/*
6397c478bd9Sstevel@tonic-gate 		 * Yank it off the free/delayed write lists
6407c478bd9Sstevel@tonic-gate 		 */
6417c478bd9Sstevel@tonic-gate 		hp->b_length--;
6427c478bd9Sstevel@tonic-gate 		notavail(bp);
6437c478bd9Sstevel@tonic-gate 		mutex_exit(hmp);
6447c478bd9Sstevel@tonic-gate 
645a8b4e7c7SToomas Soome 		ASSERT((bp->b_flags & B_NOCACHE) == 0);
6467c478bd9Sstevel@tonic-gate 
6477c478bd9Sstevel@tonic-gate 		if (nbp == NULL) {
6487c478bd9Sstevel@tonic-gate 			/*
6497c478bd9Sstevel@tonic-gate 			 * Make the common path short.
6507c478bd9Sstevel@tonic-gate 			 */
6517c478bd9Sstevel@tonic-gate 			ASSERT(SEMA_HELD(&bp->b_sem));
6527c478bd9Sstevel@tonic-gate 			return (bp);
6537c478bd9Sstevel@tonic-gate 		}
6547c478bd9Sstevel@tonic-gate 
6557c478bd9Sstevel@tonic-gate 		biostats.bio_bufdup.value.ui32++;
6567c478bd9Sstevel@tonic-gate 
6577c478bd9Sstevel@tonic-gate 		/*
6587c478bd9Sstevel@tonic-gate 		 * The buffer must have entered during the lock upgrade
6597c478bd9Sstevel@tonic-gate 		 * so free the new buffer we allocated and return the
6607c478bd9Sstevel@tonic-gate 		 * found buffer.
6617c478bd9Sstevel@tonic-gate 		 */
6627c478bd9Sstevel@tonic-gate 		kmem_free(nbp->b_un.b_addr, nbp->b_bufsize);
6637c478bd9Sstevel@tonic-gate 		nbp->b_un.b_addr = NULL;
6647c478bd9Sstevel@tonic-gate 
6657c478bd9Sstevel@tonic-gate 		/*
6667c478bd9Sstevel@tonic-gate 		 * Account for the memory
6677c478bd9Sstevel@tonic-gate 		 */
6687c478bd9Sstevel@tonic-gate 		mutex_enter(&bfree_lock);
6697c478bd9Sstevel@tonic-gate 		bfreelist.b_bufsize += nbp->b_bufsize;
6707c478bd9Sstevel@tonic-gate 		mutex_exit(&bfree_lock);
6717c478bd9Sstevel@tonic-gate 
6727c478bd9Sstevel@tonic-gate 		/*
6737c478bd9Sstevel@tonic-gate 		 * Destroy buf identity, and place on avail list
6747c478bd9Sstevel@tonic-gate 		 */
6757c478bd9Sstevel@tonic-gate 		nbp->b_dev = (o_dev_t)NODEV;
6767c478bd9Sstevel@tonic-gate 		nbp->b_edev = NODEV;
6777c478bd9Sstevel@tonic-gate 		nbp->b_flags = 0;
6787c478bd9Sstevel@tonic-gate 		nbp->b_file = NULL;
6797c478bd9Sstevel@tonic-gate 		nbp->b_offset = -1;
6807c478bd9Sstevel@tonic-gate 
6817c478bd9Sstevel@tonic-gate 		sema_v(&nbp->b_sem);
6827c478bd9Sstevel@tonic-gate 		bio_bhdr_free(nbp);
6837c478bd9Sstevel@tonic-gate 
6847c478bd9Sstevel@tonic-gate 		ASSERT(SEMA_HELD(&bp->b_sem));
6857c478bd9Sstevel@tonic-gate 		return (bp);
6867c478bd9Sstevel@tonic-gate 	}
6877c478bd9Sstevel@tonic-gate 
6887c478bd9Sstevel@tonic-gate 	/*
6897c478bd9Sstevel@tonic-gate 	 * bio_getfreeblk may block so check the hash chain again.
6907c478bd9Sstevel@tonic-gate 	 */
6917c478bd9Sstevel@tonic-gate 	if (nbp == NULL) {
6927c478bd9Sstevel@tonic-gate 		mutex_exit(hmp);
6937c478bd9Sstevel@tonic-gate 		nbp = bio_getfreeblk(bsize);
6947c478bd9Sstevel@tonic-gate 		mutex_enter(hmp);
6957c478bd9Sstevel@tonic-gate 		goto loop;
6967c478bd9Sstevel@tonic-gate 	}
6977c478bd9Sstevel@tonic-gate 
6987c478bd9Sstevel@tonic-gate 	/*
6997c478bd9Sstevel@tonic-gate 	 * New buffer. Assign nbp and stick it on the hash.
7007c478bd9Sstevel@tonic-gate 	 */
7017c478bd9Sstevel@tonic-gate 	nbp->b_flags = B_BUSY;
7027c478bd9Sstevel@tonic-gate 	nbp->b_edev = dev;
7037c478bd9Sstevel@tonic-gate 	nbp->b_dev = (o_dev_t)cmpdev(dev);
7047c478bd9Sstevel@tonic-gate 	nbp->b_blkno = blkno;
7057c478bd9Sstevel@tonic-gate 	nbp->b_iodone = NULL;
7067c478bd9Sstevel@tonic-gate 	nbp->b_bcount = bsize;
7077c478bd9Sstevel@tonic-gate 	/*
7087c478bd9Sstevel@tonic-gate 	 * If we are given a ufsvfsp and the vfs_root field is NULL
7097c478bd9Sstevel@tonic-gate 	 * then this must be I/O for a superblock.  A superblock's
7107c478bd9Sstevel@tonic-gate 	 * buffer is set up in mountfs() and there is no root vnode
7117c478bd9Sstevel@tonic-gate 	 * at that point.
7127c478bd9Sstevel@tonic-gate 	 */
7137c478bd9Sstevel@tonic-gate 	if (ufsvfsp && ufsvfsp->vfs_root) {
7147c478bd9Sstevel@tonic-gate 		nbp->b_vp = ufsvfsp->vfs_root;
7157c478bd9Sstevel@tonic-gate 	} else {
7167c478bd9Sstevel@tonic-gate 		nbp->b_vp = NULL;
7177c478bd9Sstevel@tonic-gate 	}
7187c478bd9Sstevel@tonic-gate 
719a8b4e7c7SToomas Soome 	ASSERT((nbp->b_flags & B_NOCACHE) == 0);
7207c478bd9Sstevel@tonic-gate 
7217c478bd9Sstevel@tonic-gate 	binshash(nbp, dp);
7227c478bd9Sstevel@tonic-gate 	mutex_exit(hmp);
7237c478bd9Sstevel@tonic-gate 
7247c478bd9Sstevel@tonic-gate 	ASSERT(SEMA_HELD(&nbp->b_sem));
7257c478bd9Sstevel@tonic-gate 
7267c478bd9Sstevel@tonic-gate 	return (nbp);
7277c478bd9Sstevel@tonic-gate 
7287c478bd9Sstevel@tonic-gate 
7297c478bd9Sstevel@tonic-gate 	/*
7307c478bd9Sstevel@tonic-gate 	 * Come here in case of an internal error. At this point we couldn't
73148bbca81SDaniel Hoffman 	 * get a buffer, but we have to return one. Hence we allocate some
7327c478bd9Sstevel@tonic-gate 	 * kind of error reply buffer on the fly. This buffer is marked as
7337c478bd9Sstevel@tonic-gate 	 * B_NOCACHE | B_AGE | B_ERROR | B_DONE to assure the following:
7347c478bd9Sstevel@tonic-gate 	 *	- B_ERROR will indicate error to the caller.
7357c478bd9Sstevel@tonic-gate 	 *	- B_DONE will prevent us from reading the buffer from
7367c478bd9Sstevel@tonic-gate 	 *	  the device.
7377c478bd9Sstevel@tonic-gate 	 *	- B_NOCACHE will cause that this buffer gets free'd in
7387c478bd9Sstevel@tonic-gate 	 *	  brelse().
7397c478bd9Sstevel@tonic-gate 	 */
7407c478bd9Sstevel@tonic-gate 
7417c478bd9Sstevel@tonic-gate errout:
7427c478bd9Sstevel@tonic-gate 	errbp = geteblk();
7437c478bd9Sstevel@tonic-gate 	sema_p(&errbp->b_sem);
7447c478bd9Sstevel@tonic-gate 	errbp->b_flags &= ~B_BUSY;
7457c478bd9Sstevel@tonic-gate 	errbp->b_flags |= (B_ERROR | B_DONE);
7467c478bd9Sstevel@tonic-gate 	return (errbp);
7477c478bd9Sstevel@tonic-gate }
7487c478bd9Sstevel@tonic-gate 
7497c478bd9Sstevel@tonic-gate /*
7507c478bd9Sstevel@tonic-gate  * Get an empty block, not assigned to any particular device.
7517c478bd9Sstevel@tonic-gate  * Returns a locked buffer that is not on any hash or free list.
7527c478bd9Sstevel@tonic-gate  */
7537c478bd9Sstevel@tonic-gate struct buf *
ngeteblk(long bsize)7547c478bd9Sstevel@tonic-gate ngeteblk(long bsize)
7557c478bd9Sstevel@tonic-gate {
7567c478bd9Sstevel@tonic-gate 	struct buf *bp;
7577c478bd9Sstevel@tonic-gate 
7587c478bd9Sstevel@tonic-gate 	bp = kmem_alloc(sizeof (struct buf), KM_SLEEP);
7597c478bd9Sstevel@tonic-gate 	bioinit(bp);
7607c478bd9Sstevel@tonic-gate 	bp->av_forw = bp->av_back = NULL;
7617c478bd9Sstevel@tonic-gate 	bp->b_un.b_addr = kmem_alloc(bsize, KM_SLEEP);
7627c478bd9Sstevel@tonic-gate 	bp->b_bufsize = bsize;
7637c478bd9Sstevel@tonic-gate 	bp->b_flags = B_BUSY | B_NOCACHE | B_AGE;
7647c478bd9Sstevel@tonic-gate 	bp->b_dev = (o_dev_t)NODEV;
7657c478bd9Sstevel@tonic-gate 	bp->b_edev = NODEV;
7667c478bd9Sstevel@tonic-gate 	bp->b_lblkno = 0;
7677c478bd9Sstevel@tonic-gate 	bp->b_bcount = bsize;
7687c478bd9Sstevel@tonic-gate 	bp->b_iodone = NULL;
7697c478bd9Sstevel@tonic-gate 	return (bp);
7707c478bd9Sstevel@tonic-gate }
7717c478bd9Sstevel@tonic-gate 
7727c478bd9Sstevel@tonic-gate /*
7737c478bd9Sstevel@tonic-gate  * Interface of geteblk() is kept intact to maintain driver compatibility.
7747c478bd9Sstevel@tonic-gate  * Use ngeteblk() to allocate block size other than 1 KB.
7757c478bd9Sstevel@tonic-gate  */
7767c478bd9Sstevel@tonic-gate struct buf *
geteblk(void)7777c478bd9Sstevel@tonic-gate geteblk(void)
7787c478bd9Sstevel@tonic-gate {
7797c478bd9Sstevel@tonic-gate 	return (ngeteblk((long)1024));
7807c478bd9Sstevel@tonic-gate }
7817c478bd9Sstevel@tonic-gate 
7827c478bd9Sstevel@tonic-gate /*
7837c478bd9Sstevel@tonic-gate  * Return a buffer w/o sleeping
7847c478bd9Sstevel@tonic-gate  */
7857c478bd9Sstevel@tonic-gate struct buf *
trygetblk(dev_t dev,daddr_t blkno)7867c478bd9Sstevel@tonic-gate trygetblk(dev_t dev, daddr_t blkno)
7877c478bd9Sstevel@tonic-gate {
7887c478bd9Sstevel@tonic-gate 	struct buf	*bp;
7897c478bd9Sstevel@tonic-gate 	struct buf	*dp;
7907c478bd9Sstevel@tonic-gate 	struct hbuf	*hp;
7917c478bd9Sstevel@tonic-gate 	kmutex_t	*hmp;
7927c478bd9Sstevel@tonic-gate 	uint_t		index;
7937c478bd9Sstevel@tonic-gate 
7947c478bd9Sstevel@tonic-gate 	index = bio_bhash(dev, blkno);
7957c478bd9Sstevel@tonic-gate 	hp = &hbuf[index];
7967c478bd9Sstevel@tonic-gate 	hmp = &hp->b_lock;
7977c478bd9Sstevel@tonic-gate 
7987c478bd9Sstevel@tonic-gate 	if (!mutex_tryenter(hmp))
7997c478bd9Sstevel@tonic-gate 		return (NULL);
8007c478bd9Sstevel@tonic-gate 
8017c478bd9Sstevel@tonic-gate 	dp = (struct buf *)hp;
8027c478bd9Sstevel@tonic-gate 	for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
8037c478bd9Sstevel@tonic-gate 		if (bp->b_blkno != blkno || bp->b_edev != dev ||
8047c478bd9Sstevel@tonic-gate 		    (bp->b_flags & B_STALE))
8057c478bd9Sstevel@tonic-gate 			continue;
8067c478bd9Sstevel@tonic-gate 		/*
8077c478bd9Sstevel@tonic-gate 		 * Get access to a valid buffer without sleeping
8087c478bd9Sstevel@tonic-gate 		 */
8097c478bd9Sstevel@tonic-gate 		if (sema_tryp(&bp->b_sem)) {
8107c478bd9Sstevel@tonic-gate 			if (bp->b_flags & B_DONE) {
8117c478bd9Sstevel@tonic-gate 				hp->b_length--;
8127c478bd9Sstevel@tonic-gate 				notavail(bp);
8137c478bd9Sstevel@tonic-gate 				mutex_exit(hmp);
8147c478bd9Sstevel@tonic-gate 				return (bp);
8157c478bd9Sstevel@tonic-gate 			} else {
8167c478bd9Sstevel@tonic-gate 				sema_v(&bp->b_sem);
8177c478bd9Sstevel@tonic-gate 				break;
8187c478bd9Sstevel@tonic-gate 			}
8197c478bd9Sstevel@tonic-gate 		}
8207c478bd9Sstevel@tonic-gate 		break;
8217c478bd9Sstevel@tonic-gate 	}
8227c478bd9Sstevel@tonic-gate 	mutex_exit(hmp);
8237c478bd9Sstevel@tonic-gate 	return (NULL);
8247c478bd9Sstevel@tonic-gate }
8257c478bd9Sstevel@tonic-gate 
8267c478bd9Sstevel@tonic-gate /*
8277c478bd9Sstevel@tonic-gate  * Wait for I/O completion on the buffer; return errors
8287c478bd9Sstevel@tonic-gate  * to the user.
8297c478bd9Sstevel@tonic-gate  */
8307c478bd9Sstevel@tonic-gate int
iowait(struct buf * bp)8317c478bd9Sstevel@tonic-gate iowait(struct buf *bp)
8327c478bd9Sstevel@tonic-gate {
8337c478bd9Sstevel@tonic-gate 	ASSERT(SEMA_HELD(&bp->b_sem));
8347c478bd9Sstevel@tonic-gate 	return (biowait(bp));
8357c478bd9Sstevel@tonic-gate }
8367c478bd9Sstevel@tonic-gate 
8377c478bd9Sstevel@tonic-gate /*
8387c478bd9Sstevel@tonic-gate  * Mark I/O complete on a buffer, release it if I/O is asynchronous,
8397c478bd9Sstevel@tonic-gate  * and wake up anyone waiting for it.
8407c478bd9Sstevel@tonic-gate  */
8417c478bd9Sstevel@tonic-gate void
iodone(struct buf * bp)8427c478bd9Sstevel@tonic-gate iodone(struct buf *bp)
8437c478bd9Sstevel@tonic-gate {
8447c478bd9Sstevel@tonic-gate 	ASSERT(SEMA_HELD(&bp->b_sem));
8457c478bd9Sstevel@tonic-gate 	(void) biodone(bp);
8467c478bd9Sstevel@tonic-gate }
8477c478bd9Sstevel@tonic-gate 
8487c478bd9Sstevel@tonic-gate /*
8497c478bd9Sstevel@tonic-gate  * Zero the core associated with a buffer.
8507c478bd9Sstevel@tonic-gate  */
8517c478bd9Sstevel@tonic-gate void
clrbuf(struct buf * bp)8527c478bd9Sstevel@tonic-gate clrbuf(struct buf *bp)
8537c478bd9Sstevel@tonic-gate {
8547c478bd9Sstevel@tonic-gate 	ASSERT(SEMA_HELD(&bp->b_sem));
8557c478bd9Sstevel@tonic-gate 	bzero(bp->b_un.b_addr, bp->b_bcount);
8567c478bd9Sstevel@tonic-gate 	bp->b_resid = 0;
8577c478bd9Sstevel@tonic-gate }
8587c478bd9Sstevel@tonic-gate 
8597c478bd9Sstevel@tonic-gate 
8607c478bd9Sstevel@tonic-gate /*
8617c478bd9Sstevel@tonic-gate  * Make sure all write-behind blocks on dev (or NODEV for all)
8627c478bd9Sstevel@tonic-gate  * are flushed out.
8637c478bd9Sstevel@tonic-gate  */
8647c478bd9Sstevel@tonic-gate void
bflush(dev_t dev)8657c478bd9Sstevel@tonic-gate bflush(dev_t dev)
8667c478bd9Sstevel@tonic-gate {
8677c478bd9Sstevel@tonic-gate 	struct buf *bp, *dp;
8687c478bd9Sstevel@tonic-gate 	struct hbuf *hp;
8697c478bd9Sstevel@tonic-gate 	struct buf *delwri_list = EMPTY_LIST;
8707c478bd9Sstevel@tonic-gate 	int i, index;
8717c478bd9Sstevel@tonic-gate 	kmutex_t *hmp;
8727c478bd9Sstevel@tonic-gate 
8737c478bd9Sstevel@tonic-gate 	mutex_enter(&blist_lock);
8747c478bd9Sstevel@tonic-gate 	/*
8757c478bd9Sstevel@tonic-gate 	 * Wait for any invalidates or flushes ahead of us to finish.
8767c478bd9Sstevel@tonic-gate 	 * We really could split blist_lock up per device for better
8777c478bd9Sstevel@tonic-gate 	 * parallelism here.
8787c478bd9Sstevel@tonic-gate 	 */
8797c478bd9Sstevel@tonic-gate 	while (bio_doinginval || bio_doingflush) {
8807c478bd9Sstevel@tonic-gate 		bio_flinv_cv_wanted = 1;
8817c478bd9Sstevel@tonic-gate 		cv_wait(&bio_flushinval_cv, &blist_lock);
8827c478bd9Sstevel@tonic-gate 	}
8837c478bd9Sstevel@tonic-gate 	bio_doingflush++;
8847c478bd9Sstevel@tonic-gate 	/*
8857c478bd9Sstevel@tonic-gate 	 * Gather all B_DELWRI buffer for device.
8867c478bd9Sstevel@tonic-gate 	 * Lock ordering is b_sem > hash lock (brelse).
8877c478bd9Sstevel@tonic-gate 	 * Since we are finding the buffer via the delayed write list,
8887c478bd9Sstevel@tonic-gate 	 * it may be busy and we would block trying to get the
8897c478bd9Sstevel@tonic-gate 	 * b_sem lock while holding hash lock. So transfer all the
8907c478bd9Sstevel@tonic-gate 	 * candidates on the delwri_list and then drop the hash locks.
8917c478bd9Sstevel@tonic-gate 	 */
8927c478bd9Sstevel@tonic-gate 	for (i = 0; i < v.v_hbuf; i++) {
8937c478bd9Sstevel@tonic-gate 		hmp = &hbuf[i].b_lock;
8947c478bd9Sstevel@tonic-gate 		dp = (struct buf *)&dwbuf[i];
8957c478bd9Sstevel@tonic-gate 		mutex_enter(hmp);
8967c478bd9Sstevel@tonic-gate 		for (bp = dp->av_forw; bp != dp; bp = bp->av_forw) {
8977c478bd9Sstevel@tonic-gate 			if (dev == NODEV || bp->b_edev == dev) {
8987c478bd9Sstevel@tonic-gate 				if (bp->b_list == NULL) {
8997c478bd9Sstevel@tonic-gate 					bp->b_list = delwri_list;
9007c478bd9Sstevel@tonic-gate 					delwri_list = bp;
9017c478bd9Sstevel@tonic-gate 				}
9027c478bd9Sstevel@tonic-gate 			}
9037c478bd9Sstevel@tonic-gate 		}
9047c478bd9Sstevel@tonic-gate 		mutex_exit(hmp);
9057c478bd9Sstevel@tonic-gate 	}
9067c478bd9Sstevel@tonic-gate 	mutex_exit(&blist_lock);
9077c478bd9Sstevel@tonic-gate 
9087c478bd9Sstevel@tonic-gate 	/*
9097c478bd9Sstevel@tonic-gate 	 * Now that the hash locks have been dropped grab the semaphores
9107c478bd9Sstevel@tonic-gate 	 * and write back all the buffers that have B_DELWRI set.
9117c478bd9Sstevel@tonic-gate 	 */
9127c478bd9Sstevel@tonic-gate 	while (delwri_list != EMPTY_LIST) {
9137c478bd9Sstevel@tonic-gate 		bp = delwri_list;
9147c478bd9Sstevel@tonic-gate 
9157c478bd9Sstevel@tonic-gate 		sema_p(&bp->b_sem);	/* may block */
9167c478bd9Sstevel@tonic-gate 		if ((dev != bp->b_edev && dev != NODEV) ||
9177c478bd9Sstevel@tonic-gate 		    (panicstr && bp->b_flags & B_BUSY)) {
9187c478bd9Sstevel@tonic-gate 			sema_v(&bp->b_sem);
9197c478bd9Sstevel@tonic-gate 			delwri_list = bp->b_list;
9207c478bd9Sstevel@tonic-gate 			bp->b_list = NULL;
9217c478bd9Sstevel@tonic-gate 			continue;	/* No longer a candidate */
9227c478bd9Sstevel@tonic-gate 		}
9237c478bd9Sstevel@tonic-gate 		if (bp->b_flags & B_DELWRI) {
9247c478bd9Sstevel@tonic-gate 			index = bio_bhash(bp->b_edev, bp->b_blkno);
9257c478bd9Sstevel@tonic-gate 			hp = &hbuf[index];
9267c478bd9Sstevel@tonic-gate 			hmp = &hp->b_lock;
9277c478bd9Sstevel@tonic-gate 			dp = (struct buf *)hp;
9287c478bd9Sstevel@tonic-gate 
9297c478bd9Sstevel@tonic-gate 			bp->b_flags |= B_ASYNC;
9307c478bd9Sstevel@tonic-gate 			mutex_enter(hmp);
9317c478bd9Sstevel@tonic-gate 			hp->b_length--;
9327c478bd9Sstevel@tonic-gate 			notavail(bp);
9337c478bd9Sstevel@tonic-gate 			mutex_exit(hmp);
9347c478bd9Sstevel@tonic-gate 			if (bp->b_vp == NULL) {		/* !ufs */
9357c478bd9Sstevel@tonic-gate 				BWRITE(bp);
9367c478bd9Sstevel@tonic-gate 			} else {			/* ufs */
9377c478bd9Sstevel@tonic-gate 				UFS_BWRITE(VTOI(bp->b_vp)->i_ufsvfs, bp);
9387c478bd9Sstevel@tonic-gate 			}
9397c478bd9Sstevel@tonic-gate 		} else {
9407c478bd9Sstevel@tonic-gate 			sema_v(&bp->b_sem);
9417c478bd9Sstevel@tonic-gate 		}
9427c478bd9Sstevel@tonic-gate 		delwri_list = bp->b_list;
9437c478bd9Sstevel@tonic-gate 		bp->b_list = NULL;
9447c478bd9Sstevel@tonic-gate 	}
9457c478bd9Sstevel@tonic-gate 	mutex_enter(&blist_lock);
9467c478bd9Sstevel@tonic-gate 	bio_doingflush--;
9477c478bd9Sstevel@tonic-gate 	if (bio_flinv_cv_wanted) {
9487c478bd9Sstevel@tonic-gate 		bio_flinv_cv_wanted = 0;
9497c478bd9Sstevel@tonic-gate 		cv_broadcast(&bio_flushinval_cv);
9507c478bd9Sstevel@tonic-gate 	}
9517c478bd9Sstevel@tonic-gate 	mutex_exit(&blist_lock);
9527c478bd9Sstevel@tonic-gate }
9537c478bd9Sstevel@tonic-gate 
9547c478bd9Sstevel@tonic-gate /*
9557c478bd9Sstevel@tonic-gate  * Ensure that a specified block is up-to-date on disk.
9567c478bd9Sstevel@tonic-gate  */
9577c478bd9Sstevel@tonic-gate void
blkflush(dev_t dev,daddr_t blkno)9587c478bd9Sstevel@tonic-gate blkflush(dev_t dev, daddr_t blkno)
9597c478bd9Sstevel@tonic-gate {
9607c478bd9Sstevel@tonic-gate 	struct buf *bp, *dp;
9617c478bd9Sstevel@tonic-gate 	struct hbuf *hp;
9627c478bd9Sstevel@tonic-gate 	struct buf *sbp = NULL;
9637c478bd9Sstevel@tonic-gate 	uint_t index;
9647c478bd9Sstevel@tonic-gate 	kmutex_t *hmp;
9657c478bd9Sstevel@tonic-gate 
9667c478bd9Sstevel@tonic-gate 	index = bio_bhash(dev, blkno);
9677c478bd9Sstevel@tonic-gate 	hp    = &hbuf[index];
9687c478bd9Sstevel@tonic-gate 	dp    = (struct buf *)hp;
9697c478bd9Sstevel@tonic-gate 	hmp   = &hp->b_lock;
9707c478bd9Sstevel@tonic-gate 
9717c478bd9Sstevel@tonic-gate 	/*
9727c478bd9Sstevel@tonic-gate 	 * Identify the buffer in the cache belonging to
9737c478bd9Sstevel@tonic-gate 	 * this device and blkno (if any).
9747c478bd9Sstevel@tonic-gate 	 */
9757c478bd9Sstevel@tonic-gate 	mutex_enter(hmp);
9767c478bd9Sstevel@tonic-gate 	for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
9777c478bd9Sstevel@tonic-gate 		if (bp->b_blkno != blkno || bp->b_edev != dev ||
9787c478bd9Sstevel@tonic-gate 		    (bp->b_flags & B_STALE))
9797c478bd9Sstevel@tonic-gate 			continue;
9807c478bd9Sstevel@tonic-gate 		sbp = bp;
9817c478bd9Sstevel@tonic-gate 		break;
9827c478bd9Sstevel@tonic-gate 	}
9837c478bd9Sstevel@tonic-gate 	mutex_exit(hmp);
9847c478bd9Sstevel@tonic-gate 	if (sbp == NULL)
9857c478bd9Sstevel@tonic-gate 		return;
9867c478bd9Sstevel@tonic-gate 	/*
9877c478bd9Sstevel@tonic-gate 	 * Now check the buffer we have identified and
9887c478bd9Sstevel@tonic-gate 	 * make sure it still belongs to the device and is B_DELWRI
9897c478bd9Sstevel@tonic-gate 	 */
9907c478bd9Sstevel@tonic-gate 	sema_p(&sbp->b_sem);
9917c478bd9Sstevel@tonic-gate 	if (sbp->b_blkno == blkno && sbp->b_edev == dev &&
9927c478bd9Sstevel@tonic-gate 	    (sbp->b_flags & (B_DELWRI|B_STALE)) == B_DELWRI) {
9937c478bd9Sstevel@tonic-gate 		mutex_enter(hmp);
9947c478bd9Sstevel@tonic-gate 		hp->b_length--;
9957c478bd9Sstevel@tonic-gate 		notavail(sbp);
9967c478bd9Sstevel@tonic-gate 		mutex_exit(hmp);
9977c478bd9Sstevel@tonic-gate 		/*
9987c478bd9Sstevel@tonic-gate 		 * XXX - There is nothing to guarantee a synchronous
9997c478bd9Sstevel@tonic-gate 		 * write here if the B_ASYNC flag is set.  This needs
10007c478bd9Sstevel@tonic-gate 		 * some investigation.
10017c478bd9Sstevel@tonic-gate 		 */
10027c478bd9Sstevel@tonic-gate 		if (sbp->b_vp == NULL) {		/* !ufs */
10037c478bd9Sstevel@tonic-gate 			BWRITE(sbp);	/* synchronous write */
10047c478bd9Sstevel@tonic-gate 		} else {				/* ufs */
10057c478bd9Sstevel@tonic-gate 			UFS_BWRITE(VTOI(sbp->b_vp)->i_ufsvfs, sbp);
10067c478bd9Sstevel@tonic-gate 		}
10077c478bd9Sstevel@tonic-gate 	} else {
10087c478bd9Sstevel@tonic-gate 		sema_v(&sbp->b_sem);
10097c478bd9Sstevel@tonic-gate 	}
10107c478bd9Sstevel@tonic-gate }
10117c478bd9Sstevel@tonic-gate 
10127c478bd9Sstevel@tonic-gate /*
10137c478bd9Sstevel@tonic-gate  * Same as binval, except can force-invalidate delayed-write buffers
10147c478bd9Sstevel@tonic-gate  * (which are not be already flushed because of device errors).  Also
10157c478bd9Sstevel@tonic-gate  * makes sure that the retry write flag is cleared.
10167c478bd9Sstevel@tonic-gate  */
10177c478bd9Sstevel@tonic-gate int
bfinval(dev_t dev,int force)10187c478bd9Sstevel@tonic-gate bfinval(dev_t dev, int force)
10197c478bd9Sstevel@tonic-gate {
10207c478bd9Sstevel@tonic-gate 	struct buf *dp;
10217c478bd9Sstevel@tonic-gate 	struct buf *bp;
10227c478bd9Sstevel@tonic-gate 	struct buf *binval_list = EMPTY_LIST;
10237c478bd9Sstevel@tonic-gate 	int i, error = 0;
10247c478bd9Sstevel@tonic-gate 	kmutex_t *hmp;
10257c478bd9Sstevel@tonic-gate 	uint_t index;
10267c478bd9Sstevel@tonic-gate 	struct buf **backp;
10277c478bd9Sstevel@tonic-gate 
10287c478bd9Sstevel@tonic-gate 	mutex_enter(&blist_lock);
10297c478bd9Sstevel@tonic-gate 	/*
10307c478bd9Sstevel@tonic-gate 	 * Wait for any flushes ahead of us to finish, it's ok to
10317c478bd9Sstevel@tonic-gate 	 * do invalidates in parallel.
10327c478bd9Sstevel@tonic-gate 	 */
10337c478bd9Sstevel@tonic-gate 	while (bio_doingflush) {
10347c478bd9Sstevel@tonic-gate 		bio_flinv_cv_wanted = 1;
10357c478bd9Sstevel@tonic-gate 		cv_wait(&bio_flushinval_cv, &blist_lock);
10367c478bd9Sstevel@tonic-gate 	}
10377c478bd9Sstevel@tonic-gate 	bio_doinginval++;
10387c478bd9Sstevel@tonic-gate 
10397c478bd9Sstevel@tonic-gate 	/* Gather bp's */
10407c478bd9Sstevel@tonic-gate 	for (i = 0; i < v.v_hbuf; i++) {
10417c478bd9Sstevel@tonic-gate 		dp = (struct buf *)&hbuf[i];
10427c478bd9Sstevel@tonic-gate 		hmp = &hbuf[i].b_lock;
10437c478bd9Sstevel@tonic-gate 
10447c478bd9Sstevel@tonic-gate 		mutex_enter(hmp);
10457c478bd9Sstevel@tonic-gate 		for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
10467c478bd9Sstevel@tonic-gate 			if (bp->b_edev == dev) {
10477c478bd9Sstevel@tonic-gate 				if (bp->b_list == NULL) {
10487c478bd9Sstevel@tonic-gate 					bp->b_list = binval_list;
10497c478bd9Sstevel@tonic-gate 					binval_list = bp;
10507c478bd9Sstevel@tonic-gate 				}
10517c478bd9Sstevel@tonic-gate 			}
10527c478bd9Sstevel@tonic-gate 		}
10537c478bd9Sstevel@tonic-gate 		mutex_exit(hmp);
10547c478bd9Sstevel@tonic-gate 	}
10557c478bd9Sstevel@tonic-gate 	mutex_exit(&blist_lock);
10567c478bd9Sstevel@tonic-gate 
10577c478bd9Sstevel@tonic-gate 	/* Invalidate all bp's found */
10587c478bd9Sstevel@tonic-gate 	while (binval_list != EMPTY_LIST) {
10597c478bd9Sstevel@tonic-gate 		bp = binval_list;
10607c478bd9Sstevel@tonic-gate 
10617c478bd9Sstevel@tonic-gate 		sema_p(&bp->b_sem);
10627c478bd9Sstevel@tonic-gate 		if (bp->b_edev == dev) {
10637c478bd9Sstevel@tonic-gate 			if (force && (bp->b_flags & B_DELWRI)) {
10647c478bd9Sstevel@tonic-gate 				/* clear B_DELWRI, move to non-dw freelist */
10657c478bd9Sstevel@tonic-gate 				index = bio_bhash(bp->b_edev, bp->b_blkno);
10667c478bd9Sstevel@tonic-gate 				hmp = &hbuf[index].b_lock;
10677c478bd9Sstevel@tonic-gate 				dp = (struct buf *)&hbuf[index];
10687c478bd9Sstevel@tonic-gate 				mutex_enter(hmp);
10697c478bd9Sstevel@tonic-gate 
10707c478bd9Sstevel@tonic-gate 				/* remove from delayed write freelist */
10717c478bd9Sstevel@tonic-gate 				notavail(bp);
10727c478bd9Sstevel@tonic-gate 
10737c478bd9Sstevel@tonic-gate 				/* add to B_AGE side of non-dw freelist */
10747c478bd9Sstevel@tonic-gate 				backp = &dp->av_forw;
10757c478bd9Sstevel@tonic-gate 				(*backp)->av_back = bp;
10767c478bd9Sstevel@tonic-gate 				bp->av_forw = *backp;
10777c478bd9Sstevel@tonic-gate 				*backp = bp;
10787c478bd9Sstevel@tonic-gate 				bp->av_back = dp;
10797c478bd9Sstevel@tonic-gate 
10807c478bd9Sstevel@tonic-gate 				/*
10817c478bd9Sstevel@tonic-gate 				 * make sure write retries and busy are cleared
10827c478bd9Sstevel@tonic-gate 				 */
10837c478bd9Sstevel@tonic-gate 				bp->b_flags &=
10847c478bd9Sstevel@tonic-gate 				    ~(B_BUSY | B_DELWRI | B_RETRYWRI);
10857c478bd9Sstevel@tonic-gate 				mutex_exit(hmp);
10867c478bd9Sstevel@tonic-gate 			}
10877c478bd9Sstevel@tonic-gate 			if ((bp->b_flags & B_DELWRI) == 0)
10887c478bd9Sstevel@tonic-gate 				bp->b_flags |= B_STALE|B_AGE;
10897c478bd9Sstevel@tonic-gate 			else
10907c478bd9Sstevel@tonic-gate 				error = EIO;
10917c478bd9Sstevel@tonic-gate 		}
10927c478bd9Sstevel@tonic-gate 		sema_v(&bp->b_sem);
10937c478bd9Sstevel@tonic-gate 		binval_list = bp->b_list;
10947c478bd9Sstevel@tonic-gate 		bp->b_list = NULL;
10957c478bd9Sstevel@tonic-gate 	}
10967c478bd9Sstevel@tonic-gate 	mutex_enter(&blist_lock);
10977c478bd9Sstevel@tonic-gate 	bio_doinginval--;
10987c478bd9Sstevel@tonic-gate 	if (bio_flinv_cv_wanted) {
10997c478bd9Sstevel@tonic-gate 		cv_broadcast(&bio_flushinval_cv);
11007c478bd9Sstevel@tonic-gate 		bio_flinv_cv_wanted = 0;
11017c478bd9Sstevel@tonic-gate 	}
11027c478bd9Sstevel@tonic-gate 	mutex_exit(&blist_lock);
11037c478bd9Sstevel@tonic-gate 	return (error);
11047c478bd9Sstevel@tonic-gate }
11057c478bd9Sstevel@tonic-gate 
11067c478bd9Sstevel@tonic-gate /*
11077c478bd9Sstevel@tonic-gate  * If possible, invalidate blocks for a dev on demand
11087c478bd9Sstevel@tonic-gate  */
11097c478bd9Sstevel@tonic-gate void
binval(dev_t dev)11107c478bd9Sstevel@tonic-gate binval(dev_t dev)
11117c478bd9Sstevel@tonic-gate {
11127c478bd9Sstevel@tonic-gate 	(void) bfinval(dev, 0);
11137c478bd9Sstevel@tonic-gate }
11147c478bd9Sstevel@tonic-gate 
11157c478bd9Sstevel@tonic-gate /*
11167c478bd9Sstevel@tonic-gate  * Initialize the buffer I/O system by freeing
11177c478bd9Sstevel@tonic-gate  * all buffers and setting all device hash buffer lists to empty.
11187c478bd9Sstevel@tonic-gate  */
11197c478bd9Sstevel@tonic-gate void
binit(void)11207c478bd9Sstevel@tonic-gate binit(void)
11217c478bd9Sstevel@tonic-gate {
11227c478bd9Sstevel@tonic-gate 	struct buf *bp;
11237c478bd9Sstevel@tonic-gate 	unsigned int i, pct;
11247c478bd9Sstevel@tonic-gate 	ulong_t	bio_max_hwm, bio_default_hwm;
11257c478bd9Sstevel@tonic-gate 
11267c478bd9Sstevel@tonic-gate 	/*
11277c478bd9Sstevel@tonic-gate 	 * Maximum/Default values for bufhwm are set to the smallest of:
11287c478bd9Sstevel@tonic-gate 	 *	- BIO_MAX_PERCENT resp. BIO_BUF_PERCENT of real memory
11297c478bd9Sstevel@tonic-gate 	 *	- 1/4 of kernel virtual memory
11307c478bd9Sstevel@tonic-gate 	 *	- INT32_MAX to prevent overflows of v.v_bufhwm (which is int).
11317c478bd9Sstevel@tonic-gate 	 * Additionally, in order to allow simple tuning by percentage of
11327c478bd9Sstevel@tonic-gate 	 * physical memory, bufhwm_pct is used to calculate the default if
11337c478bd9Sstevel@tonic-gate 	 * the value of this tunable is between 0 and BIO_MAX_PERCENT.
11347c478bd9Sstevel@tonic-gate 	 *
11357c478bd9Sstevel@tonic-gate 	 * Since the unit for v.v_bufhwm is kilobytes, this allows for
11367c478bd9Sstevel@tonic-gate 	 * a maximum of 1024 * 2GB == 2TB memory usage by buffer headers.
11377c478bd9Sstevel@tonic-gate 	 */
11387c478bd9Sstevel@tonic-gate 	bio_max_hwm = MIN(physmem / BIO_MAX_PERCENT,
11397c478bd9Sstevel@tonic-gate 	    btop(vmem_size(heap_arena, VMEM_FREE)) / 4) * (PAGESIZE / 1024);
11407c478bd9Sstevel@tonic-gate 	bio_max_hwm = MIN(INT32_MAX, bio_max_hwm);
11417c478bd9Sstevel@tonic-gate 
11427c478bd9Sstevel@tonic-gate 	pct = BIO_BUF_PERCENT;
11437c478bd9Sstevel@tonic-gate 	if (bufhwm_pct != 0 &&
11447c478bd9Sstevel@tonic-gate 	    ((pct = 100 / bufhwm_pct) < BIO_MAX_PERCENT)) {
11457c478bd9Sstevel@tonic-gate 		pct = BIO_BUF_PERCENT;
11467c478bd9Sstevel@tonic-gate 		/*
11477c478bd9Sstevel@tonic-gate 		 * Invalid user specified value, emit a warning.
11487c478bd9Sstevel@tonic-gate 		 */
11497c478bd9Sstevel@tonic-gate 		cmn_err(CE_WARN, "binit: bufhwm_pct(%d) out of \
1150d3d50737SRafael Vanoni 		    range(1..%d). Using %d as default.",
1151d3d50737SRafael Vanoni 		    bufhwm_pct,
1152d3d50737SRafael Vanoni 		    100 / BIO_MAX_PERCENT, 100 / BIO_BUF_PERCENT);
11537c478bd9Sstevel@tonic-gate 	}
11547c478bd9Sstevel@tonic-gate 
11557c478bd9Sstevel@tonic-gate 	bio_default_hwm = MIN(physmem / pct,
11567c478bd9Sstevel@tonic-gate 	    btop(vmem_size(heap_arena, VMEM_FREE)) / 4) * (PAGESIZE / 1024);
11577c478bd9Sstevel@tonic-gate 	bio_default_hwm = MIN(INT32_MAX, bio_default_hwm);
11587c478bd9Sstevel@tonic-gate 
11597c478bd9Sstevel@tonic-gate 	if ((v.v_bufhwm = bufhwm) == 0)
11607c478bd9Sstevel@tonic-gate 		v.v_bufhwm = bio_default_hwm;
11617c478bd9Sstevel@tonic-gate 
11627c478bd9Sstevel@tonic-gate 	if (v.v_bufhwm < BIO_MIN_HWM || v.v_bufhwm > bio_max_hwm) {
11637c478bd9Sstevel@tonic-gate 		v.v_bufhwm = (int)bio_max_hwm;
11647c478bd9Sstevel@tonic-gate 		/*
11657c478bd9Sstevel@tonic-gate 		 * Invalid user specified value, emit a warning.
11667c478bd9Sstevel@tonic-gate 		 */
11677c478bd9Sstevel@tonic-gate 		cmn_err(CE_WARN,
1168d3d50737SRafael Vanoni 		    "binit: bufhwm(%d) out \
1169d3d50737SRafael Vanoni 		    of range(%d..%lu). Using %lu as default",
1170d3d50737SRafael Vanoni 		    bufhwm,
1171d3d50737SRafael Vanoni 		    BIO_MIN_HWM, bio_max_hwm, bio_max_hwm);
11727c478bd9Sstevel@tonic-gate 	}
11737c478bd9Sstevel@tonic-gate 
11747c478bd9Sstevel@tonic-gate 	/*
11757c478bd9Sstevel@tonic-gate 	 * Determine the number of hash buckets. Default is to
11767c478bd9Sstevel@tonic-gate 	 * create ~BIO_HASHLEN entries per chain based on MAXBSIZE buffers.
11777c478bd9Sstevel@tonic-gate 	 * Round up number to the next power of 2.
11787c478bd9Sstevel@tonic-gate 	 */
11797c478bd9Sstevel@tonic-gate 	v.v_hbuf = 1 << highbit((((ulong_t)v.v_bufhwm * 1024) / MAXBSIZE) /
1180