common/os/bio.c

/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 * Copyright 2011 Joyent, Inc.  All rights reserved.
 */

/*
 * Copyright (c) 2016 by Delphix. All rights reserved.
 */

/*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
/*	  All Rights Reserved	*/

/*
 * University Copyright- Copyright (c) 1982, 1986, 1988
 * The Regents of the University of California
 * All Rights Reserved
 *
 * University Acknowledgment- Portions of this document are derived from
 * software developed by the University of California, Berkeley, and its
 * contributors.
 */

#include <sys/types.h>
#include <sys/t_lock.h>
#include <sys/sysmacros.h>
#include <sys/conf.h>
#include <sys/cpuvar.h>
#include <sys/errno.h>
#include <sys/debug.h>
#include <sys/buf.h>
#include <sys/var.h>
#include <sys/vnode.h>
#include <sys/bitmap.h>
#include <sys/cmn_err.h>
#include <sys/kmem.h>
#include <sys/vmem.h>
#include <sys/atomic.h>
#include <vm/seg_kmem.h>
#include <vm/page.h>
#include <vm/pvn.h>
#include <sys/vtrace.h>
#include <sys/tnf_probe.h>
#include <sys/fs/ufs_inode.h>
#include <sys/fs/ufs_bio.h>
#include <sys/fs/ufs_log.h>
#include <sys/systm.h>
#include <sys/vfs.h>
#include <sys/sdt.h>

/* Locks */
static	kmutex_t	blist_lock;	/* protects b_list */
static	kmutex_t	bhdr_lock;	/* protects the bhdrlist */
static	kmutex_t	bfree_lock;	/* protects the bfreelist structure */

struct hbuf	*hbuf;			/* Hash buckets */
struct dwbuf	*dwbuf;			/* Delayed write buckets */
static struct buf *bhdrlist;		/* buf header free list */
static int	nbuf;			/* number of buffer headers allocated */

static int	lastindex;		/* Reference point on where to start */
					/* when looking for free buffers */

#define	bio_bhash(dev, bn)	(hash2ints((dev), (int)(bn)) & v.v_hmask)
#define	EMPTY_LIST	((struct buf *)-1)

static kcondvar_t	bio_mem_cv;	/* Condition variables */
static kcondvar_t	bio_flushinval_cv;
static int	bio_doingflush;		/* flush in progress */
static int	bio_doinginval;		/* inval in progress */
static int	bio_flinv_cv_wanted;	/* someone waiting for cv */

/*
 * Statistics on the buffer cache
 */
struct biostats biostats = {
	{ "buffer_cache_lookups",		KSTAT_DATA_UINT32 },
	{ "buffer_cache_hits",			KSTAT_DATA_UINT32 },
	{ "new_buffer_requests",		KSTAT_DATA_UINT32 },
	{ "waits_for_buffer_allocs",		KSTAT_DATA_UINT32 },
	{ "buffers_locked_by_someone",		KSTAT_DATA_UINT32 },
	{ "duplicate_buffers_found",		KSTAT_DATA_UINT32 }
};

/*
 * kstat data
 */
kstat_named_t	*biostats_ptr = (kstat_named_t *)&biostats;
uint_t		biostats_ndata = (uint_t)(sizeof (biostats) /
					sizeof (kstat_named_t));

/*
 * Statistics on ufs buffer cache
 * Not protected by locks
 */
struct ufsbiostats ub = {
	{ "breads",			KSTAT_DATA_UINT32 },
	{ "bwrites",			KSTAT_DATA_UINT32 },
	{ "fbiwrites",			KSTAT_DATA_UINT32 },
	{ "getpages",			KSTAT_DATA_UINT32 },
	{ "getras",			KSTAT_DATA_UINT32 },
	{ "putsyncs",			KSTAT_DATA_UINT32 },
	{ "putasyncs",			KSTAT_DATA_UINT32 },
	{ "putpageios",			KSTAT_DATA_UINT32 },
};

/*
 * more UFS Logging eccentricities...
 *
 * required since "#pragma weak ..." doesn't work in reverse order.
 * i.e.:  genunix (bio.c) is loaded before the ufs modules and pointers
 *        to ufs routines don't get plugged into bio.c calls so
 *        we initialize it when setting up the "lufsops" table
 *        in "lufs.c:_init()"
 */
void (*bio_lufs_strategy)(void *, buf_t *);
void (*bio_snapshot_strategy)(void *, buf_t *);


/* Private routines */
static struct buf	*bio_getfreeblk(long);
static void		bio_mem_get(long);
static void		bio_bhdr_free(struct buf *);
static struct buf	*bio_bhdr_alloc(void);
static void		bio_recycle(int, long);
static void		bio_pageio_done(struct buf *);
static int		bio_incore(dev_t, daddr_t);

/*
 * Buffer cache constants
 */
#define	BIO_BUF_PERCENT	(100/2)		/* default: 2% of memory */
#define	BIO_MAX_PERCENT	(100/20)	/* max is 20% of real memory */
#define	BIO_BHDR_POOL	100		/* Default bhdr pool size */
#define	BIO_MIN_HDR	10		/* Minimum number of buffer headers */
#define	BIO_MIN_HWM	(BIO_MIN_HDR * MAXBSIZE / 1024)
#define	BIO_HASHLEN	4		/* Target length of hash chains */


/* Flags for bio_recycle() */
#define	BIO_HEADER	0x01
#define	BIO_MEM		0x02

extern	int bufhwm;		/* User tunable - high water mark for mem  */
extern	int bufhwm_pct;		/* ditto - given in % of physmem  */

/*
 * The following routines allocate and free
 * buffers with various side effects.  In general the
 * arguments to an allocate routine are a device and
 * a block number, and the value is a pointer to
 * to the buffer header; the buffer returned is locked with a
 * binary semaphore so that no one else can touch it. If the block was
 * already in core, no I/O need be done; if it is
 * already locked, the process waits until it becomes free.
 * The following routines allocate a buffer:
 *	getblk
 *	bread/BREAD
 *	breada
 * Eventually the buffer must be released, possibly with the
 * side effect of writing it out, by using one of
 *	bwrite/BWRITE/brwrite
 *	bdwrite/bdrwrite
 *	bawrite
 *	brelse
 *
 * The B_WANTED/B_BUSY bits are NOT used by these routines for synchronization.
 * Instead, a binary semaphore, b_sem is used to gain exclusive access to
 * a buffer and a binary semaphore, b_io is used for I/O synchronization.
 * B_DONE is still used to denote a buffer with I/O complete on it.
 *
 * The bfreelist.b_bcount field is computed everytime fsflush runs. It is
 * should not be used where a very accurate count of the free buffers is
 * needed.
 */

/*
 * Read in (if necessary) the block and return a buffer pointer.
 *
 * This interface is provided for binary compatibility.  Using
 * BREAD() directly avoids the extra function call overhead invoked
 * by calling this routine.
 */
struct buf *
bread(dev_t dev, daddr_t blkno, long bsize)
{
	return (BREAD(dev, blkno, bsize));
}

/*
 * Common code for reading a buffer with various options
 *
 * Read in (if necessary) the block and return a buffer pointer.
 */
struct buf *
bread_common(void *arg, dev_t dev, daddr_t blkno, long bsize)
{
	struct ufsvfs *ufsvfsp = (struct ufsvfs *)arg;
	struct buf *bp;
	klwp_t *lwp = ttolwp(curthread);

	CPU_STATS_ADD_K(sys, lread, 1);
	bp = getblk_common(ufsvfsp, dev, blkno, bsize, /* errflg */ 1);
	if (bp->b_flags & B_DONE)
		return (bp);
	bp->b_flags |= B_READ;
	ASSERT(bp->b_bcount == bsize);
	if (ufsvfsp == NULL) {					/* !ufs */
		(void) bdev_strategy(bp);
	} else if (ufsvfsp->vfs_log && bio_lufs_strategy != NULL) {
							/* ufs && logging */
		(*bio_lufs_strategy)(ufsvfsp->vfs_log, bp);
	} else if (ufsvfsp->vfs_snapshot && bio_snapshot_strategy != NULL) {
							/* ufs && snapshots */
		(*bio_snapshot_strategy)(&ufsvfsp->vfs_snapshot, bp);
	} else {
		ufsvfsp->vfs_iotstamp = ddi_get_lbolt();
		ub.ub_breads.value.ul++;		/* ufs && !logging */
		(void) bdev_strategy(bp);
	}
	if (lwp != NULL)
		lwp->lwp_ru.inblock++;
	CPU_STATS_ADD_K(sys, bread, 1);
	(void) biowait(bp);
	return (bp);
}

/*
 * Read in the block, like bread, but also start I/O on the
 * read-ahead block (which is not allocated to the caller).
 */
struct buf *
breada(dev_t dev, daddr_t blkno, daddr_t rablkno, long bsize)
{
	struct buf *bp, *rabp;
	klwp_t *lwp = ttolwp(curthread);

	bp = NULL;
	if (!bio_incore(dev, blkno)) {
		CPU_STATS_ADD_K(sys, lread, 1);
		bp = GETBLK(dev, blkno, bsize);
		if ((bp->b_flags & B_DONE) == 0) {
			bp->b_flags |= B_READ;
			bp->b_bcount = bsize;
			(void) bdev_strategy(bp);
			if (lwp != NULL)
				lwp->lwp_ru.inblock++;
			CPU_STATS_ADD_K(sys, bread, 1);
		}
	}
	if (rablkno && bfreelist.b_bcount > 1 &&
	    !bio_incore(dev, rablkno)) {
		rabp = GETBLK(dev, rablkno, bsize);
		if (rabp->b_flags & B_DONE)
			brelse(rabp);
		else {
			rabp->b_flags |= B_READ|B_ASYNC;
			rabp->b_bcount = bsize;
			(void) bdev_strategy(rabp);
			if (lwp != NULL)
				lwp->lwp_ru.inblock++;
			CPU_STATS_ADD_K(sys, bread, 1);
		}
	}
	if (bp == NULL)
		return (BREAD(dev, blkno, bsize));
	(void) biowait(bp);
	return (bp);
}

/*
 * Common code for writing a buffer with various options.
 *
 * force_wait  - wait for write completion regardless of B_ASYNC flag
 * do_relse    - release the buffer when we are done
 * clear_flags - flags to clear from the buffer
 */
void
bwrite_common(void *arg, struct buf *bp, int force_wait,
    int do_relse, int clear_flags)
{
	register int do_wait;
	struct ufsvfs *ufsvfsp = (struct ufsvfs *)arg;
	int flag;
	klwp_t *lwp = ttolwp(curthread);
	struct cpu *cpup;

	ASSERT(SEMA_HELD(&bp->b_sem));
	flag = bp->b_flags;
	bp->b_flags &= ~clear_flags;
	if (lwp != NULL)
		lwp->lwp_ru.oublock++;
	CPU_STATS_ENTER_K();
	cpup = CPU;		/* get pointer AFTER preemption is disabled */
	CPU_STATS_ADDQ(cpup, sys, lwrite, 1);
	CPU_STATS_ADDQ(cpup, sys, bwrite, 1);
	do_wait = ((flag & B_ASYNC) == 0 || force_wait);
	if (do_wait == 0)
		CPU_STATS_ADDQ(cpup, sys, bawrite, 1);
	CPU_STATS_EXIT_K();
	if (ufsvfsp == NULL) {
		(void) bdev_strategy(bp);
	} else if (ufsvfsp->vfs_log && bio_lufs_strategy != NULL) {
							/* ufs && logging */
		(*bio_lufs_strategy)(ufsvfsp->vfs_log, bp);
	} else if (ufsvfsp->vfs_snapshot && bio_snapshot_strategy != NULL) {
							/* ufs && snapshots */
		(*bio_snapshot_strategy)(&ufsvfsp->vfs_snapshot, bp);
	} else {
		ub.ub_bwrites.value.ul++;		/* ufs && !logging */
		(void) bdev_strategy(bp);
	}
	if (do_wait) {
		(void) biowait(bp);
		if (do_relse) {
			brelse(bp);
		}
	}
}

/*
 * Write the buffer, waiting for completion (unless B_ASYNC is set).
 * Then release the buffer.
 * This interface is provided for binary compatibility.  Using
 * BWRITE() directly avoids the extra function call overhead invoked
 * by calling this routine.
 */
void
bwrite(struct buf *bp)
{
	BWRITE(bp);
}

/*
 * Write the buffer, waiting for completion.
 * But don't release the buffer afterwards.
 * This interface is provided for binary compatibility.  Using
 * BWRITE2() directly avoids the extra function call overhead.
 */
void
bwrite2(struct buf *bp)
{
	BWRITE2(bp);
}

/*
 * Release the buffer, marking it so that if it is grabbed
 * for another purpose it will be written out before being
 * given up (e.g. when writing a partial block where it is
 * assumed that another write for the same block will soon follow).
 * Also save the time that the block is first marked as delayed
 * so that it will be written in a reasonable time.
 */
void
bdwrite(struct buf *bp)
{
	ASSERT(SEMA_HELD(&bp->b_sem));
	CPU_STATS_ADD_K(sys, lwrite, 1);
	if ((bp->b_flags & B_DELWRI) == 0)
		bp->b_start = ddi_get_lbolt();
	/*
	 * B_DONE allows others to use the buffer, B_DELWRI causes the
	 * buffer to be written before being reused, and setting b_resid
	 * to zero says the buffer is complete.
	 */
	bp->b_flags |= B_DELWRI | B_DONE;
	bp->b_resid = 0;
	brelse(bp);
}

/*
 * Release the buffer, start I/O on it, but don't wait for completion.
 */
void
bawrite(struct buf *bp)
{
	ASSERT(SEMA_HELD(&bp->b_sem));

	/* Use bfreelist.b_bcount as a weird-ass heuristic */
	if (bfreelist.b_bcount > 4)
		bp->b_flags |= B_ASYNC;
	BWRITE(bp);
}

/*
 * Release the buffer, with no I/O implied.
 */
void
brelse(struct buf *bp)
{
	struct buf	**backp;
	uint_t		index;
	kmutex_t	*hmp;
	struct	buf	*dp;
	struct	hbuf	*hp;


	ASSERT(SEMA_HELD(&bp->b_sem));

	/*
	 * Clear the retry write flag if the buffer was written without
	 * error.  The presence of B_DELWRI means the buffer has not yet
	 * been written and the presence of B_ERROR means that an error
	 * is still occurring.
	 */
	if ((bp->b_flags & (B_ERROR | B_DELWRI | B_RETRYWRI)) == B_RETRYWRI) {
		bp->b_flags &= ~B_RETRYWRI;
	}

	/* Check for anomalous conditions */
	if (bp->b_flags & (B_ERROR|B_NOCACHE)) {
		if (bp->b_flags & B_NOCACHE) {
			/* Don't add to the freelist. Destroy it now */
			kmem_free(bp->b_un.b_addr, bp->b_bufsize);
			sema_destroy(&bp->b_sem);
			sema_destroy(&bp->b_io);
			kmem_free(bp, sizeof (struct buf));
			return;
		}
		/*
		 * If a write failed and we are supposed to retry write,
		 * don't toss the buffer.  Keep it around and mark it
		 * delayed write in the hopes that it will eventually
		 * get flushed (and still keep the system running.)
		 */
		if ((bp->b_flags & (B_READ | B_RETRYWRI)) == B_RETRYWRI) {
			bp->b_flags |= B_DELWRI;
			/* keep fsflush from trying continuously to flush */
			bp->b_start = ddi_get_lbolt();
		} else
			bp->b_flags |= B_AGE|B_STALE;
		bp->b_flags &= ~B_ERROR;
		bp->b_error = 0;
	}

	/*
	 * If delayed write is set then put in on the delayed
	 * write list instead of the free buffer list.
	 */
	index = bio_bhash(bp->b_edev, bp->b_blkno);
	hmp   = &hbuf[index].b_lock;

	mutex_enter(hmp);
	hp = &hbuf[index];
	dp = (struct buf *)hp;

	/*
	 * Make sure that the number of entries on this list are
	 * Zero <= count <= total # buffers
	 */
	ASSERT(hp->b_length >= 0);
	ASSERT(hp->b_length < nbuf);

	hp->b_length++;		/* We are adding this buffer */

	if (bp->b_flags & B_DELWRI) {
		/*
		 * This buffer goes on the delayed write buffer list
		 */
		dp = (struct buf *)&dwbuf[index];
	}
	ASSERT(bp->b_bufsize > 0);
	ASSERT(bp->b_bcount > 0);
	ASSERT(bp->b_un.b_addr != NULL);

	if (bp->b_flags & B_AGE) {
		backp = &dp->av_forw;
		(*backp)->av_back = bp;
		bp->av_forw = *backp;
		*backp = bp;
		bp->av_back = dp;
	} else {
		backp = &dp->av_back;
		(*backp)->av_forw = bp;
		bp->av_back = *backp;
		*backp = bp;
		bp->av_forw = dp;
	}
	mutex_exit(hmp);

	if (bfreelist.b_flags & B_WANTED) {
		/*
		 * Should come here very very rarely.
		 */
		mutex_enter(&bfree_lock);
		if (bfreelist.b_flags & B_WANTED) {
			bfreelist.b_flags &= ~B_WANTED;
			cv_broadcast(&bio_mem_cv);
		}
		mutex_exit(&bfree_lock);
	}

	bp->b_flags &= ~(B_WANTED|B_BUSY|B_ASYNC);
	/*
	 * Don't let anyone get the buffer off the freelist before we
	 * release our hold on it.
	 */
	sema_v(&bp->b_sem);
}

/*
 * Return a count of the number of B_BUSY buffers in the system
 * Can only be used as a good estimate.  If 'cleanit' is set,
 * try to flush all bufs.
 */
int
bio_busy(int cleanit)
{
	struct buf *bp, *dp;
	int busy = 0;
	int i;
	kmutex_t *hmp;

	for (i = 0; i < v.v_hbuf; i++) {
		dp = (struct buf *)&hbuf[i];
		hmp = &hbuf[i].b_lock;

		mutex_enter(hmp);
		for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
			if (bp->b_flags & B_BUSY)
				busy++;
		}
		mutex_exit(hmp);
	}

	if (cleanit && busy != 0) {
		bflush(NODEV);
	}

	return (busy);
}

/*
 * this interface is provided for binary compatibility.
 *
 * Assign a buffer for the given block.  If the appropriate
 * block is already associated, return it; otherwise search
 * for the oldest non-busy buffer and reassign it.
 */
struct buf *
getblk(dev_t dev, daddr_t blkno, long bsize)
{
	return (getblk_common(/* ufsvfsp */ NULL, dev,
	    blkno, bsize, /* errflg */ 0));
}

/*
 * Assign a buffer for the given block.  If the appropriate
 * block is already associated, return it; otherwise search
 * for the oldest non-busy buffer and reassign it.
 */
struct buf *
getblk_common(void * arg, dev_t dev, daddr_t blkno, long bsize, int errflg)
{
	ufsvfs_t *ufsvfsp = (struct ufsvfs *)arg;
	struct buf *bp;
	struct buf *dp;
	struct buf *nbp = NULL;
	struct buf *errbp;
	uint_t		index;
	kmutex_t	*hmp;
	struct	hbuf	*hp;

	if (getmajor(dev) >= devcnt)
		cmn_err(CE_PANIC, "blkdev");

	biostats.bio_lookup.value.ui32++;

	index = bio_bhash(dev, blkno);
	hp    = &hbuf[index];
	dp    = (struct buf *)hp;
	hmp   = &hp->b_lock;

	mutex_enter(hmp);
loop:
	for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
		if (bp->b_blkno != blkno || bp->b_edev != dev ||
		    (bp->b_flags & B_STALE))
			continue;
		/*
		 * Avoid holding the hash lock in the event that
		 * the buffer is locked by someone. Since the hash chain
		 * may change when we drop the hash lock
		 * we have to start at the beginning of the chain if the
		 * buffer identity/contents aren't valid.
		 */
		if (!sema_tryp(&bp->b_sem)) {
			biostats.bio_bufbusy.value.ui32++;
			mutex_exit(hmp);
			/*
			 * OK, we are dealing with a busy buffer.
			 * In the case that we are panicking and we
			 * got called from bread(), we have some chance
			 * for error recovery. So better bail out from
			 * here since sema_p() won't block. If we got
			 * called directly from ufs routines, there is
			 * no way to report an error yet.
			 */
			if (panicstr && errflg)
				goto errout;
			/*
			 * For the following line of code to work
			 * correctly never kmem_free the buffer "header".
			 */
			sema_p(&bp->b_sem);
			if (bp->b_blkno != blkno || bp->b_edev != dev ||
			    (bp->b_flags & B_STALE)) {
				sema_v(&bp->b_sem);
				mutex_enter(hmp);
				goto loop;	/* start over */
			}
			mutex_enter(hmp);
		}
		/* Found */
		biostats.bio_hit.value.ui32++;
		bp->b_flags &= ~B_AGE;

		/*
		 * Yank it off the free/delayed write lists
		 */
		hp->b_length--;
		notavail(bp);
		mutex_exit(hmp);

		ASSERT((bp->b_flags & B_NOCACHE) == 0);

		if (nbp == NULL) {
			/*
			 * Make the common path short.
			 */
			ASSERT(SEMA_HELD(&bp->b_sem));
			return (bp);
		}

		biostats.bio_bufdup.value.ui32++;

		/*
		 * The buffer must have entered during the lock upgrade
		 * so free the new buffer we allocated and return the
		 * found buffer.
		 */
		kmem_free(nbp->b_un.b_addr, nbp->b_bufsize);
		nbp->b_un.b_addr = NULL;

		/*
		 * Account for the memory
		 */
		mutex_enter(&bfree_lock);
		bfreelist.b_bufsize += nbp->b_bufsize;
		mutex_exit(&bfree_lock);

		/*
		 * Destroy buf identity, and place on avail list
		 */
		nbp->b_dev = (o_dev_t)NODEV;
		nbp->b_edev = NODEV;
		nbp->b_flags = 0;
		nbp->b_file = NULL;
		nbp->b_offset = -1;

		sema_v(&nbp->b_sem);
		bio_bhdr_free(nbp);

		ASSERT(SEMA_HELD(&bp->b_sem));
		return (bp);
	}

	/*
	 * bio_getfreeblk may block so check the hash chain again.
	 */
	if (nbp == NULL) {
		mutex_exit(hmp);
		nbp = bio_getfreeblk(bsize);
		mutex_enter(hmp);
		goto loop;
	}

	/*
	 * New buffer. Assign nbp and stick it on the hash.
	 */
	nbp->b_flags = B_BUSY;
	nbp->b_edev = dev;
	nbp->b_dev = (o_dev_t)cmpdev(dev);
	nbp->b_blkno = blkno;
	nbp->b_iodone = NULL;
	nbp->b_bcount = bsize;
	/*
	 * If we are given a ufsvfsp and the vfs_root field is NULL
	 * then this must be I/O for a superblock.  A superblock's
	 * buffer is set up in mountfs() and there is no root vnode
	 * at that point.
	 */
	if (ufsvfsp && ufsvfsp->vfs_root) {
		nbp->b_vp = ufsvfsp->vfs_root;
	} else {
		nbp->b_vp = NULL;
	}

	ASSERT((nbp->b_flags & B_NOCACHE) == 0);

	binshash(nbp, dp);
	mutex_exit(hmp);

	ASSERT(SEMA_HELD(&nbp->b_sem));

	return (nbp);


	/*
	 * Come here in case of an internal error. At this point we couldn't
	 * get a buffer, but we have to return one. Hence we allocate some
	 * kind of error reply buffer on the fly. This buffer is marked as
	 * B_NOCACHE | B_AGE | B_ERROR | B_DONE to assure the following:
	 *	- B_ERROR will indicate error to the caller.
	 *	- B_DONE will prevent us from reading the buffer from
	 *	  the device.
	 *	- B_NOCACHE will cause that this buffer gets free'd in
	 *	  brelse().
	 */

errout:
	errbp = geteblk();
	sema_p(&errbp->b_sem);
	errbp->b_flags &= ~B_BUSY;
	errbp->b_flags |= (B_ERROR | B_DONE);
	return (errbp);
}

/*
 * Get an empty block, not assigned to any particular device.
 * Returns a locked buffer that is not on any hash or free list.
 */
struct buf *
ngeteblk(long bsize)
{
	struct buf *bp;

	bp = kmem_alloc(sizeof (struct buf), KM_SLEEP);
	bioinit(bp);
	bp->av_forw = bp->av_back = NULL;
	bp->b_un.b_addr = kmem_alloc(bsize, KM_SLEEP);
	bp->b_bufsize = bsize;
	bp->b_flags = B_BUSY | B_NOCACHE | B_AGE;
	bp->b_dev = (o_dev_t)NODEV;
	bp->b_edev = NODEV;
	bp->b_lblkno = 0;
	bp->b_bcount = bsize;
	bp->b_iodone = NULL;
	return (bp);
}

/*
 * Interface of geteblk() is kept intact to maintain driver compatibility.
 * Use ngeteblk() to allocate block size other than 1 KB.
 */
struct buf *
geteblk(void)
{
	return (ngeteblk((long)1024));
}

/*
 * Return a buffer w/o sleeping
 */
struct buf *
trygetblk(dev_t dev, daddr_t blkno)
{
	struct buf	*bp;
	struct buf	*dp;
	struct hbuf	*hp;
	kmutex_t	*hmp;
	uint_t		index;

	index = bio_bhash(dev, blkno);
	hp = &hbuf[index];
	hmp = &hp->b_lock;

	if (!mutex_tryenter(hmp))
		return (NULL);

	dp = (struct buf *)hp;
	for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
		if (bp->b_blkno != blkno || bp->b_edev != dev ||
		    (bp->b_flags & B_STALE))
			continue;
		/*
		 * Get access to a valid buffer without sleeping
		 */
		if (sema_tryp(&bp->b_sem)) {
			if (bp->b_flags & B_DONE) {
				hp->b_length--;
				notavail(bp);
				mutex_exit(hmp);
				return (bp);
			} else {
				sema_v(&bp->b_sem);
				break;
			}
		}
		break;
	}
	mutex_exit(hmp);
	return (NULL);
}

/*
 * Wait for I/O completion on the buffer; return errors
 * to the user.
 */
int
iowait(struct buf *bp)
{
	ASSERT(SEMA_HELD(&bp->b_sem));
	return (biowait(bp));
}

/*
 * Mark I/O complete on a buffer, release it if I/O is asynchronous,
 * and wake up anyone waiting for it.
 */
void
iodone(struct buf *bp)
{
	ASSERT(SEMA_HELD(&bp->b_sem));
	(void) biodone(bp);
}

/*
 * Zero the core associated with a buffer.
 */
void
clrbuf(struct buf *bp)
{
	ASSERT(SEMA_HELD(&bp->b_sem));
	bzero(bp->b_un.b_addr, bp->b_bcount);
	bp->b_resid = 0;
}


/*
 * Make sure all write-behind blocks on dev (or NODEV for all)
 * are flushed out.
 */
void
bflush(dev_t dev)
{
	struct buf *bp, *dp;
	struct hbuf *hp;
	struct buf *delwri_list = EMPTY_LIST;
	int i, index;
	kmutex_t *hmp;

	mutex_enter(&blist_lock);
	/*
	 * Wait for any invalidates or flushes ahead of us to finish.
	 * We really could split blist_lock up per device for better
	 * parallelism here.
	 */
	while (bio_doinginval || bio_doingflush) {
		bio_flinv_cv_wanted = 1;
		cv_wait(&bio_flushinval_cv, &blist_lock);
	}
	bio_doingflush++;
	/*
	 * Gather all B_DELWRI buffer for device.
	 * Lock ordering is b_sem > hash lock (brelse).
	 * Since we are finding the buffer via the delayed write list,
	 * it may be busy and we would block trying to get the
	 * b_sem lock while holding hash lock. So transfer all the
	 * candidates on the delwri_list and then drop the hash locks.
	 */
	for (i = 0; i < v.v_hbuf; i++) {
		hmp = &hbuf[i].b_lock;
		dp = (struct buf *)&dwbuf[i];
		mutex_enter(hmp);
		for (bp = dp->av_forw; bp != dp; bp = bp->av_forw) {
			if (dev == NODEV || bp->b_edev == dev) {
				if (bp->b_list == NULL) {
					bp->b_list = delwri_list;
					delwri_list = bp;
				}
			}
		}
		mutex_exit(hmp);
	}
	mutex_exit(&blist_lock);

	/*
	 * Now that the hash locks have been dropped grab the semaphores
	 * and write back all the buffers that have B_DELWRI set.
	 */
	while (delwri_list != EMPTY_LIST) {
		bp = delwri_list;

		sema_p(&bp->b_sem);	/* may block */
		if ((dev != bp->b_edev && dev != NODEV) ||
		    (panicstr && bp->b_flags & B_BUSY)) {
			sema_v(&bp->b_sem);
			delwri_list = bp->b_list;
			bp->b_list = NULL;
			continue;	/* No longer a candidate */
		}
		if (bp->b_flags & B_DELWRI) {
			index = bio_bhash(bp->b_edev, bp->b_blkno);
			hp = &hbuf[index];
			hmp = &hp->b_lock;
			dp = (struct buf *)hp;

			bp->b_flags |= B_ASYNC;
			mutex_enter(hmp);
			hp->b_length--;
			notavail(bp);
			mutex_exit(hmp);
			if (bp->b_vp == NULL) {		/* !ufs */
				BWRITE(bp);
			} else {			/* ufs */
				UFS_BWRITE(VTOI(bp->b_vp)->i_ufsvfs, bp);
			}
		} else {
			sema_v(&bp->b_sem);
		}
		delwri_list = bp->b_list;
		bp->b_list = NULL;
	}
	mutex_enter(&blist_lock);
	bio_doingflush--;
	if (bio_flinv_cv_wanted) {
		bio_flinv_cv_wanted = 0;
		cv_broadcast(&bio_flushinval_cv);
	}
	mutex_exit(&blist_lock);
}

/*
 * Ensure that a specified block is up-to-date on disk.
 */
void
blkflush(dev_t dev, daddr_t blkno)
{
	struct buf *bp, *dp;
	struct hbuf *hp;
	struct buf *sbp = NULL;
	uint_t index;
	kmutex_t *hmp;

	index = bio_bhash(dev, blkno);
	hp    = &hbuf[index];
	dp    = (struct buf *)hp;
	hmp   = &hp->b_lock;

	/*
	 * Identify the buffer in the cache belonging to
	 * this device and blkno (if any).
	 */
	mutex_enter(hmp);
	for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
		if (bp->b_blkno != blkno || bp->b_edev != dev ||
		    (bp->b_flags & B_STALE))
			continue;
		sbp = bp;
		break;
	}
	mutex_exit(hmp);
	if (sbp == NULL)
		return;
	/*
	 * Now check the buffer we have identified and
	 * make sure it still belongs to the device and is B_DELWRI
	 */
	sema_p(&sbp->b_sem);
	if (sbp->b_blkno == blkno && sbp->b_edev == dev &&
	    (sbp->b_flags & (B_DELWRI|B_STALE)) == B_DELWRI) {
		mutex_enter(hmp);
		hp->b_length--;
		notavail(sbp);
		mutex_exit(hmp);
		/*
		 * XXX - There is nothing to guarantee a synchronous
		 * write here if the B_ASYNC flag is set.  This needs
		 * some investigation.
		 */
		if (sbp->b_vp == NULL) {		/* !ufs */
			BWRITE(sbp);	/* synchronous write */
		} else {				/* ufs */
			UFS_BWRITE(VTOI(sbp->b_vp)->i_ufsvfs, sbp);
		}
	} else {
		sema_v(&sbp->b_sem);
	}
}

/*
 * Same as binval, except can force-invalidate delayed-write buffers
 * (which are not be already flushed because of device errors).  Also
 * makes sure that the retry write flag is cleared.
 */
int
bfinval(dev_t dev, int force)
{
	struct buf *dp;
	struct buf *bp;
	struct buf *binval_list = EMPTY_LIST;
	int i, error = 0;
	kmutex_t *hmp;
	uint_t index;
	struct buf **backp;

	mutex_enter(&blist_lock);
	/*
	 * Wait for any flushes ahead of us to finish, it's ok to
	 * do invalidates in parallel.
	 */
	while (bio_doingflush) {
		bio_flinv_cv_wanted = 1;
		cv_wait(&bio_flushinval_cv, &blist_lock);
	}
	bio_doinginval++;

	/* Gather bp's */
	for (i = 0; i < v.v_hbuf; i++) {
		dp = (struct buf *)&hbuf[i];
		hmp = &hbuf[i].b_lock;

		mutex_enter(hmp);
		for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
			if (bp->b_edev == dev) {
				if (bp->b_list == NULL) {
					bp->b_list = binval_list;
					binval_list = bp;
				}
			}
		}
		mutex_exit(hmp);
	}
	mutex_exit(&blist_lock);

	/* Invalidate all bp's found */
	while (binval_list != EMPTY_LIST) {
		bp = binval_list;

		sema_p(&bp->b_sem);
		if (bp->b_edev == dev) {
			if (force && (bp->b_flags & B_DELWRI)) {
				/* clear B_DELWRI, move to non-dw freelist */
				index = bio_bhash(bp->b_edev, bp->b_blkno);
				hmp = &hbuf[index].b_lock;
				dp = (struct buf *)&hbuf[index];
				mutex_enter(hmp);

				/* remove from delayed write freelist */
				notavail(bp);

				/* add to B_AGE side of non-dw freelist */
				backp = &dp->av_forw;
				(*backp)->av_back = bp;
				bp->av_forw = *backp;
				*backp = bp;
				bp->av_back = dp;

				/*
				 * make sure write retries and busy are cleared
				 */
				bp->b_flags &=
				    ~(B_BUSY | B_DELWRI | B_RETRYWRI);
				mutex_exit(hmp);
			}
			if ((bp->b_flags & B_DELWRI) == 0)
				bp->b_flags |= B_STALE|B_AGE;
			else
				error = EIO;
		}
		sema_v(&bp->b_sem);
		binval_list = bp->b_list;
		bp->b_list = NULL;
	}
	mutex_enter(&blist_lock);
	bio_doinginval--;
	if (bio_flinv_cv_wanted) {
		cv_broadcast(&bio_flushinval_cv);
		bio_flinv_cv_wanted = 0;
	}
	mutex_exit(&blist_lock);
	return (error);
}

/*
 * If possible, invalidate blocks for a dev on demand
 */
void
binval(dev_t dev)
{
	(void) bfinval(dev, 0);
}

/*
 * Initialize the buffer I/O system by freeing
 * all buffers and setting all device hash buffer lists to empty.
 */
void
binit(void)
{
	struct buf *bp;
	unsigned int i, pct;
	ulong_t	bio_max_hwm, bio_default_hwm;

	/*
	 * Maximum/Default values for bufhwm are set to the smallest of:
	 *	- BIO_MAX_PERCENT resp. BIO_BUF_PERCENT of real memory
	 *	- 1/4 of kernel virtual memory
	 *	- INT32_MAX to prevent overflows of v.v_bufhwm (which is int).
	 * Additionally, in order to allow simple tuning by percentage of
	 * physical memory, bufhwm_pct is used to calculate the default if
	 * the value of this tunable is between 0 and BIO_MAX_PERCENT.
	 *
	 * Since the unit for v.v_bufhwm is kilobytes, this allows for
	 * a maximum of 1024 * 2GB == 2TB memory usage by buffer headers.
	 */
	bio_max_hwm = MIN(physmem / BIO_MAX_PERCENT,
	    btop(vmem_size(heap_arena, VMEM_FREE)) / 4) * (PAGESIZE / 1024);
	bio_max_hwm = MIN(INT32_MAX, bio_max_hwm);

	pct = BIO_BUF_PERCENT;
	if (bufhwm_pct != 0 &&
	    ((pct = 100 / bufhwm_pct) < BIO_MAX_PERCENT)) {
		pct = BIO_BUF_PERCENT;
		/*
		 * Invalid user specified value, emit a warning.
		 */
		cmn_err(CE_WARN, "binit: bufhwm_pct(%d) out of \
		    range(1..%d). Using %d as default.",
		    bufhwm_pct,
		    100 / BIO_MAX_PERCENT, 100 / BIO_BUF_PERCENT);
	}

	bio_default_hwm = MIN(physmem / pct,
	    btop(vmem_size(heap_arena, VMEM_FREE)) / 4) * (PAGESIZE / 1024);
	bio_default_hwm = MIN(INT32_MAX, bio_default_hwm);

	if ((v.v_bufhwm = bufhwm) == 0)
		v.v_bufhwm = bio_default_hwm;

	if (v.v_bufhwm < BIO_MIN_HWM || v.v_bufhwm > bio_max_hwm) {
		v.v_bufhwm = (int)bio_max_hwm;
		/*
		 * Invalid user specified value, emit a warning.
		 */
		cmn_err(CE_WARN,
		    "binit: bufhwm(%d) out \
		    of range(%d..%lu). Using %lu as default",
		    bufhwm,
		    BIO_MIN_HWM, bio_max_hwm, bio_max_hwm);
	}

	/*
	 * Determine the number of hash buckets. Default is to
	 * create ~BIO_HASHLEN entries per chain based on MAXBSIZE buffers.
	 * Round up number to the next power of 2.
	 */
	v.v_hbuf = 1 << highbit((((ulong_t)v.v_bufhwm * 1024) / MAXBSIZE) /
	    BIO_HASHLEN);
	v.v_hmask = v.v_hbuf - 1;
	v.v_buf = BIO_BHDR_POOL;

	hbuf = kmem_zalloc(v.v_hbuf * sizeof (struct hbuf), KM_SLEEP);

	dwbuf = kmem_zalloc(v.v_hbuf * sizeof (struct dwbuf), KM_SLEEP);

	bfreelist.b_bufsize = (size_t)v.v_bufhwm * 1024;
	bp = &bfreelist;
	bp->b_forw = bp->b_back = bp->av_forw = bp->av_back = bp;

	for (i = 0; i < v.v_hbuf; i++) {
		hbuf[i].b_forw = hbuf[i].b_back = (struct buf *)&hbuf[i];
		hbuf[i].av_forw = hbuf[i].av_back = (struct buf *)&hbuf[i];

		/*
		 * Initialize the delayed write buffer list.
		 */
		dwbuf[i].b_forw = dwbuf[i].b_back = (struct buf *)&dwbuf[i];
		dwbuf[i].av_forw = dwbuf[i].av_back = (struct buf *)&dwbuf[i];
	}
}

/*
 * Wait for I/O completion on the buffer; return error code.
 * If bp was for synchronous I/O, bp is invalid and associated
 * resources are freed on return.
 */
int
biowait(struct buf *bp)
{
	int error = 0;
	struct cpu *cpup;

	ASSERT(SEMA_HELD(&bp->b_sem));

	cpup = CPU;
	atomic_inc_64(&cpup->cpu_stats.sys.iowait);
	DTRACE_IO1(wait__start, struct buf *, bp);

	/*
	 * In case of panic, busy wait for completion
	 */
	if (panicstr) {
		while ((bp->b_flags & B_DONE) == 0)
			drv_usecwait(10);
	} else
		sema_p(&bp->b_io);

	DTRACE_IO1(wait__done, struct buf *, bp);
	atomic_dec_64(&cpup->cpu_stats.sys.iowait);

	error = geterror(bp);
	if ((bp->b_flags & B_ASYNC) == 0) {
		if (bp->b_flags & B_REMAPPED)
			bp_mapout(bp);
	}
	return (error);
}

static void
biodone_tnf_probe(struct buf *bp)
{
	/* Kernel probe */
	TNF_PROBE_3(biodone, "io blockio", /* CSTYLED */,
	    tnf_device,		device,		bp->b_edev,
	    tnf_diskaddr,	block,		bp->b_lblkno,
	    tnf_opaque,		buf,		bp);
}

/*
 * Mark I/O complete on a buffer, release it if I/O is asynchronous,
 * and wake up anyone waiting for it.
 */
void
biodone(struct buf *bp)
{
	if (bp->b_flags & B_STARTED) {
		DTRACE_IO1(done, struct buf *, bp);
		bp->b_flags &= ~B_STARTED;
	}

	/*
	 * Call the TNF probe here instead of the inline code
	 * to force our compiler to use the tail call optimization.
	 */
	biodone_tnf_probe(bp);

	if (bp->b_iodone != NULL) {
		(*(bp->b_iodone))(bp);
		return;
	}
	ASSERT((bp->b_flags & B_DONE) == 0);
	ASSERT(SEMA_HELD(&bp->b_sem));
	bp->b_flags |= B_DONE;
	if (bp->b_flags & B_ASYNC) {
		if (bp->b_flags & (B_PAGEIO|B_REMAPPED))
			bio_pageio_done(bp);
		else
			brelse(bp);	/* release bp to freelist */
	} else {
		sema_v(&bp->b_io);
	}
}

/*
 * Pick up the device's error number and pass it to the user;
 * if there is an error but the number is 0 set a generalized code.
 */
int
geterror(struct buf *bp)
{
	int error = 0;

	ASSERT(SEMA_HELD(&bp->b_sem));
	if (bp->b_flags & B_ERROR) {
		error = bp->b_error;
		if (!error)
			error = EIO;
	}
	return (error);
}

/*
 * Support for pageio buffers.
 *
 * This stuff should be generalized to provide a generalized bp
 * header facility that can be used for things other than pageio.
 */

/*
 * Allocate and initialize a buf struct for use with pageio.
 */
struct buf *
pageio_setup(struct page *pp, size_t len, struct vnode *vp, int flags)
{
	struct buf *bp;
	struct cpu *cpup;

	if (flags & B_READ) {
		CPU_STATS_ENTER_K();
		cpup = CPU;	/* get pointer AFTER preemption is disabled */
		CPU_STATS_ADDQ(cpup, vm, pgin, 1);
		CPU_STATS_ADDQ(cpup, vm, pgpgin, btopr(len));

		atomic_add_64(&curzone->zone_pgpgin, btopr(len));

		if ((flags & B_ASYNC) == 0) {
			klwp_t *lwp = ttolwp(curthread);
			if (lwp != NULL)
				lwp->lwp_ru.majflt++;
			CPU_STATS_ADDQ(cpup, vm, maj_fault, 1);
			/* Kernel probe */
			TNF_PROBE_2(major_fault, "vm pagefault", /* CSTYLED */,
			    tnf_opaque,		vnode,		pp->p_vnode,
			    tnf_offset,		offset,		pp->p_offset);
		}
		/*
		 * Update statistics for pages being paged in
		 */
		if (pp != NULL && pp->p_vnode != NULL) {
			if (IS_SWAPFSVP(pp->p_vnode)) {
				CPU_STATS_ADDQ(cpup, vm, anonpgin, btopr(len));
				atomic_add_64(&curzone->zone_anonpgin,
				    btopr(len));
			} else {
				if (pp->p_vnode->v_flag & VVMEXEC) {
					CPU_STATS_ADDQ(cpup, vm, execpgin,
					    btopr(len));
					atomic_add_64(&curzone->zone_execpgin,
					    btopr(len));
				} else {
					CPU_STATS_ADDQ(cpup, vm, fspgin,
					    btopr(len));
					atomic_add_64(&curzone->zone_fspgin,
					    btopr(len));
				}
			}
		}
		CPU_STATS_EXIT_K();
		TRACE_1(TR_FAC_VM, TR_PAGE_WS_IN,
		    "page_ws_in:pp %p", pp);
		/* Kernel probe */
		TNF_PROBE_3(pagein, "vm pageio io", /* CSTYLED */,
		    tnf_opaque,	vnode,	pp->p_vnode,
		    tnf_offset,	offset,	pp->p_offset,
		    tnf_size,	size,	len);
	}

	bp = kmem_zalloc(sizeof (struct buf), KM_SLEEP);
	bp->b_bcount = len;
	bp->b_bufsize = len;
	bp->b_pages = pp;
	bp->b_flags = B_PAGEIO | B_NOCACHE | B_BUSY | flags;
	bp->b_offset = -1;
	sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL);

	/* Initialize bp->b_sem in "locked" state */
	sema_init(&bp->b_sem, 0, NULL, SEMA_DEFAULT, NULL);

	VN_HOLD(vp);
	bp->b_vp = vp;
	THREAD_KPRI_RELEASE_N(btopr(len)); /* release kpri from page_locks */

	/*
	 * Caller sets dev & blkno and can adjust
	 * b_addr for page offset and can use bp_mapin
	 * to make pages kernel addressable.
	 */
	return (bp);
}

void
pageio_done(struct buf *bp)
{
	ASSERT(SEMA_HELD(&bp->b_sem));
	if (bp->b_flags & B_REMAPPED)
		bp_mapout(bp);
	VN_RELE(bp->b_vp);
	bp->b_vp = NULL;
	ASSERT((bp->b_flags & B_NOCACHE) != 0);

	/* A sema_v(bp->b_sem) is implied if we are destroying it */
	sema_destroy(&bp->b_sem);
	sema_destroy(&bp->b_io);
	kmem_free(bp, sizeof (struct buf));
}

/*
 * Check to see whether the buffers, except the one pointed by sbp,
 * associated with the device are busy.
 * NOTE: This expensive operation shall be improved together with ufs_icheck().
 */
int
bcheck(dev_t dev, struct buf *sbp)
{
	struct buf	*bp;
	struct buf	*dp;
	int i;
	kmutex_t *hmp;

	/*
	 * check for busy bufs for this filesystem
	 */
	for (i = 0; i < v.v_hbuf; i++) {
		dp = (struct buf *)&hbuf[i];
		hmp = &hbuf[i].b_lock;

		mutex_enter(hmp);
		for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
			/*
			 * if buf is busy or dirty, then filesystem is busy
			 */
			if ((bp->b_edev == dev) &&
			    ((bp->b_flags & B_STALE) == 0) &&
			    (bp->b_flags & (B_DELWRI|B_BUSY)) &&
			    (bp != sbp)) {
				mutex_exit(hmp);
				return (1);
			}
		}
		mutex_exit(hmp);
	}
	return (0);
}

/*
 * Hash two 32 bit entities.
 */
int
hash2ints(int x, int y)
{
	int hash = 0;

	hash = x - 1;
	hash = ((hash * 7) + (x >> 8)) - 1;
	hash = ((hash * 7) + (x >> 16)) - 1;
	hash = ((hash * 7) + (x >> 24)) - 1;
	hash = ((hash * 7) + y) - 1;
	hash = ((hash * 7) + (y >> 8)) - 1;
	hash = ((hash * 7) + (y >> 16)) - 1;
	hash = ((hash * 7) + (y >> 24)) - 1;

	return (hash);
}


/*
 * Return a new buffer struct.
 *	Create a new buffer if we haven't gone over our high water
 *	mark for memory, otherwise try to get one off the freelist.
 *
 * Returns a locked buf that has no id and is not on any hash or free
 * list.
 */
static struct buf *
bio_getfreeblk(long bsize)
{
	struct buf *bp, *dp;
	struct hbuf *hp;
	kmutex_t	*hmp;
	uint_t		start, end;

	/*
	 * mutex_enter(&bfree_lock);
	 * bfreelist.b_bufsize represents the amount of memory
	 * mutex_exit(&bfree_lock); protect ref to bfreelist
	 * we are allowed to allocate in the cache before we hit our hwm.
	 */
	bio_mem_get(bsize);	/* Account for our memory request */

again:
	bp = bio_bhdr_alloc();	/* Get a buf hdr */
	sema_p(&bp->b_sem);	/* Should never fail */

	ASSERT(bp->b_un.b_addr == NULL);
	bp->b_un.b_addr = kmem_alloc(bsize, KM_NOSLEEP);
	if (bp->b_un.b_addr != NULL) {
		/*
		 * Make the common path short
		 */
		bp->b_bufsize = bsize;
		ASSERT(SEMA_HELD(&bp->b_sem));
		return (bp);
	} else {
		struct buf *save;

		save = bp;	/* Save bp we allocated */
		start = end = lastindex;

		biostats.bio_bufwant.value.ui32++;

		/*
		 * Memory isn't available from the system now. Scan
		 * the hash buckets till enough space is found.
		 */
		do {
			hp = &hbuf[start];
			hmp = &hp->b_lock;
			dp = (struct buf *)hp;

			mutex_enter(hmp);
			bp = dp->av_forw;

			while (bp != dp) {

				ASSERT(bp != NULL);

				if (!sema_tryp(&bp->b_sem)) {
					bp = bp->av_forw;
					continue;
				}

				/*
				 * Since we are going down the freelist
				 * associated with this hash bucket the
				 * B_DELWRI flag should not be set.
				 */
				ASSERT(!(bp->b_flags & B_DELWRI));

				if (bp->b_bufsize == bsize) {
					hp->b_length--;
					notavail(bp);
					bremhash(bp);
					mutex_exit(hmp);

					/*
					 * Didn't kmem_alloc any more, so don't
					 * count it twice.
					 */
					mutex_enter(&bfree_lock);
					bfreelist.b_bufsize += bsize;
					mutex_exit(&bfree_lock);

					/*
					 * Update the lastindex value.
					 */
					lastindex = start;

					/*
					 * Put our saved bp back on the list
					 */
					sema_v(&save->b_sem);
					bio_bhdr_free(save);
					ASSERT(SEMA_HELD(&bp->b_sem));
					return (bp);
				}
				sema_v(&bp->b_sem);
				bp = bp->av_forw;
			}
			mutex_exit(hmp);
			start = ((start + 1) % v.v_hbuf);
		} while (start != end);

		biostats.bio_bufwait.value.ui32++;
		bp = save;		/* Use original bp */
		bp->b_un.b_addr = kmem_alloc(bsize, KM_SLEEP);
	}

	bp->b_bufsize = bsize;
	ASSERT(SEMA_HELD(&bp->b_sem));
	return (bp);
}

/*
 * Allocate a buffer header. If none currently available, allocate
 * a new pool.
 */
static struct buf *
bio_bhdr_alloc(void)
{
	struct buf *dp, *sdp;
	struct buf *bp;
	int i;

	for (;;) {
		mutex_enter(&bhdr_lock);
		if (bhdrlist != NULL) {
			bp = bhdrlist;
			bhdrlist = bp->av_forw;
			mutex_exit(&bhdr_lock);
			bp->av_forw = NULL;
			return (bp);
		}
		mutex_exit(&bhdr_lock);

		/*
		 * Need to allocate a new pool. If the system is currently
		 * out of memory, then try freeing things on the freelist.
		 */
		dp = kmem_zalloc(sizeof (struct buf) * v.v_buf, KM_NOSLEEP);
		if (dp == NULL) {
			/*
			 * System can't give us a pool of headers, try
			 * recycling from the free lists.
			 */
			bio_recycle(BIO_HEADER, 0);
		} else {
			sdp = dp;
			for (i = 0; i < v.v_buf; i++, dp++) {
				/*
				 * The next two lines are needed since NODEV
				 * is -1 and not NULL
				 */
				dp->b_dev = (o_dev_t)NODEV;
				dp->b_edev = NODEV;
				dp->av_forw = dp + 1;
				sema_init(&dp->b_sem, 1, NULL, SEMA_DEFAULT,
				    NULL);
				sema_init(&dp->b_io, 0, NULL, SEMA_DEFAULT,
				    NULL);
				dp->b_offset = -1;
			}
			mutex_enter(&bhdr_lock);
			(--dp)->av_forw = bhdrlist;	/* Fix last pointer */
			bhdrlist = sdp;
			nbuf += v.v_buf;
			bp = bhdrlist;
			bhdrlist = bp->av_forw;
			mutex_exit(&bhdr_lock);

			bp->av_forw = NULL;
			return (bp);
		}
	}
}

static  void
bio_bhdr_free(struct buf *bp)
{
	ASSERT(bp->b_back == NULL);
	ASSERT(bp->b_forw == NULL);
	ASSERT(bp->av_back == NULL);
	ASSERT(bp->av_forw == NULL);
	ASSERT(bp->b_un.b_addr == NULL);
	ASSERT(bp->b_dev == (o_dev_t)NODEV);
	ASSERT(bp->b_edev == NODEV);
	ASSERT(bp->b_flags == 0);

	mutex_enter(&bhdr_lock);
	bp->av_forw = bhdrlist;
	bhdrlist = bp;
	mutex_exit(&bhdr_lock);
}

/*
 * If we haven't gone over the high water mark, it's o.k. to
 * allocate more buffer space, otherwise recycle buffers
 * from the freelist until enough memory is free for a bsize request.
 *
 * We account for this memory, even though
 * we don't allocate it here.
 */
static void
bio_mem_get(long bsize)
{
	mutex_enter(&bfree_lock);
	if (bfreelist.b_bufsize > bsize) {
		bfreelist.b_bufsize -= bsize;
		mutex_exit(&bfree_lock);
		return;
	}
	mutex_exit(&bfree_lock);
	bio_recycle(BIO_MEM, bsize);
}

/*
 * flush a list of delayed write buffers.
 * (currently used only by bio_recycle below.)
 */
static void
bio_flushlist(struct buf *delwri_list)
{
	struct buf *bp;

	while (delwri_list != EMPTY_LIST) {
		bp = delwri_list;
		bp->b_flags |= B_AGE | B_ASYNC;
		if (bp->b_vp == NULL) {		/* !ufs */
			BWRITE(bp);
		} else {			/* ufs */
			UFS_BWRITE(VTOI(bp->b_vp)->i_ufsvfs, bp);
		}
		delwri_list = bp->b_list;
		bp->b_list = NULL;
	}
}

/*
 * Start recycling buffers on the freelist for one of 2 reasons:
 *	- we need a buffer header
 *	- we need to free up memory
 * Once started we continue to recycle buffers until the B_AGE
 * buffers are gone.
 */
static void
bio_recycle(int want, long bsize)
{
	struct buf *bp, *dp, *dwp, *nbp;
	struct hbuf *hp;
	int	found = 0;
	kmutex_t	*hmp;
	int		start, end;
	struct buf *delwri_list = EMPTY_LIST;

	/*
	 * Recycle buffers.
	 */
top:
	start = end = lastindex;
	do {
		hp = &hbuf[start];
		hmp = &hp->b_lock;
		dp = (struct buf *)hp;

		mutex_enter(hmp);
		bp = dp->av_forw;

		while (bp != dp) {

			ASSERT(bp != NULL);

			if (!sema_tryp(&bp->b_sem)) {
				bp = bp->av_forw;
				continue;
			}
			/*
			 * Do we really want to nuke all of the B_AGE stuff??
			 */
			if ((bp->b_flags & B_AGE) == 0 && found) {
				sema_v(&bp->b_sem);
				mutex_exit(hmp);
				lastindex = start;
				return;	/* All done */
			}

			ASSERT(MUTEX_HELD(&hp->b_lock));
			ASSERT(!(bp->b_flags & B_DELWRI));
			hp->b_length--;
			notavail(bp);

			/*
			 * Remove bhdr from cache, free up memory,
			 * and add the hdr to the freelist.
			 */
			bremhash(bp);
			mutex_exit(hmp);

			if (bp->b_bufsize) {
				kmem_free(bp->b_un.b_addr, bp->b_bufsize);
				bp->b_un.b_addr = NULL;
				mutex_enter(&bfree_lock);
				bfreelist.b_bufsize += bp->b_bufsize;
				mutex_exit(&bfree_lock);
			}

			bp->b_dev = (o_dev_t)NODEV;
			bp->b_edev = NODEV;
			bp->b_flags = 0;
			sema_v(&bp->b_sem);
			bio_bhdr_free(bp);
			if (want == BIO_HEADER) {
				found = 1;
			} else {
				ASSERT(want == BIO_MEM);
				if (!found && bfreelist.b_bufsize >= bsize) {
					/* Account for the memory we want */
					mutex_enter(&bfree_lock);
					if (bfreelist.b_bufsize >= bsize) {
						bfreelist.b_bufsize -= bsize;
						found = 1;
					}
					mutex_exit(&bfree_lock);
				}
			}

			/*
			 * Since we dropped hmp start from the
			 * begining.
			 */
			mutex_enter(hmp);
			bp = dp->av_forw;
		}
		mutex_exit(hmp);

		/*
		 * Look at the delayed write list.
		 * First gather into a private list, then write them.
		 */
		dwp = (struct buf *)&dwbuf[start];
		mutex_enter(&blist_lock);
		bio_doingflush++;
		mutex_enter(hmp);
		for (bp = dwp->av_forw; bp != dwp; bp = nbp) {

			ASSERT(bp != NULL);
			nbp = bp->av_forw;

			if (!sema_tryp(&bp->b_sem))
				continue;
			ASSERT(bp->b_flags & B_DELWRI);
			/*
			 * Do we really want to nuke all of the B_AGE stuff??
			 */

			if ((bp->b_flags & B_AGE) == 0 && found) {
				sema_v(&bp->b_sem);
				mutex_exit(hmp);
				lastindex = start;
				mutex_exit(&blist_lock);
				bio_flushlist(delwri_list);
				mutex_enter(&blist_lock);
				bio_doingflush--;
				if (bio_flinv_cv_wanted) {
					bio_flinv_cv_wanted = 0;
					cv_broadcast(&bio_flushinval_cv);
				}
				mutex_exit(&blist_lock);
				return; /* All done */
			}

			/*
			 * If the buffer is already on a flush or
			 * invalidate list then just skip it.
			 */
			if (bp->b_list != NULL) {
				sema_v(&bp->b_sem);
				continue;
			}
			/*
			 * We are still on the same bucket.
			 */
			hp->b_length--;
			notavail(bp);
			bp->b_list = delwri_list;
			delwri_list = bp;
		}
		mutex_exit(hmp);
		mutex_exit(&blist_lock);
		bio_flushlist(delwri_list);
		delwri_list = EMPTY_LIST;
		mutex_enter(&blist_lock);
		bio_doingflush--;
		if (bio_flinv_cv_wanted) {
			bio_flinv_cv_wanted = 0;
			cv_broadcast(&bio_flushinval_cv);
		}
		mutex_exit(&blist_lock);
		start = (start + 1) % v.v_hbuf;

	} while (start != end);

	if (found)
		return;

	/*
	 * Free lists exhausted and we haven't satisfied the request.
	 * Wait here for more entries to be added to freelist.
	 * Because this might have just happened, make it timed.
	 */
	mutex_enter(&bfree_lock);
	bfreelist.b_flags |= B_WANTED;
	(void) cv_reltimedwait(&bio_mem_cv, &bfree_lock, hz, TR_CLOCK_TICK);
	mutex_exit(&bfree_lock);
	goto top;
}

/*
 * See if the block is associated with some buffer
 * (mainly to avoid getting hung up on a wait in breada).
 */
static int
bio_incore(dev_t dev, daddr_t blkno)
{
	struct buf *bp;
	struct buf *dp;
	uint_t index;
	kmutex_t *hmp;

	index = bio_bhash(dev, blkno);
	dp = (struct buf *)&hbuf[index];
	hmp = &hbuf[index].b_lock;

	mutex_enter(hmp);
	for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
		if (bp->b_blkno == blkno && bp->b_edev == dev &&
		    (bp->b_flags & B_STALE) == 0) {
			mutex_exit(hmp);
			return (1);
		}
	}
	mutex_exit(hmp);
	return (0);
}

static void
bio_pageio_done(struct buf *bp)
{
	if (bp->b_flags & B_PAGEIO) {

		if (bp->b_flags & B_REMAPPED)
			bp_mapout(bp);

		if (bp->b_flags & B_READ)
			pvn_read_done(bp->b_pages, bp->b_flags);
		else
			pvn_write_done(bp->b_pages, B_WRITE | bp->b_flags);
		pageio_done(bp);
	} else {
		ASSERT(bp->b_flags & B_REMAPPED);
		bp_mapout(bp);
		brelse(bp);
	}
}

/*
 * bioerror(9F) - indicate error in buffer header
 * If 'error' is zero, remove the error indication.
 */
void
bioerror(struct buf *bp, int error)
{
	ASSERT(bp != NULL);
	ASSERT(error >= 0);
	ASSERT(SEMA_HELD(&bp->b_sem));

	if (error != 0) {
		bp->b_flags |= B_ERROR;
	} else {
		bp->b_flags &= ~B_ERROR;
	}
	bp->b_error = error;
}

/*
 * bioreset(9F) - reuse a private buffer header after I/O is complete
 */
void
bioreset(struct buf *bp)
{
	ASSERT(bp != NULL);

	biofini(bp);
	bioinit(bp);
}

/*
 * biosize(9F) - return size of a buffer header
 */
size_t
biosize(void)
{
	return (sizeof (struct buf));
}

/*
 * biomodified(9F) - check if buffer is modified
 */
int
biomodified(struct buf *bp)
{
	int npf;
	int ppattr;
	struct page *pp;

	ASSERT(bp != NULL);

	if ((bp->b_flags & B_PAGEIO) == 0) {
		return (-1);
	}
	pp = bp->b_pages;
	npf = btopr(bp->b_bcount + ((uintptr_t)bp->b_un.b_addr & PAGEOFFSET));

	while (npf > 0) {
		ppattr = hat_pagesync(pp, HAT_SYNC_DONTZERO |
		    HAT_SYNC_STOPON_MOD);
		if (ppattr & P_MOD)
			return (1);
		pp = pp->p_next;
		npf--;
	}

	return (0);
}

/*
 * bioinit(9F) - initialize a buffer structure
 */
void
bioinit(struct buf *bp)
{
	bzero(bp, sizeof (struct buf));
	sema_init(&bp->b_sem, 0, NULL, SEMA_DEFAULT, NULL);
	sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
	bp->b_offset = -1;
}

/*
 * biofini(9F) - uninitialize a buffer structure
 */
void
biofini(struct buf *bp)
{
	sema_destroy(&bp->b_io);
	sema_destroy(&bp->b_sem);
}

/*
 * bioclone(9F) - clone a buffer
 */
struct buf *
bioclone(struct buf *bp, off_t off, size_t len, dev_t dev, daddr_t blkno,
    int (*iodone)(struct buf *), struct buf *bp_mem, int sleep)
{
	struct buf *bufp;

	ASSERT(bp);
	if (bp_mem == NULL) {
		bufp = kmem_alloc(sizeof (struct buf), sleep);
		if (bufp == NULL) {
			return (NULL);
		}
		bioinit(bufp);
	} else {
		bufp = bp_mem;
		bioreset(bufp);
	}

#define	BUF_CLONE_FLAGS	(B_READ|B_WRITE|B_SHADOW|B_PHYS|B_PAGEIO|B_FAILFAST|\
	B_ABRWRITE)

	/*
	 * The cloned buffer does not inherit the B_REMAPPED flag.
	 */
	bufp->b_flags = (bp->b_flags & BUF_CLONE_FLAGS)  | B_BUSY;
	bufp->b_bcount = len;
	bufp->b_blkno = blkno;
	bufp->b_iodone = iodone;
	bufp->b_proc = bp->b_proc;
	bufp->b_edev = dev;
	bufp->b_file = bp->b_file;
	bufp->b_offset = bp->b_offset;

	if (bp->b_flags & B_SHADOW) {
		ASSERT(bp->b_shadow);
		ASSERT(bp->b_flags & B_PHYS);

		bufp->b_shadow = bp->b_shadow +
		    btop(((uintptr_t)bp->b_un.b_addr & PAGEOFFSET) + off);
		bufp->b_un.b_addr = (caddr_t)((uintptr_t)bp->b_un.b_addr + off);
		if (bp->b_flags & B_REMAPPED)
			bufp->b_proc = NULL;
	} else {
		if (bp->b_flags & B_PAGEIO) {
			struct page *pp;
			off_t o;
			int i;

			pp = bp->b_pages;
			o = ((uintptr_t)bp->b_un.b_addr & PAGEOFFSET) + off;
			for (i = btop(o); i > 0; i--) {
				pp = pp->p_next;
			}
			bufp->b_pages = pp;
			bufp->b_un.b_addr = (caddr_t)(o & PAGEOFFSET);
		} else {
			bufp->b_un.b_addr =
			    (caddr_t)((uintptr_t)bp->b_un.b_addr + off);
			if (bp->b_flags & B_REMAPPED)
				bufp->b_proc = NULL;
		}
	}
	return (bufp);
}