xref: /illumos-gate/usr/src/uts/common/os/bio.c (revision 7c478bd9)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
28 /*	  All Rights Reserved  	*/
29 
30 /*
31  * University Copyright- Copyright (c) 1982, 1986, 1988
32  * The Regents of the University of California
33  * All Rights Reserved
34  *
35  * University Acknowledgment- Portions of this document are derived from
36  * software developed by the University of California, Berkeley, and its
37  * contributors.
38  */
39 
40 #pragma ident	"%Z%%M%	%I%	%E% SMI"
41 
42 #include <sys/types.h>
43 #include <sys/t_lock.h>
44 #include <sys/sysmacros.h>
45 #include <sys/conf.h>
46 #include <sys/cpuvar.h>
47 #include <sys/errno.h>
48 #include <sys/debug.h>
49 #include <sys/buf.h>
50 #include <sys/var.h>
51 #include <sys/vnode.h>
52 #include <sys/bitmap.h>
53 #include <sys/cmn_err.h>
54 #include <sys/kmem.h>
55 #include <sys/vmem.h>
56 #include <sys/atomic.h>
57 #include <vm/seg_kmem.h>
58 #include <vm/page.h>
59 #include <vm/pvn.h>
60 #include <sys/vtrace.h>
61 #include <sys/tnf_probe.h>
62 #include <sys/fs/ufs_inode.h>
63 #include <sys/fs/ufs_bio.h>
64 #include <sys/fs/ufs_log.h>
65 #include <sys/systm.h>
66 #include <sys/vfs.h>
67 #include <sys/sdt.h>
68 
69 /* Locks */
70 static	kmutex_t	blist_lock;	/* protects b_list */
71 static	kmutex_t	bhdr_lock;	/* protects the bhdrlist */
72 static	kmutex_t	bfree_lock;	/* protects the bfreelist structure */
73 
74 struct hbuf	*hbuf;			/* Hash buckets */
75 struct dwbuf	*dwbuf;			/* Delayed write buckets */
76 static struct buf *bhdrlist;		/* buf header free list */
77 static int 	nbuf;			/* number of buffer headers allocated */
78 
79 static int	lastindex;		/* Reference point on where to start */
80 					/* when looking for free buffers */
81 
82 #define	bio_bhash(dev, bn)	(hash2ints((dev), (int)(bn)) & v.v_hmask)
83 #define	EMPTY_LIST	((struct buf *)-1)
84 
85 static kcondvar_t	bio_mem_cv; 	/* Condition variables */
86 static kcondvar_t	bio_flushinval_cv;
87 static int	bio_doingflush;		/* flush in progress */
88 static int	bio_doinginval;		/* inval in progress */
89 static int	bio_flinv_cv_wanted;	/* someone waiting for cv */
90 
91 /*
92  * Statistics on the buffer cache
93  */
94 struct biostats biostats = {
95 	{ "buffer_cache_lookups",		KSTAT_DATA_UINT32 },
96 	{ "buffer_cache_hits",			KSTAT_DATA_UINT32 },
97 	{ "new_buffer_requests",		KSTAT_DATA_UINT32 },
98 	{ "waits_for_buffer_allocs",		KSTAT_DATA_UINT32 },
99 	{ "buffers_locked_by_someone",		KSTAT_DATA_UINT32 },
100 	{ "duplicate_buffers_found",		KSTAT_DATA_UINT32 }
101 };
102 
103 /*
104  * kstat data
105  */
106 kstat_named_t	*biostats_ptr = (kstat_named_t *)&biostats;
107 uint_t		biostats_ndata = (uint_t)(sizeof (biostats) /
108 					sizeof (kstat_named_t));
109 
110 /*
111  * Statistics on ufs buffer cache
112  * Not protected by locks
113  */
114 struct ufsbiostats ub = {
115 	{ "breads",			KSTAT_DATA_UINT32 },
116 	{ "bwrites",			KSTAT_DATA_UINT32 },
117 	{ "fbiwrites",			KSTAT_DATA_UINT32 },
118 	{ "getpages",			KSTAT_DATA_UINT32 },
119 	{ "getras",			KSTAT_DATA_UINT32 },
120 	{ "putsyncs",			KSTAT_DATA_UINT32 },
121 	{ "putasyncs",			KSTAT_DATA_UINT32 },
122 	{ "putpageios",			KSTAT_DATA_UINT32 },
123 };
124 
125 /*
126  * more UFS Logging eccentricities...
127  *
128  * required since "#pragma weak ..." doesn't work in reverse order.
129  * i.e.:  genunix (bio.c) is loaded before the ufs modules and pointers
130  *        to ufs routines don't get plugged into bio.c calls so
131  *        we initialize it when setting up the "lufsops" table
132  *        in "lufs.c:_init()"
133  */
134 void (*bio_lufs_strategy)(void *, buf_t *);
135 void (*bio_snapshot_strategy)(void *, buf_t *);
136 
137 
138 /* Private routines */
139 static struct buf	*bio_getfreeblk(long);
140 static void 		bio_mem_get(long);
141 static void		bio_bhdr_free(struct buf *);
142 static struct buf	*bio_bhdr_alloc(void);
143 static void		bio_recycle(int, long);
144 static void 		bio_pageio_done(struct buf *);
145 static int 		bio_incore(dev_t, daddr_t);
146 
147 /*
148  * Buffer cache constants
149  */
150 #define	BIO_BUF_PERCENT	(100/2)		/* default: 2% of memory */
151 #define	BIO_MAX_PERCENT	(100/20)	/* max is 20% of real memory */
152 #define	BIO_BHDR_POOL	100		/* Default bhdr pool size */
153 #define	BIO_MIN_HDR	10		/* Minimum number of buffer headers */
154 #define	BIO_MIN_HWM	(BIO_MIN_HDR * MAXBSIZE / 1024)
155 #define	BIO_HASHLEN	4		/* Target length of hash chains */
156 
157 
158 /* Flags for bio_recycle() */
159 #define	BIO_HEADER	0x01
160 #define	BIO_MEM		0x02
161 
162 extern	int bufhwm;		/* User tunable - high water mark for mem  */
163 extern	int bufhwm_pct;		/* ditto - given in % of physmem  */
164 
165 /*
166  * The following routines allocate and free
167  * buffers with various side effects.  In general the
168  * arguments to an allocate routine are a device and
169  * a block number, and the value is a pointer to
170  * to the buffer header; the buffer returned is locked with a
171  * binary semaphore so that no one else can touch it. If the block was
172  * already in core, no I/O need be done; if it is
173  * already locked, the process waits until it becomes free.
174  * The following routines allocate a buffer:
175  *	getblk
176  *	bread/BREAD
177  *	breada
178  * Eventually the buffer must be released, possibly with the
179  * side effect of writing it out, by using one of
180  *	bwrite/BWRITE/brwrite
181  *	bdwrite/bdrwrite
182  *	bawrite
183  *	brelse
184  *
185  * The B_WANTED/B_BUSY bits are NOT used by these routines for synchronization.
186  * Instead, a binary semaphore, b_sem is used to gain exclusive access to
187  * a buffer and a binary semaphore, b_io is used for I/O synchronization.
188  * B_DONE is still used to denote a buffer with I/O complete on it.
189  *
190  * The bfreelist.b_bcount field is computed everytime fsflush runs. It is
191  * should not be used where a very accurate count of the free buffers is
192  * needed.
193  */
194 
195 /*
196  * Read in (if necessary) the block and return a buffer pointer.
197  *
198  * This interface is provided for binary compatibility.  Using
199  * BREAD() directly avoids the extra function call overhead invoked
200  * by calling this routine.
201  */
202 struct buf *
203 bread(dev_t dev, daddr_t blkno, long bsize)
204 {
205 	return (BREAD(dev, blkno, bsize));
206 }
207 
208 /*
209  * Common code for reading a buffer with various options
210  *
211  * Read in (if necessary) the block and return a buffer pointer.
212  */
213 struct buf *
214 bread_common(void *arg, dev_t dev, daddr_t blkno, long bsize)
215 {
216 	struct ufsvfs *ufsvfsp = (struct ufsvfs *)arg;
217 	struct buf *bp;
218 	klwp_t *lwp = ttolwp(curthread);
219 
220 	CPU_STATS_ADD_K(sys, lread, 1);
221 	bp = getblk_common(ufsvfsp, dev, blkno, bsize, /* errflg */ 1);
222 	if (bp->b_flags & B_DONE)
223 		return (bp);
224 	bp->b_flags |= B_READ;
225 	ASSERT(bp->b_bcount == bsize);
226 	if (ufsvfsp == NULL) {					/* !ufs */
227 		(void) bdev_strategy(bp);
228 	} else if (ufsvfsp->vfs_log && bio_lufs_strategy != NULL) {
229 							/* ufs && logging */
230 		(*bio_lufs_strategy)(ufsvfsp->vfs_log, bp);
231 	} else if (ufsvfsp->vfs_snapshot && bio_snapshot_strategy != NULL) {
232 							/* ufs && snapshots */
233 		(*bio_snapshot_strategy)(&ufsvfsp->vfs_snapshot, bp);
234 	} else {
235 		ufsvfsp->vfs_iotstamp = lbolt;
236 		ub.ub_breads.value.ul++;		/* ufs && !logging */
237 		(void) bdev_strategy(bp);
238 	}
239 	if (lwp != NULL)
240 		lwp->lwp_ru.inblock++;
241 	CPU_STATS_ADD_K(sys, bread, 1);
242 	(void) biowait(bp);
243 	return (bp);
244 }
245 
246 /*
247  * Read in the block, like bread, but also start I/O on the
248  * read-ahead block (which is not allocated to the caller).
249  */
250 struct buf *
251 breada(dev_t dev, daddr_t blkno, daddr_t rablkno, long bsize)
252 {
253 	struct buf *bp, *rabp;
254 	klwp_t *lwp = ttolwp(curthread);
255 
256 	bp = NULL;
257 	if (!bio_incore(dev, blkno)) {
258 		CPU_STATS_ADD_K(sys, lread, 1);
259 		bp = GETBLK(dev, blkno, bsize);
260 		if ((bp->b_flags & B_DONE) == 0) {
261 			bp->b_flags |= B_READ;
262 			bp->b_bcount = bsize;
263 			(void) bdev_strategy(bp);
264 			if (lwp != NULL)
265 				lwp->lwp_ru.inblock++;
266 			CPU_STATS_ADD_K(sys, bread, 1);
267 		}
268 	}
269 	if (rablkno && bfreelist.b_bcount > 1 &&
270 	    !bio_incore(dev, rablkno)) {
271 		rabp = GETBLK(dev, rablkno, bsize);
272 		if (rabp->b_flags & B_DONE)
273 			brelse(rabp);
274 		else {
275 			rabp->b_flags |= B_READ|B_ASYNC;
276 			rabp->b_bcount = bsize;
277 			(void) bdev_strategy(rabp);
278 			if (lwp != NULL)
279 				lwp->lwp_ru.inblock++;
280 			CPU_STATS_ADD_K(sys, bread, 1);
281 		}
282 	}
283 	if (bp == NULL)
284 		return (BREAD(dev, blkno, bsize));
285 	(void) biowait(bp);
286 	return (bp);
287 }
288 
289 /*
290  * Common code for writing a buffer with various options.
291  *
292  * force_wait  - wait for write completion regardless of B_ASYNC flag
293  * do_relse    - release the buffer when we are done
294  * clear_flags - flags to clear from the buffer
295  */
296 void
297 bwrite_common(void *arg, struct buf *bp, int force_wait,
298 				int do_relse, int clear_flags)
299 {
300 	register int do_wait;
301 	struct ufsvfs *ufsvfsp = (struct ufsvfs *)arg;
302 	int flag;
303 	klwp_t *lwp = ttolwp(curthread);
304 	struct cpu *cpup;
305 
306 	ASSERT(SEMA_HELD(&bp->b_sem));
307 	flag = bp->b_flags;
308 	bp->b_flags &= ~clear_flags;
309 	if (lwp != NULL)
310 		lwp->lwp_ru.oublock++;
311 	CPU_STATS_ENTER_K();
312 	cpup = CPU;		/* get pointer AFTER preemption is disabled */
313 	CPU_STATS_ADDQ(cpup, sys, lwrite, 1);
314 	CPU_STATS_ADDQ(cpup, sys, bwrite, 1);
315 	do_wait = ((flag & B_ASYNC) == 0 || force_wait);
316 	if (do_wait == 0)
317 		CPU_STATS_ADDQ(cpup, sys, bawrite, 1);
318 	CPU_STATS_EXIT_K();
319 	if (ufsvfsp == NULL) {
320 		(void) bdev_strategy(bp);
321 	} else if (ufsvfsp->vfs_log && bio_lufs_strategy != NULL) {
322 							/* ufs && logging */
323 		(*bio_lufs_strategy)(ufsvfsp->vfs_log, bp);
324 	} else if (ufsvfsp->vfs_snapshot && bio_snapshot_strategy != NULL) {
325 							/* ufs && snapshots */
326 		(*bio_snapshot_strategy)(&ufsvfsp->vfs_snapshot, bp);
327 	} else {
328 		ub.ub_bwrites.value.ul++;		/* ufs && !logging */
329 		(void) bdev_strategy(bp);
330 	}
331 	if (do_wait) {
332 		(void) biowait(bp);
333 		if (do_relse) {
334 			brelse(bp);
335 		}
336 	}
337 }
338 
339 /*
340  * Write the buffer, waiting for completion (unless B_ASYNC is set).
341  * Then release the buffer.
342  * This interface is provided for binary compatibility.  Using
343  * BWRITE() directly avoids the extra function call overhead invoked
344  * by calling this routine.
345  */
346 void
347 bwrite(struct buf *bp)
348 {
349 	BWRITE(bp);
350 }
351 
352 /*
353  * Write the buffer, waiting for completion.
354  * But don't release the buffer afterwards.
355  * This interface is provided for binary compatibility.  Using
356  * BWRITE2() directly avoids the extra function call overhead.
357  */
358 void
359 bwrite2(struct buf *bp)
360 {
361 	BWRITE2(bp);
362 }
363 
364 /*
365  * Release the buffer, marking it so that if it is grabbed
366  * for another purpose it will be written out before being
367  * given up (e.g. when writing a partial block where it is
368  * assumed that another write for the same block will soon follow).
369  * Also save the time that the block is first marked as delayed
370  * so that it will be written in a reasonable time.
371  */
372 void
373 bdwrite(struct buf *bp)
374 {
375 	ASSERT(SEMA_HELD(&bp->b_sem));
376 	CPU_STATS_ADD_K(sys, lwrite, 1);
377 	if ((bp->b_flags & B_DELWRI) == 0)
378 		bp->b_start = lbolt;
379 	/*
380 	 * B_DONE allows others to use the buffer, B_DELWRI causes the
381 	 * buffer to be written before being reused, and setting b_resid
382 	 * to zero says the buffer is complete.
383 	 */
384 	bp->b_flags |= B_DELWRI | B_DONE;
385 	bp->b_resid = 0;
386 	brelse(bp);
387 }
388 
389 /*
390  * Release the buffer, start I/O on it, but don't wait for completion.
391  */
392 void
393 bawrite(struct buf *bp)
394 {
395 	ASSERT(SEMA_HELD(&bp->b_sem));
396 
397 	/* Use bfreelist.b_bcount as a weird-ass heuristic */
398 	if (bfreelist.b_bcount > 4)
399 		bp->b_flags |= B_ASYNC;
400 	BWRITE(bp);
401 }
402 
403 /*
404  * Release the buffer, with no I/O implied.
405  */
406 void
407 brelse(struct buf *bp)
408 {
409 	struct buf	**backp;
410 	uint_t		index;
411 	kmutex_t	*hmp;
412 	struct	buf	*dp;
413 	struct	hbuf	*hp;
414 
415 
416 	ASSERT(SEMA_HELD(&bp->b_sem));
417 
418 	/*
419 	 * Clear the retry write flag if the buffer was written without
420 	 * error.  The presence of B_DELWRI means the buffer has not yet
421 	 * been written and the presence of B_ERROR means that an error
422 	 * is still occurring.
423 	 */
424 	if ((bp->b_flags & (B_ERROR | B_DELWRI | B_RETRYWRI)) == B_RETRYWRI) {
425 		bp->b_flags &= ~B_RETRYWRI;
426 	}
427 
428 	/* Check for anomalous conditions */
429 	if (bp->b_flags & (B_ERROR|B_NOCACHE)) {
430 		if (bp->b_flags & B_NOCACHE) {
431 			/* Don't add to the freelist. Destroy it now */
432 			kmem_free(bp->b_un.b_addr, bp->b_bufsize);
433 			sema_destroy(&bp->b_sem);
434 			sema_destroy(&bp->b_io);
435 			kmem_free(bp, sizeof (struct buf));
436 			return;
437 		}
438 		/*
439 		 * If a write failed and we are supposed to retry write,
440 		 * don't toss the buffer.  Keep it around and mark it
441 		 * delayed write in the hopes that it will eventually
442 		 * get flushed (and still keep the system running.)
443 		 */
444 		if ((bp->b_flags & (B_READ | B_RETRYWRI)) == B_RETRYWRI) {
445 			bp->b_flags |= B_DELWRI;
446 			/* keep fsflush from trying continuously to flush */
447 			bp->b_start = lbolt;
448 		} else
449 			bp->b_flags |= B_AGE|B_STALE;
450 		bp->b_flags &= ~B_ERROR;
451 		bp->b_error = 0;
452 	}
453 
454 	/*
455 	 * If delayed write is set then put in on the delayed
456 	 * write list instead of the free buffer list.
457 	 */
458 	index = bio_bhash(bp->b_edev, bp->b_blkno);
459 	hmp   = &hbuf[index].b_lock;
460 
461 	mutex_enter(hmp);
462 	hp = &hbuf[index];
463 	dp = (struct buf *)hp;
464 
465 	/*
466 	 * Make sure that the number of entries on this list are
467 	 * Zero <= count <= total # buffers
468 	 */
469 	ASSERT(hp->b_length >= 0);
470 	ASSERT(hp->b_length < nbuf);
471 
472 	hp->b_length++;		/* We are adding this buffer */
473 
474 	if (bp->b_flags & B_DELWRI) {
475 		/*
476 		 * This buffer goes on the delayed write buffer list
477 		 */
478 		dp = (struct buf *)&dwbuf[index];
479 	}
480 	ASSERT(bp->b_bufsize > 0);
481 	ASSERT(bp->b_bcount > 0);
482 	ASSERT(bp->b_un.b_addr != NULL);
483 
484 	if (bp->b_flags & B_AGE) {
485 		backp = &dp->av_forw;
486 		(*backp)->av_back = bp;
487 		bp->av_forw = *backp;
488 		*backp = bp;
489 		bp->av_back = dp;
490 	} else {
491 		backp = &dp->av_back;
492 		(*backp)->av_forw = bp;
493 		bp->av_back = *backp;
494 		*backp = bp;
495 		bp->av_forw = dp;
496 	}
497 	mutex_exit(hmp);
498 
499 	if (bfreelist.b_flags & B_WANTED) {
500 		/*
501 		 * Should come here very very rarely.
502 		 */
503 		mutex_enter(&bfree_lock);
504 		if (bfreelist.b_flags & B_WANTED) {
505 			bfreelist.b_flags &= ~B_WANTED;
506 			cv_broadcast(&bio_mem_cv);
507 		}
508 		mutex_exit(&bfree_lock);
509 	}
510 
511 	bp->b_flags &= ~(B_WANTED|B_BUSY|B_ASYNC);
512 	/*
513 	 * Don't let anyone get the buffer off the freelist before we
514 	 * release our hold on it.
515 	 */
516 	sema_v(&bp->b_sem);
517 }
518 
519 /*
520  * Return a count of the number of B_BUSY buffers in the system
521  * Can only be used as a good estimate.  If 'cleanit' is set,
522  * try to flush all bufs.
523  */
524 int
525 bio_busy(int cleanit)
526 {
527 	struct buf *bp, *dp;
528 	int busy = 0;
529 	int i;
530 	kmutex_t *hmp;
531 
532 	for (i = 0; i < v.v_hbuf; i++) {
533 		vfs_syncprogress();
534 		dp = (struct buf *)&hbuf[i];
535 		hmp = &hbuf[i].b_lock;
536 
537 		mutex_enter(hmp);
538 		for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
539 			if (bp->b_flags & B_BUSY)
540 				busy++;
541 		}
542 		mutex_exit(hmp);
543 	}
544 
545 	if (cleanit && busy != 0) {
546 		bflush(NODEV);
547 	}
548 
549 	return (busy);
550 }
551 
552 /*
553  * this interface is provided for binary compatibility.
554  *
555  * Assign a buffer for the given block.  If the appropriate
556  * block is already associated, return it; otherwise search
557  * for the oldest non-busy buffer and reassign it.
558  */
559 struct buf *
560 getblk(dev_t dev, daddr_t blkno, long bsize)
561 {
562 	return (getblk_common(/* ufsvfsp */ NULL, dev,
563 			blkno, bsize, /* errflg */ 0));
564 }
565 
566 /*
567  * Assign a buffer for the given block.  If the appropriate
568  * block is already associated, return it; otherwise search
569  * for the oldest non-busy buffer and reassign it.
570  */
571 struct buf *
572 getblk_common(void * arg, dev_t dev, daddr_t blkno, long bsize, int errflg)
573 {
574 	ufsvfs_t *ufsvfsp = (struct ufsvfs *)arg;
575 	struct buf *bp;
576 	struct buf *dp;
577 	struct buf *nbp = NULL;
578 	struct buf *errbp;
579 	uint_t		index;
580 	kmutex_t	*hmp;
581 	struct	hbuf	*hp;
582 
583 	if (getmajor(dev) >= devcnt)
584 		cmn_err(CE_PANIC, "blkdev");
585 
586 	biostats.bio_lookup.value.ui32++;
587 
588 	index = bio_bhash(dev, blkno);
589 	hp    = &hbuf[index];
590 	dp    = (struct buf *)hp;
591 	hmp   = &hp->b_lock;
592 
593 	mutex_enter(hmp);
594 loop:
595 	for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
596 		if (bp->b_blkno != blkno || bp->b_edev != dev ||
597 		    (bp->b_flags & B_STALE))
598 			continue;
599 		/*
600 		 * Avoid holding the hash lock in the event that
601 		 * the buffer is locked by someone. Since the hash chain
602 		 * may change when we drop the hash lock
603 		 * we have to start at the beginning of the chain if the
604 		 * buffer identity/contents aren't valid.
605 		 */
606 		if (!sema_tryp(&bp->b_sem)) {
607 			biostats.bio_bufbusy.value.ui32++;
608 			mutex_exit(hmp);
609 			/*
610 			 * OK, we are dealing with a busy buffer.
611 			 * In the case that we are panicking and we
612 			 * got called from bread(), we have some chance
613 			 * for error recovery. So better bail out from
614 			 * here since sema_p() won't block. If we got
615 			 * called directly from ufs routines, there is
616 			 * no way to report an error yet.
617 			 */
618 			if (panicstr && errflg)
619 				goto errout;
620 			/*
621 			 * For the following line of code to work
622 			 * correctly never kmem_free the buffer "header".
623 			 */
624 			sema_p(&bp->b_sem);
625 			if (bp->b_blkno != blkno || bp->b_edev != dev ||
626 			    (bp->b_flags & B_STALE)) {
627 				sema_v(&bp->b_sem);
628 				mutex_enter(hmp);
629 				goto loop;	/* start over */
630 			}
631 			mutex_enter(hmp);
632 		}
633 		/* Found */
634 		biostats.bio_hit.value.ui32++;
635 		bp->b_flags &= ~B_AGE;
636 
637 		/*
638 		 * Yank it off the free/delayed write lists
639 		 */
640 		hp->b_length--;
641 		notavail(bp);
642 		mutex_exit(hmp);
643 
644 		ASSERT((bp->b_flags & B_NOCACHE) == NULL);
645 
646 		if (nbp == NULL) {
647 			/*
648 			 * Make the common path short.
649 			 */
650 			ASSERT(SEMA_HELD(&bp->b_sem));
651 			return (bp);
652 		}
653 
654 		biostats.bio_bufdup.value.ui32++;
655 
656 		/*
657 		 * The buffer must have entered during the lock upgrade
658 		 * so free the new buffer we allocated and return the
659 		 * found buffer.
660 		 */
661 		kmem_free(nbp->b_un.b_addr, nbp->b_bufsize);
662 		nbp->b_un.b_addr = NULL;
663 
664 		/*
665 		 * Account for the memory
666 		 */
667 		mutex_enter(&bfree_lock);
668 		bfreelist.b_bufsize += nbp->b_bufsize;
669 		mutex_exit(&bfree_lock);
670 
671 		/*
672 		 * Destroy buf identity, and place on avail list
673 		 */
674 		nbp->b_dev = (o_dev_t)NODEV;
675 		nbp->b_edev = NODEV;
676 		nbp->b_flags = 0;
677 		nbp->b_file = NULL;
678 		nbp->b_offset = -1;
679 
680 		sema_v(&nbp->b_sem);
681 		bio_bhdr_free(nbp);
682 
683 		ASSERT(SEMA_HELD(&bp->b_sem));
684 		return (bp);
685 	}
686 
687 	/*
688 	 * bio_getfreeblk may block so check the hash chain again.
689 	 */
690 	if (nbp == NULL) {
691 		mutex_exit(hmp);
692 		nbp = bio_getfreeblk(bsize);
693 		mutex_enter(hmp);
694 		goto loop;
695 	}
696 
697 	/*
698 	 * New buffer. Assign nbp and stick it on the hash.
699 	 */
700 	nbp->b_flags = B_BUSY;
701 	nbp->b_edev = dev;
702 	nbp->b_dev = (o_dev_t)cmpdev(dev);
703 	nbp->b_blkno = blkno;
704 	nbp->b_iodone = NULL;
705 	nbp->b_bcount = bsize;
706 	/*
707 	 * If we are given a ufsvfsp and the vfs_root field is NULL
708 	 * then this must be I/O for a superblock.  A superblock's
709 	 * buffer is set up in mountfs() and there is no root vnode
710 	 * at that point.
711 	 */
712 	if (ufsvfsp && ufsvfsp->vfs_root) {
713 		nbp->b_vp = ufsvfsp->vfs_root;
714 	} else {
715 		nbp->b_vp = NULL;
716 	}
717 
718 	ASSERT((nbp->b_flags & B_NOCACHE) == NULL);
719 
720 	binshash(nbp, dp);
721 	mutex_exit(hmp);
722 
723 	ASSERT(SEMA_HELD(&nbp->b_sem));
724 
725 	return (nbp);
726 
727 
728 	/*
729 	 * Come here in case of an internal error. At this point we couldn't
730 	 * get a buffer, but he have to return one. Hence we allocate some
731 	 * kind of error reply buffer on the fly. This buffer is marked as
732 	 * B_NOCACHE | B_AGE | B_ERROR | B_DONE to assure the following:
733 	 *	- B_ERROR will indicate error to the caller.
734 	 *	- B_DONE will prevent us from reading the buffer from
735 	 *	  the device.
736 	 *	- B_NOCACHE will cause that this buffer gets free'd in
737 	 *	  brelse().
738 	 */
739 
740 errout:
741 	errbp = geteblk();
742 	sema_p(&errbp->b_sem);
743 	errbp->b_flags &= ~B_BUSY;
744 	errbp->b_flags |= (B_ERROR | B_DONE);
745 	return (errbp);
746 }
747 
748 /*
749  * Get an empty block, not assigned to any particular device.
750  * Returns a locked buffer that is not on any hash or free list.
751  */
752 struct buf *
753 ngeteblk(long bsize)
754 {
755 	struct buf *bp;
756 
757 	bp = kmem_alloc(sizeof (struct buf), KM_SLEEP);
758 	bioinit(bp);
759 	bp->av_forw = bp->av_back = NULL;
760 	bp->b_un.b_addr = kmem_alloc(bsize, KM_SLEEP);
761 	bp->b_bufsize = bsize;
762 	bp->b_flags = B_BUSY | B_NOCACHE | B_AGE;
763 	bp->b_dev = (o_dev_t)NODEV;
764 	bp->b_edev = NODEV;
765 	bp->b_lblkno = 0;
766 	bp->b_bcount = bsize;
767 	bp->b_iodone = NULL;
768 	return (bp);
769 }
770 
771 /*
772  * Interface of geteblk() is kept intact to maintain driver compatibility.
773  * Use ngeteblk() to allocate block size other than 1 KB.
774  */
775 struct buf *
776 geteblk(void)
777 {
778 	return (ngeteblk((long)1024));
779 }
780 
781 /*
782  * Return a buffer w/o sleeping
783  */
784 struct buf *
785 trygetblk(dev_t dev, daddr_t blkno)
786 {
787 	struct buf	*bp;
788 	struct buf	*dp;
789 	struct hbuf	*hp;
790 	kmutex_t	*hmp;
791 	uint_t		index;
792 
793 	index = bio_bhash(dev, blkno);
794 	hp = &hbuf[index];
795 	hmp = &hp->b_lock;
796 
797 	if (!mutex_tryenter(hmp))
798 		return (NULL);
799 
800 	dp = (struct buf *)hp;
801 	for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
802 		if (bp->b_blkno != blkno || bp->b_edev != dev ||
803 		    (bp->b_flags & B_STALE))
804 			continue;
805 		/*
806 		 * Get access to a valid buffer without sleeping
807 		 */
808 		if (sema_tryp(&bp->b_sem)) {
809 			if (bp->b_flags & B_DONE) {
810 				hp->b_length--;
811 				notavail(bp);
812 				mutex_exit(hmp);
813 				return (bp);
814 			} else {
815 				sema_v(&bp->b_sem);
816 				break;
817 			}
818 		}
819 		break;
820 	}
821 	mutex_exit(hmp);
822 	return (NULL);
823 }
824 
825 /*
826  * Wait for I/O completion on the buffer; return errors
827  * to the user.
828  */
829 int
830 iowait(struct buf *bp)
831 {
832 	ASSERT(SEMA_HELD(&bp->b_sem));
833 	return (biowait(bp));
834 }
835 
836 /*
837  * Mark I/O complete on a buffer, release it if I/O is asynchronous,
838  * and wake up anyone waiting for it.
839  */
840 void
841 iodone(struct buf *bp)
842 {
843 	ASSERT(SEMA_HELD(&bp->b_sem));
844 	(void) biodone(bp);
845 }
846 
847 /*
848  * Zero the core associated with a buffer.
849  */
850 void
851 clrbuf(struct buf *bp)
852 {
853 	ASSERT(SEMA_HELD(&bp->b_sem));
854 	bzero(bp->b_un.b_addr, bp->b_bcount);
855 	bp->b_resid = 0;
856 }
857 
858 
859 /*
860  * Make sure all write-behind blocks on dev (or NODEV for all)
861  * are flushed out.
862  */
863 void
864 bflush(dev_t dev)
865 {
866 	struct buf *bp, *dp;
867 	struct hbuf *hp;
868 	struct buf *delwri_list = EMPTY_LIST;
869 	int i, index;
870 	kmutex_t *hmp;
871 
872 	mutex_enter(&blist_lock);
873 	/*
874 	 * Wait for any invalidates or flushes ahead of us to finish.
875 	 * We really could split blist_lock up per device for better
876 	 * parallelism here.
877 	 */
878 	while (bio_doinginval || bio_doingflush) {
879 		bio_flinv_cv_wanted = 1;
880 		cv_wait(&bio_flushinval_cv, &blist_lock);
881 	}
882 	bio_doingflush++;
883 	/*
884 	 * Gather all B_DELWRI buffer for device.
885 	 * Lock ordering is b_sem > hash lock (brelse).
886 	 * Since we are finding the buffer via the delayed write list,
887 	 * it may be busy and we would block trying to get the
888 	 * b_sem lock while holding hash lock. So transfer all the
889 	 * candidates on the delwri_list and then drop the hash locks.
890 	 */
891 	for (i = 0; i < v.v_hbuf; i++) {
892 		vfs_syncprogress();
893 		hmp = &hbuf[i].b_lock;
894 		dp = (struct buf *)&dwbuf[i];
895 		mutex_enter(hmp);
896 		for (bp = dp->av_forw; bp != dp; bp = bp->av_forw) {
897 			if (dev == NODEV || bp->b_edev == dev) {
898 				if (bp->b_list == NULL) {
899 					bp->b_list = delwri_list;
900 					delwri_list = bp;
901 				}
902 			}
903 		}
904 		mutex_exit(hmp);
905 	}
906 	mutex_exit(&blist_lock);
907 
908 	/*
909 	 * Now that the hash locks have been dropped grab the semaphores
910 	 * and write back all the buffers that have B_DELWRI set.
911 	 */
912 	while (delwri_list != EMPTY_LIST) {
913 		vfs_syncprogress();
914 		bp = delwri_list;
915 
916 		sema_p(&bp->b_sem);	/* may block */
917 		if ((dev != bp->b_edev && dev != NODEV) ||
918 		    (panicstr && bp->b_flags & B_BUSY)) {
919 			sema_v(&bp->b_sem);
920 			delwri_list = bp->b_list;
921 			bp->b_list = NULL;
922 			continue;	/* No longer a candidate */
923 		}
924 		if (bp->b_flags & B_DELWRI) {
925 			index = bio_bhash(bp->b_edev, bp->b_blkno);
926 			hp = &hbuf[index];
927 			hmp = &hp->b_lock;
928 			dp = (struct buf *)hp;
929 
930 			bp->b_flags |= B_ASYNC;
931 			mutex_enter(hmp);
932 			hp->b_length--;
933 			notavail(bp);
934 			mutex_exit(hmp);
935 			if (bp->b_vp == NULL) {		/* !ufs */
936 				BWRITE(bp);
937 			} else {			/* ufs */
938 				UFS_BWRITE(VTOI(bp->b_vp)->i_ufsvfs, bp);
939 			}
940 		} else {
941 			sema_v(&bp->b_sem);
942 		}
943 		delwri_list = bp->b_list;
944 		bp->b_list = NULL;
945 	}
946 	mutex_enter(&blist_lock);
947 	bio_doingflush--;
948 	if (bio_flinv_cv_wanted) {
949 		bio_flinv_cv_wanted = 0;
950 		cv_broadcast(&bio_flushinval_cv);
951 	}
952 	mutex_exit(&blist_lock);
953 }
954 
955 /*
956  * Ensure that a specified block is up-to-date on disk.
957  */
958 void
959 blkflush(dev_t dev, daddr_t blkno)
960 {
961 	struct buf *bp, *dp;
962 	struct hbuf *hp;
963 	struct buf *sbp = NULL;
964 	uint_t index;
965 	kmutex_t *hmp;
966 
967 	index = bio_bhash(dev, blkno);
968 	hp    = &hbuf[index];
969 	dp    = (struct buf *)hp;
970 	hmp   = &hp->b_lock;
971 
972 	/*
973 	 * Identify the buffer in the cache belonging to
974 	 * this device and blkno (if any).
975 	 */
976 	mutex_enter(hmp);
977 	for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
978 		if (bp->b_blkno != blkno || bp->b_edev != dev ||
979 		    (bp->b_flags & B_STALE))
980 			continue;
981 		sbp = bp;
982 		break;
983 	}
984 	mutex_exit(hmp);
985 	if (sbp == NULL)
986 		return;
987 	/*
988 	 * Now check the buffer we have identified and
989 	 * make sure it still belongs to the device and is B_DELWRI
990 	 */
991 	sema_p(&sbp->b_sem);
992 	if (sbp->b_blkno == blkno && sbp->b_edev == dev &&
993 	    (sbp->b_flags & (B_DELWRI|B_STALE)) == B_DELWRI) {
994 		mutex_enter(hmp);
995 		hp->b_length--;
996 		notavail(sbp);
997 		mutex_exit(hmp);
998 		/*
999 		 * XXX - There is nothing to guarantee a synchronous
1000 		 * write here if the B_ASYNC flag is set.  This needs
1001 		 * some investigation.
1002 		 */
1003 		if (sbp->b_vp == NULL) {		/* !ufs */
1004 			BWRITE(sbp);	/* synchronous write */
1005 		} else {				/* ufs */
1006 			UFS_BWRITE(VTOI(sbp->b_vp)->i_ufsvfs, sbp);
1007 		}
1008 	} else {
1009 		sema_v(&sbp->b_sem);
1010 	}
1011 }
1012 
1013 /*
1014  * Same as binval, except can force-invalidate delayed-write buffers
1015  * (which are not be already flushed because of device errors).  Also
1016  * makes sure that the retry write flag is cleared.
1017  */
1018 int
1019 bfinval(dev_t dev, int force)
1020 {
1021 	struct buf *dp;
1022 	struct buf *bp;
1023 	struct buf *binval_list = EMPTY_LIST;
1024 	int i, error = 0;
1025 	kmutex_t *hmp;
1026 	uint_t index;
1027 	struct buf **backp;
1028 
1029 	mutex_enter(&blist_lock);
1030 	/*
1031 	 * Wait for any flushes ahead of us to finish, it's ok to
1032 	 * do invalidates in parallel.
1033 	 */
1034 	while (bio_doingflush) {
1035 		bio_flinv_cv_wanted = 1;
1036 		cv_wait(&bio_flushinval_cv, &blist_lock);
1037 	}
1038 	bio_doinginval++;
1039 
1040 	/* Gather bp's */
1041 	for (i = 0; i < v.v_hbuf; i++) {
1042 		dp = (struct buf *)&hbuf[i];
1043 		hmp = &hbuf[i].b_lock;
1044 
1045 		mutex_enter(hmp);
1046 		for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
1047 			if (bp->b_edev == dev) {
1048 				if (bp->b_list == NULL) {
1049 					bp->b_list = binval_list;
1050 					binval_list = bp;
1051 				}
1052 			}
1053 		}
1054 		mutex_exit(hmp);
1055 	}
1056 	mutex_exit(&blist_lock);
1057 
1058 	/* Invalidate all bp's found */
1059 	while (binval_list != EMPTY_LIST) {
1060 		bp = binval_list;
1061 
1062 		sema_p(&bp->b_sem);
1063 		if (bp->b_edev == dev) {
1064 			if (force && (bp->b_flags & B_DELWRI)) {
1065 				/* clear B_DELWRI, move to non-dw freelist */
1066 				index = bio_bhash(bp->b_edev, bp->b_blkno);
1067 				hmp = &hbuf[index].b_lock;
1068 				dp = (struct buf *)&hbuf[index];
1069 				mutex_enter(hmp);
1070 
1071 				/* remove from delayed write freelist */
1072 				notavail(bp);
1073 
1074 				/* add to B_AGE side of non-dw freelist */
1075 				backp = &dp->av_forw;
1076 				(*backp)->av_back = bp;
1077 				bp->av_forw = *backp;
1078 				*backp = bp;
1079 				bp->av_back = dp;
1080 
1081 				/*
1082 				 * make sure write retries and busy are cleared
1083 				 */
1084 				bp->b_flags &=
1085 				    ~(B_BUSY | B_DELWRI | B_RETRYWRI);
1086 				mutex_exit(hmp);
1087 			}
1088 			if ((bp->b_flags & B_DELWRI) == 0)
1089 				bp->b_flags |= B_STALE|B_AGE;
1090 			else
1091 				error = EIO;
1092 		}
1093 		sema_v(&bp->b_sem);
1094 		binval_list = bp->b_list;
1095 		bp->b_list = NULL;
1096 	}
1097 	mutex_enter(&blist_lock);
1098 	bio_doinginval--;
1099 	if (bio_flinv_cv_wanted) {
1100 		cv_broadcast(&bio_flushinval_cv);
1101 		bio_flinv_cv_wanted = 0;
1102 	}
1103 	mutex_exit(&blist_lock);
1104 	return (error);
1105 }
1106 
1107 /*
1108  * If possible, invalidate blocks for a dev on demand
1109  */
1110 void
1111 binval(dev_t dev)
1112 {
1113 	(void) bfinval(dev, 0);
1114 }
1115 
1116 /*
1117  * Initialize the buffer I/O system by freeing
1118  * all buffers and setting all device hash buffer lists to empty.
1119  */
1120 void
1121 binit(void)
1122 {
1123 	struct buf *bp;
1124 	unsigned int i, pct;
1125 	ulong_t	bio_max_hwm, bio_default_hwm;
1126 
1127 	/*
1128 	 * Maximum/Default values for bufhwm are set to the smallest of:
1129 	 *	- BIO_MAX_PERCENT resp. BIO_BUF_PERCENT of real memory
1130 	 *	- 1/4 of kernel virtual memory
1131 	 *	- INT32_MAX to prevent overflows of v.v_bufhwm (which is int).
1132 	 * Additionally, in order to allow simple tuning by percentage of
1133 	 * physical memory, bufhwm_pct is used to calculate the default if
1134 	 * the value of this tunable is between 0 and BIO_MAX_PERCENT.
1135 	 *
1136 	 * Since the unit for v.v_bufhwm is kilobytes, this allows for
1137 	 * a maximum of 1024 * 2GB == 2TB memory usage by buffer headers.
1138 	 */
1139 	bio_max_hwm = MIN(physmem / BIO_MAX_PERCENT,
1140 	    btop(vmem_size(heap_arena, VMEM_FREE)) / 4) * (PAGESIZE / 1024);
1141 	bio_max_hwm = MIN(INT32_MAX, bio_max_hwm);
1142 
1143 	pct = BIO_BUF_PERCENT;
1144 	if (bufhwm_pct != 0 &&
1145 	    ((pct = 100 / bufhwm_pct) < BIO_MAX_PERCENT)) {
1146 		pct = BIO_BUF_PERCENT;
1147 		/*
1148 		 * Invalid user specified value, emit a warning.
1149 		 */
1150 		cmn_err(CE_WARN, "binit: bufhwm_pct(%d) out of \
1151 			range(1..%d). Using %d as default.",
1152 			bufhwm_pct,
1153 			100 / BIO_MAX_PERCENT, 100 / BIO_BUF_PERCENT);
1154 	}
1155 
1156 	bio_default_hwm = MIN(physmem / pct,
1157 	    btop(vmem_size(heap_arena, VMEM_FREE)) / 4) * (PAGESIZE / 1024);
1158 	bio_default_hwm = MIN(INT32_MAX, bio_default_hwm);
1159 
1160 	if ((v.v_bufhwm = bufhwm) == 0)
1161 		v.v_bufhwm = bio_default_hwm;
1162 
1163 	if (v.v_bufhwm < BIO_MIN_HWM || v.v_bufhwm > bio_max_hwm) {
1164 		v.v_bufhwm = (int)bio_max_hwm;
1165 		/*
1166 		 * Invalid user specified value, emit a warning.
1167 		 */
1168 		cmn_err(CE_WARN,
1169 			"binit: bufhwm(%d) out \
1170 			of range(%d..%lu). Using %lu as default",
1171 			bufhwm,
1172 			BIO_MIN_HWM, bio_max_hwm, bio_max_hwm);
1173 	}
1174 
1175 	/*
1176 	 * Determine the number of hash buckets. Default is to
1177 	 * create ~BIO_HASHLEN entries per chain based on MAXBSIZE buffers.
1178 	 * Round up number to the next power of 2.
1179 	 */
1180 	v.v_hbuf = 1 << highbit((((ulong_t)v.v_bufhwm * 1024) / MAXBSIZE) /
1181 	    BIO_HASHLEN);
1182 	v.v_hmask = v.v_hbuf - 1;
1183 	v.v_buf = BIO_BHDR_POOL;
1184 
1185 	hbuf = kmem_zalloc(v.v_hbuf * sizeof (struct hbuf), KM_SLEEP);
1186 
1187 	dwbuf = kmem_zalloc(v.v_hbuf * sizeof (struct dwbuf), KM_SLEEP);
1188 
1189 	bfreelist.b_bufsize = (size_t)v.v_bufhwm * 1024;
1190 	bp = &bfreelist;
1191 	bp->b_forw = bp->b_back = bp->av_forw = bp->av_back = bp;
1192 
1193 	for (i = 0; i < v.v_hbuf; i++) {
1194 		hbuf[i].b_forw = hbuf[i].b_back = (struct buf *)&hbuf[i];
1195 		hbuf[i].av_forw = hbuf[i].av_back = (struct buf *)&hbuf[i];
1196 
1197 		/*
1198 		 * Initialize the delayed write buffer list.
1199 		 */
1200 		dwbuf[i].b_forw = dwbuf[i].b_back = (struct buf *)&dwbuf[i];
1201 		dwbuf[i].av_forw = dwbuf[i].av_back = (struct buf *)&dwbuf[i];
1202 	}
1203 }
1204 
1205 /*
1206  * Wait for I/O completion on the buffer; return error code.
1207  * If bp was for synchronous I/O, bp is invalid and associated
1208  * resources are freed on return.
1209  */
1210 int
1211 biowait(struct buf *bp)
1212 {
1213 	int error = 0;
1214 	struct cpu *cpup;
1215 
1216 	ASSERT(SEMA_HELD(&bp->b_sem));
1217 
1218 	cpup = CPU;
1219 	atomic_add_64(&cpup->cpu_stats.sys.iowait, 1);
1220 	DTRACE_IO1(wait__start, struct buf *, bp);
1221 
1222 	/*
1223 	 * In case of panic, busy wait for completion
1224 	 */
1225 	if (panicstr) {
1226 		while ((bp->b_flags & B_DONE) == 0)
1227 			drv_usecwait(10);
1228 	} else
1229 		sema_p(&bp->b_io);
1230 
1231 	DTRACE_IO1(wait__done, struct buf *, bp);
1232 	atomic_add_64(&cpup->cpu_stats.sys.iowait, -1);
1233 
1234 	error = geterror(bp);
1235 	if ((bp->b_flags & B_ASYNC) == 0) {
1236 		if (bp->b_flags & B_REMAPPED)
1237 			bp_mapout(bp);
1238 	}
1239 	return (error);
1240 }
1241 
1242 static void
1243 biodone_tnf_probe(struct buf *bp)
1244 {
1245 	/* Kernel probe */
1246 	TNF_PROBE_3(biodone, "io blockio", /* CSTYLED */,
1247 		tnf_device,	device,		bp->b_edev,
1248 		tnf_diskaddr,	block,		bp->b_lblkno,
1249 		tnf_opaque,	buf,		bp);
1250 }
1251 
1252 /*
1253  * Mark I/O complete on a buffer, release it if I/O is asynchronous,
1254  * and wake up anyone waiting for it.
1255  */
1256 void
1257 biodone(struct buf *bp)
1258 {
1259 	if (bp->b_flags & B_STARTED) {
1260 		DTRACE_IO1(done, struct buf *, bp);
1261 		bp->b_flags &= ~B_STARTED;
1262 	}
1263 
1264 	/*
1265 	 * Call the TNF probe here instead of the inline code
1266 	 * to force our compiler to use the tail call optimization.
1267 	 */
1268 	biodone_tnf_probe(bp);
1269 
1270 	if (bp->b_iodone != NULL) {
1271 		(*(bp->b_iodone))(bp);
1272 		return;
1273 	}
1274 	ASSERT((bp->b_flags & B_DONE) == 0);
1275 	ASSERT(SEMA_HELD(&bp->b_sem));
1276 	bp->b_flags |= B_DONE;
1277 	if (bp->b_flags & B_ASYNC) {
1278 		if (bp->b_flags & (B_PAGEIO|B_REMAPPED))
1279 			bio_pageio_done(bp);
1280 		else
1281 			brelse(bp);	/* release bp to freelist */
1282 	} else {
1283 		sema_v(&bp->b_io);
1284 	}
1285 }
1286 
1287 /*
1288  * Pick up the device's error number and pass it to the user;
1289  * if there is an error but the number is 0 set a generalized code.
1290  */
1291 int
1292 geterror(struct buf *bp)
1293 {
1294 	int error = 0;
1295 
1296 	ASSERT(SEMA_HELD(&bp->b_sem));
1297 	if (bp->b_flags & B_ERROR) {
1298 		error = bp->b_error;
1299 		if (!error)
1300 			error = EIO;
1301 	}
1302 	return (error);
1303 }
1304 
1305 /*
1306  * Support for pageio buffers.
1307  *
1308  * This stuff should be generalized to provide a generalized bp
1309  * header facility that can be used for things other than pageio.
1310  */
1311 
1312 /*
1313  * Allocate and initialize a buf struct for use with pageio.
1314  */
1315 struct buf *
1316 pageio_setup(struct page *pp, size_t len, struct vnode *vp, int flags)
1317 {
1318 	struct buf *bp;
1319 	struct cpu *cpup;
1320 
1321 	if (flags & B_READ) {
1322 		CPU_STATS_ENTER_K();
1323 		cpup = CPU;	/* get pointer AFTER preemption is disabled */
1324 		CPU_STATS_ADDQ(cpup, vm, pgin, 1);
1325 		CPU_STATS_ADDQ(cpup, vm, pgpgin, btopr(len));
1326 		if ((flags & B_ASYNC) == 0) {
1327 			klwp_t *lwp = ttolwp(curthread);
1328 			if (lwp != NULL)
1329 				lwp->lwp_ru.majflt++;
1330 			CPU_STATS_ADDQ(cpup, vm, maj_fault, 1);
1331 			/* Kernel probe */
1332 			TNF_PROBE_2(major_fault, "vm pagefault", /* CSTYLED */,
1333 				tnf_opaque,	vnode,		pp->p_vnode,
1334 				tnf_offset,	offset,		pp->p_offset);
1335 		}
1336 		/*
1337 		 * Update statistics for pages being paged in
1338 		 */
1339 		if (pp != NULL && pp->p_vnode != NULL) {
1340 			if (IS_SWAPFSVP(pp->p_vnode)) {
1341 				CPU_STATS_ADDQ(cpup, vm, anonpgin,
1342 						btopr(len));
1343 			} else {
1344 				if (pp->p_vnode->v_flag & VVMEXEC) {
1345 					CPU_STATS_ADDQ(cpup, vm, execpgin,
1346 							btopr(len));
1347 				} else {
1348 					CPU_STATS_ADDQ(cpup, vm, fspgin,
1349 							btopr(len));
1350 				}
1351 			}
1352 		}
1353 		CPU_STATS_EXIT_K();
1354 		TRACE_1(TR_FAC_VM, TR_PAGE_WS_IN,
1355 		    "page_ws_in:pp %p", pp);
1356 		/* Kernel probe */
1357 		TNF_PROBE_3(pagein, "vm pageio io", /* CSTYLED */,
1358 			tnf_opaque,	vnode,		pp->p_vnode,
1359 			tnf_offset,	offset,		pp->p_offset,
1360 			tnf_size,	size,		len);
1361 	}
1362 
1363 	bp = kmem_zalloc(sizeof (struct buf), KM_SLEEP);
1364 	bp->b_bcount = len;
1365 	bp->b_bufsize = len;
1366 	bp->b_pages = pp;
1367 	bp->b_flags = B_PAGEIO | B_NOCACHE | B_BUSY | flags;
1368 	bp->b_offset = -1;
1369 	sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
1370 
1371 	/* Initialize bp->b_sem in "locked" state */
1372 	sema_init(&bp->b_sem, 0, NULL, SEMA_DEFAULT, NULL);
1373 
1374 	VN_HOLD(vp);
1375 	bp->b_vp = vp;
1376 	THREAD_KPRI_RELEASE_N(btopr(len)); /* release kpri from page_locks */
1377 
1378 	/*
1379 	 * Caller sets dev & blkno and can adjust
1380 	 * b_addr for page offset and can use bp_mapin
1381 	 * to make pages kernel addressable.
1382 	 */
1383 	return (bp);
1384 }
1385 
1386 void
1387 pageio_done(struct buf *bp)
1388 {
1389 	ASSERT(SEMA_HELD(&bp->b_sem));
1390 	if (bp->b_flags & B_REMAPPED)
1391 		bp_mapout(bp);
1392 	VN_RELE(bp->b_vp);
1393 	bp->b_vp = NULL;
1394 	ASSERT((bp->b_flags & B_NOCACHE) != 0);
1395 
1396 	/* A sema_v(bp->b_sem) is implied if we are destroying it */
1397 	sema_destroy(&bp->b_sem);
1398 	sema_destroy(&bp->b_io);
1399 	kmem_free(bp, sizeof (struct buf));
1400 }
1401 
1402 /*
1403  * Check to see whether the buffers, except the one pointed by sbp,
1404  * associated with the device are busy.
1405  * NOTE: This expensive operation shall be improved together with ufs_icheck().
1406  */
1407 int
1408 bcheck(dev_t dev, struct buf *sbp)
1409 {
1410 	struct buf	*bp;
1411 	struct buf	*dp;
1412 	int i;
1413 	kmutex_t *hmp;
1414 
1415 	/*
1416 	 * check for busy bufs for this filesystem
1417 	 */
1418 	for (i = 0; i < v.v_hbuf; i++) {
1419 		dp = (struct buf *)&hbuf[i];
1420 		hmp = &hbuf[i].b_lock;
1421 
1422 		mutex_enter(hmp);
1423 		for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
1424 			/*
1425 			 * if buf is busy or dirty, then filesystem is busy
1426 			 */
1427 			if ((bp->b_edev == dev) &&
1428 			    ((bp->b_flags & B_STALE) == 0) &&
1429 			    (bp->b_flags & (B_DELWRI|B_BUSY)) &&
1430 			    (bp != sbp)) {
1431 				mutex_exit(hmp);
1432 				return (1);
1433 			}
1434 		}
1435 		mutex_exit(hmp);
1436 	}
1437 	return (0);
1438 }
1439 
1440 /*
1441  * Hash two 32 bit entities.
1442  */
1443 int
1444 hash2ints(int x, int y)
1445 {
1446 	int hash = 0;
1447 
1448 	hash = x - 1;
1449 	hash = ((hash * 7) + (x >> 8)) - 1;
1450 	hash = ((hash * 7) + (x >> 16)) - 1;
1451 	hash = ((hash * 7) + (x >> 24)) - 1;
1452 	hash = ((hash * 7) + y) - 1;
1453 	hash = ((hash * 7) + (y >> 8)) - 1;
1454 	hash = ((hash * 7) + (y >> 16)) - 1;
1455 	hash = ((hash * 7) + (y >> 24)) - 1;
1456 
1457 	return (hash);
1458 }
1459 
1460 
1461 /*
1462  * Return a new buffer struct.
1463  *	Create a new buffer if we haven't gone over our high water
1464  *	mark for memory, otherwise try to get one off the freelist.
1465  *
1466  * Returns a locked buf that has no id and is not on any hash or free
1467  * list.
1468  */
1469 static struct buf *
1470 bio_getfreeblk(long bsize)
1471 {
1472 	struct buf *bp, *dp;
1473 	struct hbuf *hp;
1474 	kmutex_t	*hmp;
1475 	uint_t		start, end;
1476 
1477 	/*
1478 	 * mutex_enter(&bfree_lock);
1479 	 * bfreelist.b_bufsize represents the amount of memory
1480 	 * mutex_exit(&bfree_lock); protect ref to bfreelist
1481 	 * we are allowed to allocate in the cache before we hit our hwm.
1482 	 */
1483 	bio_mem_get(bsize);	/* Account for our memory request */
1484 
1485 again:
1486 	bp = bio_bhdr_alloc();	/* Get a buf hdr */
1487 	sema_p(&bp->b_sem);	/* Should never fail */
1488 
1489 	ASSERT(bp->b_un.b_addr == NULL);
1490 	bp->b_un.b_addr = kmem_alloc(bsize, KM_NOSLEEP);
1491 	if (bp->b_un.b_addr != NULL) {
1492 		/*
1493 		 * Make the common path short
1494 		 */
1495 		bp->b_bufsize = bsize;
1496 		ASSERT(SEMA_HELD(&bp->b_sem));
1497 		return (bp);
1498 	} else {
1499 		struct buf *save;
1500 
1501 		save = bp;	/* Save bp we allocated */
1502 		start = end = lastindex;
1503 
1504 		biostats.bio_bufwant.value.ui32++;
1505 
1506 		/*
1507 		 * Memory isn't available from the system now. Scan
1508 		 * the hash buckets till enough space is found.
1509 		 */
1510 		do {
1511 			hp = &hbuf[start];
1512 			hmp = &hp->b_lock;
1513 			dp = (struct buf *)hp;
1514 
1515 			mutex_enter(hmp);
1516 			bp = dp->av_forw;
1517 
1518 			while (bp != dp) {
1519 
1520 				ASSERT(bp != NULL);
1521 
1522 				if (!sema_tryp(&bp->b_sem)) {
1523 					bp = bp->av_forw;
1524 					continue;
1525 				}
1526 
1527 				/*
1528 				 * Since we are going down the freelist
1529 				 * associated with this hash bucket the
1530 				 * B_DELWRI flag should not be set.
1531 				 */
1532 				ASSERT(!(bp->b_flags & B_DELWRI));
1533 
1534 				if (bp->b_bufsize == bsize) {
1535 					hp->b_length--;
1536 					notavail(bp);
1537 					bremhash(bp);
1538 					mutex_exit(hmp);
1539 
1540 					/*
1541 					 * Didn't kmem_alloc any more, so don't
1542 					 * count it twice.
1543 					 */
1544 					mutex_enter(&bfree_lock);
1545 					bfreelist.b_bufsize += bsize;
1546 					mutex_exit(&bfree_lock);
1547 
1548 					/*
1549 					 * Update the lastindex value.
1550 					 */
1551 					lastindex = start;
1552 
1553 					/*
1554 					 * Put our saved bp back on the list
1555 					 */
1556 					sema_v(&save->b_sem);
1557 					bio_bhdr_free(save);
1558 					ASSERT(SEMA_HELD(&bp->b_sem));
1559 					return (bp);
1560 				}
1561 				sema_v(&bp->b_sem);
1562 				bp = bp->av_forw;
1563 			}
1564 			mutex_exit(hmp);
1565 			start = ((start + 1) % v.v_hbuf);
1566 		} while (start != end);
1567 
1568 		biostats.bio_bufwait.value.ui32++;
1569 		bp = save;		/* Use original bp */
1570 		bp->b_un.b_addr = kmem_alloc(bsize, KM_SLEEP);
1571 	}
1572 
1573 	bp->b_bufsize = bsize;
1574 	ASSERT(SEMA_HELD(&bp->b_sem));
1575 	return (bp);
1576 }
1577 
1578 /*
1579  * Allocate a buffer header. If none currently available, allocate
1580  * a new pool.
1581  */
1582 static struct buf *
1583 bio_bhdr_alloc(void)
1584 {
1585 	struct buf *dp, *sdp;
1586 	struct buf *bp;
1587 	int i;
1588 
1589 	for (;;) {
1590 		mutex_enter(&bhdr_lock);
1591 		if (bhdrlist != NULL) {
1592 			bp = bhdrlist;
1593 			bhdrlist = bp->av_forw;
1594 			mutex_exit(&bhdr_lock);
1595 			bp->av_forw = NULL;
1596 			return (bp);
1597 		}
1598 		mutex_exit(&bhdr_lock);
1599 
1600 		/*
1601 		 * Need to allocate a new pool. If the system is currently
1602 		 * out of memory, then try freeing things on the freelist.
1603 		 */
1604 		dp = kmem_zalloc(sizeof (struct buf) * v.v_buf, KM_NOSLEEP);
1605 		if (dp == NULL) {
1606 			/*
1607 			 * System can't give us a pool of headers, try
1608 			 * recycling from the free lists.
1609 			 */
1610 			bio_recycle(BIO_HEADER, 0);
1611 		} else {
1612 			sdp = dp;
1613 			for (i = 0; i < v.v_buf; i++, dp++) {
1614 				/*
1615 				 * The next two lines are needed since NODEV
1616 				 * is -1 and not NULL
1617 				 */
1618 				dp->b_dev = (o_dev_t)NODEV;
1619 				dp->b_edev = NODEV;
1620 				dp->av_forw = dp + 1;
1621 				sema_init(&dp->b_sem, 1, NULL, SEMA_DEFAULT,
1622 				    NULL);
1623 				sema_init(&dp->b_io, 0, NULL, SEMA_DEFAULT,
1624 				    NULL);
1625 				dp->b_offset = -1;
1626 			}
1627 			mutex_enter(&bhdr_lock);
1628 			(--dp)->av_forw = bhdrlist;	/* Fix last pointer */
1629 			bhdrlist = sdp;
1630 			nbuf += v.v_buf;
1631 			bp = bhdrlist;
1632 			bhdrlist = bp->av_forw;
1633 			mutex_exit(&bhdr_lock);
1634 
1635 			bp->av_forw = NULL;
1636 			return (bp);
1637 		}
1638 	}
1639 }
1640 
1641 static  void
1642 bio_bhdr_free(struct buf *bp)
1643 {
1644 	ASSERT(bp->b_back == NULL);
1645 	ASSERT(bp->b_forw == NULL);
1646 	ASSERT(bp->av_back == NULL);
1647 	ASSERT(bp->av_forw == NULL);
1648 	ASSERT(bp->b_un.b_addr == NULL);
1649 	ASSERT(bp->b_dev == (o_dev_t)NODEV);
1650 	ASSERT(bp->b_edev == NODEV);
1651 	ASSERT(bp->b_flags == 0);
1652 
1653 	mutex_enter(&bhdr_lock);
1654 	bp->av_forw = bhdrlist;
1655 	bhdrlist = bp;
1656 	mutex_exit(&bhdr_lock);
1657 }
1658 
1659 /*
1660  * If we haven't gone over the high water mark, it's o.k. to
1661  * allocate more buffer space, otherwise recycle buffers
1662  * from the freelist until enough memory is free for a bsize request.
1663  *
1664  * We account for this memory, even though
1665  * we don't allocate it here.
1666  */
1667 static void
1668 bio_mem_get(long bsize)
1669 {
1670 	mutex_enter(&bfree_lock);
1671 	if (bfreelist.b_bufsize > bsize) {
1672 		bfreelist.b_bufsize -= bsize;
1673 		mutex_exit(&bfree_lock);
1674 		return;
1675 	}
1676 	mutex_exit(&bfree_lock);
1677 	bio_recycle(BIO_MEM, bsize);
1678 }
1679 
1680 /*
1681  * flush a list of delayed write buffers.
1682  * (currently used only by bio_recycle below.)
1683  */
1684 static void
1685 bio_flushlist(struct buf *delwri_list)
1686 {
1687 	struct buf *bp;
1688 
1689 	while (delwri_list != EMPTY_LIST) {
1690 		bp = delwri_list;
1691 		bp->b_flags |= B_AGE | B_ASYNC;
1692 		if (bp->b_vp == NULL) {		/* !ufs */
1693 			BWRITE(bp);
1694 		} else {			/* ufs */
1695 			UFS_BWRITE(VTOI(bp->b_vp)->i_ufsvfs, bp);
1696 		}
1697 		delwri_list = bp->b_list;
1698 		bp->b_list = NULL;
1699 	}
1700 }
1701 
1702 /*
1703  * Start recycling buffers on the freelist for one of 2 reasons:
1704  *	- we need a buffer header
1705  *	- we need to free up memory
1706  * Once started we continue to recycle buffers until the B_AGE
1707  * buffers are gone.
1708  */
1709 static void
1710 bio_recycle(int want, long bsize)
1711 {
1712 	struct buf *bp, *dp, *dwp, *nbp;
1713 	struct hbuf *hp;
1714 	int	found = 0;
1715 	kmutex_t	*hmp;
1716 	int		start, end;
1717 	struct buf *delwri_list = EMPTY_LIST;
1718 
1719 	/*
1720 	 * Recycle buffers.
1721 	 */
1722 top:
1723 	start = end = lastindex;
1724 	do {
1725 		hp = &hbuf[start];
1726 		hmp = &hp->b_lock;
1727 		dp = (struct buf *)hp;
1728 
1729 		mutex_enter(hmp);
1730 		bp = dp->av_forw;
1731 
1732 		while (bp != dp) {
1733 
1734 			ASSERT(bp != NULL);
1735 
1736 			if (!sema_tryp(&bp->b_sem)) {
1737 				bp = bp->av_forw;
1738 				continue;
1739 			}
1740 			/*
1741 			 * Do we really want to nuke all of the B_AGE stuff??
1742 			 */
1743 			if ((bp->b_flags & B_AGE) == 0 && found) {
1744 				sema_v(&bp->b_sem);
1745 				mutex_exit(hmp);
1746 				lastindex = start;
1747 				return;	/* All done */
1748 			}
1749 
1750 			ASSERT(MUTEX_HELD(&hp->b_lock));
1751 			ASSERT(!(bp->b_flags & B_DELWRI));
1752 			hp->b_length--;
1753 			notavail(bp);
1754 
1755 			/*
1756 			 * Remove bhdr from cache, free up memory,
1757 			 * and add the hdr to the freelist.
1758 			 */
1759 			bremhash(bp);
1760 			mutex_exit(hmp);
1761 
1762 			if (bp->b_bufsize) {
1763 				kmem_free(bp->b_un.b_addr, bp->b_bufsize);
1764 				bp->b_un.b_addr = NULL;
1765 				mutex_enter(&bfree_lock);
1766 				bfreelist.b_bufsize += bp->b_bufsize;
1767 				mutex_exit(&bfree_lock);
1768 			}
1769 
1770 			bp->b_dev = (o_dev_t)NODEV;
1771 			bp->b_edev = NODEV;
1772 			bp->b_flags = 0;
1773 			sema_v(&bp->b_sem);
1774 			bio_bhdr_free(bp);
1775 			if (want == BIO_HEADER) {
1776 				found = 1;
1777 			} else {
1778 				ASSERT(want == BIO_MEM);
1779 				if (!found && bfreelist.b_bufsize >= bsize) {
1780 					/* Account for the memory we want */
1781 					mutex_enter(&bfree_lock);
1782 					if (bfreelist.b_bufsize >= bsize) {
1783 						bfreelist.b_bufsize -= bsize;
1784 						found = 1;
1785 					}
1786 					mutex_exit(&bfree_lock);
1787 				}
1788 			}
1789 
1790 			/*
1791 			 * Since we dropped hmp start from the
1792 			 * begining.
1793 			 */
1794 			mutex_enter(hmp);
1795 			bp = dp->av_forw;
1796 		}
1797 		mutex_exit(hmp);
1798 
1799 		/*
1800 		 * Look at the delayed write list.
1801 		 * First gather into a private list, then write them.
1802 		 */
1803 		dwp = (struct buf *)&dwbuf[start];
1804 		mutex_enter(&blist_lock);
1805 		bio_doingflush++;
1806 		mutex_enter(hmp);
1807 		for (bp = dwp->av_forw; bp != dwp; bp = nbp) {
1808 
1809 			ASSERT(bp != NULL);
1810 			nbp = bp->av_forw;
1811 
1812 			if (!sema_tryp(&bp->b_sem))
1813 				continue;
1814 			ASSERT(bp->b_flags & B_DELWRI);
1815 			/*
1816 			 * Do we really want to nuke all of the B_AGE stuff??
1817 			 */
1818 
1819 			if ((bp->b_flags & B_AGE) == 0 && found) {
1820 				sema_v(&bp->b_sem);
1821 				mutex_exit(hmp);
1822 				lastindex = start;
1823 				mutex_exit(&blist_lock);
1824 				bio_flushlist(delwri_list);
1825 				mutex_enter(&blist_lock);
1826 				bio_doingflush--;
1827 				if (bio_flinv_cv_wanted) {
1828 					bio_flinv_cv_wanted = 0;
1829 					cv_broadcast(&bio_flushinval_cv);
1830 				}
1831 				mutex_exit(&blist_lock);
1832 				return; /* All done */
1833 			}
1834 
1835 			/*
1836 			 * If the buffer is already on a flush or
1837 			 * invalidate list then just skip it.
1838 			 */
1839 			if (bp->b_list != NULL) {
1840 				sema_v(&bp->b_sem);
1841 				continue;
1842 			}
1843 			/*
1844 			 * We are still on the same bucket.
1845 			 */
1846 			hp->b_length--;
1847 			notavail(bp);
1848 			bp->b_list = delwri_list;
1849 			delwri_list = bp;
1850 		}
1851 		mutex_exit(hmp);
1852 		mutex_exit(&blist_lock);
1853 		bio_flushlist(delwri_list);
1854 		delwri_list = EMPTY_LIST;
1855 		mutex_enter(&blist_lock);
1856 		bio_doingflush--;
1857 		if (bio_flinv_cv_wanted) {
1858 			bio_flinv_cv_wanted = 0;
1859 			cv_broadcast(&bio_flushinval_cv);
1860 		}
1861 		mutex_exit(&blist_lock);
1862 		start = (start + 1) % v.v_hbuf;
1863 
1864 	} while (start != end);
1865 
1866 	if (found)
1867 		return;
1868 
1869 	/*
1870 	 * Free lists exhausted and we haven't satisfied the request.
1871 	 * Wait here for more entries to be added to freelist.
1872 	 * Because this might have just happened, make it timed.
1873 	 */
1874 	mutex_enter(&bfree_lock);
1875 	bfreelist.b_flags |= B_WANTED;
1876 	(void) cv_timedwait(&bio_mem_cv, &bfree_lock, lbolt+hz);
1877 	mutex_exit(&bfree_lock);
1878 	goto top;
1879 }
1880 
1881 /*
1882  * See if the block is associated with some buffer
1883  * (mainly to avoid getting hung up on a wait in breada).
1884  */
1885 static int
1886 bio_incore(dev_t dev, daddr_t blkno)
1887 {
1888 	struct buf *bp;
1889 	struct buf *dp;
1890 	uint_t index;
1891 	kmutex_t *hmp;
1892 
1893 	index = bio_bhash(dev, blkno);
1894 	dp = (struct buf *)&hbuf[index];
1895 	hmp = &hbuf[index].b_lock;
1896 
1897 	mutex_enter(hmp);
1898 	for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
1899 		if (bp->b_blkno == blkno && bp->b_edev == dev &&
1900 		    (bp->b_flags & B_STALE) == 0) {
1901 			mutex_exit(hmp);
1902 			return (1);
1903 		}
1904 	}
1905 	mutex_exit(hmp);
1906 	return (0);
1907 }
1908 
1909 static void
1910 bio_pageio_done(struct buf *bp)
1911 {
1912 	if (bp->b_flags & B_PAGEIO) {
1913 
1914 		if (bp->b_flags & B_REMAPPED)
1915 			bp_mapout(bp);
1916 
1917 		if (bp->b_flags & B_READ)
1918 			pvn_read_done(bp->b_pages, bp->b_flags);
1919 		else
1920 			pvn_write_done(bp->b_pages, B_WRITE | bp->b_flags);
1921 		pageio_done(bp);
1922 	} else {
1923 		ASSERT(bp->b_flags & B_REMAPPED);
1924 		bp_mapout(bp);
1925 		brelse(bp);
1926 	}
1927 }
1928 
1929 /*
1930  * bioerror(9F) - indicate error in buffer header
1931  * If 'error' is zero, remove the error indication.
1932  */
1933 void
1934 bioerror(struct buf *bp, int error)
1935 {
1936 	ASSERT(bp != NULL);
1937 	ASSERT(error >= 0);
1938 	ASSERT(SEMA_HELD(&bp->b_sem));
1939 
1940 	if (error != 0) {
1941 		bp->b_flags |= B_ERROR;
1942 	} else {
1943 		bp->b_flags &= ~B_ERROR;
1944 	}
1945 	bp->b_error = error;
1946 }
1947 
1948 /*
1949  * bioreset(9F) - reuse a private buffer header after I/O is complete
1950  */
1951 void
1952 bioreset(struct buf *bp)
1953 {
1954 	ASSERT(bp != NULL);
1955 
1956 	biofini(bp);
1957 	bioinit(bp);
1958 }
1959 
1960 /*
1961  * biosize(9F) - return size of a buffer header
1962  */
1963 size_t
1964 biosize(void)
1965 {
1966 	return (sizeof (struct buf));
1967 }
1968 
1969 /*
1970  * biomodified(9F) - check if buffer is modified
1971  */
1972 int
1973 biomodified(struct buf *bp)
1974 {
1975 	int npf;
1976 	int ppattr;
1977 	struct page *pp;
1978 
1979 	ASSERT(bp != NULL);
1980 
1981 	if ((bp->b_flags & B_PAGEIO) == 0) {
1982 		return (-1);
1983 	}
1984 	pp = bp->b_pages;
1985 	npf = btopr(bp->b_bcount + ((uintptr_t)bp->b_un.b_addr & PAGEOFFSET));
1986 
1987 	while (npf > 0) {
1988 		ppattr = hat_pagesync(pp, HAT_SYNC_DONTZERO |
1989 				HAT_SYNC_STOPON_MOD);
1990 		if (ppattr & P_MOD)
1991 			return (1);
1992 		pp = pp->p_next;
1993 		npf--;
1994 	}
1995 
1996 	return (0);
1997 }
1998 
1999 /*
2000  * bioinit(9F) - initialize a buffer structure
2001  */
2002 void
2003 bioinit(struct buf *bp)
2004 {
2005 	bzero(bp, sizeof (struct buf));
2006 	sema_init(&bp->b_sem, 0, NULL, SEMA_DEFAULT, NULL);
2007 	sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
2008 	bp->b_offset = -1;
2009 }
2010 
2011 /*
2012  * biofini(9F) - uninitialize a buffer structure
2013  */
2014 void
2015 biofini(struct buf *bp)
2016 {
2017 	sema_destroy(&bp->b_io);
2018 	sema_destroy(&bp->b_sem);
2019 }
2020 
2021 /*
2022  * bioclone(9F) - clone a buffer
2023  */
2024 struct buf *
2025 bioclone(struct buf *bp, off_t off, size_t len, dev_t dev, daddr_t blkno,
2026     int (*iodone)(struct buf *), struct buf *bp_mem, int sleep)
2027 {
2028 	struct buf *bufp;
2029 
2030 	ASSERT(bp);
2031 	if (bp_mem == NULL) {
2032 		bufp = kmem_alloc(sizeof (struct buf), sleep);
2033 		if (bufp == NULL) {
2034 			return (NULL);
2035 		}
2036 		bioinit(bufp);
2037 	} else {
2038 		bufp = bp_mem;
2039 		bioreset(bufp);
2040 	}
2041 
2042 #define	BUF_CLONE_FLAGS	(B_READ|B_WRITE|B_SHADOW|B_PHYS|B_PAGEIO|B_FAILFAST|\
2043 	B_ABRWRITE)
2044 
2045 	/*
2046 	 * the cloned buffer does not inherit the B_REMAPPED flag. A separate
2047 	 * bp_mapin(9F) has to be done to get a kernel mapping.
2048 	 */
2049 	bufp->b_flags = (bp->b_flags & BUF_CLONE_FLAGS)  | B_BUSY;
2050 	bufp->b_bcount = len;
2051 	bufp->b_blkno = blkno;
2052 	bufp->b_iodone = iodone;
2053 	bufp->b_proc = bp->b_proc;
2054 	bufp->b_edev = dev;
2055 	bufp->b_file = bp->b_file;
2056 	bufp->b_offset = bp->b_offset;
2057 
2058 	if (bp->b_flags & B_SHADOW) {
2059 		ASSERT(bp->b_shadow);
2060 		ASSERT(bp->b_flags & B_PHYS);
2061 
2062 		bufp->b_shadow = bp->b_shadow +
2063 			btop(((uintptr_t)bp->b_un.b_addr & PAGEOFFSET) + off);
2064 		bufp->b_un.b_addr = (caddr_t)((uintptr_t)bp->b_un.b_addr + off);
2065 	} else {
2066 		if (bp->b_flags & B_PAGEIO) {
2067 			struct page *pp;
2068 			off_t o;
2069 			int i;
2070 
2071 			pp = bp->b_pages;
2072 			o = ((uintptr_t)bp->b_un.b_addr & PAGEOFFSET) + off;
2073 			for (i = btop(o); i > 0; i--) {
2074 				pp = pp->p_next;
2075 			}
2076 			bufp->b_pages = pp;
2077 			bufp->b_un.b_addr = (caddr_t)(o & PAGEOFFSET);
2078 		} else {
2079 			bufp->b_un.b_addr =
2080 				(caddr_t)((uintptr_t)bp->b_un.b_addr + off);
2081 			if (bp->b_flags & B_REMAPPED)
2082 				bufp->b_proc = NULL;
2083 		}
2084 	}
2085 	return (bufp);
2086 }
2087