xref: /illumos-gate/usr/src/uts/common/os/bio.c (revision 1a5e258f)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
27 /*	  All Rights Reserved  	*/
28 
29 /*
30  * University Copyright- Copyright (c) 1982, 1986, 1988
31  * The Regents of the University of California
32  * All Rights Reserved
33  *
34  * University Acknowledgment- Portions of this document are derived from
35  * software developed by the University of California, Berkeley, and its
36  * contributors.
37  */
38 
39 #include <sys/types.h>
40 #include <sys/t_lock.h>
41 #include <sys/sysmacros.h>
42 #include <sys/conf.h>
43 #include <sys/cpuvar.h>
44 #include <sys/errno.h>
45 #include <sys/debug.h>
46 #include <sys/buf.h>
47 #include <sys/var.h>
48 #include <sys/vnode.h>
49 #include <sys/bitmap.h>
50 #include <sys/cmn_err.h>
51 #include <sys/kmem.h>
52 #include <sys/vmem.h>
53 #include <sys/atomic.h>
54 #include <vm/seg_kmem.h>
55 #include <vm/page.h>
56 #include <vm/pvn.h>
57 #include <sys/vtrace.h>
58 #include <sys/tnf_probe.h>
59 #include <sys/fs/ufs_inode.h>
60 #include <sys/fs/ufs_bio.h>
61 #include <sys/fs/ufs_log.h>
62 #include <sys/systm.h>
63 #include <sys/vfs.h>
64 #include <sys/sdt.h>
65 
66 /* Locks */
67 static	kmutex_t	blist_lock;	/* protects b_list */
68 static	kmutex_t	bhdr_lock;	/* protects the bhdrlist */
69 static	kmutex_t	bfree_lock;	/* protects the bfreelist structure */
70 
71 struct hbuf	*hbuf;			/* Hash buckets */
72 struct dwbuf	*dwbuf;			/* Delayed write buckets */
73 static struct buf *bhdrlist;		/* buf header free list */
74 static int 	nbuf;			/* number of buffer headers allocated */
75 
76 static int	lastindex;		/* Reference point on where to start */
77 					/* when looking for free buffers */
78 
79 #define	bio_bhash(dev, bn)	(hash2ints((dev), (int)(bn)) & v.v_hmask)
80 #define	EMPTY_LIST	((struct buf *)-1)
81 
82 static kcondvar_t	bio_mem_cv; 	/* Condition variables */
83 static kcondvar_t	bio_flushinval_cv;
84 static int	bio_doingflush;		/* flush in progress */
85 static int	bio_doinginval;		/* inval in progress */
86 static int	bio_flinv_cv_wanted;	/* someone waiting for cv */
87 
88 /*
89  * Statistics on the buffer cache
90  */
91 struct biostats biostats = {
92 	{ "buffer_cache_lookups",		KSTAT_DATA_UINT32 },
93 	{ "buffer_cache_hits",			KSTAT_DATA_UINT32 },
94 	{ "new_buffer_requests",		KSTAT_DATA_UINT32 },
95 	{ "waits_for_buffer_allocs",		KSTAT_DATA_UINT32 },
96 	{ "buffers_locked_by_someone",		KSTAT_DATA_UINT32 },
97 	{ "duplicate_buffers_found",		KSTAT_DATA_UINT32 }
98 };
99 
100 /*
101  * kstat data
102  */
103 kstat_named_t	*biostats_ptr = (kstat_named_t *)&biostats;
104 uint_t		biostats_ndata = (uint_t)(sizeof (biostats) /
105 					sizeof (kstat_named_t));
106 
107 /*
108  * Statistics on ufs buffer cache
109  * Not protected by locks
110  */
111 struct ufsbiostats ub = {
112 	{ "breads",			KSTAT_DATA_UINT32 },
113 	{ "bwrites",			KSTAT_DATA_UINT32 },
114 	{ "fbiwrites",			KSTAT_DATA_UINT32 },
115 	{ "getpages",			KSTAT_DATA_UINT32 },
116 	{ "getras",			KSTAT_DATA_UINT32 },
117 	{ "putsyncs",			KSTAT_DATA_UINT32 },
118 	{ "putasyncs",			KSTAT_DATA_UINT32 },
119 	{ "putpageios",			KSTAT_DATA_UINT32 },
120 };
121 
122 /*
123  * more UFS Logging eccentricities...
124  *
125  * required since "#pragma weak ..." doesn't work in reverse order.
126  * i.e.:  genunix (bio.c) is loaded before the ufs modules and pointers
127  *        to ufs routines don't get plugged into bio.c calls so
128  *        we initialize it when setting up the "lufsops" table
129  *        in "lufs.c:_init()"
130  */
131 void (*bio_lufs_strategy)(void *, buf_t *);
132 void (*bio_snapshot_strategy)(void *, buf_t *);
133 
134 
135 /* Private routines */
136 static struct buf	*bio_getfreeblk(long);
137 static void 		bio_mem_get(long);
138 static void		bio_bhdr_free(struct buf *);
139 static struct buf	*bio_bhdr_alloc(void);
140 static void		bio_recycle(int, long);
141 static void 		bio_pageio_done(struct buf *);
142 static int 		bio_incore(dev_t, daddr_t);
143 
144 /*
145  * Buffer cache constants
146  */
147 #define	BIO_BUF_PERCENT	(100/2)		/* default: 2% of memory */
148 #define	BIO_MAX_PERCENT	(100/20)	/* max is 20% of real memory */
149 #define	BIO_BHDR_POOL	100		/* Default bhdr pool size */
150 #define	BIO_MIN_HDR	10		/* Minimum number of buffer headers */
151 #define	BIO_MIN_HWM	(BIO_MIN_HDR * MAXBSIZE / 1024)
152 #define	BIO_HASHLEN	4		/* Target length of hash chains */
153 
154 
155 /* Flags for bio_recycle() */
156 #define	BIO_HEADER	0x01
157 #define	BIO_MEM		0x02
158 
159 extern	int bufhwm;		/* User tunable - high water mark for mem  */
160 extern	int bufhwm_pct;		/* ditto - given in % of physmem  */
161 
162 /*
163  * The following routines allocate and free
164  * buffers with various side effects.  In general the
165  * arguments to an allocate routine are a device and
166  * a block number, and the value is a pointer to
167  * to the buffer header; the buffer returned is locked with a
168  * binary semaphore so that no one else can touch it. If the block was
169  * already in core, no I/O need be done; if it is
170  * already locked, the process waits until it becomes free.
171  * The following routines allocate a buffer:
172  *	getblk
173  *	bread/BREAD
174  *	breada
175  * Eventually the buffer must be released, possibly with the
176  * side effect of writing it out, by using one of
177  *	bwrite/BWRITE/brwrite
178  *	bdwrite/bdrwrite
179  *	bawrite
180  *	brelse
181  *
182  * The B_WANTED/B_BUSY bits are NOT used by these routines for synchronization.
183  * Instead, a binary semaphore, b_sem is used to gain exclusive access to
184  * a buffer and a binary semaphore, b_io is used for I/O synchronization.
185  * B_DONE is still used to denote a buffer with I/O complete on it.
186  *
187  * The bfreelist.b_bcount field is computed everytime fsflush runs. It is
188  * should not be used where a very accurate count of the free buffers is
189  * needed.
190  */
191 
192 /*
193  * Read in (if necessary) the block and return a buffer pointer.
194  *
195  * This interface is provided for binary compatibility.  Using
196  * BREAD() directly avoids the extra function call overhead invoked
197  * by calling this routine.
198  */
199 struct buf *
200 bread(dev_t dev, daddr_t blkno, long bsize)
201 {
202 	return (BREAD(dev, blkno, bsize));
203 }
204 
205 /*
206  * Common code for reading a buffer with various options
207  *
208  * Read in (if necessary) the block and return a buffer pointer.
209  */
210 struct buf *
211 bread_common(void *arg, dev_t dev, daddr_t blkno, long bsize)
212 {
213 	struct ufsvfs *ufsvfsp = (struct ufsvfs *)arg;
214 	struct buf *bp;
215 	klwp_t *lwp = ttolwp(curthread);
216 
217 	CPU_STATS_ADD_K(sys, lread, 1);
218 	bp = getblk_common(ufsvfsp, dev, blkno, bsize, /* errflg */ 1);
219 	if (bp->b_flags & B_DONE)
220 		return (bp);
221 	bp->b_flags |= B_READ;
222 	ASSERT(bp->b_bcount == bsize);
223 	if (ufsvfsp == NULL) {					/* !ufs */
224 		(void) bdev_strategy(bp);
225 	} else if (ufsvfsp->vfs_log && bio_lufs_strategy != NULL) {
226 							/* ufs && logging */
227 		(*bio_lufs_strategy)(ufsvfsp->vfs_log, bp);
228 	} else if (ufsvfsp->vfs_snapshot && bio_snapshot_strategy != NULL) {
229 							/* ufs && snapshots */
230 		(*bio_snapshot_strategy)(&ufsvfsp->vfs_snapshot, bp);
231 	} else {
232 		ufsvfsp->vfs_iotstamp = ddi_get_lbolt();
233 		ub.ub_breads.value.ul++;		/* ufs && !logging */
234 		(void) bdev_strategy(bp);
235 	}
236 	if (lwp != NULL)
237 		lwp->lwp_ru.inblock++;
238 	CPU_STATS_ADD_K(sys, bread, 1);
239 	(void) biowait(bp);
240 	return (bp);
241 }
242 
243 /*
244  * Read in the block, like bread, but also start I/O on the
245  * read-ahead block (which is not allocated to the caller).
246  */
247 struct buf *
248 breada(dev_t dev, daddr_t blkno, daddr_t rablkno, long bsize)
249 {
250 	struct buf *bp, *rabp;
251 	klwp_t *lwp = ttolwp(curthread);
252 
253 	bp = NULL;
254 	if (!bio_incore(dev, blkno)) {
255 		CPU_STATS_ADD_K(sys, lread, 1);
256 		bp = GETBLK(dev, blkno, bsize);
257 		if ((bp->b_flags & B_DONE) == 0) {
258 			bp->b_flags |= B_READ;
259 			bp->b_bcount = bsize;
260 			(void) bdev_strategy(bp);
261 			if (lwp != NULL)
262 				lwp->lwp_ru.inblock++;
263 			CPU_STATS_ADD_K(sys, bread, 1);
264 		}
265 	}
266 	if (rablkno && bfreelist.b_bcount > 1 &&
267 	    !bio_incore(dev, rablkno)) {
268 		rabp = GETBLK(dev, rablkno, bsize);
269 		if (rabp->b_flags & B_DONE)
270 			brelse(rabp);
271 		else {
272 			rabp->b_flags |= B_READ|B_ASYNC;
273 			rabp->b_bcount = bsize;
274 			(void) bdev_strategy(rabp);
275 			if (lwp != NULL)
276 				lwp->lwp_ru.inblock++;
277 			CPU_STATS_ADD_K(sys, bread, 1);
278 		}
279 	}
280 	if (bp == NULL)
281 		return (BREAD(dev, blkno, bsize));
282 	(void) biowait(bp);
283 	return (bp);
284 }
285 
286 /*
287  * Common code for writing a buffer with various options.
288  *
289  * force_wait  - wait for write completion regardless of B_ASYNC flag
290  * do_relse    - release the buffer when we are done
291  * clear_flags - flags to clear from the buffer
292  */
293 void
294 bwrite_common(void *arg, struct buf *bp, int force_wait,
295 				int do_relse, int clear_flags)
296 {
297 	register int do_wait;
298 	struct ufsvfs *ufsvfsp = (struct ufsvfs *)arg;
299 	int flag;
300 	klwp_t *lwp = ttolwp(curthread);
301 	struct cpu *cpup;
302 
303 	ASSERT(SEMA_HELD(&bp->b_sem));
304 	flag = bp->b_flags;
305 	bp->b_flags &= ~clear_flags;
306 	if (lwp != NULL)
307 		lwp->lwp_ru.oublock++;
308 	CPU_STATS_ENTER_K();
309 	cpup = CPU;		/* get pointer AFTER preemption is disabled */
310 	CPU_STATS_ADDQ(cpup, sys, lwrite, 1);
311 	CPU_STATS_ADDQ(cpup, sys, bwrite, 1);
312 	do_wait = ((flag & B_ASYNC) == 0 || force_wait);
313 	if (do_wait == 0)
314 		CPU_STATS_ADDQ(cpup, sys, bawrite, 1);
315 	CPU_STATS_EXIT_K();
316 	if (ufsvfsp == NULL) {
317 		(void) bdev_strategy(bp);
318 	} else if (ufsvfsp->vfs_log && bio_lufs_strategy != NULL) {
319 							/* ufs && logging */
320 		(*bio_lufs_strategy)(ufsvfsp->vfs_log, bp);
321 	} else if (ufsvfsp->vfs_snapshot && bio_snapshot_strategy != NULL) {
322 							/* ufs && snapshots */
323 		(*bio_snapshot_strategy)(&ufsvfsp->vfs_snapshot, bp);
324 	} else {
325 		ub.ub_bwrites.value.ul++;		/* ufs && !logging */
326 		(void) bdev_strategy(bp);
327 	}
328 	if (do_wait) {
329 		(void) biowait(bp);
330 		if (do_relse) {
331 			brelse(bp);
332 		}
333 	}
334 }
335 
336 /*
337  * Write the buffer, waiting for completion (unless B_ASYNC is set).
338  * Then release the buffer.
339  * This interface is provided for binary compatibility.  Using
340  * BWRITE() directly avoids the extra function call overhead invoked
341  * by calling this routine.
342  */
343 void
344 bwrite(struct buf *bp)
345 {
346 	BWRITE(bp);
347 }
348 
349 /*
350  * Write the buffer, waiting for completion.
351  * But don't release the buffer afterwards.
352  * This interface is provided for binary compatibility.  Using
353  * BWRITE2() directly avoids the extra function call overhead.
354  */
355 void
356 bwrite2(struct buf *bp)
357 {
358 	BWRITE2(bp);
359 }
360 
361 /*
362  * Release the buffer, marking it so that if it is grabbed
363  * for another purpose it will be written out before being
364  * given up (e.g. when writing a partial block where it is
365  * assumed that another write for the same block will soon follow).
366  * Also save the time that the block is first marked as delayed
367  * so that it will be written in a reasonable time.
368  */
369 void
370 bdwrite(struct buf *bp)
371 {
372 	ASSERT(SEMA_HELD(&bp->b_sem));
373 	CPU_STATS_ADD_K(sys, lwrite, 1);
374 	if ((bp->b_flags & B_DELWRI) == 0)
375 		bp->b_start = ddi_get_lbolt();
376 	/*
377 	 * B_DONE allows others to use the buffer, B_DELWRI causes the
378 	 * buffer to be written before being reused, and setting b_resid
379 	 * to zero says the buffer is complete.
380 	 */
381 	bp->b_flags |= B_DELWRI | B_DONE;
382 	bp->b_resid = 0;
383 	brelse(bp);
384 }
385 
386 /*
387  * Release the buffer, start I/O on it, but don't wait for completion.
388  */
389 void
390 bawrite(struct buf *bp)
391 {
392 	ASSERT(SEMA_HELD(&bp->b_sem));
393 
394 	/* Use bfreelist.b_bcount as a weird-ass heuristic */
395 	if (bfreelist.b_bcount > 4)
396 		bp->b_flags |= B_ASYNC;
397 	BWRITE(bp);
398 }
399 
400 /*
401  * Release the buffer, with no I/O implied.
402  */
403 void
404 brelse(struct buf *bp)
405 {
406 	struct buf	**backp;
407 	uint_t		index;
408 	kmutex_t	*hmp;
409 	struct	buf	*dp;
410 	struct	hbuf	*hp;
411 
412 
413 	ASSERT(SEMA_HELD(&bp->b_sem));
414 
415 	/*
416 	 * Clear the retry write flag if the buffer was written without
417 	 * error.  The presence of B_DELWRI means the buffer has not yet
418 	 * been written and the presence of B_ERROR means that an error
419 	 * is still occurring.
420 	 */
421 	if ((bp->b_flags & (B_ERROR | B_DELWRI | B_RETRYWRI)) == B_RETRYWRI) {
422 		bp->b_flags &= ~B_RETRYWRI;
423 	}
424 
425 	/* Check for anomalous conditions */
426 	if (bp->b_flags & (B_ERROR|B_NOCACHE)) {
427 		if (bp->b_flags & B_NOCACHE) {
428 			/* Don't add to the freelist. Destroy it now */
429 			kmem_free(bp->b_un.b_addr, bp->b_bufsize);
430 			sema_destroy(&bp->b_sem);
431 			sema_destroy(&bp->b_io);
432 			kmem_free(bp, sizeof (struct buf));
433 			return;
434 		}
435 		/*
436 		 * If a write failed and we are supposed to retry write,
437 		 * don't toss the buffer.  Keep it around and mark it
438 		 * delayed write in the hopes that it will eventually
439 		 * get flushed (and still keep the system running.)
440 		 */
441 		if ((bp->b_flags & (B_READ | B_RETRYWRI)) == B_RETRYWRI) {
442 			bp->b_flags |= B_DELWRI;
443 			/* keep fsflush from trying continuously to flush */
444 			bp->b_start = ddi_get_lbolt();
445 		} else
446 			bp->b_flags |= B_AGE|B_STALE;
447 		bp->b_flags &= ~B_ERROR;
448 		bp->b_error = 0;
449 	}
450 
451 	/*
452 	 * If delayed write is set then put in on the delayed
453 	 * write list instead of the free buffer list.
454 	 */
455 	index = bio_bhash(bp->b_edev, bp->b_blkno);
456 	hmp   = &hbuf[index].b_lock;
457 
458 	mutex_enter(hmp);
459 	hp = &hbuf[index];
460 	dp = (struct buf *)hp;
461 
462 	/*
463 	 * Make sure that the number of entries on this list are
464 	 * Zero <= count <= total # buffers
465 	 */
466 	ASSERT(hp->b_length >= 0);
467 	ASSERT(hp->b_length < nbuf);
468 
469 	hp->b_length++;		/* We are adding this buffer */
470 
471 	if (bp->b_flags & B_DELWRI) {
472 		/*
473 		 * This buffer goes on the delayed write buffer list
474 		 */
475 		dp = (struct buf *)&dwbuf[index];
476 	}
477 	ASSERT(bp->b_bufsize > 0);
478 	ASSERT(bp->b_bcount > 0);
479 	ASSERT(bp->b_un.b_addr != NULL);
480 
481 	if (bp->b_flags & B_AGE) {
482 		backp = &dp->av_forw;
483 		(*backp)->av_back = bp;
484 		bp->av_forw = *backp;
485 		*backp = bp;
486 		bp->av_back = dp;
487 	} else {
488 		backp = &dp->av_back;
489 		(*backp)->av_forw = bp;
490 		bp->av_back = *backp;
491 		*backp = bp;
492 		bp->av_forw = dp;
493 	}
494 	mutex_exit(hmp);
495 
496 	if (bfreelist.b_flags & B_WANTED) {
497 		/*
498 		 * Should come here very very rarely.
499 		 */
500 		mutex_enter(&bfree_lock);
501 		if (bfreelist.b_flags & B_WANTED) {
502 			bfreelist.b_flags &= ~B_WANTED;
503 			cv_broadcast(&bio_mem_cv);
504 		}
505 		mutex_exit(&bfree_lock);
506 	}
507 
508 	bp->b_flags &= ~(B_WANTED|B_BUSY|B_ASYNC);
509 	/*
510 	 * Don't let anyone get the buffer off the freelist before we
511 	 * release our hold on it.
512 	 */
513 	sema_v(&bp->b_sem);
514 }
515 
516 /*
517  * Return a count of the number of B_BUSY buffers in the system
518  * Can only be used as a good estimate.  If 'cleanit' is set,
519  * try to flush all bufs.
520  */
521 int
522 bio_busy(int cleanit)
523 {
524 	struct buf *bp, *dp;
525 	int busy = 0;
526 	int i;
527 	kmutex_t *hmp;
528 
529 	for (i = 0; i < v.v_hbuf; i++) {
530 		vfs_syncprogress();
531 		dp = (struct buf *)&hbuf[i];
532 		hmp = &hbuf[i].b_lock;
533 
534 		mutex_enter(hmp);
535 		for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
536 			if (bp->b_flags & B_BUSY)
537 				busy++;
538 		}
539 		mutex_exit(hmp);
540 	}
541 
542 	if (cleanit && busy != 0) {
543 		bflush(NODEV);
544 	}
545 
546 	return (busy);
547 }
548 
549 /*
550  * this interface is provided for binary compatibility.
551  *
552  * Assign a buffer for the given block.  If the appropriate
553  * block is already associated, return it; otherwise search
554  * for the oldest non-busy buffer and reassign it.
555  */
556 struct buf *
557 getblk(dev_t dev, daddr_t blkno, long bsize)
558 {
559 	return (getblk_common(/* ufsvfsp */ NULL, dev,
560 	    blkno, bsize, /* errflg */ 0));
561 }
562 
563 /*
564  * Assign a buffer for the given block.  If the appropriate
565  * block is already associated, return it; otherwise search
566  * for the oldest non-busy buffer and reassign it.
567  */
568 struct buf *
569 getblk_common(void * arg, dev_t dev, daddr_t blkno, long bsize, int errflg)
570 {
571 	ufsvfs_t *ufsvfsp = (struct ufsvfs *)arg;
572 	struct buf *bp;
573 	struct buf *dp;
574 	struct buf *nbp = NULL;
575 	struct buf *errbp;
576 	uint_t		index;
577 	kmutex_t	*hmp;
578 	struct	hbuf	*hp;
579 
580 	if (getmajor(dev) >= devcnt)
581 		cmn_err(CE_PANIC, "blkdev");
582 
583 	biostats.bio_lookup.value.ui32++;
584 
585 	index = bio_bhash(dev, blkno);
586 	hp    = &hbuf[index];
587 	dp    = (struct buf *)hp;
588 	hmp   = &hp->b_lock;
589 
590 	mutex_enter(hmp);
591 loop:
592 	for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
593 		if (bp->b_blkno != blkno || bp->b_edev != dev ||
594 		    (bp->b_flags & B_STALE))
595 			continue;
596 		/*
597 		 * Avoid holding the hash lock in the event that
598 		 * the buffer is locked by someone. Since the hash chain
599 		 * may change when we drop the hash lock
600 		 * we have to start at the beginning of the chain if the
601 		 * buffer identity/contents aren't valid.
602 		 */
603 		if (!sema_tryp(&bp->b_sem)) {
604 			biostats.bio_bufbusy.value.ui32++;
605 			mutex_exit(hmp);
606 			/*
607 			 * OK, we are dealing with a busy buffer.
608 			 * In the case that we are panicking and we
609 			 * got called from bread(), we have some chance
610 			 * for error recovery. So better bail out from
611 			 * here since sema_p() won't block. If we got
612 			 * called directly from ufs routines, there is
613 			 * no way to report an error yet.
614 			 */
615 			if (panicstr && errflg)
616 				goto errout;
617 			/*
618 			 * For the following line of code to work
619 			 * correctly never kmem_free the buffer "header".
620 			 */
621 			sema_p(&bp->b_sem);
622 			if (bp->b_blkno != blkno || bp->b_edev != dev ||
623 			    (bp->b_flags & B_STALE)) {
624 				sema_v(&bp->b_sem);
625 				mutex_enter(hmp);
626 				goto loop;	/* start over */
627 			}
628 			mutex_enter(hmp);
629 		}
630 		/* Found */
631 		biostats.bio_hit.value.ui32++;
632 		bp->b_flags &= ~B_AGE;
633 
634 		/*
635 		 * Yank it off the free/delayed write lists
636 		 */
637 		hp->b_length--;
638 		notavail(bp);
639 		mutex_exit(hmp);
640 
641 		ASSERT((bp->b_flags & B_NOCACHE) == NULL);
642 
643 		if (nbp == NULL) {
644 			/*
645 			 * Make the common path short.
646 			 */
647 			ASSERT(SEMA_HELD(&bp->b_sem));
648 			return (bp);
649 		}
650 
651 		biostats.bio_bufdup.value.ui32++;
652 
653 		/*
654 		 * The buffer must have entered during the lock upgrade
655 		 * so free the new buffer we allocated and return the
656 		 * found buffer.
657 		 */
658 		kmem_free(nbp->b_un.b_addr, nbp->b_bufsize);
659 		nbp->b_un.b_addr = NULL;
660 
661 		/*
662 		 * Account for the memory
663 		 */
664 		mutex_enter(&bfree_lock);
665 		bfreelist.b_bufsize += nbp->b_bufsize;
666 		mutex_exit(&bfree_lock);
667 
668 		/*
669 		 * Destroy buf identity, and place on avail list
670 		 */
671 		nbp->b_dev = (o_dev_t)NODEV;
672 		nbp->b_edev = NODEV;
673 		nbp->b_flags = 0;
674 		nbp->b_file = NULL;
675 		nbp->b_offset = -1;
676 
677 		sema_v(&nbp->b_sem);
678 		bio_bhdr_free(nbp);
679 
680 		ASSERT(SEMA_HELD(&bp->b_sem));
681 		return (bp);
682 	}
683 
684 	/*
685 	 * bio_getfreeblk may block so check the hash chain again.
686 	 */
687 	if (nbp == NULL) {
688 		mutex_exit(hmp);
689 		nbp = bio_getfreeblk(bsize);
690 		mutex_enter(hmp);
691 		goto loop;
692 	}
693 
694 	/*
695 	 * New buffer. Assign nbp and stick it on the hash.
696 	 */
697 	nbp->b_flags = B_BUSY;
698 	nbp->b_edev = dev;
699 	nbp->b_dev = (o_dev_t)cmpdev(dev);
700 	nbp->b_blkno = blkno;
701 	nbp->b_iodone = NULL;
702 	nbp->b_bcount = bsize;
703 	/*
704 	 * If we are given a ufsvfsp and the vfs_root field is NULL
705 	 * then this must be I/O for a superblock.  A superblock's
706 	 * buffer is set up in mountfs() and there is no root vnode
707 	 * at that point.
708 	 */
709 	if (ufsvfsp && ufsvfsp->vfs_root) {
710 		nbp->b_vp = ufsvfsp->vfs_root;
711 	} else {
712 		nbp->b_vp = NULL;
713 	}
714 
715 	ASSERT((nbp->b_flags & B_NOCACHE) == NULL);
716 
717 	binshash(nbp, dp);
718 	mutex_exit(hmp);
719 
720 	ASSERT(SEMA_HELD(&nbp->b_sem));
721 
722 	return (nbp);
723 
724 
725 	/*
726 	 * Come here in case of an internal error. At this point we couldn't
727 	 * get a buffer, but he have to return one. Hence we allocate some
728 	 * kind of error reply buffer on the fly. This buffer is marked as
729 	 * B_NOCACHE | B_AGE | B_ERROR | B_DONE to assure the following:
730 	 *	- B_ERROR will indicate error to the caller.
731 	 *	- B_DONE will prevent us from reading the buffer from
732 	 *	  the device.
733 	 *	- B_NOCACHE will cause that this buffer gets free'd in
734 	 *	  brelse().
735 	 */
736 
737 errout:
738 	errbp = geteblk();
739 	sema_p(&errbp->b_sem);
740 	errbp->b_flags &= ~B_BUSY;
741 	errbp->b_flags |= (B_ERROR | B_DONE);
742 	return (errbp);
743 }
744 
745 /*
746  * Get an empty block, not assigned to any particular device.
747  * Returns a locked buffer that is not on any hash or free list.
748  */
749 struct buf *
750 ngeteblk(long bsize)
751 {
752 	struct buf *bp;
753 
754 	bp = kmem_alloc(sizeof (struct buf), KM_SLEEP);
755 	bioinit(bp);
756 	bp->av_forw = bp->av_back = NULL;
757 	bp->b_un.b_addr = kmem_alloc(bsize, KM_SLEEP);
758 	bp->b_bufsize = bsize;
759 	bp->b_flags = B_BUSY | B_NOCACHE | B_AGE;
760 	bp->b_dev = (o_dev_t)NODEV;
761 	bp->b_edev = NODEV;
762 	bp->b_lblkno = 0;
763 	bp->b_bcount = bsize;
764 	bp->b_iodone = NULL;
765 	return (bp);
766 }
767 
768 /*
769  * Interface of geteblk() is kept intact to maintain driver compatibility.
770  * Use ngeteblk() to allocate block size other than 1 KB.
771  */
772 struct buf *
773 geteblk(void)
774 {
775 	return (ngeteblk((long)1024));
776 }
777 
778 /*
779  * Return a buffer w/o sleeping
780  */
781 struct buf *
782 trygetblk(dev_t dev, daddr_t blkno)
783 {
784 	struct buf	*bp;
785 	struct buf	*dp;
786 	struct hbuf	*hp;
787 	kmutex_t	*hmp;
788 	uint_t		index;
789 
790 	index = bio_bhash(dev, blkno);
791 	hp = &hbuf[index];
792 	hmp = &hp->b_lock;
793 
794 	if (!mutex_tryenter(hmp))
795 		return (NULL);
796 
797 	dp = (struct buf *)hp;
798 	for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
799 		if (bp->b_blkno != blkno || bp->b_edev != dev ||
800 		    (bp->b_flags & B_STALE))
801 			continue;
802 		/*
803 		 * Get access to a valid buffer without sleeping
804 		 */
805 		if (sema_tryp(&bp->b_sem)) {
806 			if (bp->b_flags & B_DONE) {
807 				hp->b_length--;
808 				notavail(bp);
809 				mutex_exit(hmp);
810 				return (bp);
811 			} else {
812 				sema_v(&bp->b_sem);
813 				break;
814 			}
815 		}
816 		break;
817 	}
818 	mutex_exit(hmp);
819 	return (NULL);
820 }
821 
822 /*
823  * Wait for I/O completion on the buffer; return errors
824  * to the user.
825  */
826 int
827 iowait(struct buf *bp)
828 {
829 	ASSERT(SEMA_HELD(&bp->b_sem));
830 	return (biowait(bp));
831 }
832 
833 /*
834  * Mark I/O complete on a buffer, release it if I/O is asynchronous,
835  * and wake up anyone waiting for it.
836  */
837 void
838 iodone(struct buf *bp)
839 {
840 	ASSERT(SEMA_HELD(&bp->b_sem));
841 	(void) biodone(bp);
842 }
843 
844 /*
845  * Zero the core associated with a buffer.
846  */
847 void
848 clrbuf(struct buf *bp)
849 {
850 	ASSERT(SEMA_HELD(&bp->b_sem));
851 	bzero(bp->b_un.b_addr, bp->b_bcount);
852 	bp->b_resid = 0;
853 }
854 
855 
856 /*
857  * Make sure all write-behind blocks on dev (or NODEV for all)
858  * are flushed out.
859  */
860 void
861 bflush(dev_t dev)
862 {
863 	struct buf *bp, *dp;
864 	struct hbuf *hp;
865 	struct buf *delwri_list = EMPTY_LIST;
866 	int i, index;
867 	kmutex_t *hmp;
868 
869 	mutex_enter(&blist_lock);
870 	/*
871 	 * Wait for any invalidates or flushes ahead of us to finish.
872 	 * We really could split blist_lock up per device for better
873 	 * parallelism here.
874 	 */
875 	while (bio_doinginval || bio_doingflush) {
876 		bio_flinv_cv_wanted = 1;
877 		cv_wait(&bio_flushinval_cv, &blist_lock);
878 	}
879 	bio_doingflush++;
880 	/*
881 	 * Gather all B_DELWRI buffer for device.
882 	 * Lock ordering is b_sem > hash lock (brelse).
883 	 * Since we are finding the buffer via the delayed write list,
884 	 * it may be busy and we would block trying to get the
885 	 * b_sem lock while holding hash lock. So transfer all the
886 	 * candidates on the delwri_list and then drop the hash locks.
887 	 */
888 	for (i = 0; i < v.v_hbuf; i++) {
889 		vfs_syncprogress();
890 		hmp = &hbuf[i].b_lock;
891 		dp = (struct buf *)&dwbuf[i];
892 		mutex_enter(hmp);
893 		for (bp = dp->av_forw; bp != dp; bp = bp->av_forw) {
894 			if (dev == NODEV || bp->b_edev == dev) {
895 				if (bp->b_list == NULL) {
896 					bp->b_list = delwri_list;
897 					delwri_list = bp;
898 				}
899 			}
900 		}
901 		mutex_exit(hmp);
902 	}
903 	mutex_exit(&blist_lock);
904 
905 	/*
906 	 * Now that the hash locks have been dropped grab the semaphores
907 	 * and write back all the buffers that have B_DELWRI set.
908 	 */
909 	while (delwri_list != EMPTY_LIST) {
910 		vfs_syncprogress();
911 		bp = delwri_list;
912 
913 		sema_p(&bp->b_sem);	/* may block */
914 		if ((dev != bp->b_edev && dev != NODEV) ||
915 		    (panicstr && bp->b_flags & B_BUSY)) {
916 			sema_v(&bp->b_sem);
917 			delwri_list = bp->b_list;
918 			bp->b_list = NULL;
919 			continue;	/* No longer a candidate */
920 		}
921 		if (bp->b_flags & B_DELWRI) {
922 			index = bio_bhash(bp->b_edev, bp->b_blkno);
923 			hp = &hbuf[index];
924 			hmp = &hp->b_lock;
925 			dp = (struct buf *)hp;
926 
927 			bp->b_flags |= B_ASYNC;
928 			mutex_enter(hmp);
929 			hp->b_length--;
930 			notavail(bp);
931 			mutex_exit(hmp);
932 			if (bp->b_vp == NULL) {		/* !ufs */
933 				BWRITE(bp);
934 			} else {			/* ufs */
935 				UFS_BWRITE(VTOI(bp->b_vp)->i_ufsvfs, bp);
936 			}
937 		} else {
938 			sema_v(&bp->b_sem);
939 		}
940 		delwri_list = bp->b_list;
941 		bp->b_list = NULL;
942 	}
943 	mutex_enter(&blist_lock);
944 	bio_doingflush--;
945 	if (bio_flinv_cv_wanted) {
946 		bio_flinv_cv_wanted = 0;
947 		cv_broadcast(&bio_flushinval_cv);
948 	}
949 	mutex_exit(&blist_lock);
950 }
951 
952 /*
953  * Ensure that a specified block is up-to-date on disk.
954  */
955 void
956 blkflush(dev_t dev, daddr_t blkno)
957 {
958 	struct buf *bp, *dp;
959 	struct hbuf *hp;
960 	struct buf *sbp = NULL;
961 	uint_t index;
962 	kmutex_t *hmp;
963 
964 	index = bio_bhash(dev, blkno);
965 	hp    = &hbuf[index];
966 	dp    = (struct buf *)hp;
967 	hmp   = &hp->b_lock;
968 
969 	/*
970 	 * Identify the buffer in the cache belonging to
971 	 * this device and blkno (if any).
972 	 */
973 	mutex_enter(hmp);
974 	for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
975 		if (bp->b_blkno != blkno || bp->b_edev != dev ||
976 		    (bp->b_flags & B_STALE))
977 			continue;
978 		sbp = bp;
979 		break;
980 	}
981 	mutex_exit(hmp);
982 	if (sbp == NULL)
983 		return;
984 	/*
985 	 * Now check the buffer we have identified and
986 	 * make sure it still belongs to the device and is B_DELWRI
987 	 */
988 	sema_p(&sbp->b_sem);
989 	if (sbp->b_blkno == blkno && sbp->b_edev == dev &&
990 	    (sbp->b_flags & (B_DELWRI|B_STALE)) == B_DELWRI) {
991 		mutex_enter(hmp);
992 		hp->b_length--;
993 		notavail(sbp);
994 		mutex_exit(hmp);
995 		/*
996 		 * XXX - There is nothing to guarantee a synchronous
997 		 * write here if the B_ASYNC flag is set.  This needs
998 		 * some investigation.
999 		 */
1000 		if (sbp->b_vp == NULL) {		/* !ufs */
1001 			BWRITE(sbp);	/* synchronous write */
1002 		} else {				/* ufs */
1003 			UFS_BWRITE(VTOI(sbp->b_vp)->i_ufsvfs, sbp);
1004 		}
1005 	} else {
1006 		sema_v(&sbp->b_sem);
1007 	}
1008 }
1009 
1010 /*
1011  * Same as binval, except can force-invalidate delayed-write buffers
1012  * (which are not be already flushed because of device errors).  Also
1013  * makes sure that the retry write flag is cleared.
1014  */
1015 int
1016 bfinval(dev_t dev, int force)
1017 {
1018 	struct buf *dp;
1019 	struct buf *bp;
1020 	struct buf *binval_list = EMPTY_LIST;
1021 	int i, error = 0;
1022 	kmutex_t *hmp;
1023 	uint_t index;
1024 	struct buf **backp;
1025 
1026 	mutex_enter(&blist_lock);
1027 	/*
1028 	 * Wait for any flushes ahead of us to finish, it's ok to
1029 	 * do invalidates in parallel.
1030 	 */
1031 	while (bio_doingflush) {
1032 		bio_flinv_cv_wanted = 1;
1033 		cv_wait(&bio_flushinval_cv, &blist_lock);
1034 	}
1035 	bio_doinginval++;
1036 
1037 	/* Gather bp's */
1038 	for (i = 0; i < v.v_hbuf; i++) {
1039 		dp = (struct buf *)&hbuf[i];
1040 		hmp = &hbuf[i].b_lock;
1041 
1042 		mutex_enter(hmp);
1043 		for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
1044 			if (bp->b_edev == dev) {
1045 				if (bp->b_list == NULL) {
1046 					bp->b_list = binval_list;
1047 					binval_list = bp;
1048 				}
1049 			}
1050 		}
1051 		mutex_exit(hmp);
1052 	}
1053 	mutex_exit(&blist_lock);
1054 
1055 	/* Invalidate all bp's found */
1056 	while (binval_list != EMPTY_LIST) {
1057 		bp = binval_list;
1058 
1059 		sema_p(&bp->b_sem);
1060 		if (bp->b_edev == dev) {
1061 			if (force && (bp->b_flags & B_DELWRI)) {
1062 				/* clear B_DELWRI, move to non-dw freelist */
1063 				index = bio_bhash(bp->b_edev, bp->b_blkno);
1064 				hmp = &hbuf[index].b_lock;
1065 				dp = (struct buf *)&hbuf[index];
1066 				mutex_enter(hmp);
1067 
1068 				/* remove from delayed write freelist */
1069 				notavail(bp);
1070 
1071 				/* add to B_AGE side of non-dw freelist */
1072 				backp = &dp->av_forw;
1073 				(*backp)->av_back = bp;
1074 				bp->av_forw = *backp;
1075 				*backp = bp;
1076 				bp->av_back = dp;
1077 
1078 				/*
1079 				 * make sure write retries and busy are cleared
1080 				 */
1081 				bp->b_flags &=
1082 				    ~(B_BUSY | B_DELWRI | B_RETRYWRI);
1083 				mutex_exit(hmp);
1084 			}
1085 			if ((bp->b_flags & B_DELWRI) == 0)
1086 				bp->b_flags |= B_STALE|B_AGE;
1087 			else
1088 				error = EIO;
1089 		}
1090 		sema_v(&bp->b_sem);
1091 		binval_list = bp->b_list;
1092 		bp->b_list = NULL;
1093 	}
1094 	mutex_enter(&blist_lock);
1095 	bio_doinginval--;
1096 	if (bio_flinv_cv_wanted) {
1097 		cv_broadcast(&bio_flushinval_cv);
1098 		bio_flinv_cv_wanted = 0;
1099 	}
1100 	mutex_exit(&blist_lock);
1101 	return (error);
1102 }
1103 
1104 /*
1105  * If possible, invalidate blocks for a dev on demand
1106  */
1107 void
1108 binval(dev_t dev)
1109 {
1110 	(void) bfinval(dev, 0);
1111 }
1112 
1113 /*
1114  * Initialize the buffer I/O system by freeing
1115  * all buffers and setting all device hash buffer lists to empty.
1116  */
1117 void
1118 binit(void)
1119 {
1120 	struct buf *bp;
1121 	unsigned int i, pct;
1122 	ulong_t	bio_max_hwm, bio_default_hwm;
1123 
1124 	/*
1125 	 * Maximum/Default values for bufhwm are set to the smallest of:
1126 	 *	- BIO_MAX_PERCENT resp. BIO_BUF_PERCENT of real memory
1127 	 *	- 1/4 of kernel virtual memory
1128 	 *	- INT32_MAX to prevent overflows of v.v_bufhwm (which is int).
1129 	 * Additionally, in order to allow simple tuning by percentage of
1130 	 * physical memory, bufhwm_pct is used to calculate the default if
1131 	 * the value of this tunable is between 0 and BIO_MAX_PERCENT.
1132 	 *
1133 	 * Since the unit for v.v_bufhwm is kilobytes, this allows for
1134 	 * a maximum of 1024 * 2GB == 2TB memory usage by buffer headers.
1135 	 */
1136 	bio_max_hwm = MIN(physmem / BIO_MAX_PERCENT,
1137 	    btop(vmem_size(heap_arena, VMEM_FREE)) / 4) * (PAGESIZE / 1024);
1138 	bio_max_hwm = MIN(INT32_MAX, bio_max_hwm);
1139 
1140 	pct = BIO_BUF_PERCENT;
1141 	if (bufhwm_pct != 0 &&
1142 	    ((pct = 100 / bufhwm_pct) < BIO_MAX_PERCENT)) {
1143 		pct = BIO_BUF_PERCENT;
1144 		/*
1145 		 * Invalid user specified value, emit a warning.
1146 		 */
1147 		cmn_err(CE_WARN, "binit: bufhwm_pct(%d) out of \
1148 		    range(1..%d). Using %d as default.",
1149 		    bufhwm_pct,
1150 		    100 / BIO_MAX_PERCENT, 100 / BIO_BUF_PERCENT);
1151 	}
1152 
1153 	bio_default_hwm = MIN(physmem / pct,
1154 	    btop(vmem_size(heap_arena, VMEM_FREE)) / 4) * (PAGESIZE / 1024);
1155 	bio_default_hwm = MIN(INT32_MAX, bio_default_hwm);
1156 
1157 	if ((v.v_bufhwm = bufhwm) == 0)
1158 		v.v_bufhwm = bio_default_hwm;
1159 
1160 	if (v.v_bufhwm < BIO_MIN_HWM || v.v_bufhwm > bio_max_hwm) {
1161 		v.v_bufhwm = (int)bio_max_hwm;
1162 		/*
1163 		 * Invalid user specified value, emit a warning.
1164 		 */
1165 		cmn_err(CE_WARN,
1166 		    "binit: bufhwm(%d) out \
1167 		    of range(%d..%lu). Using %lu as default",
1168 		    bufhwm,
1169 		    BIO_MIN_HWM, bio_max_hwm, bio_max_hwm);
1170 	}
1171 
1172 	/*
1173 	 * Determine the number of hash buckets. Default is to
1174 	 * create ~BIO_HASHLEN entries per chain based on MAXBSIZE buffers.
1175 	 * Round up number to the next power of 2.
1176 	 */
1177 	v.v_hbuf = 1 << highbit((((ulong_t)v.v_bufhwm * 1024) / MAXBSIZE) /
1178 	    BIO_HASHLEN);
1179 	v.v_hmask = v.v_hbuf - 1;
1180 	v.v_buf = BIO_BHDR_POOL;
1181 
1182 	hbuf = kmem_zalloc(v.v_hbuf * sizeof (struct hbuf), KM_SLEEP);
1183 
1184 	dwbuf = kmem_zalloc(v.v_hbuf * sizeof (struct dwbuf), KM_SLEEP);
1185 
1186 	bfreelist.b_bufsize = (size_t)v.v_bufhwm * 1024;
1187 	bp = &bfreelist;
1188 	bp->b_forw = bp->b_back = bp->av_forw = bp->av_back = bp;
1189 
1190 	for (i = 0; i < v.v_hbuf; i++) {
1191 		hbuf[i].b_forw = hbuf[i].b_back = (struct buf *)&hbuf[i];
1192 		hbuf[i].av_forw = hbuf[i].av_back = (struct buf *)&hbuf[i];
1193 
1194 		/*
1195 		 * Initialize the delayed write buffer list.
1196 		 */
1197 		dwbuf[i].b_forw = dwbuf[i].b_back = (struct buf *)&dwbuf[i];
1198 		dwbuf[i].av_forw = dwbuf[i].av_back = (struct buf *)&dwbuf[i];
1199 	}
1200 }
1201 
1202 /*
1203  * Wait for I/O completion on the buffer; return error code.
1204  * If bp was for synchronous I/O, bp is invalid and associated
1205  * resources are freed on return.
1206  */
1207 int
1208 biowait(struct buf *bp)
1209 {
1210 	int error = 0;
1211 	struct cpu *cpup;
1212 
1213 	ASSERT(SEMA_HELD(&bp->b_sem));
1214 
1215 	cpup = CPU;
1216 	atomic_inc_64(&cpup->cpu_stats.sys.iowait);
1217 	DTRACE_IO1(wait__start, struct buf *, bp);
1218 
1219 	/*
1220 	 * In case of panic, busy wait for completion
1221 	 */
1222 	if (panicstr) {
1223 		while ((bp->b_flags & B_DONE) == 0)
1224 			drv_usecwait(10);
1225 	} else
1226 		sema_p(&bp->b_io);
1227 
1228 	DTRACE_IO1(wait__done, struct buf *, bp);
1229 	atomic_dec_64(&cpup->cpu_stats.sys.iowait);
1230 
1231 	error = geterror(bp);
1232 	if ((bp->b_flags & B_ASYNC) == 0) {
1233 		if (bp->b_flags & B_REMAPPED)
1234 			bp_mapout(bp);
1235 	}
1236 	return (error);
1237 }
1238 
1239 static void
1240 biodone_tnf_probe(struct buf *bp)
1241 {
1242 	/* Kernel probe */
1243 	TNF_PROBE_3(biodone, "io blockio", /* CSTYLED */,
1244 	    tnf_device,		device,		bp->b_edev,
1245 	    tnf_diskaddr,	block,		bp->b_lblkno,
1246 	    tnf_opaque,		buf,		bp);
1247 }
1248 
1249 /*
1250  * Mark I/O complete on a buffer, release it if I/O is asynchronous,
1251  * and wake up anyone waiting for it.
1252  */
1253 void
1254 biodone(struct buf *bp)
1255 {
1256 	if (bp->b_flags & B_STARTED) {
1257 		DTRACE_IO1(done, struct buf *, bp);
1258 		bp->b_flags &= ~B_STARTED;
1259 	}
1260 
1261 	/*
1262 	 * Call the TNF probe here instead of the inline code
1263 	 * to force our compiler to use the tail call optimization.
1264 	 */
1265 	biodone_tnf_probe(bp);
1266 
1267 	if (bp->b_iodone != NULL) {
1268 		(*(bp->b_iodone))(bp);
1269 		return;
1270 	}
1271 	ASSERT((bp->b_flags & B_DONE) == 0);
1272 	ASSERT(SEMA_HELD(&bp->b_sem));
1273 	bp->b_flags |= B_DONE;
1274 	if (bp->b_flags & B_ASYNC) {
1275 		if (bp->b_flags & (B_PAGEIO|B_REMAPPED))
1276 			bio_pageio_done(bp);
1277 		else
1278 			brelse(bp);	/* release bp to freelist */
1279 	} else {
1280 		sema_v(&bp->b_io);
1281 	}
1282 }
1283 
1284 /*
1285  * Pick up the device's error number and pass it to the user;
1286  * if there is an error but the number is 0 set a generalized code.
1287  */
1288 int
1289 geterror(struct buf *bp)
1290 {
1291 	int error = 0;
1292 
1293 	ASSERT(SEMA_HELD(&bp->b_sem));
1294 	if (bp->b_flags & B_ERROR) {
1295 		error = bp->b_error;
1296 		if (!error)
1297 			error = EIO;
1298 	}
1299 	return (error);
1300 }
1301 
1302 /*
1303  * Support for pageio buffers.
1304  *
1305  * This stuff should be generalized to provide a generalized bp
1306  * header facility that can be used for things other than pageio.
1307  */
1308 
1309 /*
1310  * Allocate and initialize a buf struct for use with pageio.
1311  */
1312 struct buf *
1313 pageio_setup(struct page *pp, size_t len, struct vnode *vp, int flags)
1314 {
1315 	struct buf *bp;
1316 	struct cpu *cpup;
1317 
1318 	if (flags & B_READ) {
1319 		CPU_STATS_ENTER_K();
1320 		cpup = CPU;	/* get pointer AFTER preemption is disabled */
1321 		CPU_STATS_ADDQ(cpup, vm, pgin, 1);
1322 		CPU_STATS_ADDQ(cpup, vm, pgpgin, btopr(len));
1323 		if ((flags & B_ASYNC) == 0) {
1324 			klwp_t *lwp = ttolwp(curthread);
1325 			if (lwp != NULL)
1326 				lwp->lwp_ru.majflt++;
1327 			CPU_STATS_ADDQ(cpup, vm, maj_fault, 1);
1328 			/* Kernel probe */
1329 			TNF_PROBE_2(major_fault, "vm pagefault", /* CSTYLED */,
1330 			    tnf_opaque,		vnode,		pp->p_vnode,
1331 			    tnf_offset,		offset,		pp->p_offset);
1332 		}
1333 		/*
1334 		 * Update statistics for pages being paged in
1335 		 */
1336 		if (pp != NULL && pp->p_vnode != NULL) {
1337 			if (IS_SWAPFSVP(pp->p_vnode)) {
1338 				CPU_STATS_ADDQ(cpup, vm, anonpgin, btopr(len));
1339 			} else {
1340 				if (pp->p_vnode->v_flag & VVMEXEC) {
1341 					CPU_STATS_ADDQ(cpup, vm, execpgin,
1342 					    btopr(len));
1343 				} else {
1344 					CPU_STATS_ADDQ(cpup, vm, fspgin,
1345 					    btopr(len));
1346 				}
1347 			}
1348 		}
1349 		CPU_STATS_EXIT_K();
1350 		TRACE_1(TR_FAC_VM, TR_PAGE_WS_IN,
1351 		    "page_ws_in:pp %p", pp);
1352 		/* Kernel probe */
1353 		TNF_PROBE_3(pagein, "vm pageio io", /* CSTYLED */,
1354 		    tnf_opaque,	vnode,	pp->p_vnode,
1355 		    tnf_offset,	offset,	pp->p_offset,
1356 		    tnf_size,	size,	len);
1357 	}
1358 
1359 	bp = kmem_zalloc(sizeof (struct buf), KM_SLEEP);
1360 	bp->b_bcount = len;
1361 	bp->b_bufsize = len;
1362 	bp->b_pages = pp;
1363 	bp->b_flags = B_PAGEIO | B_NOCACHE | B_BUSY | flags;
1364 	bp->b_offset = -1;
1365 	sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
1366 
1367 	/* Initialize bp->b_sem in "locked" state */
1368 	sema_init(&bp->b_sem, 0, NULL, SEMA_DEFAULT, NULL);
1369 
1370 	VN_HOLD(vp);
1371 	bp->b_vp = vp;
1372 	THREAD_KPRI_RELEASE_N(btopr(len)); /* release kpri from page_locks */
1373 
1374 	/*
1375 	 * Caller sets dev & blkno and can adjust
1376 	 * b_addr for page offset and can use bp_mapin
1377 	 * to make pages kernel addressable.
1378 	 */
1379 	return (bp);
1380 }
1381 
1382 void
1383 pageio_done(struct buf *bp)
1384 {
1385 	ASSERT(SEMA_HELD(&bp->b_sem));
1386 	if (bp->b_flags & B_REMAPPED)
1387 		bp_mapout(bp);
1388 	VN_RELE(bp->b_vp);
1389 	bp->b_vp = NULL;
1390 	ASSERT((bp->b_flags & B_NOCACHE) != 0);
1391 
1392 	/* A sema_v(bp->b_sem) is implied if we are destroying it */
1393 	sema_destroy(&bp->b_sem);
1394 	sema_destroy(&bp->b_io);
1395 	kmem_free(bp, sizeof (struct buf));
1396 }
1397 
1398 /*
1399  * Check to see whether the buffers, except the one pointed by sbp,
1400  * associated with the device are busy.
1401  * NOTE: This expensive operation shall be improved together with ufs_icheck().
1402  */
1403 int
1404 bcheck(dev_t dev, struct buf *sbp)
1405 {
1406 	struct buf	*bp;
1407 	struct buf	*dp;
1408 	int i;
1409 	kmutex_t *hmp;
1410 
1411 	/*
1412 	 * check for busy bufs for this filesystem
1413 	 */
1414 	for (i = 0; i < v.v_hbuf; i++) {
1415 		dp = (struct buf *)&hbuf[i];
1416 		hmp = &hbuf[i].b_lock;
1417 
1418 		mutex_enter(hmp);
1419 		for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
1420 			/*
1421 			 * if buf is busy or dirty, then filesystem is busy
1422 			 */
1423 			if ((bp->b_edev == dev) &&
1424 			    ((bp->b_flags & B_STALE) == 0) &&
1425 			    (bp->b_flags & (B_DELWRI|B_BUSY)) &&
1426 			    (bp != sbp)) {
1427 				mutex_exit(hmp);
1428 				return (1);
1429 			}
1430 		}
1431 		mutex_exit(hmp);
1432 	}
1433 	return (0);
1434 }
1435 
1436 /*
1437  * Hash two 32 bit entities.
1438  */
1439 int
1440 hash2ints(int x, int y)
1441 {
1442 	int hash = 0;
1443 
1444 	hash = x - 1;
1445 	hash = ((hash * 7) + (x >> 8)) - 1;
1446 	hash = ((hash * 7) + (x >> 16)) - 1;
1447 	hash = ((hash * 7) + (x >> 24)) - 1;
1448 	hash = ((hash * 7) + y) - 1;
1449 	hash = ((hash * 7) + (y >> 8)) - 1;
1450 	hash = ((hash * 7) + (y >> 16)) - 1;
1451 	hash = ((hash * 7) + (y >> 24)) - 1;
1452 
1453 	return (hash);
1454 }
1455 
1456 
1457 /*
1458  * Return a new buffer struct.
1459  *	Create a new buffer if we haven't gone over our high water
1460  *	mark for memory, otherwise try to get one off the freelist.
1461  *
1462  * Returns a locked buf that has no id and is not on any hash or free
1463  * list.
1464  */
1465 static struct buf *
1466 bio_getfreeblk(long bsize)
1467 {
1468 	struct buf *bp, *dp;
1469 	struct hbuf *hp;
1470 	kmutex_t	*hmp;
1471 	uint_t		start, end;
1472 
1473 	/*
1474 	 * mutex_enter(&bfree_lock);
1475 	 * bfreelist.b_bufsize represents the amount of memory
1476 	 * mutex_exit(&bfree_lock); protect ref to bfreelist
1477 	 * we are allowed to allocate in the cache before we hit our hwm.
1478 	 */
1479 	bio_mem_get(bsize);	/* Account for our memory request */
1480 
1481 again:
1482 	bp = bio_bhdr_alloc();	/* Get a buf hdr */
1483 	sema_p(&bp->b_sem);	/* Should never fail */
1484 
1485 	ASSERT(bp->b_un.b_addr == NULL);
1486 	bp->b_un.b_addr = kmem_alloc(bsize, KM_NOSLEEP);
1487 	if (bp->b_un.b_addr != NULL) {
1488 		/*
1489 		 * Make the common path short
1490 		 */
1491 		bp->b_bufsize = bsize;
1492 		ASSERT(SEMA_HELD(&bp->b_sem));
1493 		return (bp);
1494 	} else {
1495 		struct buf *save;
1496 
1497 		save = bp;	/* Save bp we allocated */
1498 		start = end = lastindex;
1499 
1500 		biostats.bio_bufwant.value.ui32++;
1501 
1502 		/*
1503 		 * Memory isn't available from the system now. Scan
1504 		 * the hash buckets till enough space is found.
1505 		 */
1506 		do {
1507 			hp = &hbuf[start];
1508 			hmp = &hp->b_lock;
1509 			dp = (struct buf *)hp;
1510 
1511 			mutex_enter(hmp);
1512 			bp = dp->av_forw;
1513 
1514 			while (bp != dp) {
1515 
1516 				ASSERT(bp != NULL);
1517 
1518 				if (!sema_tryp(&bp->b_sem)) {
1519 					bp = bp->av_forw;
1520 					continue;
1521 				}
1522 
1523 				/*
1524 				 * Since we are going down the freelist
1525 				 * associated with this hash bucket the
1526 				 * B_DELWRI flag should not be set.
1527 				 */
1528 				ASSERT(!(bp->b_flags & B_DELWRI));
1529 
1530 				if (bp->b_bufsize == bsize) {
1531 					hp->b_length--;
1532 					notavail(bp);
1533 					bremhash(bp);
1534 					mutex_exit(hmp);
1535 
1536 					/*
1537 					 * Didn't kmem_alloc any more, so don't
1538 					 * count it twice.
1539 					 */
1540 					mutex_enter(&bfree_lock);
1541 					bfreelist.b_bufsize += bsize;
1542 					mutex_exit(&bfree_lock);
1543 
1544 					/*
1545 					 * Update the lastindex value.
1546 					 */
1547 					lastindex = start;
1548 
1549 					/*
1550 					 * Put our saved bp back on the list
1551 					 */
1552 					sema_v(&save->b_sem);
1553 					bio_bhdr_free(save);
1554 					ASSERT(SEMA_HELD(&bp->b_sem));
1555 					return (bp);
1556 				}
1557 				sema_v(&bp->b_sem);
1558 				bp = bp->av_forw;
1559 			}
1560 			mutex_exit(hmp);
1561 			start = ((start + 1) % v.v_hbuf);
1562 		} while (start != end);
1563 
1564 		biostats.bio_bufwait.value.ui32++;
1565 		bp = save;		/* Use original bp */
1566 		bp->b_un.b_addr = kmem_alloc(bsize, KM_SLEEP);
1567 	}
1568 
1569 	bp->b_bufsize = bsize;
1570 	ASSERT(SEMA_HELD(&bp->b_sem));
1571 	return (bp);
1572 }
1573 
1574 /*
1575  * Allocate a buffer header. If none currently available, allocate
1576  * a new pool.
1577  */
1578 static struct buf *
1579 bio_bhdr_alloc(void)
1580 {
1581 	struct buf *dp, *sdp;
1582 	struct buf *bp;
1583 	int i;
1584 
1585 	for (;;) {
1586 		mutex_enter(&bhdr_lock);
1587 		if (bhdrlist != NULL) {
1588 			bp = bhdrlist;
1589 			bhdrlist = bp->av_forw;
1590 			mutex_exit(&bhdr_lock);
1591 			bp->av_forw = NULL;
1592 			return (bp);
1593 		}
1594 		mutex_exit(&bhdr_lock);
1595 
1596 		/*
1597 		 * Need to allocate a new pool. If the system is currently
1598 		 * out of memory, then try freeing things on the freelist.
1599 		 */
1600 		dp = kmem_zalloc(sizeof (struct buf) * v.v_buf, KM_NOSLEEP);
1601 		if (dp == NULL) {
1602 			/*
1603 			 * System can't give us a pool of headers, try
1604 			 * recycling from the free lists.
1605 			 */
1606 			bio_recycle(BIO_HEADER, 0);
1607 		} else {
1608 			sdp = dp;
1609 			for (i = 0; i < v.v_buf; i++, dp++) {
1610 				/*
1611 				 * The next two lines are needed since NODEV
1612 				 * is -1 and not NULL
1613 				 */
1614 				dp->b_dev = (o_dev_t)NODEV;
1615 				dp->b_edev = NODEV;
1616 				dp->av_forw = dp + 1;
1617 				sema_init(&dp->b_sem, 1, NULL, SEMA_DEFAULT,
1618 				    NULL);
1619 				sema_init(&dp->b_io, 0, NULL, SEMA_DEFAULT,
1620 				    NULL);
1621 				dp->b_offset = -1;
1622 			}
1623 			mutex_enter(&bhdr_lock);
1624 			(--dp)->av_forw = bhdrlist;	/* Fix last pointer */
1625 			bhdrlist = sdp;
1626 			nbuf += v.v_buf;
1627 			bp = bhdrlist;
1628 			bhdrlist = bp->av_forw;
1629 			mutex_exit(&bhdr_lock);
1630 
1631 			bp->av_forw = NULL;
1632 			return (bp);
1633 		}
1634 	}
1635 }
1636 
1637 static  void
1638 bio_bhdr_free(struct buf *bp)
1639 {
1640 	ASSERT(bp->b_back == NULL);
1641 	ASSERT(bp->b_forw == NULL);
1642 	ASSERT(bp->av_back == NULL);
1643 	ASSERT(bp->av_forw == NULL);
1644 	ASSERT(bp->b_un.b_addr == NULL);
1645 	ASSERT(bp->b_dev == (o_dev_t)NODEV);
1646 	ASSERT(bp->b_edev == NODEV);
1647 	ASSERT(bp->b_flags == 0);
1648 
1649 	mutex_enter(&bhdr_lock);
1650 	bp->av_forw = bhdrlist;
1651 	bhdrlist = bp;
1652 	mutex_exit(&bhdr_lock);
1653 }
1654 
1655 /*
1656  * If we haven't gone over the high water mark, it's o.k. to
1657  * allocate more buffer space, otherwise recycle buffers
1658  * from the freelist until enough memory is free for a bsize request.
1659  *
1660  * We account for this memory, even though
1661  * we don't allocate it here.
1662  */
1663 static void
1664 bio_mem_get(long bsize)
1665 {
1666 	mutex_enter(&bfree_lock);
1667 	if (bfreelist.b_bufsize > bsize) {
1668 		bfreelist.b_bufsize -= bsize;
1669 		mutex_exit(&bfree_lock);
1670 		return;
1671 	}
1672 	mutex_exit(&bfree_lock);
1673 	bio_recycle(BIO_MEM, bsize);
1674 }
1675 
1676 /*
1677  * flush a list of delayed write buffers.
1678  * (currently used only by bio_recycle below.)
1679  */
1680 static void
1681 bio_flushlist(struct buf *delwri_list)
1682 {
1683 	struct buf *bp;
1684 
1685 	while (delwri_list != EMPTY_LIST) {
1686 		bp = delwri_list;
1687 		bp->b_flags |= B_AGE | B_ASYNC;
1688 		if (bp->b_vp == NULL) {		/* !ufs */
1689 			BWRITE(bp);
1690 		} else {			/* ufs */
1691 			UFS_BWRITE(VTOI(bp->b_vp)->i_ufsvfs, bp);
1692 		}
1693 		delwri_list = bp->b_list;
1694 		bp->b_list = NULL;
1695 	}
1696 }
1697 
1698 /*
1699  * Start recycling buffers on the freelist for one of 2 reasons:
1700  *	- we need a buffer header
1701  *	- we need to free up memory
1702  * Once started we continue to recycle buffers until the B_AGE
1703  * buffers are gone.
1704  */
1705 static void
1706 bio_recycle(int want, long bsize)
1707 {
1708 	struct buf *bp, *dp, *dwp, *nbp;
1709 	struct hbuf *hp;
1710 	int	found = 0;
1711 	kmutex_t	*hmp;
1712 	int		start, end;
1713 	struct buf *delwri_list = EMPTY_LIST;
1714 
1715 	/*
1716 	 * Recycle buffers.
1717 	 */
1718 top:
1719 	start = end = lastindex;
1720 	do {
1721 		hp = &hbuf[start];
1722 		hmp = &hp->b_lock;
1723 		dp = (struct buf *)hp;
1724 
1725 		mutex_enter(hmp);
1726 		bp = dp->av_forw;
1727 
1728 		while (bp != dp) {
1729 
1730 			ASSERT(bp != NULL);
1731 
1732 			if (!sema_tryp(&bp->b_sem)) {
1733 				bp = bp->av_forw;
1734 				continue;
1735 			}
1736 			/*
1737 			 * Do we really want to nuke all of the B_AGE stuff??
1738 			 */
1739 			if ((bp->b_flags & B_AGE) == 0 && found) {
1740 				sema_v(&bp->b_sem);
1741 				mutex_exit(hmp);
1742 				lastindex = start;
1743 				return;	/* All done */
1744 			}
1745 
1746 			ASSERT(MUTEX_HELD(&hp->b_lock));
1747 			ASSERT(!(bp->b_flags & B_DELWRI));
1748 			hp->b_length--;
1749 			notavail(bp);
1750 
1751 			/*
1752 			 * Remove bhdr from cache, free up memory,
1753 			 * and add the hdr to the freelist.
1754 			 */
1755 			bremhash(bp);
1756 			mutex_exit(hmp);
1757 
1758 			if (bp->b_bufsize) {
1759 				kmem_free(bp->b_un.b_addr, bp->b_bufsize);
1760 				bp->b_un.b_addr = NULL;
1761 				mutex_enter(&bfree_lock);
1762 				bfreelist.b_bufsize += bp->b_bufsize;
1763 				mutex_exit(&bfree_lock);
1764 			}
1765 
1766 			bp->b_dev = (o_dev_t)NODEV;
1767 			bp->b_edev = NODEV;
1768 			bp->b_flags = 0;
1769 			sema_v(&bp->b_sem);
1770 			bio_bhdr_free(bp);
1771 			if (want == BIO_HEADER) {
1772 				found = 1;
1773 			} else {
1774 				ASSERT(want == BIO_MEM);
1775 				if (!found && bfreelist.b_bufsize >= bsize) {
1776 					/* Account for the memory we want */
1777 					mutex_enter(&bfree_lock);
1778 					if (bfreelist.b_bufsize >= bsize) {
1779 						bfreelist.b_bufsize -= bsize;
1780 						found = 1;
1781 					}
1782 					mutex_exit(&bfree_lock);
1783 				}
1784 			}
1785 
1786 			/*
1787 			 * Since we dropped hmp start from the
1788 			 * begining.
1789 			 */
1790 			mutex_enter(hmp);
1791 			bp = dp->av_forw;
1792 		}
1793 		mutex_exit(hmp);
1794 
1795 		/*
1796 		 * Look at the delayed write list.
1797 		 * First gather into a private list, then write them.
1798 		 */
1799 		dwp = (struct buf *)&dwbuf[start];
1800 		mutex_enter(&blist_lock);
1801 		bio_doingflush++;
1802 		mutex_enter(hmp);
1803 		for (bp = dwp->av_forw; bp != dwp; bp = nbp) {
1804 
1805 			ASSERT(bp != NULL);
1806 			nbp = bp->av_forw;
1807 
1808 			if (!sema_tryp(&bp->b_sem))
1809 				continue;
1810 			ASSERT(bp->b_flags & B_DELWRI);
1811 			/*
1812 			 * Do we really want to nuke all of the B_AGE stuff??
1813 			 */
1814 
1815 			if ((bp->b_flags & B_AGE) == 0 && found) {
1816 				sema_v(&bp->b_sem);
1817 				mutex_exit(hmp);
1818 				lastindex = start;
1819 				mutex_exit(&blist_lock);
1820 				bio_flushlist(delwri_list);
1821 				mutex_enter(&blist_lock);
1822 				bio_doingflush--;
1823 				if (bio_flinv_cv_wanted) {
1824 					bio_flinv_cv_wanted = 0;
1825 					cv_broadcast(&bio_flushinval_cv);
1826 				}
1827 				mutex_exit(&blist_lock);
1828 				return; /* All done */
1829 			}
1830 
1831 			/*
1832 			 * If the buffer is already on a flush or
1833 			 * invalidate list then just skip it.
1834 			 */
1835 			if (bp->b_list != NULL) {
1836 				sema_v(&bp->b_sem);
1837 				continue;
1838 			}
1839 			/*
1840 			 * We are still on the same bucket.
1841 			 */
1842 			hp->b_length--;
1843 			notavail(bp);
1844 			bp->b_list = delwri_list;
1845 			delwri_list = bp;
1846 		}
1847 		mutex_exit(hmp);
1848 		mutex_exit(&blist_lock);
1849 		bio_flushlist(delwri_list);
1850 		delwri_list = EMPTY_LIST;
1851 		mutex_enter(&blist_lock);
1852 		bio_doingflush--;
1853 		if (bio_flinv_cv_wanted) {
1854 			bio_flinv_cv_wanted = 0;
1855 			cv_broadcast(&bio_flushinval_cv);
1856 		}
1857 		mutex_exit(&blist_lock);
1858 		start = (start + 1) % v.v_hbuf;
1859 
1860 	} while (start != end);
1861 
1862 	if (found)
1863 		return;
1864 
1865 	/*
1866 	 * Free lists exhausted and we haven't satisfied the request.
1867 	 * Wait here for more entries to be added to freelist.
1868 	 * Because this might have just happened, make it timed.
1869 	 */
1870 	mutex_enter(&bfree_lock);
1871 	bfreelist.b_flags |= B_WANTED;
1872 	(void) cv_reltimedwait(&bio_mem_cv, &bfree_lock, hz, TR_CLOCK_TICK);
1873 	mutex_exit(&bfree_lock);
1874 	goto top;
1875 }
1876 
1877 /*
1878  * See if the block is associated with some buffer
1879  * (mainly to avoid getting hung up on a wait in breada).
1880  */
1881 static int
1882 bio_incore(dev_t dev, daddr_t blkno)
1883 {
1884 	struct buf *bp;
1885 	struct buf *dp;
1886 	uint_t index;
1887 	kmutex_t *hmp;
1888 
1889 	index = bio_bhash(dev, blkno);
1890 	dp = (struct buf *)&hbuf[index];
1891 	hmp = &hbuf[index].b_lock;
1892 
1893 	mutex_enter(hmp);
1894 	for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
1895 		if (bp->b_blkno == blkno && bp->b_edev == dev &&
1896 		    (bp->b_flags & B_STALE) == 0) {
1897 			mutex_exit(hmp);
1898 			return (1);
1899 		}
1900 	}
1901 	mutex_exit(hmp);
1902 	return (0);
1903 }
1904 
1905 static void
1906 bio_pageio_done(struct buf *bp)
1907 {
1908 	if (bp->b_flags & B_PAGEIO) {
1909 
1910 		if (bp->b_flags & B_REMAPPED)
1911 			bp_mapout(bp);
1912 
1913 		if (bp->b_flags & B_READ)
1914 			pvn_read_done(bp->b_pages, bp->b_flags);
1915 		else
1916 			pvn_write_done(bp->b_pages, B_WRITE | bp->b_flags);
1917 		pageio_done(bp);
1918 	} else {
1919 		ASSERT(bp->b_flags & B_REMAPPED);
1920 		bp_mapout(bp);
1921 		brelse(bp);
1922 	}
1923 }
1924 
1925 /*
1926  * bioerror(9F) - indicate error in buffer header
1927  * If 'error' is zero, remove the error indication.
1928  */
1929 void
1930 bioerror(struct buf *bp, int error)
1931 {
1932 	ASSERT(bp != NULL);
1933 	ASSERT(error >= 0);
1934 	ASSERT(SEMA_HELD(&bp->b_sem));
1935 
1936 	if (error != 0) {
1937 		bp->b_flags |= B_ERROR;
1938 	} else {
1939 		bp->b_flags &= ~B_ERROR;
1940 	}
1941 	bp->b_error = error;
1942 }
1943 
1944 /*
1945  * bioreset(9F) - reuse a private buffer header after I/O is complete
1946  */
1947 void
1948 bioreset(struct buf *bp)
1949 {
1950 	ASSERT(bp != NULL);
1951 
1952 	biofini(bp);
1953 	bioinit(bp);
1954 }
1955 
1956 /*
1957  * biosize(9F) - return size of a buffer header
1958  */
1959 size_t
1960 biosize(void)
1961 {
1962 	return (sizeof (struct buf));
1963 }
1964 
1965 /*
1966  * biomodified(9F) - check if buffer is modified
1967  */
1968 int
1969 biomodified(struct buf *bp)
1970 {
1971 	int npf;
1972 	int ppattr;
1973 	struct page *pp;
1974 
1975 	ASSERT(bp != NULL);
1976 
1977 	if ((bp->b_flags & B_PAGEIO) == 0) {
1978 		return (-1);
1979 	}
1980 	pp = bp->b_pages;
1981 	npf = btopr(bp->b_bcount + ((uintptr_t)bp->b_un.b_addr & PAGEOFFSET));
1982 
1983 	while (npf > 0) {
1984 		ppattr = hat_pagesync(pp, HAT_SYNC_DONTZERO |
1985 		    HAT_SYNC_STOPON_MOD);
1986 		if (ppattr & P_MOD)
1987 			return (1);
1988 		pp = pp->p_next;
1989 		npf--;
1990 	}
1991 
1992 	return (0);
1993 }
1994 
1995 /*
1996  * bioinit(9F) - initialize a buffer structure
1997  */
1998 void
1999 bioinit(struct buf *bp)
2000 {
2001 	bzero(bp, sizeof (struct buf));
2002 	sema_init(&bp->b_sem, 0, NULL, SEMA_DEFAULT, NULL);
2003 	sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
2004 	bp->b_offset = -1;
2005 }
2006 
2007 /*
2008  * biofini(9F) - uninitialize a buffer structure
2009  */
2010 void
2011 biofini(struct buf *bp)
2012 {
2013 	sema_destroy(&bp->b_io);
2014 	sema_destroy(&bp->b_sem);
2015 }
2016 
2017 /*
2018  * bioclone(9F) - clone a buffer
2019  */
2020 struct buf *
2021 bioclone(struct buf *bp, off_t off, size_t len, dev_t dev, daddr_t blkno,
2022     int (*iodone)(struct buf *), struct buf *bp_mem, int sleep)
2023 {
2024 	struct buf *bufp;
2025 
2026 	ASSERT(bp);
2027 	if (bp_mem == NULL) {
2028 		bufp = kmem_alloc(sizeof (struct buf), sleep);
2029 		if (bufp == NULL) {
2030 			return (NULL);
2031 		}
2032 		bioinit(bufp);
2033 	} else {
2034 		bufp = bp_mem;
2035 		bioreset(bufp);
2036 	}
2037 
2038 #define	BUF_CLONE_FLAGS	(B_READ|B_WRITE|B_SHADOW|B_PHYS|B_PAGEIO|B_FAILFAST|\
2039 	B_ABRWRITE)
2040 
2041 	/*
2042 	 * The cloned buffer does not inherit the B_REMAPPED flag.
2043 	 */
2044 	bufp->b_flags = (bp->b_flags & BUF_CLONE_FLAGS)  | B_BUSY;
2045 	bufp->b_bcount = len;
2046 	bufp->b_blkno = blkno;
2047 	bufp->b_iodone = iodone;
2048 	bufp->b_proc = bp->b_proc;
2049 	bufp->b_edev = dev;
2050 	bufp->b_file = bp->b_file;
2051 	bufp->b_offset = bp->b_offset;
2052 
2053 	if (bp->b_flags & B_SHADOW) {
2054 		ASSERT(bp->b_shadow);
2055 		ASSERT(bp->b_flags & B_PHYS);
2056 
2057 		bufp->b_shadow = bp->b_shadow +
2058 		    btop(((uintptr_t)bp->b_un.b_addr & PAGEOFFSET) + off);
2059 		bufp->b_un.b_addr = (caddr_t)((uintptr_t)bp->b_un.b_addr + off);
2060 		if (bp->b_flags & B_REMAPPED)
2061 			bufp->b_proc = NULL;
2062 	} else {
2063 		if (bp->b_flags & B_PAGEIO) {
2064 			struct page *pp;
2065 			off_t o;
2066 			int i;
2067 
2068 			pp = bp->b_pages;
2069 			o = ((uintptr_t)bp->b_un.b_addr & PAGEOFFSET) + off;
2070 			for (i = btop(o); i > 0; i--) {
2071 				pp = pp->p_next;
2072 			}
2073 			bufp->b_pages = pp;
2074 			bufp->b_un.b_addr = (caddr_t)(o & PAGEOFFSET);
2075 		} else {
2076 			bufp->b_un.b_addr =
2077 			    (caddr_t)((uintptr_t)bp->b_un.b_addr + off);
2078 			if (bp->b_flags & B_REMAPPED)
2079 				bufp->b_proc = NULL;
2080 		}
2081 	}
2082 	return (bufp);
2083 }
2084