1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 * Copyright 2019 Joyent, Inc.
25 */
26
27 /*
28 * Copyright (c) 2016 by Delphix. All rights reserved.
29 */
30
31 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
32 /* All Rights Reserved */
33
34 /*
35 * University Copyright- Copyright (c) 1982, 1986, 1988
36 * The Regents of the University of California
37 * All Rights Reserved
38 *
39 * University Acknowledgment- Portions of this document are derived from
40 * software developed by the University of California, Berkeley, and its
41 * contributors.
42 */
43
44 #include <sys/types.h>
45 #include <sys/t_lock.h>
46 #include <sys/sysmacros.h>
47 #include <sys/conf.h>
48 #include <sys/cpuvar.h>
49 #include <sys/errno.h>
50 #include <sys/debug.h>
51 #include <sys/buf.h>
52 #include <sys/var.h>
53 #include <sys/vnode.h>
54 #include <sys/bitmap.h>
55 #include <sys/cmn_err.h>
56 #include <sys/kmem.h>
57 #include <sys/vmem.h>
58 #include <sys/atomic.h>
59 #include <vm/seg_kmem.h>
60 #include <vm/page.h>
61 #include <vm/pvn.h>
62 #include <sys/vtrace.h>
63 #include <sys/tnf_probe.h>
64 #include <sys/fs/ufs_inode.h>
65 #include <sys/fs/ufs_bio.h>
66 #include <sys/fs/ufs_log.h>
67 #include <sys/systm.h>
68 #include <sys/vfs.h>
69 #include <sys/sdt.h>
70
71 /* Locks */
72 static kmutex_t blist_lock; /* protects b_list */
73 static kmutex_t bhdr_lock; /* protects the bhdrlist */
74 static kmutex_t bfree_lock; /* protects the bfreelist structure */
75
76 struct hbuf *hbuf; /* Hash buckets */
77 struct dwbuf *dwbuf; /* Delayed write buckets */
78 static struct buf *bhdrlist; /* buf header free list */
79 static int nbuf; /* number of buffer headers allocated */
80
81 static int lastindex; /* Reference point on where to start */
82 /* when looking for free buffers */
83
84 #define bio_bhash(dev, bn) (hash2ints((dev), (int)(bn)) & v.v_hmask)
85 #define EMPTY_LIST ((struct buf *)-1)
86
87 static kcondvar_t bio_mem_cv; /* Condition variables */
88 static kcondvar_t bio_flushinval_cv;
89 static int bio_doingflush; /* flush in progress */
90 static int bio_doinginval; /* inval in progress */
91 static int bio_flinv_cv_wanted; /* someone waiting for cv */
92
93 /*
94 * Statistics on the buffer cache
95 */
96 struct biostats biostats = {
97 { "buffer_cache_lookups", KSTAT_DATA_UINT32 },
98 { "buffer_cache_hits", KSTAT_DATA_UINT32 },
99 { "new_buffer_requests", KSTAT_DATA_UINT32 },
100 { "waits_for_buffer_allocs", KSTAT_DATA_UINT32 },
101 { "buffers_locked_by_someone", KSTAT_DATA_UINT32 },
102 { "duplicate_buffers_found", KSTAT_DATA_UINT32 }
103 };
104
105 /*
106 * kstat data
107 */
108 kstat_named_t *biostats_ptr = (kstat_named_t *)&biostats;
109 uint_t biostats_ndata = (uint_t)(sizeof (biostats) /
110 sizeof (kstat_named_t));
111
112 /*
113 * Statistics on ufs buffer cache
114 * Not protected by locks
115 */
116 struct ufsbiostats ub = {
117 { "breads", KSTAT_DATA_UINT32 },
118 { "bwrites", KSTAT_DATA_UINT32 },
119 { "fbiwrites", KSTAT_DATA_UINT32 },
120 { "getpages", KSTAT_DATA_UINT32 },
121 { "getras", KSTAT_DATA_UINT32 },
122 { "putsyncs", KSTAT_DATA_UINT32 },
123 { "putasyncs", KSTAT_DATA_UINT32 },
124 { "putpageios", KSTAT_DATA_UINT32 },
125 };
126
127 /*
128 * more UFS Logging eccentricities...
129 *
130 * required since "#pragma weak ..." doesn't work in reverse order.
131 * i.e.: genunix (bio.c) is loaded before the ufs modules and pointers
132 * to ufs routines don't get plugged into bio.c calls so
133 * we initialize it when setting up the "lufsops" table
134 * in "lufs.c:_init()"
135 */
136 void (*bio_lufs_strategy)(void *, buf_t *);
137 void (*bio_snapshot_strategy)(void *, buf_t *);
138
139
140 /* Private routines */
141 static struct buf *bio_getfreeblk(long);
142 static void bio_mem_get(long);
143 static void bio_bhdr_free(struct buf *);
144 static struct buf *bio_bhdr_alloc(void);
145 static void bio_recycle(int, long);
146 static void bio_pageio_done(struct buf *);
147 static int bio_incore(dev_t, daddr_t);
148
149 /*
150 * Buffer cache constants
151 */
152 #define BIO_BUF_PERCENT (100/2) /* default: 2% of memory */
153 #define BIO_MAX_PERCENT (100/20) /* max is 20% of real memory */
154 #define BIO_BHDR_POOL 100 /* Default bhdr pool size */
155 #define BIO_MIN_HDR 10 /* Minimum number of buffer headers */
156 #define BIO_MIN_HWM (BIO_MIN_HDR * MAXBSIZE / 1024)
157 #define BIO_HASHLEN 4 /* Target length of hash chains */
158
159
160 /* Flags for bio_recycle() */
161 #define BIO_HEADER 0x01
162 #define BIO_MEM 0x02
163
164 extern int bufhwm; /* User tunable - high water mark for mem */
165 extern int bufhwm_pct; /* ditto - given in % of physmem */
166
167 /*
168 * The following routines allocate and free
169 * buffers with various side effects. In general the
170 * arguments to an allocate routine are a device and
171 * a block number, and the value is a pointer to
172 * to the buffer header; the buffer returned is locked with a
173 * binary semaphore so that no one else can touch it. If the block was
174 * already in core, no I/O need be done; if it is
175 * already locked, the process waits until it becomes free.
176 * The following routines allocate a buffer:
177 * getblk
178 * bread/BREAD
179 * breada
180 * Eventually the buffer must be released, possibly with the
181 * side effect of writing it out, by using one of
182 * bwrite/BWRITE/brwrite
183 * bdwrite/bdrwrite
184 * bawrite
185 * brelse
186 *
187 * The B_WANTED/B_BUSY bits are NOT used by these routines for synchronization.
188 * Instead, a binary semaphore, b_sem is used to gain exclusive access to
189 * a buffer and a binary semaphore, b_io is used for I/O synchronization.
190 * B_DONE is still used to denote a buffer with I/O complete on it.
191 *
192 * The bfreelist.b_bcount field is computed everytime fsflush runs. It is
193 * should not be used where a very accurate count of the free buffers is
194 * needed.
195 */
196
197 /*
198 * Read in (if necessary) the block and return a buffer pointer.
199 *
200 * This interface is provided for binary compatibility. Using
201 * BREAD() directly avoids the extra function call overhead invoked
202 * by calling this routine.
203 */
204 struct buf *
bread(dev_t dev,daddr_t blkno,long bsize)205 bread(dev_t dev, daddr_t blkno, long bsize)
206 {
207 return (BREAD(dev, blkno, bsize));
208 }
209
210 /*
211 * Common code for reading a buffer with various options
212 *
213 * Read in (if necessary) the block and return a buffer pointer.
214 */
215 struct buf *
bread_common(void * arg,dev_t dev,daddr_t blkno,long bsize)216 bread_common(void *arg, dev_t dev, daddr_t blkno, long bsize)
217 {
218 struct ufsvfs *ufsvfsp = (struct ufsvfs *)arg;
219 struct buf *bp;
220 klwp_t *lwp = ttolwp(curthread);
221
222 CPU_STATS_ADD_K(sys, lread, 1);
223 bp = getblk_common(ufsvfsp, dev, blkno, bsize, /* errflg */ 1);
224 if (bp->b_flags & B_DONE)
225 return (bp);
226 bp->b_flags |= B_READ;
227 ASSERT(bp->b_bcount == bsize);
228 if (ufsvfsp == NULL) { /* !ufs */
229 (void) bdev_strategy(bp);
230 } else if (ufsvfsp->vfs_log && bio_lufs_strategy != NULL) {
231 /* ufs && logging */
232 (*bio_lufs_strategy)(ufsvfsp->vfs_log, bp);
233 } else if (ufsvfsp->vfs_snapshot && bio_snapshot_strategy != NULL) {
234 /* ufs && snapshots */
235 (*bio_snapshot_strategy)(&ufsvfsp->vfs_snapshot, bp);
236 } else {
237 ufsvfsp->vfs_iotstamp = ddi_get_lbolt();
238 ub.ub_breads.value.ul++; /* ufs && !logging */
239 (void) bdev_strategy(bp);
240 }
241 if (lwp != NULL)
242 lwp->lwp_ru.inblock++;
243 CPU_STATS_ADD_K(sys, bread, 1);
244 (void) biowait(bp);
245 return (bp);
246 }
247
248 /*
249 * Read in the block, like bread, but also start I/O on the
250 * read-ahead block (which is not allocated to the caller).
251 */
252 struct buf *
breada(dev_t dev,daddr_t blkno,daddr_t rablkno,long bsize)253 breada(dev_t dev, daddr_t blkno, daddr_t rablkno, long bsize)
254 {
255 struct buf *bp, *rabp;
256 klwp_t *lwp = ttolwp(curthread);
257
258 bp = NULL;
259 if (!bio_incore(dev, blkno)) {
260 CPU_STATS_ADD_K(sys, lread, 1);
261 bp = GETBLK(dev, blkno, bsize);
262 if ((bp->b_flags & B_DONE) == 0) {
263 bp->b_flags |= B_READ;
264 bp->b_bcount = bsize;
265 (void) bdev_strategy(bp);
266 if (lwp != NULL)
267 lwp->lwp_ru.inblock++;
268 CPU_STATS_ADD_K(sys, bread, 1);
269 }
270 }
271 if (rablkno && bfreelist.b_bcount > 1 &&
272 !bio_incore(dev, rablkno)) {
273 rabp = GETBLK(dev, rablkno, bsize);
274 if (rabp->b_flags & B_DONE)
275 brelse(rabp);
276 else {
277 rabp->b_flags |= B_READ|B_ASYNC;
278 rabp->b_bcount = bsize;
279 (void) bdev_strategy(rabp);
280 if (lwp != NULL)
281 lwp->lwp_ru.inblock++;
282 CPU_STATS_ADD_K(sys, bread, 1);
283 }
284 }
285 if (bp == NULL)
286 return (BREAD(dev, blkno, bsize));
287 (void) biowait(bp);
288 return (bp);
289 }
290
291 /*
292 * Common code for writing a buffer with various options.
293 *
294 * force_wait - wait for write completion regardless of B_ASYNC flag
295 * do_relse - release the buffer when we are done
296 * clear_flags - flags to clear from the buffer
297 */
298 void
bwrite_common(void * arg,struct buf * bp,int force_wait,int do_relse,int clear_flags)299 bwrite_common(void *arg, struct buf *bp, int force_wait,
300 int do_relse, int clear_flags)
301 {
302 register int do_wait;
303 struct ufsvfs *ufsvfsp = (struct ufsvfs *)arg;
304 int flag;
305 klwp_t *lwp = ttolwp(curthread);
306 struct cpu *cpup;
307
308 ASSERT(SEMA_HELD(&bp->b_sem));
309 flag = bp->b_flags;
310 bp->b_flags &= ~clear_flags;
311 if (lwp != NULL)
312 lwp->lwp_ru.oublock++;
313 CPU_STATS_ENTER_K();
314 cpup = CPU; /* get pointer AFTER preemption is disabled */
315 CPU_STATS_ADDQ(cpup, sys, lwrite, 1);
316 CPU_STATS_ADDQ(cpup, sys, bwrite, 1);
317 do_wait = ((flag & B_ASYNC) == 0 || force_wait);
318 if (do_wait == 0)
319 CPU_STATS_ADDQ(cpup, sys, bawrite, 1);
320 CPU_STATS_EXIT_K();
321 if (ufsvfsp == NULL) {
322 (void) bdev_strategy(bp);
323 } else if (ufsvfsp->vfs_log && bio_lufs_strategy != NULL) {
324 /* ufs && logging */
325 (*bio_lufs_strategy)(ufsvfsp->vfs_log, bp);
326 } else if (ufsvfsp->vfs_snapshot && bio_snapshot_strategy != NULL) {
327 /* ufs && snapshots */
328 (*bio_snapshot_strategy)(&ufsvfsp->vfs_snapshot, bp);
329 } else {
330 ub.ub_bwrites.value.ul++; /* ufs && !logging */
331 (void) bdev_strategy(bp);
332 }
333 if (do_wait) {
334 (void) biowait(bp);
335 if (do_relse) {
336 brelse(bp);
337 }
338 }
339 }
340
341 /*
342 * Write the buffer, waiting for completion (unless B_ASYNC is set).
343 * Then release the buffer.
344 * This interface is provided for binary compatibility. Using
345 * BWRITE() directly avoids the extra function call overhead invoked
346 * by calling this routine.
347 */
348 void
bwrite(struct buf * bp)349 bwrite(struct buf *bp)
350 {
351 BWRITE(bp);
352 }
353
354 /*
355 * Write the buffer, waiting for completion.
356 * But don't release the buffer afterwards.
357 * This interface is provided for binary compatibility. Using
358 * BWRITE2() directly avoids the extra function call overhead.
359 */
360 void
bwrite2(struct buf * bp)361 bwrite2(struct buf *bp)
362 {
363 BWRITE2(bp);
364 }
365
366 /*
367 * Release the buffer, marking it so that if it is grabbed
368 * for another purpose it will be written out before being
369 * given up (e.g. when writing a partial block where it is
370 * assumed that another write for the same block will soon follow).
371 * Also save the time that the block is first marked as delayed
372 * so that it will be written in a reasonable time.
373 */
374 void
bdwrite(struct buf * bp)375 bdwrite(struct buf *bp)
376 {
377 ASSERT(SEMA_HELD(&bp->b_sem));
378 CPU_STATS_ADD_K(sys, lwrite, 1);
379 if ((bp->b_flags & B_DELWRI) == 0)
380 bp->b_start = ddi_get_lbolt();
381 /*
382 * B_DONE allows others to use the buffer, B_DELWRI causes the
383 * buffer to be written before being reused, and setting b_resid
384 * to zero says the buffer is complete.
385 */
386 bp->b_flags |= B_DELWRI | B_DONE;
387 bp->b_resid = 0;
388 brelse(bp);
389 }
390
391 /*
392 * Release the buffer, start I/O on it, but don't wait for completion.
393 */
394 void
bawrite(struct buf * bp)395 bawrite(struct buf *bp)
396 {
397 ASSERT(SEMA_HELD(&bp->b_sem));
398
399 /* Use bfreelist.b_bcount as a weird-ass heuristic */
400 if (bfreelist.b_bcount > 4)
401 bp->b_flags |= B_ASYNC;
402 BWRITE(bp);
403 }
404
405 /*
406 * Release the buffer, with no I/O implied.
407 */
408 void
brelse(struct buf * bp)409 brelse(struct buf *bp)
410 {
411 struct buf **backp;
412 uint_t index;
413 kmutex_t *hmp;
414 struct buf *dp;
415 struct hbuf *hp;
416
417
418 ASSERT(SEMA_HELD(&bp->b_sem));
419
420 /*
421 * Clear the retry write flag if the buffer was written without
422 * error. The presence of B_DELWRI means the buffer has not yet
423 * been written and the presence of B_ERROR means that an error
424 * is still occurring.
425 */
426 if ((bp->b_flags & (B_ERROR | B_DELWRI | B_RETRYWRI)) == B_RETRYWRI) {
427 bp->b_flags &= ~B_RETRYWRI;
428 }
429
430 /* Check for anomalous conditions */
431 if (bp->b_flags & (B_ERROR|B_NOCACHE)) {
432 if (bp->b_flags & B_NOCACHE) {
433 /* Don't add to the freelist. Destroy it now */
434 kmem_free(bp->b_un.b_addr, bp->b_bufsize);
435 sema_destroy(&bp->b_sem);
436 sema_destroy(&bp->b_io);
437 kmem_free(bp, sizeof (struct buf));
438 return;
439 }
440 /*
441 * If a write failed and we are supposed to retry write,
442 * don't toss the buffer. Keep it around and mark it
443 * delayed write in the hopes that it will eventually
444 * get flushed (and still keep the system running.)
445 */
446 if ((bp->b_flags & (B_READ | B_RETRYWRI)) == B_RETRYWRI) {
447 bp->b_flags |= B_DELWRI;
448 /* keep fsflush from trying continuously to flush */
449 bp->b_start = ddi_get_lbolt();
450 } else
451 bp->b_flags |= B_AGE|B_STALE;
452 bp->b_flags &= ~B_ERROR;
453 bp->b_error = 0;
454 }
455
456 /*
457 * If delayed write is set then put in on the delayed
458 * write list instead of the free buffer list.
459 */
460 index = bio_bhash(bp->b_edev, bp->b_blkno);
461 hmp = &hbuf[index].b_lock;
462
463 mutex_enter(hmp);
464 hp = &hbuf[index];
465 dp = (struct buf *)hp;
466
467 /*
468 * Make sure that the number of entries on this list are
469 * Zero <= count <= total # buffers
470 */
471 ASSERT(hp->b_length >= 0);
472 ASSERT(hp->b_length < nbuf);
473
474 hp->b_length++; /* We are adding this buffer */
475
476 if (bp->b_flags & B_DELWRI) {
477 /*
478 * This buffer goes on the delayed write buffer list
479 */
480 dp = (struct buf *)&dwbuf[index];
481 }
482 ASSERT(bp->b_bufsize > 0);
483 ASSERT(bp->b_bcount > 0);
484 ASSERT(bp->b_un.b_addr != NULL);
485
486 if (bp->b_flags & B_AGE) {
487 backp = &dp->av_forw;
488 (*backp)->av_back = bp;
489 bp->av_forw = *backp;
490 *backp = bp;
491 bp->av_back = dp;
492 } else {
493 backp = &dp->av_back;
494 (*backp)->av_forw = bp;
495 bp->av_back = *backp;
496 *backp = bp;
497 bp->av_forw = dp;
498 }
499 mutex_exit(hmp);
500
501 if (bfreelist.b_flags & B_WANTED) {
502 /*
503 * Should come here very very rarely.
504 */
505 mutex_enter(&bfree_lock);
506 if (bfreelist.b_flags & B_WANTED) {
507 bfreelist.b_flags &= ~B_WANTED;
508 cv_broadcast(&bio_mem_cv);
509 }
510 mutex_exit(&bfree_lock);
511 }
512
513 bp->b_flags &= ~(B_WANTED|B_BUSY|B_ASYNC);
514 /*
515 * Don't let anyone get the buffer off the freelist before we
516 * release our hold on it.
517 */
518 sema_v(&bp->b_sem);
519 }
520
521 /*
522 * Return a count of the number of B_BUSY buffers in the system
523 * Can only be used as a good estimate. If 'cleanit' is set,
524 * try to flush all bufs.
525 */
526 int
bio_busy(int cleanit)527 bio_busy(int cleanit)
528 {
529 struct buf *bp, *dp;
530 int busy = 0;
531 int i;
532 kmutex_t *hmp;
533
534 for (i = 0; i < v.v_hbuf; i++) {
535 dp = (struct buf *)&hbuf[i];
536 hmp = &hbuf[i].b_lock;
537
538 mutex_enter(hmp);
539 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
540 if (bp->b_flags & B_BUSY)
541 busy++;
542 }
543 mutex_exit(hmp);
544 }
545
546 if (cleanit && busy != 0) {
547 bflush(NODEV);
548 }
549
550 return (busy);
551 }
552
553 /*
554 * this interface is provided for binary compatibility.
555 *
556 * Assign a buffer for the given block. If the appropriate
557 * block is already associated, return it; otherwise search
558 * for the oldest non-busy buffer and reassign it.
559 */
560 struct buf *
getblk(dev_t dev,daddr_t blkno,long bsize)561 getblk(dev_t dev, daddr_t blkno, long bsize)
562 {
563 return (getblk_common(/* ufsvfsp */ NULL, dev,
564 blkno, bsize, /* errflg */ 0));
565 }
566
567 /*
568 * Assign a buffer for the given block. If the appropriate
569 * block is already associated, return it; otherwise search
570 * for the oldest non-busy buffer and reassign it.
571 */
572 struct buf *
getblk_common(void * arg,dev_t dev,daddr_t blkno,long bsize,int errflg)573 getblk_common(void * arg, dev_t dev, daddr_t blkno, long bsize, int errflg)
574 {
575 ufsvfs_t *ufsvfsp = (struct ufsvfs *)arg;
576 struct buf *bp;
577 struct buf *dp;
578 struct buf *nbp = NULL;
579 struct buf *errbp;
580 uint_t index;
581 kmutex_t *hmp;
582 struct hbuf *hp;
583
584 if (getmajor(dev) >= devcnt)
585 cmn_err(CE_PANIC, "blkdev");
586
587 biostats.bio_lookup.value.ui32++;
588
589 index = bio_bhash(dev, blkno);
590 hp = &hbuf[index];
591 dp = (struct buf *)hp;
592 hmp = &hp->b_lock;
593
594 mutex_enter(hmp);
595 loop:
596 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
597 if (bp->b_blkno != blkno || bp->b_edev != dev ||
598 (bp->b_flags & B_STALE))
599 continue;
600 /*
601 * Avoid holding the hash lock in the event that
602 * the buffer is locked by someone. Since the hash chain
603 * may change when we drop the hash lock
604 * we have to start at the beginning of the chain if the
605 * buffer identity/contents aren't valid.
606 */
607 if (!sema_tryp(&bp->b_sem)) {
608 biostats.bio_bufbusy.value.ui32++;
609 mutex_exit(hmp);
610 /*
611 * OK, we are dealing with a busy buffer.
612 * In the case that we are panicking and we
613 * got called from bread(), we have some chance
614 * for error recovery. So better bail out from
615 * here since sema_p() won't block. If we got
616 * called directly from ufs routines, there is
617 * no way to report an error yet.
618 */
619 if (panicstr && errflg)
620 goto errout;
621 /*
622 * For the following line of code to work
623 * correctly never kmem_free the buffer "header".
624 */
625 sema_p(&bp->b_sem);
626 if (bp->b_blkno != blkno || bp->b_edev != dev ||
627 (bp->b_flags & B_STALE)) {
628 sema_v(&bp->b_sem);
629 mutex_enter(hmp);
630 goto loop; /* start over */
631 }
632 mutex_enter(hmp);
633 }
634 /* Found */
635 biostats.bio_hit.value.ui32++;
636 bp->b_flags &= ~B_AGE;
637
638 /*
639 * Yank it off the free/delayed write lists
640 */
641 hp->b_length--;
642 notavail(bp);
643 mutex_exit(hmp);
644
645 ASSERT((bp->b_flags & B_NOCACHE) == 0);
646
647 if (nbp == NULL) {
648 /*
649 * Make the common path short.
650 */
651 ASSERT(SEMA_HELD(&bp->b_sem));
652 return (bp);
653 }
654
655 biostats.bio_bufdup.value.ui32++;
656
657 /*
658 * The buffer must have entered during the lock upgrade
659 * so free the new buffer we allocated and return the
660 * found buffer.
661 */
662 kmem_free(nbp->b_un.b_addr, nbp->b_bufsize);
663 nbp->b_un.b_addr = NULL;
664
665 /*
666 * Account for the memory
667 */
668 mutex_enter(&bfree_lock);
669 bfreelist.b_bufsize += nbp->b_bufsize;
670 mutex_exit(&bfree_lock);
671
672 /*
673 * Destroy buf identity, and place on avail list
674 */
675 nbp->b_dev = (o_dev_t)NODEV;
676 nbp->b_edev = NODEV;
677 nbp->b_flags = 0;
678 nbp->b_file = NULL;
679 nbp->b_offset = -1;
680
681 sema_v(&nbp->b_sem);
682 bio_bhdr_free(nbp);
683
684 ASSERT(SEMA_HELD(&bp->b_sem));
685 return (bp);
686 }
687
688 /*
689 * bio_getfreeblk may block so check the hash chain again.
690 */
691 if (nbp == NULL) {
692 mutex_exit(hmp);
693 nbp = bio_getfreeblk(bsize);
694 mutex_enter(hmp);
695 goto loop;
696 }
697
698 /*
699 * New buffer. Assign nbp and stick it on the hash.
700 */
701 nbp->b_flags = B_BUSY;
702 nbp->b_edev = dev;
703 nbp->b_dev = (o_dev_t)cmpdev(dev);
704 nbp->b_blkno = blkno;
705 nbp->b_iodone = NULL;
706 nbp->b_bcount = bsize;
707 /*
708 * If we are given a ufsvfsp and the vfs_root field is NULL
709 * then this must be I/O for a superblock. A superblock's
710 * buffer is set up in mountfs() and there is no root vnode
711 * at that point.
712 */
713 if (ufsvfsp && ufsvfsp->vfs_root) {
714 nbp->b_vp = ufsvfsp->vfs_root;
715 } else {
716 nbp->b_vp = NULL;
717 }
718
719 ASSERT((nbp->b_flags & B_NOCACHE) == 0);
720
721 binshash(nbp, dp);
722 mutex_exit(hmp);
723
724 ASSERT(SEMA_HELD(&nbp->b_sem));
725
726 return (nbp);
727
728
729 /*
730 * Come here in case of an internal error. At this point we couldn't
731 * get a buffer, but we have to return one. Hence we allocate some
732 * kind of error reply buffer on the fly. This buffer is marked as
733 * B_NOCACHE | B_AGE | B_ERROR | B_DONE to assure the following:
734 * - B_ERROR will indicate error to the caller.
735 * - B_DONE will prevent us from reading the buffer from
736 * the device.
737 * - B_NOCACHE will cause that this buffer gets free'd in
738 * brelse().
739 */
740
741 errout:
742 errbp = geteblk();
743 sema_p(&errbp->b_sem);
744 errbp->b_flags &= ~B_BUSY;
745 errbp->b_flags |= (B_ERROR | B_DONE);
746 return (errbp);
747 }
748
749 /*
750 * Get an empty block, not assigned to any particular device.
751 * Returns a locked buffer that is not on any hash or free list.
752 */
753 struct buf *
ngeteblk(long bsize)754 ngeteblk(long bsize)
755 {
756 struct buf *bp;
757
758 bp = kmem_alloc(sizeof (struct buf), KM_SLEEP);
759 bioinit(bp);
760 bp->av_forw = bp->av_back = NULL;
761 bp->b_un.b_addr = kmem_alloc(bsize, KM_SLEEP);
762 bp->b_bufsize = bsize;
763 bp->b_flags = B_BUSY | B_NOCACHE | B_AGE;
764 bp->b_dev = (o_dev_t)NODEV;
765 bp->b_edev = NODEV;
766 bp->b_lblkno = 0;
767 bp->b_bcount = bsize;
768 bp->b_iodone = NULL;
769 return (bp);
770 }
771
772 /*
773 * Interface of geteblk() is kept intact to maintain driver compatibility.
774 * Use ngeteblk() to allocate block size other than 1 KB.
775 */
776 struct buf *
geteblk(void)777 geteblk(void)
778 {
779 return (ngeteblk((long)1024));
780 }
781
782 /*
783 * Return a buffer w/o sleeping
784 */
785 struct buf *
trygetblk(dev_t dev,daddr_t blkno)786 trygetblk(dev_t dev, daddr_t blkno)
787 {
788 struct buf *bp;
789 struct buf *dp;
790 struct hbuf *hp;
791 kmutex_t *hmp;
792 uint_t index;
793
794 index = bio_bhash(dev, blkno);
795 hp = &hbuf[index];
796 hmp = &hp->b_lock;
797
798 if (!mutex_tryenter(hmp))
799 return (NULL);
800
801 dp = (struct buf *)hp;
802 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
803 if (bp->b_blkno != blkno || bp->b_edev != dev ||
804 (bp->b_flags & B_STALE))
805 continue;
806 /*
807 * Get access to a valid buffer without sleeping
808 */
809 if (sema_tryp(&bp->b_sem)) {
810 if (bp->b_flags & B_DONE) {
811 hp->b_length--;
812 notavail(bp);
813 mutex_exit(hmp);
814 return (bp);
815 } else {
816 sema_v(&bp->b_sem);
817 break;
818 }
819 }
820 break;
821 }
822 mutex_exit(hmp);
823 return (NULL);
824 }
825
826 /*
827 * Wait for I/O completion on the buffer; return errors
828 * to the user.
829 */
830 int
iowait(struct buf * bp)831 iowait(struct buf *bp)
832 {
833 ASSERT(SEMA_HELD(&bp->b_sem));
834 return (biowait(bp));
835 }
836
837 /*
838 * Mark I/O complete on a buffer, release it if I/O is asynchronous,
839 * and wake up anyone waiting for it.
840 */
841 void
iodone(struct buf * bp)842 iodone(struct buf *bp)
843 {
844 ASSERT(SEMA_HELD(&bp->b_sem));
845 (void) biodone(bp);
846 }
847
848 /*
849 * Zero the core associated with a buffer.
850 */
851 void
clrbuf(struct buf * bp)852 clrbuf(struct buf *bp)
853 {
854 ASSERT(SEMA_HELD(&bp->b_sem));
855 bzero(bp->b_un.b_addr, bp->b_bcount);
856 bp->b_resid = 0;
857 }
858
859
860 /*
861 * Make sure all write-behind blocks on dev (or NODEV for all)
862 * are flushed out.
863 */
864 void
bflush(dev_t dev)865 bflush(dev_t dev)
866 {
867 struct buf *bp, *dp;
868 struct hbuf *hp;
869 struct buf *delwri_list = EMPTY_LIST;
870 int i, index;
871 kmutex_t *hmp;
872
873 mutex_enter(&blist_lock);
874 /*
875 * Wait for any invalidates or flushes ahead of us to finish.
876 * We really could split blist_lock up per device for better
877 * parallelism here.
878 */
879 while (bio_doinginval || bio_doingflush) {
880 bio_flinv_cv_wanted = 1;
881 cv_wait(&bio_flushinval_cv, &blist_lock);
882 }
883 bio_doingflush++;
884 /*
885 * Gather all B_DELWRI buffer for device.
886 * Lock ordering is b_sem > hash lock (brelse).
887 * Since we are finding the buffer via the delayed write list,
888 * it may be busy and we would block trying to get the
889 * b_sem lock while holding hash lock. So transfer all the
890 * candidates on the delwri_list and then drop the hash locks.
891 */
892 for (i = 0; i < v.v_hbuf; i++) {
893 hmp = &hbuf[i].b_lock;
894 dp = (struct buf *)&dwbuf[i];
895 mutex_enter(hmp);
896 for (bp = dp->av_forw; bp != dp; bp = bp->av_forw) {
897 if (dev == NODEV || bp->b_edev == dev) {
898 if (bp->b_list == NULL) {
899 bp->b_list = delwri_list;
900 delwri_list = bp;
901 }
902 }
903 }
904 mutex_exit(hmp);
905 }
906 mutex_exit(&blist_lock);
907
908 /*
909 * Now that the hash locks have been dropped grab the semaphores
910 * and write back all the buffers that have B_DELWRI set.
911 */
912 while (delwri_list != EMPTY_LIST) {
913 bp = delwri_list;
914
915 sema_p(&bp->b_sem); /* may block */
916 if ((dev != bp->b_edev && dev != NODEV) ||
917 (panicstr && bp->b_flags & B_BUSY)) {
918 sema_v(&bp->b_sem);
919 delwri_list = bp->b_list;
920 bp->b_list = NULL;
921 continue; /* No longer a candidate */
922 }
923 if (bp->b_flags & B_DELWRI) {
924 index = bio_bhash(bp->b_edev, bp->b_blkno);
925 hp = &hbuf[index];
926 hmp = &hp->b_lock;
927 dp = (struct buf *)hp;
928
929 bp->b_flags |= B_ASYNC;
930 mutex_enter(hmp);
931 hp->b_length--;
932 notavail(bp);
933 mutex_exit(hmp);
934 if (bp->b_vp == NULL) { /* !ufs */
935 BWRITE(bp);
936 } else { /* ufs */
937 UFS_BWRITE(VTOI(bp->b_vp)->i_ufsvfs, bp);
938 }
939 } else {
940 sema_v(&bp->b_sem);
941 }
942 delwri_list = bp->b_list;
943 bp->b_list = NULL;
944 }
945 mutex_enter(&blist_lock);
946 bio_doingflush--;
947 if (bio_flinv_cv_wanted) {
948 bio_flinv_cv_wanted = 0;
949 cv_broadcast(&bio_flushinval_cv);
950 }
951 mutex_exit(&blist_lock);
952 }
953
954 /*
955 * Ensure that a specified block is up-to-date on disk.
956 */
957 void
blkflush(dev_t dev,daddr_t blkno)958 blkflush(dev_t dev, daddr_t blkno)
959 {
960 struct buf *bp, *dp;
961 struct hbuf *hp;
962 struct buf *sbp = NULL;
963 uint_t index;
964 kmutex_t *hmp;
965
966 index = bio_bhash(dev, blkno);
967 hp = &hbuf[index];
968 dp = (struct buf *)hp;
969 hmp = &hp->b_lock;
970
971 /*
972 * Identify the buffer in the cache belonging to
973 * this device and blkno (if any).
974 */
975 mutex_enter(hmp);
976 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
977 if (bp->b_blkno != blkno || bp->b_edev != dev ||
978 (bp->b_flags & B_STALE))
979 continue;
980 sbp = bp;
981 break;
982 }
983 mutex_exit(hmp);
984 if (sbp == NULL)
985 return;
986 /*
987 * Now check the buffer we have identified and
988 * make sure it still belongs to the device and is B_DELWRI
989 */
990 sema_p(&sbp->b_sem);
991 if (sbp->b_blkno == blkno && sbp->b_edev == dev &&
992 (sbp->b_flags & (B_DELWRI|B_STALE)) == B_DELWRI) {
993 mutex_enter(hmp);
994 hp->b_length--;
995 notavail(sbp);
996 mutex_exit(hmp);
997 /*
998 * XXX - There is nothing to guarantee a synchronous
999 * write here if the B_ASYNC flag is set. This needs
1000 * some investigation.
1001 */
1002 if (sbp->b_vp == NULL) { /* !ufs */
1003 BWRITE(sbp); /* synchronous write */
1004 } else { /* ufs */
1005 UFS_BWRITE(VTOI(sbp->b_vp)->i_ufsvfs, sbp);
1006 }
1007 } else {
1008 sema_v(&sbp->b_sem);
1009 }
1010 }
1011
1012 /*
1013 * Same as binval, except can force-invalidate delayed-write buffers
1014 * (which are not be already flushed because of device errors). Also
1015 * makes sure that the retry write flag is cleared.
1016 */
1017 int
bfinval(dev_t dev,int force)1018 bfinval(dev_t dev, int force)
1019 {
1020 struct buf *dp;
1021 struct buf *bp;
1022 struct buf *binval_list = EMPTY_LIST;
1023 int i, error = 0;
1024 kmutex_t *hmp;
1025 uint_t index;
1026 struct buf **backp;
1027
1028 mutex_enter(&blist_lock);
1029 /*
1030 * Wait for any flushes ahead of us to finish, it's ok to
1031 * do invalidates in parallel.
1032 */
1033 while (bio_doingflush) {
1034 bio_flinv_cv_wanted = 1;
1035 cv_wait(&bio_flushinval_cv, &blist_lock);
1036 }
1037 bio_doinginval++;
1038
1039 /* Gather bp's */
1040 for (i = 0; i < v.v_hbuf; i++) {
1041 dp = (struct buf *)&hbuf[i];
1042 hmp = &hbuf[i].b_lock;
1043
1044 mutex_enter(hmp);
1045 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
1046 if (bp->b_edev == dev) {
1047 if (bp->b_list == NULL) {
1048 bp->b_list = binval_list;
1049 binval_list = bp;
1050 }
1051 }
1052 }
1053 mutex_exit(hmp);
1054 }
1055 mutex_exit(&blist_lock);
1056
1057 /* Invalidate all bp's found */
1058 while (binval_list != EMPTY_LIST) {
1059 bp = binval_list;
1060
1061 sema_p(&bp->b_sem);
1062 if (bp->b_edev == dev) {
1063 if (force && (bp->b_flags & B_DELWRI)) {
1064 /* clear B_DELWRI, move to non-dw freelist */
1065 index = bio_bhash(bp->b_edev, bp->b_blkno);
1066 hmp = &hbuf[index].b_lock;
1067 dp = (struct buf *)&hbuf[index];
1068 mutex_enter(hmp);
1069
1070 /* remove from delayed write freelist */
1071 notavail(bp);
1072
1073 /* add to B_AGE side of non-dw freelist */
1074 backp = &dp->av_forw;
1075 (*backp)->av_back = bp;
1076 bp->av_forw = *backp;
1077 *backp = bp;
1078 bp->av_back = dp;
1079
1080 /*
1081 * make sure write retries and busy are cleared
1082 */
1083 bp->b_flags &=
1084 ~(B_BUSY | B_DELWRI | B_RETRYWRI);
1085 mutex_exit(hmp);
1086 }
1087 if ((bp->b_flags & B_DELWRI) == 0)
1088 bp->b_flags |= B_STALE|B_AGE;
1089 else
1090 error = EIO;
1091 }
1092 sema_v(&bp->b_sem);
1093 binval_list = bp->b_list;
1094 bp->b_list = NULL;
1095 }
1096 mutex_enter(&blist_lock);
1097 bio_doinginval--;
1098 if (bio_flinv_cv_wanted) {
1099 cv_broadcast(&bio_flushinval_cv);
1100 bio_flinv_cv_wanted = 0;
1101 }
1102 mutex_exit(&blist_lock);
1103 return (error);
1104 }
1105
1106 /*
1107 * If possible, invalidate blocks for a dev on demand
1108 */
1109 void
binval(dev_t dev)1110 binval(dev_t dev)
1111 {
1112 (void) bfinval(dev, 0);
1113 }
1114
1115 /*
1116 * Initialize the buffer I/O system by freeing
1117 * all buffers and setting all device hash buffer lists to empty.
1118 */
1119 void
binit(void)1120 binit(void)
1121 {
1122 struct buf *bp;
1123 unsigned int i, pct;
1124 ulong_t bio_max_hwm, bio_default_hwm;
1125
1126 /*
1127 * Maximum/Default values for bufhwm are set to the smallest of:
1128 * - BIO_MAX_PERCENT resp. BIO_BUF_PERCENT of real memory
1129 * - 1/4 of kernel virtual memory
1130 * - INT32_MAX to prevent overflows of v.v_bufhwm (which is int).
1131 * Additionally, in order to allow simple tuning by percentage of
1132 * physical memory, bufhwm_pct is used to calculate the default if
1133 * the value of this tunable is between 0 and BIO_MAX_PERCENT.
1134 *
1135 * Since the unit for v.v_bufhwm is kilobytes, this allows for
1136 * a maximum of 1024 * 2GB == 2TB memory usage by buffer headers.
1137 */
1138 bio_max_hwm = MIN(physmem / BIO_MAX_PERCENT,
1139 btop(vmem_size(heap_arena, VMEM_FREE)) / 4) * (PAGESIZE / 1024);
1140 bio_max_hwm = MIN(INT32_MAX, bio_max_hwm);
1141
1142 pct = BIO_BUF_PERCENT;
1143 if (bufhwm_pct != 0 &&
1144 ((pct = 100 / bufhwm_pct) < BIO_MAX_PERCENT)) {
1145 pct = BIO_BUF_PERCENT;
1146 /*
1147 * Invalid user specified value, emit a warning.
1148 */
1149 cmn_err(CE_WARN, "binit: bufhwm_pct(%d) out of \
1150 range(1..%d). Using %d as default.",
1151 bufhwm_pct,
1152 100 / BIO_MAX_PERCENT, 100 / BIO_BUF_PERCENT);
1153 }
1154
1155 bio_default_hwm = MIN(physmem / pct,
1156 btop(vmem_size(heap_arena, VMEM_FREE)) / 4) * (PAGESIZE / 1024);
1157 bio_default_hwm = MIN(INT32_MAX, bio_default_hwm);
1158
1159 if ((v.v_bufhwm = bufhwm) == 0)
1160 v.v_bufhwm = bio_default_hwm;
1161
1162 if (v.v_bufhwm < BIO_MIN_HWM || v.v_bufhwm > bio_max_hwm) {
1163 v.v_bufhwm = (int)bio_max_hwm;
1164 /*
1165 * Invalid user specified value, emit a warning.
1166 */
1167 cmn_err(CE_WARN,
1168 "binit: bufhwm(%d) out \
1169 of range(%d..%lu). Using %lu as default",
1170 bufhwm,
1171 BIO_MIN_HWM, bio_max_hwm, bio_max_hwm);
1172 }
1173
1174 /*
1175 * Determine the number of hash buckets. Default is to
1176 * create ~BIO_HASHLEN entries per chain based on MAXBSIZE buffers.
1177 * Round up number to the next power of 2.
1178 */
1179 v.v_hbuf = 1 << highbit((((ulong_t)v.v_bufhwm * 1024) / MAXBSIZE) /
1180 BIO_HASHLEN);
1181 v.v_hmask = v.v_hbuf - 1;
1182 v.v_buf = BIO_BHDR_POOL;
1183
1184 hbuf = kmem_zalloc(v.v_hbuf * sizeof (struct hbuf), KM_SLEEP);
1185
1186 dwbuf = kmem_zalloc(v.v_hbuf * sizeof (struct dwbuf), KM_SLEEP);
1187
1188 bfreelist.b_bufsize = (size_t)v.v_bufhwm * 1024;
1189 bp = &bfreelist;
1190 bp->b_forw = bp->b_back = bp->av_forw = bp->av_back = bp;
1191
1192 for (i = 0; i < v.v_hbuf; i++) {
1193 hbuf[i].b_forw = hbuf[i].b_back = (struct buf *)&hbuf[i];
1194 hbuf[i].av_forw = hbuf[i].av_back = (struct buf *)&hbuf[i];
1195
1196 /*
1197 * Initialize the delayed write buffer list.
1198 */
1199 dwbuf[i].b_forw = dwbuf[i].b_back = (struct buf *)&dwbuf[i];
1200 dwbuf[i].av_forw = dwbuf[i].av_back = (struct buf *)&dwbuf[i];
1201 }
1202 }
1203
1204 /*
1205 * Wait for I/O completion on the buffer; return error code.
1206 * If bp was for synchronous I/O, bp is invalid and associated
1207 * resources are freed on return.
1208 */
1209 int
biowait(struct buf * bp)1210 biowait(struct buf *bp)
1211 {
1212 int error = 0;
1213 struct cpu *cpup;
1214
1215 ASSERT(SEMA_HELD(&bp->b_sem));
1216
1217 cpup = CPU;
1218 atomic_inc_64(&cpup->cpu_stats.sys.iowait);
1219 DTRACE_IO1(wait__start, struct buf *, bp);
1220
1221 /*
1222 * In case of panic, busy wait for completion
1223 */
1224 if (panicstr) {
1225 while ((bp->b_flags & B_DONE) == 0)
1226 drv_usecwait(10);
1227 } else
1228 sema_p(&bp->b_io);
1229
1230 DTRACE_IO1(wait__done, struct buf *, bp);
1231 atomic_dec_64(&cpup->cpu_stats.sys.iowait);
1232
1233 error = geterror(bp);
1234 if ((bp->b_flags & B_ASYNC) == 0) {
1235 if (bp->b_flags & B_REMAPPED)
1236 bp_mapout(bp);
1237 }
1238 return (error);
1239 }
1240
1241 static void
biodone_tnf_probe(struct buf * bp)1242 biodone_tnf_probe(struct buf *bp)
1243 {
1244 /* Kernel probe */
1245 TNF_PROBE_3(biodone, "io blockio", /* CSTYLED */,
1246 tnf_device, device, bp->b_edev,
1247 tnf_diskaddr, block, bp->b_lblkno,
1248 tnf_opaque, buf, bp);
1249 }
1250
1251 /*
1252 * Mark I/O complete on a buffer, release it if I/O is asynchronous,
1253 * and wake up anyone waiting for it.
1254 */
1255 void
biodone(struct buf * bp)1256 biodone(struct buf *bp)
1257 {
1258 if (bp->b_flags & B_STARTED) {
1259 DTRACE_IO1(done, struct buf *, bp);
1260 bp->b_flags &= ~B_STARTED;
1261 }
1262
1263 /*
1264 * Call the TNF probe here instead of the inline code
1265 * to force our compiler to use the tail call optimization.
1266 */
1267 biodone_tnf_probe(bp);
1268
1269 if (bp->b_iodone != NULL) {
1270 (*(bp->b_iodone))(bp);
1271 return;
1272 }
1273 ASSERT((bp->b_flags & B_DONE) == 0);
1274 ASSERT(SEMA_HELD(&bp->b_sem));
1275 bp->b_flags |= B_DONE;
1276 if (bp->b_flags & B_ASYNC) {
1277 if (bp->b_flags & (B_PAGEIO|B_REMAPPED))
1278 bio_pageio_done(bp);
1279 else
1280 brelse(bp); /* release bp to freelist */
1281 } else {
1282 sema_v(&bp->b_io);
1283 }
1284 }
1285
1286 /*
1287 * Pick up the device's error number and pass it to the user;
1288 * if there is an error but the number is 0 set a generalized code.
1289 */
1290 int
geterror(struct buf * bp)1291 geterror(struct buf *bp)
1292 {
1293 int error = 0;
1294
1295 ASSERT(SEMA_HELD(&bp->b_sem));
1296 if (bp->b_flags & B_ERROR) {
1297 error = bp->b_error;
1298 if (!error)
1299 error = EIO;
1300 }
1301 return (error);
1302 }
1303
1304 /*
1305 * Support for pageio buffers.
1306 *
1307 * This stuff should be generalized to provide a generalized bp
1308 * header facility that can be used for things other than pageio.
1309 */
1310
1311 /*
1312 * Allocate and initialize a buf struct for use with pageio.
1313 */
1314 struct buf *
pageio_setup(struct page * pp,size_t len,struct vnode * vp,int flags)1315 pageio_setup(struct page *pp, size_t len, struct vnode *vp, int flags)
1316 {
1317 struct buf *bp;
1318 struct cpu *cpup;
1319
1320 if (flags & B_READ) {
1321 CPU_STATS_ENTER_K();
1322 cpup = CPU; /* get pointer AFTER preemption is disabled */
1323 CPU_STATS_ADDQ(cpup, vm, pgin, 1);
1324 CPU_STATS_ADDQ(cpup, vm, pgpgin, btopr(len));
1325
1326 atomic_add_64(&curzone->zone_pgpgin, btopr(len));
1327
1328 if ((flags & B_ASYNC) == 0) {
1329 klwp_t *lwp = ttolwp(curthread);
1330 if (lwp != NULL)
1331 lwp->lwp_ru.majflt++;
1332 CPU_STATS_ADDQ(cpup, vm, maj_fault, 1);
1333 /* Kernel probe */
1334 TNF_PROBE_2(major_fault, "vm pagefault", /* CSTYLED */,
1335 tnf_opaque, vnode, pp->p_vnode,
1336 tnf_offset, offset, pp->p_offset);
1337 }
1338 /*
1339 * Update statistics for pages being paged in
1340 */
1341 if (pp != NULL && pp->p_vnode != NULL) {
1342 if (IS_SWAPFSVP(pp->p_vnode)) {
1343 CPU_STATS_ADDQ(cpup, vm, anonpgin, btopr(len));
1344 atomic_add_64(&curzone->zone_anonpgin,
1345 btopr(len));
1346 } else {
1347 if (pp->p_vnode->v_flag & VVMEXEC) {
1348 CPU_STATS_ADDQ(cpup, vm, execpgin,
1349 btopr(len));
1350 atomic_add_64(&curzone->zone_execpgin,
1351 btopr(len));
1352 } else {
1353 CPU_STATS_ADDQ(cpup, vm, fspgin,
1354 btopr(len));
1355 atomic_add_64(&curzone->zone_fspgin,
1356 btopr(len));
1357 }
1358 }
1359 }
1360 CPU_STATS_EXIT_K();
1361 TRACE_1(TR_FAC_VM, TR_PAGE_WS_IN,
1362 "page_ws_in:pp %p", pp);
1363 /* Kernel probe */
1364 TNF_PROBE_3(pagein, "vm pageio io", /* CSTYLED */,
1365 tnf_opaque, vnode, pp->p_vnode,
1366 tnf_offset, offset, pp->p_offset,
1367 tnf_size, size, len);
1368 }
1369
1370 bp = kmem_zalloc(sizeof (struct buf), KM_SLEEP);
1371 bp->b_bcount = len;
1372 bp->b_bufsize = len;
1373 bp->b_pages = pp;
1374 bp->b_flags = B_PAGEIO | B_NOCACHE | B_BUSY | flags;
1375 bp->b_offset = -1;
1376 sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
1377
1378 /* Initialize bp->b_sem in "locked" state */
1379 sema_init(&bp->b_sem, 0, NULL, SEMA_DEFAULT, NULL);
1380
1381 VN_HOLD(vp);
1382 bp->b_vp = vp;
1383
1384 /*
1385 * Caller sets dev & blkno and can adjust
1386 * b_addr for page offset and can use bp_mapin
1387 * to make pages kernel addressable.
1388 */
1389 return (bp);
1390 }
1391
1392 void
pageio_done(struct buf * bp)1393 pageio_done(struct buf *bp)
1394 {
1395 ASSERT(SEMA_HELD(&bp->b_sem));
1396 if (bp->b_flags & B_REMAPPED)
1397 bp_mapout(bp);
1398 VN_RELE(bp->b_vp);
1399 bp->b_vp = NULL;
1400 ASSERT((bp->b_flags & B_NOCACHE) != 0);
1401
1402 /* A sema_v(bp->b_sem) is implied if we are destroying it */
1403 sema_destroy(&bp->b_sem);
1404 sema_destroy(&bp->b_io);
1405 kmem_free(bp, sizeof (struct buf));
1406 }
1407
1408 /*
1409 * Check to see whether the buffers, except the one pointed by sbp,
1410 * associated with the device are busy.
1411 * NOTE: This expensive operation shall be improved together with ufs_icheck().
1412 */
1413 int
bcheck(dev_t dev,struct buf * sbp)1414 bcheck(dev_t dev, struct buf *sbp)
1415 {
1416 struct buf *bp;
1417 struct buf *dp;
1418 int i;
1419 kmutex_t *hmp;
1420
1421 /*
1422 * check for busy bufs for this filesystem
1423 */
1424 for (i = 0; i < v.v_hbuf; i++) {
1425 dp = (struct buf *)&hbuf[i];
1426 hmp = &hbuf[i].b_lock;
1427
1428 mutex_enter(hmp);
1429 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
1430 /*
1431 * if buf is busy or dirty, then filesystem is busy
1432 */
1433 if ((bp->b_edev == dev) &&
1434 ((bp->b_flags & B_STALE) == 0) &&
1435 (bp->b_flags & (B_DELWRI|B_BUSY)) &&
1436 (bp != sbp)) {
1437 mutex_exit(hmp);
1438 return (1);
1439 }
1440 }
1441 mutex_exit(hmp);
1442 }
1443 return (0);
1444 }
1445
1446 /*
1447 * Hash two 32 bit entities.
1448 */
1449 int
hash2ints(int x,int y)1450 hash2ints(int x, int y)
1451 {
1452 int hash = 0;
1453
1454 hash = x - 1;
1455 hash = ((hash * 7) + (x >> 8)) - 1;
1456 hash = ((hash * 7) + (x >> 16)) - 1;
1457 hash = ((hash * 7) + (x >> 24)) - 1;
1458 hash = ((hash * 7) + y) - 1;
1459 hash = ((hash * 7) + (y >> 8)) - 1;
1460 hash = ((hash * 7) + (y >> 16)) - 1;
1461 hash = ((hash * 7) + (y >> 24)) - 1;
1462
1463 return (hash);
1464 }
1465
1466
1467 /*
1468 * Return a new buffer struct.
1469 * Create a new buffer if we haven't gone over our high water
1470 * mark for memory, otherwise try to get one off the freelist.
1471 *
1472 * Returns a locked buf that has no id and is not on any hash or free
1473 * list.
1474 */
1475 static struct buf *
bio_getfreeblk(long bsize)1476 bio_getfreeblk(long bsize)
1477 {
1478 struct buf *bp, *dp;
1479 struct hbuf *hp;
1480 kmutex_t *hmp;
1481 uint_t start, end;
1482
1483 /*
1484 * mutex_enter(&bfree_lock);
1485 * bfreelist.b_bufsize represents the amount of memory
1486 * mutex_exit(&bfree_lock); protect ref to bfreelist
1487 * we are allowed to allocate in the cache before we hit our hwm.
1488 */
1489 bio_mem_get(bsize); /* Account for our memory request */
1490
1491 again:
1492 bp = bio_bhdr_alloc(); /* Get a buf hdr */
1493 sema_p(&bp->b_sem); /* Should never fail */
1494
1495 ASSERT(bp->b_un.b_addr == NULL);
1496 bp->b_un.b_addr = kmem_alloc(bsize, KM_NOSLEEP);
1497 if (bp->b_un.b_addr != NULL) {
1498 /*
1499 * Make the common path short
1500 */
1501 bp->b_bufsize = bsize;
1502 ASSERT(SEMA_HELD(&bp->b_sem));
1503 return (bp);
1504 } else {
1505 struct buf *save;
1506
1507 save = bp; /* Save bp we allocated */
1508 start = end = lastindex;
1509
1510 biostats.bio_bufwant.value.ui32++;
1511
1512 /*
1513 * Memory isn't available from the system now. Scan
1514 * the hash buckets till enough space is found.
1515 */
1516 do {
1517 hp = &hbuf[start];
1518 hmp = &hp->b_lock;
1519 dp = (struct buf *)hp;
1520
1521 mutex_enter(hmp);
1522 bp = dp->av_forw;
1523
1524 while (bp != dp) {
1525
1526 ASSERT(bp != NULL);
1527
1528 if (!sema_tryp(&bp->b_sem)) {
1529 bp = bp->av_forw;
1530 continue;
1531 }
1532
1533 /*
1534 * Since we are going down the freelist
1535 * associated with this hash bucket the
1536 * B_DELWRI flag should not be set.
1537 */
1538 ASSERT(!(bp->b_flags & B_DELWRI));
1539
1540 if (bp->b_bufsize == bsize) {
1541 hp->b_length--;
1542 notavail(bp);
1543 bremhash(bp);
1544 mutex_exit(hmp);
1545
1546 /*
1547 * Didn't kmem_alloc any more, so don't
1548 * count it twice.
1549 */
1550 mutex_enter(&bfree_lock);
1551 bfreelist.b_bufsize += bsize;
1552 mutex_exit(&bfree_lock);
1553
1554 /*
1555 * Update the lastindex value.
1556 */
1557 lastindex = start;
1558
1559 /*
1560 * Put our saved bp back on the list
1561 */
1562 sema_v(&save->b_sem);
1563 bio_bhdr_free(save);
1564 ASSERT(SEMA_HELD(&bp->b_sem));
1565 return (bp);
1566 }
1567 sema_v(&bp->b_sem);
1568 bp = bp->av_forw;
1569 }
1570 mutex_exit(hmp);
1571 start = ((start + 1) % v.v_hbuf);
1572 } while (start != end);
1573
1574 biostats.bio_bufwait.value.ui32++;
1575 bp = save; /* Use original bp */
1576 bp->b_un.b_addr = kmem_alloc(bsize, KM_SLEEP);
1577 }
1578
1579 bp->b_bufsize = bsize;
1580 ASSERT(SEMA_HELD(&bp->b_sem));
1581 return (bp);
1582 }
1583
1584 /*
1585 * Allocate a buffer header. If none currently available, allocate
1586 * a new pool.
1587 */
1588 static struct buf *
bio_bhdr_alloc(void)1589 bio_bhdr_alloc(void)
1590 {
1591 struct buf *dp, *sdp;
1592 struct buf *bp;
1593 int i;
1594
1595 for (;;) {
1596 mutex_enter(&bhdr_lock);
1597 if (bhdrlist != NULL) {
1598 bp = bhdrlist;
1599 bhdrlist = bp->av_forw;
1600 mutex_exit(&bhdr_lock);
1601 bp->av_forw = NULL;
1602 return (bp);
1603 }
1604 mutex_exit(&bhdr_lock);
1605
1606 /*
1607 * Need to allocate a new pool. If the system is currently
1608 * out of memory, then try freeing things on the freelist.
1609 */
1610 dp = kmem_zalloc(sizeof (struct buf) * v.v_buf, KM_NOSLEEP);
1611 if (dp == NULL) {
1612 /*
1613 * System can't give us a pool of headers, try
1614 * recycling from the free lists.
1615 */
1616 bio_recycle(BIO_HEADER, 0);
1617 } else {
1618 sdp = dp;
1619 for (i = 0; i < v.v_buf; i++, dp++) {
1620 /*
1621 * The next two lines are needed since NODEV
1622 * is -1 and not NULL
1623 */
1624 dp->b_dev = (o_dev_t)NODEV;
1625 dp->b_edev = NODEV;
1626 dp->av_forw = dp + 1;
1627 sema_init(&dp->b_sem, 1, NULL, SEMA_DEFAULT,
1628 NULL);
1629 sema_init(&dp->b_io, 0, NULL, SEMA_DEFAULT,
1630 NULL);
1631 dp->b_offset = -1;
1632 }
1633 mutex_enter(&bhdr_lock);
1634 (--dp)->av_forw = bhdrlist; /* Fix last pointer */
1635 bhdrlist = sdp;
1636 nbuf += v.v_buf;
1637 bp = bhdrlist;
1638 bhdrlist = bp->av_forw;
1639 mutex_exit(&bhdr_lock);
1640
1641 bp->av_forw = NULL;
1642 return (bp);
1643 }
1644 }
1645 }
1646
1647 static void
bio_bhdr_free(struct buf * bp)1648 bio_bhdr_free(struct buf *bp)
1649 {
1650 ASSERT(bp->b_back == NULL);
1651 ASSERT(bp->b_forw == NULL);
1652 ASSERT(bp->av_back == NULL);
1653 ASSERT(bp->av_forw == NULL);
1654 ASSERT(bp->b_un.b_addr == NULL);
1655 ASSERT(bp->b_dev == (o_dev_t)NODEV);
1656 ASSERT(bp->b_edev == NODEV);
1657 ASSERT(bp->b_flags == 0);
1658
1659 mutex_enter(&bhdr_lock);
1660 bp->av_forw = bhdrlist;
1661 bhdrlist = bp;
1662 mutex_exit(&bhdr_lock);
1663 }
1664
1665 /*
1666 * If we haven't gone over the high water mark, it's o.k. to
1667 * allocate more buffer space, otherwise recycle buffers
1668 * from the freelist until enough memory is free for a bsize request.
1669 *
1670 * We account for this memory, even though
1671 * we don't allocate it here.
1672 */
1673 static void
bio_mem_get(long bsize)1674 bio_mem_get(long bsize)
1675 {
1676 mutex_enter(&bfree_lock);
1677 if (bfreelist.b_bufsize > bsize) {
1678 bfreelist.b_bufsize -= bsize;
1679 mutex_exit(&bfree_lock);
1680 return;
1681 }
1682 mutex_exit(&bfree_lock);
1683 bio_recycle(BIO_MEM, bsize);
1684 }
1685
1686 /*
1687 * flush a list of delayed write buffers.
1688 * (currently used only by bio_recycle below.)
1689 */
1690 static void
bio_flushlist(struct buf * delwri_list)1691 bio_flushlist(struct buf *delwri_list)
1692 {
1693 struct buf *bp;
1694
1695 while (delwri_list != EMPTY_LIST) {
1696 bp = delwri_list;
1697 bp->b_flags |= B_AGE | B_ASYNC;
1698 if (bp->b_vp == NULL) { /* !ufs */
1699 BWRITE(bp);
1700 } else { /* ufs */
1701 UFS_BWRITE(VTOI(bp->b_vp)->i_ufsvfs, bp);
1702 }
1703 delwri_list = bp->b_list;
1704 bp->b_list = NULL;
1705 }
1706 }
1707
1708 /*
1709 * Start recycling buffers on the freelist for one of 2 reasons:
1710 * - we need a buffer header
1711 * - we need to free up memory
1712 * Once started we continue to recycle buffers until the B_AGE
1713 * buffers are gone.
1714 */
1715 static void
bio_recycle(int want,long bsize)1716 bio_recycle(int want, long bsize)
1717 {
1718 struct buf *bp, *dp, *dwp, *nbp;
1719 struct hbuf *hp;
1720 int found = 0;
1721 kmutex_t *hmp;
1722 int start, end;
1723 struct buf *delwri_list = EMPTY_LIST;
1724
1725 /*
1726 * Recycle buffers.
1727 */
1728 top:
1729 start = end = lastindex;
1730 do {
1731 hp = &hbuf[start];
1732 hmp = &hp->b_lock;
1733 dp = (struct buf *)hp;
1734
1735 mutex_enter(hmp);
1736 bp = dp->av_forw;
1737
1738 while (bp != dp) {
1739
1740 ASSERT(bp != NULL);
1741
1742 if (!sema_tryp(&bp->b_sem)) {
1743 bp = bp->av_forw;
1744 continue;
1745 }
1746 /*
1747 * Do we really want to nuke all of the B_AGE stuff??
1748 */
1749 if ((bp->b_flags & B_AGE) == 0 && found) {
1750 sema_v(&bp->b_sem);
1751 mutex_exit(hmp);
1752 lastindex = start;
1753 return; /* All done */
1754 }
1755
1756 ASSERT(MUTEX_HELD(&hp->b_lock));
1757 ASSERT(!(bp->b_flags & B_DELWRI));
1758 hp->b_length--;
1759 notavail(bp);
1760
1761 /*
1762 * Remove bhdr from cache, free up memory,
1763 * and add the hdr to the freelist.
1764 */
1765 bremhash(bp);
1766 mutex_exit(hmp);
1767
1768 if (bp->b_bufsize) {
1769 kmem_free(bp->b_un.b_addr, bp->b_bufsize);
1770 bp->b_un.b_addr = NULL;
1771 mutex_enter(&bfree_lock);
1772 bfreelist.b_bufsize += bp->b_bufsize;
1773 mutex_exit(&bfree_lock);
1774 }
1775
1776 bp->b_dev = (o_dev_t)NODEV;
1777 bp->b_edev = NODEV;
1778 bp->b_flags = 0;
1779 sema_v(&bp->b_sem);
1780 bio_bhdr_free(bp);
1781 if (want == BIO_HEADER) {
1782 found = 1;
1783 } else {
1784 ASSERT(want == BIO_MEM);
1785 if (!found && bfreelist.b_bufsize >= bsize) {
1786 /* Account for the memory we want */
1787 mutex_enter(&bfree_lock);
1788 if (bfreelist.b_bufsize >= bsize) {
1789 bfreelist.b_bufsize -= bsize;
1790 found = 1;
1791 }
1792 mutex_exit(&bfree_lock);
1793 }
1794 }
1795
1796 /*
1797 * Since we dropped hmp start from the
1798 * begining.
1799 */
1800 mutex_enter(hmp);
1801 bp = dp->av_forw;
1802 }
1803 mutex_exit(hmp);
1804
1805 /*
1806 * Look at the delayed write list.
1807 * First gather into a private list, then write them.
1808 */
1809 dwp = (struct buf *)&dwbuf[start];
1810 mutex_enter(&blist_lock);
1811 bio_doingflush++;
1812 mutex_enter(hmp);
1813 for (bp = dwp->av_forw; bp != dwp; bp = nbp) {
1814
1815 ASSERT(bp != NULL);
1816 nbp = bp->av_forw;
1817
1818 if (!sema_tryp(&bp->b_sem))
1819 continue;
1820 ASSERT(bp->b_flags & B_DELWRI);
1821 /*
1822 * Do we really want to nuke all of the B_AGE stuff??
1823 */
1824
1825 if ((bp->b_flags & B_AGE) == 0 && found) {
1826 sema_v(&bp->b_sem);
1827 mutex_exit(hmp);
1828 lastindex = start;
1829 mutex_exit(&blist_lock);
1830 bio_flushlist(delwri_list);
1831 mutex_enter(&blist_lock);
1832 bio_doingflush--;
1833 if (bio_flinv_cv_wanted) {
1834 bio_flinv_cv_wanted = 0;
1835 cv_broadcast(&bio_flushinval_cv);
1836 }
1837 mutex_exit(&blist_lock);
1838 return; /* All done */
1839 }
1840
1841 /*
1842 * If the buffer is already on a flush or
1843 * invalidate list then just skip it.
1844 */
1845 if (bp->b_list != NULL) {
1846 sema_v(&bp->b_sem);
1847 continue;
1848 }
1849 /*
1850 * We are still on the same bucket.
1851 */
1852 hp->b_length--;
1853 notavail(bp);
1854 bp->b_list = delwri_list;
1855 delwri_list = bp;
1856 }
1857 mutex_exit(hmp);
1858 mutex_exit(&blist_lock);
1859 bio_flushlist(delwri_list);
1860 delwri_list = EMPTY_LIST;
1861 mutex_enter(&blist_lock);
1862 bio_doingflush--;
1863 if (bio_flinv_cv_wanted) {
1864 bio_flinv_cv_wanted = 0;
1865 cv_broadcast(&bio_flushinval_cv);
1866 }
1867 mutex_exit(&blist_lock);
1868 start = (start + 1) % v.v_hbuf;
1869
1870 } while (start != end);
1871
1872 if (found)
1873 return;
1874
1875 /*
1876 * Free lists exhausted and we haven't satisfied the request.
1877 * Wait here for more entries to be added to freelist.
1878 * Because this might have just happened, make it timed.
1879 */
1880 mutex_enter(&bfree_lock);
1881 bfreelist.b_flags |= B_WANTED;
1882 (void) cv_reltimedwait(&bio_mem_cv, &bfree_lock, hz, TR_CLOCK_TICK);
1883 mutex_exit(&bfree_lock);
1884 goto top;
1885 }
1886
1887 /*
1888 * See if the block is associated with some buffer
1889 * (mainly to avoid getting hung up on a wait in breada).
1890 */
1891 static int
bio_incore(dev_t dev,daddr_t blkno)1892 bio_incore(dev_t dev, daddr_t blkno)
1893 {
1894 struct buf *bp;
1895 struct buf *dp;
1896 uint_t index;
1897 kmutex_t *hmp;
1898
1899 index = bio_bhash(dev, blkno);
1900 dp = (struct buf *)&hbuf[index];
1901 hmp = &hbuf[index].b_lock;
1902
1903 mutex_enter(hmp);
1904 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
1905 if (bp->b_blkno == blkno && bp->b_edev == dev &&
1906 (bp->b_flags & B_STALE) == 0) {
1907 mutex_exit(hmp);
1908 return (1);
1909 }
1910 }
1911 mutex_exit(hmp);
1912 return (0);
1913 }
1914
1915 static void
bio_pageio_done(struct buf * bp)1916 bio_pageio_done(struct buf *bp)
1917 {
1918 if (bp->b_flags & B_PAGEIO) {
1919
1920 if (bp->b_flags & B_REMAPPED)
1921 bp_mapout(bp);
1922
1923 if (bp->b_flags & B_READ)
1924 pvn_read_done(bp->b_pages, bp->b_flags);
1925 else
1926 pvn_write_done(bp->b_pages, B_WRITE | bp->b_flags);
1927 pageio_done(bp);
1928 } else {
1929 ASSERT(bp->b_flags & B_REMAPPED);
1930 bp_mapout(bp);
1931 brelse(bp);
1932 }
1933 }
1934
1935 /*
1936 * bioerror(9F) - indicate error in buffer header
1937 * If 'error' is zero, remove the error indication.
1938 */
1939 void
bioerror(struct buf * bp,int error)1940 bioerror(struct buf *bp, int error)
1941 {
1942 ASSERT(bp != NULL);
1943 ASSERT(error >= 0);
1944 ASSERT(SEMA_HELD(&bp->b_sem));
1945
1946 if (error != 0) {
1947 bp->b_flags |= B_ERROR;
1948 } else {
1949 bp->b_flags &= ~B_ERROR;
1950 }
1951 bp->b_error = error;
1952 }
1953
1954 /*
1955 * bioreset(9F) - reuse a private buffer header after I/O is complete
1956 */
1957 void
bioreset(struct buf * bp)1958 bioreset(struct buf *bp)
1959 {
1960 ASSERT(bp != NULL);
1961
1962 biofini(bp);
1963 bioinit(bp);
1964 }
1965
1966 /*
1967 * biosize(9F) - return size of a buffer header
1968 */
1969 size_t
biosize(void)1970 biosize(void)
1971 {
1972 return (sizeof (struct buf));
1973 }
1974
1975 /*
1976 * biomodified(9F) - check if buffer is modified
1977 */
1978 int
biomodified(struct buf * bp)1979 biomodified(struct buf *bp)
1980 {
1981 int npf;
1982 int ppattr;
1983 struct page *pp;
1984
1985 ASSERT(bp != NULL);
1986
1987 if ((bp->b_flags & B_PAGEIO) == 0) {
1988 return (-1);
1989 }
1990 pp = bp->b_pages;
1991 npf = btopr(bp->b_bcount + ((uintptr_t)bp->b_un.b_addr & PAGEOFFSET));
1992
1993 while (npf > 0) {
1994 ppattr = hat_pagesync(pp, HAT_SYNC_DONTZERO |
1995 HAT_SYNC_STOPON_MOD);
1996 if (ppattr & P_MOD)
1997 return (1);
1998 pp = pp->p_next;
1999 npf--;
2000 }
2001
2002 return (0);
2003 }
2004
2005 /*
2006 * bioinit(9F) - initialize a buffer structure
2007 */
2008 void
bioinit(struct buf * bp)2009 bioinit(struct buf *bp)
2010 {
2011 bzero(bp, sizeof (struct buf));
2012 sema_init(&bp->b_sem, 0, NULL, SEMA_DEFAULT, NULL);
2013 sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
2014 bp->b_offset = -1;
2015 }
2016
2017 /*
2018 * biofini(9F) - uninitialize a buffer structure
2019 */
2020 void
biofini(struct buf * bp)2021 biofini(struct buf *bp)
2022 {
2023 sema_destroy(&bp->b_io);
2024 sema_destroy(&bp->b_sem);
2025 }
2026
2027 /*
2028 * bioclone(9F) - clone a buffer
2029 */
2030 struct buf *
bioclone(struct buf * bp,off_t off,size_t len,dev_t dev,daddr_t blkno,int (* iodone)(struct buf *),struct buf * bp_mem,int sleep)2031 bioclone(struct buf *bp, off_t off, size_t len, dev_t dev, daddr_t blkno,
2032 int (*iodone)(struct buf *), struct buf *bp_mem, int sleep)
2033 {
2034 struct buf *bufp;
2035
2036 ASSERT(bp);
2037 if (bp_mem == NULL) {
2038 bufp = kmem_alloc(sizeof (struct buf), sleep);
2039 if (bufp == NULL) {
2040 return (NULL);
2041 }
2042 bioinit(bufp);
2043 } else {
2044 bufp = bp_mem;
2045 bioreset(bufp);
2046 }
2047
2048 #define BUF_CLONE_FLAGS (B_READ|B_WRITE|B_SHADOW|B_PHYS|B_PAGEIO|B_FAILFAST|\
2049 B_ABRWRITE)
2050
2051 /*
2052 * The cloned buffer does not inherit the B_REMAPPED flag.
2053 */
2054 bufp->b_flags = (bp->b_flags & BUF_CLONE_FLAGS) | B_BUSY;
2055 bufp->b_bcount = len;
2056 bufp->b_blkno = blkno;
2057 bufp->b_iodone = iodone;
2058 bufp->b_proc = bp->b_proc;
2059 bufp->b_edev = dev;
2060 bufp->b_file = bp->b_file;
2061 bufp->b_offset = bp->b_offset;
2062
2063 if (bp->b_flags & B_SHADOW) {
2064 ASSERT(bp->b_shadow);
2065 ASSERT(bp->b_flags & B_PHYS);
2066
2067 bufp->b_shadow = bp->b_shadow +
2068 btop(((uintptr_t)bp->b_un.b_addr & PAGEOFFSET) + off);
2069 bufp->b_un.b_addr = (caddr_t)((uintptr_t)bp->b_un.b_addr + off);
2070 if (bp->b_flags & B_REMAPPED)
2071 bufp->b_proc = NULL;
2072 } else {
2073 if (bp->b_flags & B_PAGEIO) {
2074 struct page *pp;
2075 off_t o;
2076 int i;
2077
2078 pp = bp->b_pages;
2079 o = ((uintptr_t)bp->b_un.b_addr & PAGEOFFSET) + off;
2080 for (i = btop(o); i > 0; i--) {
2081 pp = pp->p_next;
2082 }
2083 bufp->b_pages = pp;
2084 bufp->b_un.b_addr = (caddr_t)(o & PAGEOFFSET);
2085 } else {
2086 bufp->b_un.b_addr =
2087 (caddr_t)((uintptr_t)bp->b_un.b_addr + off);
2088 if (bp->b_flags & B_REMAPPED)
2089 bufp->b_proc = NULL;
2090 }
2091 }
2092 return (bufp);
2093 }
2094