xref: /illumos-gate/usr/src/uts/common/fs/ufs/ufs_bmap.c (revision 80d34432)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
27 /*	  All Rights Reserved  	*/
28 
29 /*
30  * University Copyright- Copyright (c) 1982, 1986, 1988
31  * The Regents of the University of California
32  * All Rights Reserved
33  *
34  * University Acknowledgment- Portions of this document are derived from
35  * software developed by the University of California, Berkeley, and its
36  * contributors.
37  */
38 
39 
40 #pragma ident	"%Z%%M%	%I%	%E% SMI"
41 
42 #include <sys/types.h>
43 #include <sys/t_lock.h>
44 #include <sys/param.h>
45 #include <sys/systm.h>
46 #include <sys/signal.h>
47 #include <sys/user.h>
48 #include <sys/vnode.h>
49 #include <sys/buf.h>
50 #include <sys/disp.h>
51 #include <sys/proc.h>
52 #include <sys/conf.h>
53 #include <sys/fs/ufs_inode.h>
54 #include <sys/fs/ufs_fs.h>
55 #include <sys/fs/ufs_quota.h>
56 #include <sys/fs/ufs_trans.h>
57 #include <sys/fs/ufs_bio.h>
58 #include <vm/seg.h>
59 #include <sys/errno.h>
60 #include <sys/sysmacros.h>
61 #include <sys/vfs.h>
62 #include <sys/debug.h>
63 #include <sys/kmem.h>
64 #include <sys/cmn_err.h>
65 
66 /*
67  * This structure is used to track blocks as we allocate them, so that
68  * we can free them if we encounter an error during allocation.  We
69  * keep track of five pieces of information for each allocated block:
70  *   - The number of the newly allocated block
71  *   - The size of the block (lets us deal with fragments if we want)
72  *   - The number of the block containing a pointer to it; or whether
73  *     the pointer is in the inode
74  *   - The offset within the block (or inode) containing a pointer to it.
75  *   - A flag indicating the usage of the block.  (Logging needs to know
76  *     this to avoid overwriting a data block if it was previously used
77  *     for metadata.)
78  */
79 
80 enum ufs_owner_type {
81 	ufs_no_owner,		/* Owner has not yet been updated */
82 	ufs_inode_direct,	/* Listed in inode's direct block table */
83 	ufs_inode_indirect,	/* Listed in inode's indirect block table */
84 	ufs_indirect_block	/* Listed in an indirect block */
85 };
86 
87 struct ufs_allocated_block {
88 	daddr_t this_block;	    /* Number of this block */
89 	off_t block_size;	    /* Size of this block, in bytes */
90 	enum ufs_owner_type owner;  /* Who points to this block? */
91 	daddr_t owner_block;	    /* Number of the owning block */
92 	uint_t owner_offset;	    /* Offset within that block or inode */
93 	int usage_flags;	    /* Usage flags, as expected by free() */
94 };
95 
96 
97 static int findextent(struct fs *fs, daddr32_t *sbp, int n, int *lenp,
98 		int maxtrans);
99 
100 static void ufs_undo_allocation(inode_t *ip, int block_count,
101 	struct ufs_allocated_block table[], int inode_sector_adjust);
102 
103 /*
104  * Find the extent and the matching block number.
105  *
106  * bsize > PAGESIZE
107  *	boff indicates that we want a page in the middle
108  *	min expression is supposed to make sure no extra page[s] after EOF
109  * PAGESIZE >= bsize
110  *	we assume that a page is a multiple of bsize, i.e.,
111  *	boff always == 0
112  *
113  * We always return a length that is suitable for a disk transfer.
114  */
115 #define	DOEXTENT(fs, lbn, boff, bnp, lenp, size, tblp, n, chkfrag, maxtrans) {\
116 	register daddr32_t *dp = (tblp);				\
117 	register int _chkfrag = chkfrag; /* for lint. sigh */		\
118 									\
119 	if (*dp == 0) {							\
120 		*(bnp) = UFS_HOLE;					\
121 	} else {							\
122 		register int len;					\
123 									\
124 		len = findextent(fs, dp, (int)(n), lenp, maxtrans) << 	\
125 			(fs)->fs_bshift; 				\
126 		if (_chkfrag) {						\
127 			register u_offset_t tmp;			\
128 									\
129 			tmp = fragroundup((fs), size) -			\
130 			    (((u_offset_t)lbn) << fs->fs_bshift);	\
131 			len = (int)MIN(tmp, len);			\
132 		}							\
133 		len -= (boff);						\
134 		if (len <= 0) {						\
135 			*(bnp) = UFS_HOLE;				\
136 		} else {						\
137 			*(bnp) = fsbtodb(fs, *dp) + btodb(boff);	\
138 			*(lenp) = len;					\
139 		}							\
140 	}								\
141 }
142 
143 /*
144  * The maximum supported file size is actually somewhat less that 1
145  * terabyte.  This is because the total number of blocks used for the
146  * file and its metadata must fit into the ic_blocks field of the
147  * inode, which is a signed 32-bit quantity.  The metadata allocated
148  * for a file (that is, the single, double, and triple indirect blocks
149  * used to reference the file blocks) is actually quite small,
150  * but just to make sure, we check for overflow in the ic_blocks
151  * ic_blocks fields for all files whose total block count is
152  * within 1 GB of a terabyte.  VERYLARGEFILESIZE below is the number of
153  * 512-byte blocks in a terabyte (2^31), minus the number of 512-byte blocks
154  * in a gigabyte (2^21).  We only check for overflow in the ic_blocks
155  * field if the number of blocks currently allocated to the file is
156  * greater than VERYLARGEFILESIZE.
157  *
158  * Note that file "size" is the not the same as file "length".  A
159  * file's "size" is the number of blocks allocated to it.  A file's
160  * "length" is the maximum offset in the file.  A UFS FILE can have a
161  * length of a terabyte, but the size is limited to somewhat less than
162  * a terabyte, as described above.
163  */
164 #define	VERYLARGEFILESIZE	0x7FE00000
165 
166 /*
167  * bmap{read,write} define the structure of file system storage by mapping
168  * a logical offset in a file to a physical block number on the device.
169  * It should be called with a locked inode when allocation is to be
170  * done (bmap_write).  Note this strangeness: bmap_write is always called from
171  * getpage(), not putpage(), since getpage() is where all the allocation
172  * is done.
173  *
174  * S_READ, S_OTHER -> bmap_read; S_WRITE -> bmap_write.
175  *
176  * NOTICE: the block number returned is the disk block number, not the
177  * file system block number.  All the worries about block offsets and
178  * page/block sizes are hidden inside of bmap.  Well, not quite,
179  * unfortunately.  It's impossible to find one place to hide all this
180  * mess.  There are 3 cases:
181  *
182  * PAGESIZE < bsize
183  *	In this case, the {get,put}page routines will attempt to align to
184  *	a file system block boundry (XXX - maybe this is a mistake?).  Since
185  *	the kluster routines may be out of memory, we don't always get all
186  *	the pages we wanted.  If we called bmap first, to find out how much
187  *	to kluster, we handed in the block aligned offset.  If we didn't get
188  *	all the pages, we have to chop off the amount we didn't get from the
189  *	amount handed back by bmap.
190  *
191  * PAGESIZE == bsize
192  *	Life is quite pleasant here, no extra work needed, mainly because we
193  *	(probably?) won't kluster backwards, just forwards.
194  *
195  * PAGESIZE > bsize
196  *	This one has a different set of problems, specifically, we may have to
197  *	do N reads to fill one page.  Let us hope that Sun will stay with small
198  *	pages.
199  *
200  * Returns 0 on success, or a non-zero errno if an error occurs.
201  *
202  * TODO
203  *	LMXXX - add a bmap cache.  This could be a couple of extents in the
204  *	inode.  Two is nice for PAGESIZE > bsize.
205  */
206 
207 int
208 bmap_read(struct inode *ip, u_offset_t off, daddr_t *bnp, int *lenp)
209 {
210 	daddr_t lbn;
211 	ufsvfs_t *ufsvfsp = ip->i_ufsvfs;
212 	struct	fs *fs = ufsvfsp->vfs_fs;
213 	struct	buf *bp;
214 	int	i, j, boff;
215 	int	shft;			/* we maintain sh = 1 << shft */
216 	daddr_t	ob, nb, tbn;
217 	daddr32_t *bap;
218 	int	nindirshift, nindiroffset;
219 
220 	ASSERT(RW_LOCK_HELD(&ip->i_contents));
221 	lbn = (daddr_t)lblkno(fs, off);
222 	boff = (int)blkoff(fs, off);
223 	if (lbn < 0)
224 		return (EFBIG);
225 
226 	/*
227 	 * The first NDADDR blocks are direct blocks.
228 	 */
229 	if (lbn < NDADDR) {
230 		DOEXTENT(fs, lbn, boff, bnp, lenp,
231 		    ip->i_size, &ip->i_db[lbn], NDADDR - lbn, 1,
232 		    ufsvfsp->vfs_iotransz);
233 		return (0);
234 	}
235 
236 	nindirshift = ufsvfsp->vfs_nindirshift;
237 	nindiroffset = ufsvfsp->vfs_nindiroffset;
238 	/*
239 	 * Determine how many levels of indirection.
240 	 */
241 	shft = 0;				/* sh = 1 */
242 	tbn = lbn - NDADDR;
243 	for (j = NIADDR; j > 0; j--) {
244 		longlong_t	sh;
245 
246 		shft += nindirshift;		/* sh *= nindir */
247 		sh = 1LL << shft;
248 		if (tbn < sh)
249 			break;
250 		tbn -= sh;
251 	}
252 	if (j == 0)
253 		return (EFBIG);
254 
255 	/*
256 	 * Fetch the first indirect block.
257 	 */
258 	nb = ip->i_ib[NIADDR - j];
259 	if (nb == 0) {
260 		*bnp = UFS_HOLE;
261 		return (0);
262 	}
263 
264 	/*
265 	 * Fetch through the indirect blocks.
266 	 */
267 	for (; j <= NIADDR; j++) {
268 		ob = nb;
269 		bp = UFS_BREAD(ufsvfsp,
270 		    ip->i_dev, fsbtodb(fs, ob), fs->fs_bsize);
271 		if (bp->b_flags & B_ERROR) {
272 			brelse(bp);
273 			return (EIO);
274 		}
275 		bap = bp->b_un.b_daddr;
276 
277 		ASSERT(!ufs_indir_badblock(ip, bap));
278 
279 		shft -= nindirshift;		/* sh / nindir */
280 		i = (tbn >> shft) & nindiroffset; /* (tbn / sh) % nindir */
281 		nb = bap[i];
282 		if (nb == 0) {
283 			*bnp = UFS_HOLE;
284 			brelse(bp);
285 			return (0);
286 		}
287 		if (j != NIADDR)
288 			brelse(bp);
289 	}
290 	DOEXTENT(fs, lbn, boff, bnp, lenp, ip->i_size, &bap[i],
291 	    MIN(NINDIR(fs) - i, (daddr_t)lblkno(fs, ip->i_size - 1) - lbn + 1),
292 	    0, ufsvfsp->vfs_iotransz);
293 	brelse(bp);
294 	return (0);
295 }
296 
297 /*
298  * See bmap_read for general notes.
299  *
300  * The block must be at least size bytes and will be extended or
301  * allocated as needed.  If alloc_type is of type BI_ALLOC_ONLY, then bmap
302  * will not create any in-core pages that correspond to the new disk allocation.
303  * If alloc_type is of BI_FALLOCATE, blocks will be stored as (-1) * block addr
304  * and security is maintained b/c upon reading a negative block number pages
305  * are zeroed. For all other allocation types (BI_NORMAL) the in-core pages will
306  * be created and initialized as needed.
307  *
308  * Returns 0 on success, or a non-zero errno if an error occurs.
309  */
310 int
311 bmap_write(struct inode	*ip, u_offset_t	off, int size,
312     enum bi_type alloc_type, daddr_t *allocblk, struct cred *cr)
313 {
314 	struct	fs *fs;
315 	struct	buf *bp;
316 	int	i;
317 	struct	buf *nbp;
318 	int	j;
319 	int	shft;				/* we maintain sh = 1 << shft */
320 	daddr_t	ob, nb, pref, lbn, llbn, tbn;
321 	daddr32_t *bap;
322 	struct	vnode *vp = ITOV(ip);
323 	long	bsize = VBSIZE(vp);
324 	long	osize, nsize;
325 	int	issync, metaflag, isdirquota;
326 	int	err;
327 	dev_t	dev;
328 	struct	fbuf *fbp;
329 	int	nindirshift;
330 	int	nindiroffset;
331 	struct	ufsvfs	*ufsvfsp;
332 	int	added_sectors;		/* sectors added to this inode */
333 	int	alloced_blocks;		/* fs blocks newly allocated */
334 	struct  ufs_allocated_block undo_table[NIADDR+1];
335 	int	verylargefile = 0;
336 
337 	ASSERT(RW_WRITE_HELD(&ip->i_contents));
338 
339 	if (allocblk)
340 		*allocblk = 0;
341 
342 	ufsvfsp = ip->i_ufsvfs;
343 	fs = ufsvfsp->vfs_bufp->b_un.b_fs;
344 	lbn = (daddr_t)lblkno(fs, off);
345 	if (lbn < 0)
346 		return (EFBIG);
347 	if (ip->i_blocks >= VERYLARGEFILESIZE)
348 		verylargefile = 1;
349 	llbn = (daddr_t)((ip->i_size) ? lblkno(fs, ip->i_size - 1) : 0);
350 	metaflag = isdirquota = 0;
351 	if (((ip->i_mode & IFMT) == IFDIR) ||
352 	    ((ip->i_mode & IFMT) == IFATTRDIR))
353 		isdirquota = metaflag = I_DIR;
354 	else if ((ip->i_mode & IFMT) == IFSHAD)
355 		metaflag = I_SHAD;
356 	else if (ip->i_ufsvfs->vfs_qinod == ip)
357 		isdirquota = metaflag = I_QUOTA;
358 
359 	issync = ((ip->i_flag & ISYNC) != 0);
360 
361 	if (isdirquota || issync) {
362 		alloc_type = BI_NORMAL;	/* make sure */
363 	}
364 
365 	/*
366 	 * If the next write will extend the file into a new block,
367 	 * and the file is currently composed of a fragment
368 	 * this fragment has to be extended to be a full block.
369 	 */
370 	if (llbn < NDADDR && llbn < lbn && (ob = ip->i_db[llbn]) != 0) {
371 		osize = blksize(fs, ip, llbn);
372 		if (osize < bsize && osize > 0) {
373 			/*
374 			 * Check to see if doing this will make the file too
375 			 * big.  Only check if we are dealing with a very
376 			 * large file.
377 			 */
378 			if (verylargefile == 1) {
379 				if (((unsigned)ip->i_blocks +
380 				    btodb(bsize - osize)) > INT_MAX) {
381 					return (EFBIG);
382 				}
383 			}
384 			/*
385 			 * Make sure we have all needed pages setup correctly.
386 			 *
387 			 * We pass S_OTHER to fbread here because we want
388 			 * an exclusive lock on the page in question
389 			 * (see ufs_getpage). I/O to the old block location
390 			 * may still be in progress and we are about to free
391 			 * the old block. We don't want anyone else to get
392 			 * a hold of the old block once we free it until
393 			 * the I/O is complete.
394 			 */
395 			err =
396 			    fbread(ITOV(ip), ((offset_t)llbn << fs->fs_bshift),
397 			    (uint_t)bsize, S_OTHER, &fbp);
398 			if (err)
399 				return (err);
400 			pref = blkpref(ip, llbn, (int)llbn, &ip->i_db[0]);
401 			err = realloccg(ip, ob, pref, (int)osize, (int)bsize,
402 			    &nb, cr);
403 			if (err) {
404 				if (fbp)
405 					fbrelse(fbp, S_OTHER);
406 				return (err);
407 			}
408 			ASSERT(!ufs_badblock(ip, nb));
409 
410 			/*
411 			 * Update the inode before releasing the
412 			 * lock on the page. If we released the page
413 			 * lock first, the data could be written to it's
414 			 * old address and then destroyed.
415 			 */
416 			TRANS_MATA_ALLOC(ufsvfsp, ip, nb, bsize, 0);
417 			ip->i_db[llbn] = nb;
418 			UFS_SET_ISIZE(((u_offset_t)(llbn + 1)) << fs->fs_bshift,
419 			    ip);
420 			ip->i_blocks += btodb(bsize - osize);
421 			ASSERT((unsigned)ip->i_blocks <= INT_MAX);
422 			TRANS_INODE(ufsvfsp, ip);
423 			ip->i_flag |= IUPD | ICHG | IATTCHG;
424 
425 			/* Caller is responsible for updating i_seq */
426 			/*
427 			 * Don't check metaflag here, directories won't do this
428 			 *
429 			 */
430 			if (issync) {
431 				(void) ufs_fbiwrite(fbp, ip, nb, fs->fs_fsize);
432 			} else {
433 				ASSERT(fbp);
434 				fbrelse(fbp, S_WRITE);
435 			}
436 
437 			if (nb != ob) {
438 				(void) free(ip, ob, (off_t)osize, metaflag);
439 			}
440 		}
441 	}
442 
443 	/*
444 	 * The first NDADDR blocks are direct blocks.
445 	 */
446 	if (lbn < NDADDR) {
447 		nb = ip->i_db[lbn];
448 		if (nb == 0 ||
449 		    ip->i_size < ((u_offset_t)(lbn + 1)) << fs->fs_bshift) {
450 			if (nb != 0) {
451 				/* consider need to reallocate a frag */
452 				osize = fragroundup(fs, blkoff(fs, ip->i_size));
453 				nsize = fragroundup(fs, size);
454 				if (nsize <= osize)
455 					goto gotit;
456 				/*
457 				 * Check to see if doing this will make the
458 				 * file too big.  Only check if we are dealing
459 				 * with a very large file.
460 				 */
461 				if (verylargefile == 1) {
462 					if (((unsigned)ip->i_blocks +
463 					    btodb(nsize - osize)) > INT_MAX) {
464 						return (EFBIG);
465 					}
466 				}
467 				/*
468 				 * need to re-allocate a block or frag
469 				 */
470 				ob = nb;
471 				pref = blkpref(ip, lbn, (int)lbn,
472 				    &ip->i_db[0]);
473 				err = realloccg(ip, ob, pref, (int)osize,
474 				    (int)nsize, &nb, cr);
475 				if (err)
476 					return (err);
477 				if (allocblk)
478 					*allocblk = nb;
479 				ASSERT(!ufs_badblock(ip, nb));
480 
481 			} else {
482 				/*
483 				 * need to allocate a block or frag
484 				 */
485 				osize = 0;
486 				if (ip->i_size <
487 				    ((u_offset_t)(lbn + 1)) << fs->fs_bshift)
488 					nsize = fragroundup(fs, size);
489 				else
490 					nsize = bsize;
491 				/*
492 				 * Check to see if doing this will make the
493 				 * file too big.  Only check if we are dealing
494 				 * with a very large file.
495 				 */
496 				if (verylargefile == 1) {
497 					if (((unsigned)ip->i_blocks +
498 					    btodb(nsize - osize)) > INT_MAX) {
499 						return (EFBIG);
500 					}
501 				}
502 				pref = blkpref(ip, lbn, (int)lbn, &ip->i_db[0]);
503 				err = alloc(ip, pref, (int)nsize, &nb, cr);
504 				if (err)
505 					return (err);
506 				if (allocblk)
507 					*allocblk = nb;
508 				ASSERT(!ufs_badblock(ip, nb));
509 				ob = nb;
510 			}
511 
512 			/*
513 			 * Read old/create new zero pages
514 			 */
515 			fbp = NULL;
516 			if (osize == 0) {
517 				/*
518 				 * mmap S_WRITE faults always enter here
519 				 */
520 				/*
521 				 * We zero it if its also BI_FALLOCATE, but
522 				 * only for direct blocks!
523 				 */
524 				if (alloc_type == BI_NORMAL ||
525 				    alloc_type == BI_FALLOCATE ||
526 				    P2ROUNDUP_TYPED(size,
527 				    PAGESIZE, u_offset_t) < nsize) {
528 					/* fbzero doesn't cause a pagefault */
529 					fbzero(ITOV(ip),
530 					    ((offset_t)lbn << fs->fs_bshift),
531 					    (uint_t)nsize, &fbp);
532 				}
533 			} else {
534 				err = fbread(vp,
535 				    ((offset_t)lbn << fs->fs_bshift),
536 				    (uint_t)nsize, S_OTHER, &fbp);
537 				if (err) {
538 					if (nb != ob) {
539 						(void) free(ip, nb,
540 						    (off_t)nsize, metaflag);
541 					} else {
542 						(void) free(ip,
543 						    ob + numfrags(fs, osize),
544 						    (off_t)(nsize - osize),
545 						    metaflag);
546 					}
547 					ASSERT(nsize >= osize);
548 					(void) chkdq(ip,
549 					    -(long)btodb(nsize - osize),
550 					    0, cr, (char **)NULL,
551 					    (size_t *)NULL);
552 					return (err);
553 				}
554 			}
555 			TRANS_MATA_ALLOC(ufsvfsp, ip, nb, nsize, 0);
556 			ip->i_db[lbn] = nb;
557 			ip->i_blocks += btodb(nsize - osize);
558 			ASSERT((unsigned)ip->i_blocks <= INT_MAX);
559 			TRANS_INODE(ufsvfsp, ip);
560 			ip->i_flag |= IUPD | ICHG | IATTCHG;
561 
562 			/* Caller is responsible for updating i_seq */
563 
564 			/*
565 			 * Write directory and shadow blocks synchronously so
566 			 * that they never appear with garbage in them on the
567 			 * disk.
568 			 *
569 			 */
570 			if (isdirquota && (ip->i_size ||
571 			    TRANS_ISTRANS(ufsvfsp))) {
572 			/*
573 			 * XXX man not be necessary with harpy trans
574 			 * bug id 1130055
575 			 */
576 				(void) ufs_fbiwrite(fbp, ip, nb, fs->fs_fsize);
577 			} else if (fbp) {
578 				fbrelse(fbp, S_WRITE);
579 			}
580 
581 			if (nb != ob)
582 				(void) free(ip, ob, (off_t)osize, metaflag);
583 		}
584 gotit:
585 		return (0);
586 	}
587 
588 	added_sectors = alloced_blocks = 0;	/* No blocks alloced yet */
589 
590 	/*
591 	 * Determine how many levels of indirection.
592 	 */
593 	nindirshift = ip->i_ufsvfs->vfs_nindirshift;
594 	nindiroffset = ip->i_ufsvfs->vfs_nindiroffset;
595 	pref = 0;
596 	shft = 0;				/* sh = 1 */
597 	tbn = lbn - NDADDR;
598 	for (j = NIADDR; j > 0; j--) {
599 		longlong_t	sh;
600 
601 		shft += nindirshift;		/* sh *= nindir */
602 		sh = 1LL << shft;
603 		if (tbn < sh)
604 			break;
605 		tbn -= sh;
606 	}
607 
608 	if (j == 0)
609 		return (EFBIG);
610 
611 	/*
612 	 * Fetch the first indirect block.
613 	 */
614 	dev = ip->i_dev;
615 	nb = ip->i_ib[NIADDR - j];
616 	if (nb == 0) {
617 		/*
618 		 * Check to see if doing this will make the
619 		 * file too big.  Only check if we are dealing
620 		 * with a very large file.
621 		 */
622 		if (verylargefile == 1) {
623 			if (((unsigned)ip->i_blocks + btodb(bsize))
624 			    > INT_MAX) {
625 				return (EFBIG);
626 			}
627 		}
628 		/*
629 		 * Need to allocate an indirect block.
630 		 */
631 		pref = blkpref(ip, lbn, 0, (daddr32_t *)0);
632 		err = alloc(ip, pref, (int)bsize, &nb, cr);
633 		if (err)
634 			return (err);
635 		TRANS_MATA_ALLOC(ufsvfsp, ip, nb, bsize, 1);
636 		ASSERT(!ufs_badblock(ip, nb));
637 
638 		/*
639 		 * Keep track of this allocation so we can undo it if we
640 		 * get an error later.
641 		 */
642 
643 		ASSERT(alloced_blocks <= NIADDR);
644 
645 		undo_table[alloced_blocks].this_block = nb;
646 		undo_table[alloced_blocks].block_size = bsize;
647 		undo_table[alloced_blocks].owner = ufs_no_owner;
648 		undo_table[alloced_blocks].usage_flags = metaflag | I_IBLK;
649 
650 		alloced_blocks++;
651 
652 		/*
653 		 * Write zero block synchronously so that
654 		 * indirect blocks never point at garbage.
655 		 */
656 		bp = UFS_GETBLK(ufsvfsp, dev, fsbtodb(fs, nb), bsize);
657 
658 		clrbuf(bp);
659 		/* XXX Maybe special-case this? */
660 		TRANS_BUF(ufsvfsp, 0, bsize, bp, DT_ABZERO);
661 		UFS_BWRITE2(ufsvfsp, bp);
662 		if (bp->b_flags & B_ERROR) {
663 			err = geterror(bp);
664 			brelse(bp);
665 			ufs_undo_allocation(ip, alloced_blocks,
666 			    undo_table, added_sectors);
667 			return (err);
668 		}
669 		brelse(bp);
670 
671 		ip->i_ib[NIADDR - j] = nb;
672 		added_sectors += btodb(bsize);
673 		ip->i_blocks += btodb(bsize);
674 		ASSERT((unsigned)ip->i_blocks <= INT_MAX);
675 		TRANS_INODE(ufsvfsp, ip);
676 		ip->i_flag |= IUPD | ICHG | IATTCHG;
677 		/* Caller is responsible for updating i_seq */
678 
679 		/*
680 		 * Update the 'undo table' now that we've linked this block
681 		 * to an inode.
682 		 */
683 
684 		undo_table[alloced_blocks-1].owner = ufs_inode_indirect;
685 		undo_table[alloced_blocks-1].owner_offset = NIADDR - j;
686 
687 		/*
688 		 * In the ISYNC case, wrip will notice that the block
689 		 * count on the inode has changed and will be sure to
690 		 * ufs_iupdat the inode at the end of wrip.
691 		 */
692 	}
693 
694 	/*
695 	 * Fetch through the indirect blocks.
696 	 */
697 	for (; j <= NIADDR; j++) {
698 		ob = nb;
699 		bp = UFS_BREAD(ufsvfsp, ip->i_dev, fsbtodb(fs, ob), bsize);
700 
701 		if (bp->b_flags & B_ERROR) {
702 			err = geterror(bp);
703 			brelse(bp);
704 			/*
705 			 * Return any partial allocations.
706 			 *
707 			 * It is possible that we have not yet made any
708 			 * allocations at this point (if this is the first
709 			 * pass through the loop and we didn't have to
710 			 * allocate the first indirect block, above).
711 			 * In this case, alloced_blocks and added_sectors will
712 			 * be zero, and ufs_undo_allocation will do nothing.
713 			 */
714 			ufs_undo_allocation(ip, alloced_blocks,
715 			    undo_table, added_sectors);
716 			return (err);
717 		}
718 		bap = bp->b_un.b_daddr;
719 		shft -= nindirshift;		/* sh /= nindir */
720 		i = (tbn >> shft) & nindiroffset; /* (tbn / sh) % nindir */
721 		nb = bap[i];
722 
723 		if (nb == 0) {
724 			/*
725 			 * Check to see if doing this will make the
726 			 * file too big.  Only check if we are dealing
727 			 * with a very large file.
728 			 */
729 			if (verylargefile == 1) {
730 				if (((unsigned)ip->i_blocks + btodb(bsize))
731 				    > INT_MAX) {
732 					brelse(bp);
733 					ufs_undo_allocation(ip, alloced_blocks,
734 					    undo_table, added_sectors);
735 					return (EFBIG);
736 				}
737 			}
738 			if (pref == 0) {
739 				if (j < NIADDR) {
740 					/* Indirect block */
741 					pref = blkpref(ip, lbn, 0,
742 					    (daddr32_t *)0);
743 				} else {
744 					/* Data block */
745 					pref = blkpref(ip, lbn, i, &bap[0]);
746 				}
747 			}
748 
749 			/*
750 			 * release "bp" buf to avoid deadlock (re-bread later)
751 			 */
752 			brelse(bp);
753 
754 			err = alloc(ip, pref, (int)bsize, &nb, cr);
755 			if (err) {
756 				/*
757 				 * Return any partial allocations.
758 				 */
759 				ufs_undo_allocation(ip, alloced_blocks,
760 				    undo_table, added_sectors);
761 				return (err);
762 			}
763 
764 			ASSERT(!ufs_badblock(ip, nb));
765 			ASSERT(alloced_blocks <= NIADDR);
766 
767 			if (allocblk)
768 				*allocblk = nb;
769 
770 			undo_table[alloced_blocks].this_block = nb;
771 			undo_table[alloced_blocks].block_size = bsize;
772 			undo_table[alloced_blocks].owner = ufs_no_owner;
773 			undo_table[alloced_blocks].usage_flags = metaflag |
774 			    ((j < NIADDR) ? I_IBLK : 0);
775 
776 			alloced_blocks++;
777 
778 			if (j < NIADDR) {
779 				TRANS_MATA_ALLOC(ufsvfsp, ip, nb, bsize, 1);
780 				/*
781 				 * Write synchronously so indirect
782 				 * blocks never point at garbage.
783 				 */
784 				nbp = UFS_GETBLK(
785 				    ufsvfsp, dev, fsbtodb(fs, nb), bsize);
786 
787 				clrbuf(nbp);
788 				/* XXX Maybe special-case this? */
789 				TRANS_BUF(ufsvfsp, 0, bsize, nbp, DT_ABZERO);
790 				UFS_BWRITE2(ufsvfsp, nbp);
791 				if (nbp->b_flags & B_ERROR) {
792 					err = geterror(nbp);
793 					brelse(nbp);
794 					/*
795 					 * Return any partial
796 					 * allocations.
797 					 */
798 					ufs_undo_allocation(ip,
799 					    alloced_blocks,
800 					    undo_table, added_sectors);
801 					return (err);
802 				}
803 				brelse(nbp);
804 			} else if (alloc_type == BI_NORMAL ||
805 			    P2ROUNDUP_TYPED(size,
806 			    PAGESIZE, u_offset_t) < bsize) {
807 				TRANS_MATA_ALLOC(ufsvfsp, ip, nb, bsize, 0);
808 				fbzero(ITOV(ip),
809 				    ((offset_t)lbn << fs->fs_bshift),
810 				    (uint_t)bsize, &fbp);
811 
812 				/*
813 				 * Cases which we need to do a synchronous
814 				 * write of the zeroed data pages:
815 				 *
816 				 * 1) If we are writing a directory then we
817 				 * want to write synchronously so blocks in
818 				 * directories never contain garbage.
819 				 *
820 				 * 2) If we are filling in a hole and the
821 				 * indirect block is going to be synchronously
822 				 * written back below we need to make sure
823 				 * that the zeroes are written here before
824 				 * the indirect block is updated so that if
825 				 * we crash before the real data is pushed
826 				 * we will not end up with random data is
827 				 * the middle of the file.
828 				 *
829 				 * 3) If the size of the request rounded up
830 				 * to the system page size is smaller than
831 				 * the file system block size, we want to
832 				 * write out all the pages now so that
833 				 * they are not aborted before they actually
834 				 * make it to ufs_putpage since the length
835 				 * of the inode will not include the pages.
836 				 */
837 
838 				if (isdirquota || (issync &&
839 				    lbn < llbn))
840 					(void) ufs_fbiwrite(fbp, ip, nb,
841 					    fs->fs_fsize);
842 				else
843 					fbrelse(fbp, S_WRITE);
844 			}
845 
846 			/*
847 			 * re-acquire "bp" buf
848 			 */
849 			bp = UFS_BREAD(ufsvfsp,
850 			    ip->i_dev, fsbtodb(fs, ob), bsize);
851 			if (bp->b_flags & B_ERROR) {
852 				err = geterror(bp);
853 				brelse(bp);
854 				/*
855 				 * Return any partial allocations.
856 				 */
857 				ufs_undo_allocation(ip,
858 				    alloced_blocks,
859 				    undo_table, added_sectors);
860 				return (err);
861 			}
862 			bap = bp->b_un.b_daddr;
863 			bap[i] = nb;
864 
865 			/*
866 			 * The magic explained: j will be equal to NIADDR
867 			 * when we are at the lowest level, this is where the
868 			 * array entries point directly to data blocks. Since
869 			 * we will be 'fallocate'ing we will go ahead and negate
870 			 * the addresses.
871 			 */
872 			if (alloc_type == BI_FALLOCATE && j == NIADDR)
873 				bap[i] = -bap[i];
874 
875 			TRANS_BUF_ITEM_128(ufsvfsp, bap[i], bap, bp, DT_AB);
876 			added_sectors += btodb(bsize);
877 			ip->i_blocks += btodb(bsize);
878 			ASSERT((unsigned)ip->i_blocks <= INT_MAX);
879 			TRANS_INODE(ufsvfsp, ip);
880 			ip->i_flag |= IUPD | ICHG | IATTCHG;
881 
882 			/* Caller is responsible for updating i_seq */
883 
884 			undo_table[alloced_blocks-1].owner =
885 			    ufs_indirect_block;
886 			undo_table[alloced_blocks-1].owner_block = ob;
887 			undo_table[alloced_blocks-1].owner_offset = i;
888 
889 			if (issync) {
890 				UFS_BWRITE2(ufsvfsp, bp);
891 				if (bp->b_flags & B_ERROR) {
892 					err = geterror(bp);
893 					brelse(bp);
894 					/*
895 					 * Return any partial
896 					 * allocations.
897 					 */
898 					ufs_undo_allocation(ip,
899 					    alloced_blocks,
900 					    undo_table, added_sectors);
901 					return (err);
902 				}
903 				brelse(bp);
904 			} else {
905 				bdrwrite(bp);
906 			}
907 		} else {
908 			brelse(bp);
909 		}
910 	}
911 	return (0);
912 }
913 
914 /*
915  * Return 1 if inode has unmapped blocks (UFS holes).
916  */
917 int
918 bmap_has_holes(struct inode *ip)
919 {
920 	struct fs *fs = ip->i_fs;
921 	uint_t	dblks; 			/* # of data blocks */
922 	uint_t	mblks;			/* # of data + metadata blocks */
923 	int	nindirshift;
924 	int	nindiroffset;
925 	uint_t	cnt;
926 	int	n, j, shft;
927 	uint_t nindirblks;
928 
929 	int	fsbshift = fs->fs_bshift;
930 	int	fsboffset = (1 << fsbshift) - 1;
931 
932 	dblks = (ip->i_size + fsboffset) >> fsbshift;
933 	mblks = (ldbtob((u_offset_t)ip->i_blocks) + fsboffset) >> fsbshift;
934 
935 	/*
936 	 * File has only direct blocks.
937 	 */
938 	if (dblks <= NDADDR)
939 		return (mblks < dblks);
940 	nindirshift = ip->i_ufsvfs->vfs_nindirshift;
941 
942 	nindiroffset = ip->i_ufsvfs->vfs_nindiroffset;
943 	nindirblks = nindiroffset + 1;
944 
945 	dblks -= NDADDR;
946 	shft = 0;
947 	/*
948 	 * Determine how many levels of indirection.
949 	 */
950 	for (j = NIADDR; j > 0; j--) {
951 		longlong_t	sh;
952 
953 		shft += nindirshift;	/* sh *= nindir */
954 		sh = 1LL << shft;
955 		if (dblks <= sh)
956 			break;
957 		dblks -= sh;
958 	}
959 	/* LINTED: warning: logical expression always true: op "||" */
960 	ASSERT(NIADDR <= 3);
961 	ASSERT(j <= NIADDR);
962 	if (j == NIADDR)	/* single level indirection */
963 		cnt = NDADDR + 1 + dblks;
964 	else if (j == NIADDR-1) /* double indirection */
965 		cnt = NDADDR + 1 + nindirblks +
966 		    1 + (dblks + nindiroffset)/nindirblks + dblks;
967 	else if (j == NIADDR-2) { /* triple indirection */
968 		n = (dblks + nindiroffset)/nindirblks;
969 		cnt = NDADDR + 1 + nindirblks +
970 		    1 + nindirblks + nindirblks*nindirblks +
971 		    1 + (n + nindiroffset)/nindirblks + n + dblks;
972 	}
973 
974 	return (mblks < cnt);
975 }
976 
977 /*
978  * find some contig blocks starting at *sbp and going for min(n, max_contig)
979  * return the number of blocks (not frags) found.
980  * The array passed in must be at least [0..n-1].
981  */
982 static int
983 findextent(struct fs *fs, daddr32_t *sbp, int n, int *lenp, int maxtransfer)
984 {
985 	register daddr_t bn, nextbn;
986 	register daddr32_t *bp;
987 	register int diff;
988 	int maxtransblk;
989 
990 	if (n <= 0)
991 		return (0);
992 	bn = *sbp;
993 	if (bn == 0)
994 		return (0);
995 
996 	diff = fs->fs_frag;
997 	if (*lenp) {
998 		n = MIN(n, lblkno(fs, *lenp));
999 	} else {
1000 		/*
1001 		 * If the user has set the value for maxcontig lower than
1002 		 * the drive transfer size, then assume they want this
1003 		 * to be the maximum value for the size of the data transfer.
1004 		 */
1005 		maxtransblk = maxtransfer >> DEV_BSHIFT;
1006 		if (fs->fs_maxcontig < maxtransblk) {
1007 			n = MIN(n, fs->fs_maxcontig);
1008 		} else {
1009 			n = MIN(n, maxtransblk);
1010 		}
1011 	}
1012 	bp = sbp;
1013 	while (--n > 0) {
1014 		nextbn = *(bp + 1);
1015 		if (nextbn == 0 || bn + diff != nextbn)
1016 			break;
1017 		bn = nextbn;
1018 		bp++;
1019 	}
1020 	return ((int)(bp - sbp) + 1);
1021 }
1022 
1023 /*
1024  * Free any blocks which had been successfully allocated.  Always called
1025  * as a result of an error, so we don't bother returning an error code
1026  * from here.
1027  *
1028  * If block_count and inode_sector_adjust are both zero, we'll do nothing.
1029  * Thus it is safe to call this as part of error handling, whether or not
1030  * any blocks have been allocated.
1031  *
1032  * The ufs_inode_direct case is currently unused.
1033  */
1034 
1035 static void
1036 ufs_undo_allocation(
1037 	inode_t *ip,
1038 	int block_count,
1039 	struct ufs_allocated_block table[],
1040 	int inode_sector_adjust)
1041 {
1042 	int i;
1043 	int inode_changed;
1044 	int error_updating_pointers;
1045 	struct ufsvfs *ufsvfsp;
1046 
1047 	inode_changed = 0;
1048 	error_updating_pointers = 0;
1049 
1050 	ufsvfsp = ip->i_ufsvfs;
1051 
1052 	/*
1053 	 * Update pointers on disk before freeing blocks.  If we fail,
1054 	 * some blocks may remain busy; but they will be reclaimed by
1055 	 * an fsck.  (This is better than letting a block wind up with
1056 	 * two owners if we successfully freed it but could not remove
1057 	 * the pointer to it.)
1058 	 */
1059 
1060 	for (i = 0; i < block_count; i++) {
1061 		switch (table[i].owner) {
1062 		case ufs_no_owner:
1063 			/* Nothing to do here, nobody points to us */
1064 			break;
1065 		case ufs_inode_direct:
1066 			ASSERT(table[i].owner_offset < NDADDR);
1067 			ip->i_db[table[i].owner_offset] = 0;
1068 			inode_changed = 1;
1069 			break;
1070 		case ufs_inode_indirect:
1071 			ASSERT(table[i].owner_offset < NIADDR);
1072 			ip->i_ib[table[i].owner_offset] = 0;
1073 			inode_changed = 1;
1074 			break;
1075 		case ufs_indirect_block: {
1076 			buf_t *bp;
1077 			daddr32_t *block_data;
1078 
1079 			/* Read/modify/log/write. */
1080 
1081 			ASSERT(table[i].owner_offset <
1082 			    (VBSIZE(ITOV(ip)) / sizeof (daddr32_t)));
1083 
1084 			bp = UFS_BREAD(ufsvfsp, ip->i_dev,
1085 			    fsbtodb(ufsvfsp->vfs_fs, table[i].owner_block),
1086 			    VBSIZE(ITOV(ip)));
1087 
1088 			if (bp->b_flags & B_ERROR) {
1089 				/* Couldn't read this block; give up. */
1090 				error_updating_pointers = 1;
1091 				brelse(bp);
1092 				break;		/* out of SWITCH */
1093 			}
1094 
1095 			block_data = bp->b_un.b_daddr;
1096 			block_data[table[i].owner_offset] = 0;
1097 
1098 			/* Write a log entry which includes the zero. */
1099 			/* It might be possible to optimize this by using */
1100 			/* TRANS_BUF directly and zeroing only the four */
1101 			/* bytes involved, but an attempt to do that led */
1102 			/* to panics in the logging code.  The attempt was */
1103 			/* TRANS_BUF(ufsvfsp,				  */
1104 			/*    table[i].owner_offset * sizeof (daddr32_t), */
1105 			/*    sizeof (daddr32_t),			  */
1106 			/*    bp,					  */
1107 			/*    DT_ABZERO);				  */
1108 
1109 			TRANS_BUF_ITEM_128(ufsvfsp,
1110 			    block_data[table[i].owner_offset],
1111 			    block_data, bp, DT_AB);
1112 
1113 			/* Now we can write the buffer itself. */
1114 
1115 			UFS_BWRITE2(ufsvfsp, bp);
1116 
1117 			if (bp->b_flags & B_ERROR) {
1118 				error_updating_pointers = 1;
1119 			}
1120 
1121 			brelse(bp);
1122 			break;
1123 		}
1124 		default:
1125 			(void) ufs_fault(ITOV(ip),
1126 			    "ufs_undo_allocation failure\n");
1127 			break;
1128 		}
1129 	}
1130 
1131 	/*
1132 	 * If the inode changed, or if we need to update its block count,
1133 	 * then do that now.  We update the inode synchronously on disk
1134 	 * to ensure that it won't transiently point at a block we've
1135 	 * freed (only necessary if we're not logging).
1136 	 *
1137 	 * NOTE: Currently ufs_iupdat() does not check for errors.  When
1138 	 * it is fixed, we should verify that we successfully updated the
1139 	 * inode before freeing blocks below.
1140 	 */
1141 
1142 	if (inode_changed || (inode_sector_adjust != 0)) {
1143 		ip->i_blocks -= inode_sector_adjust;
1144 		ASSERT((unsigned)ip->i_blocks <= INT_MAX);
1145 		TRANS_INODE(ufsvfsp, ip);
1146 		ip->i_flag |= IUPD | ICHG | IATTCHG;
1147 		ip->i_seq++;
1148 		if (!TRANS_ISTRANS(ufsvfsp))
1149 			ufs_iupdat(ip, I_SYNC);
1150 	}
1151 
1152 	/*
1153 	 * Now we go through and actually free the blocks, but only if we
1154 	 * successfully removed the pointers to them.
1155 	 */
1156 
1157 	if (!error_updating_pointers) {
1158 		for (i = 0; i < block_count; i++) {
1159 			free(ip, table[i].this_block, table[i].block_size,
1160 			    table[i].usage_flags);
1161 		}
1162 	}
1163 }
1164 
1165 /*
1166  * Find the next hole or data block in file starting at *off
1167  * Return found offset in *off, which can be less than the
1168  * starting offset if not block aligned.
1169  * This code is based on bmap_read().
1170  * Errors: ENXIO for end of file
1171  *         EIO for block read error.
1172  */
1173 int
1174 bmap_find(struct inode *ip, boolean_t hole, u_offset_t *off)
1175 {
1176 	ufsvfs_t *ufsvfsp = ip->i_ufsvfs;
1177 	struct fs *fs = ufsvfsp->vfs_fs;
1178 	buf_t *bp[NIADDR];
1179 	int i, j;
1180 	int shft;			/* we maintain sh = 1 << shft */
1181 	int nindirshift, nindiroffset;
1182 	daddr_t	ob, nb, tbn, lbn, skip;
1183 	daddr32_t *bap;
1184 	u_offset_t isz = (offset_t)ip->i_size;
1185 	int32_t bs = fs->fs_bsize; /* file system block size */
1186 	int32_t nindir = fs->fs_nindir;
1187 	dev_t dev;
1188 	int error = 0;
1189 	daddr_t limits[NIADDR];
1190 
1191 	ASSERT(*off < isz);
1192 	ASSERT(RW_LOCK_HELD(&ip->i_contents));
1193 	lbn = (daddr_t)lblkno(fs, *off);
1194 	ASSERT(lbn >= 0);
1195 
1196 	for (i = 0; i < NIADDR; i++)
1197 		bp[i] = NULL;
1198 
1199 	/*
1200 	 * The first NDADDR blocks are direct blocks.
1201 	 */
1202 	if (lbn < NDADDR) {
1203 		for (; lbn < NDADDR; lbn++) {
1204 			if ((hole && (ip->i_db[lbn] == 0)) ||
1205 			    (!hole && (ip->i_db[lbn] != 0))) {
1206 				goto out;
1207 			}
1208 		}
1209 		if ((u_offset_t)lbn << fs->fs_bshift >= isz)
1210 			goto out;
1211 	}
1212 
1213 	nindir = fs->fs_nindir;
1214 	nindirshift = ufsvfsp->vfs_nindirshift;
1215 	nindiroffset = ufsvfsp->vfs_nindiroffset;
1216 	dev = ip->i_dev;
1217 
1218 	/* Set up limits array */
1219 	for (limits[0] = NDADDR, j = 1; j  < NIADDR; j++)
1220 		limits[j] = limits[j-1] + (1ULL << (nindirshift * j));
1221 
1222 loop:
1223 	/*
1224 	 * Determine how many levels of indirection.
1225 	 */
1226 	shft = 0;				/* sh = 1 */
1227 	tbn = lbn - NDADDR;
1228 	for (j = NIADDR; j > 0; j--) {
1229 		longlong_t sh;
1230 
1231 		shft += nindirshift;		/* sh *= nindir */
1232 		sh = 1LL << shft;
1233 		if (tbn < sh)
1234 			break;
1235 		tbn -= sh;
1236 	}
1237 	if (j == 0) {
1238 		/* must have passed end of file */
1239 		ASSERT(((u_offset_t)lbn << fs->fs_bshift) >= isz);
1240 		goto out;
1241 	}
1242 
1243 	/*
1244 	 * Fetch the first indirect block.
1245 	 */
1246 	nb = ip->i_ib[NIADDR - j];
1247 	if (nb == 0) {
1248 		if (hole) {
1249 			lbn = limits[NIADDR - j];
1250 			goto out;
1251 		} else {
1252 			lbn = limits[NIADDR - j + 1];
1253 			if ((u_offset_t)lbn << fs->fs_bshift >= isz)
1254 				goto out;
1255 			goto loop;
1256 		}
1257 	}
1258 
1259 	/*
1260 	 * Fetch through the indirect blocks.
1261 	 */
1262 	for (; ((j <= NIADDR) && (nb != 0)); j++) {
1263 		ob = nb;
1264 		/*
1265 		 * if there's a different block at this level then release
1266 		 * the old one and in with the new.
1267 		 */
1268 		if ((bp[j-1] == NULL) || bp[j-1]->b_blkno != fsbtodb(fs, ob)) {
1269 			if (bp[j-1] != NULL)
1270 				brelse(bp[j-1]);
1271 			bp[j-1] = UFS_BREAD(ufsvfsp, dev, fsbtodb(fs, ob), bs);
1272 			if (bp[j-1]->b_flags & B_ERROR) {
1273 				error = EIO;
1274 				goto out;
1275 			}
1276 		}
1277 		bap = bp[j-1]->b_un.b_daddr;
1278 
1279 		shft -= nindirshift;		/* sh / nindir */
1280 		i = (tbn >> shft) & nindiroffset; /* (tbn / sh) % nindir */
1281 		nb = bap[i];
1282 		skip = 1LL << (nindirshift * (NIADDR - j));
1283 	}
1284 
1285 	/*
1286 	 * Scan through the blocks in this array.
1287 	 */
1288 	for (; i < nindir; i++, lbn += skip) {
1289 		if (hole && (bap[i] == 0))
1290 			goto out;
1291 		if (!hole && (bap[i] != 0)) {
1292 			if (skip == 1) {
1293 				/* we're at the lowest level */
1294 				goto out;
1295 			} else {
1296 				goto loop;
1297 			}
1298 		}
1299 	}
1300 	if (((u_offset_t)lbn << fs->fs_bshift) < isz)
1301 		goto loop;
1302 out:
1303 	for (i = 0; i < NIADDR; i++) {
1304 		if (bp[i])
1305 			brelse(bp[i]);
1306 	}
1307 	if (error == 0) {
1308 		if (((u_offset_t)lbn << fs->fs_bshift) >= isz) {
1309 			error = ENXIO;
1310 		} else {
1311 			/* success */
1312 			*off = (u_offset_t)lbn << fs->fs_bshift;
1313 		}
1314 	}
1315 	return (error);
1316 }
1317 
1318 /*
1319  * Set a particular offset in the inode list to be a certain block.
1320  * User is responsible for calling TRANS* functions
1321  */
1322 int
1323 bmap_set_bn(struct vnode *vp, u_offset_t off, daddr32_t bn)
1324 {
1325 	daddr_t lbn;
1326 	struct inode *ip;
1327 	ufsvfs_t *ufsvfsp;
1328 	struct	fs *fs;
1329 	struct	buf *bp;
1330 	int	i, j;
1331 	int	shft;			/* we maintain sh = 1 << shft */
1332 	int err;
1333 	daddr_t	ob, nb, tbn;
1334 	daddr32_t *bap;
1335 	int	nindirshift, nindiroffset;
1336 
1337 	ip = VTOI(vp);
1338 	ufsvfsp = ip->i_ufsvfs;
1339 	fs = ufsvfsp->vfs_fs;
1340 	lbn = (daddr_t)lblkno(fs, off);
1341 
1342 	ASSERT(RW_LOCK_HELD(&ip->i_contents));
1343 
1344 	if (lbn < 0)
1345 		return (EFBIG);
1346 
1347 	/*
1348 	 * Take care of direct block assignment
1349 	 */
1350 	if (lbn < NDADDR) {
1351 		ip->i_db[lbn] = bn;
1352 		return (0);
1353 	}
1354 
1355 	nindirshift = ip->i_ufsvfs->vfs_nindirshift;
1356 	nindiroffset = ip->i_ufsvfs->vfs_nindiroffset;
1357 	/*
1358 	 * Determine how many levels of indirection.
1359 	 */
1360 	shft = 0;				/* sh = 1 */
1361 	tbn = lbn - NDADDR;
1362 	for (j = NIADDR; j > 0; j--) {
1363 		longlong_t	sh;
1364 
1365 		shft += nindirshift;		/* sh *= nindir */
1366 		sh = 1LL << shft;
1367 		if (tbn < sh)
1368 			break;
1369 		tbn -= sh;
1370 	}
1371 	if (j == 0)
1372 		return (EFBIG);
1373 
1374 	/*
1375 	 * Fetch the first indirect block.
1376 	 */
1377 	nb = ip->i_ib[NIADDR - j];
1378 	if (nb == 0) {
1379 		err = ufs_fault(ITOV(ip), "ufs_set_bn: nb == UFS_HOLE");
1380 		return (err);
1381 	}
1382 
1383 	/*
1384 	 * Fetch through the indirect blocks.
1385 	 */
1386 	for (; j <= NIADDR; j++) {
1387 		ob = nb;
1388 		bp = UFS_BREAD(ufsvfsp,
1389 		    ip->i_dev, fsbtodb(fs, ob), fs->fs_bsize);
1390 		if (bp->b_flags & B_ERROR) {
1391 			err = geterror(bp);
1392 			brelse(bp);
1393 			return (err);
1394 		}
1395 		bap = bp->b_un.b_daddr;
1396 
1397 		ASSERT(!ufs_indir_badblock(ip, bap));
1398 
1399 		shft -= nindirshift;		/* sh / nindir */
1400 		i = (tbn >> shft) & nindiroffset; /* (tbn / sh) % nindir */
1401 
1402 		nb = bap[i];
1403 		if (nb == 0) {
1404 			err = ufs_fault(ITOV(ip), "ufs_set_bn: nb == UFS_HOLE");
1405 			return (err);
1406 		}
1407 
1408 		if (j == NIADDR) {
1409 			bap[i] = bn;
1410 			bdrwrite(bp);
1411 			return (0);
1412 		}
1413 
1414 		brelse(bp);
1415 	}
1416 	return (0);
1417 }
1418