xref: /illumos-gate/usr/src/uts/common/fs/ufs/ufs_alloc.c (revision 80d3443290aca22ad7fb6c18568d19d37517ebbf)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
27 /*	  All Rights Reserved  	*/
28 
29 /*
30  * University Copyright- Copyright (c) 1982, 1986, 1988
31  * The Regents of the University of California
32  * All Rights Reserved
33  *
34  * University Acknowledgment- Portions of this document are derived from
35  * software developed by the University of California, Berkeley, and its
36  * contributors.
37  */
38 
39 
40 #pragma ident	"%Z%%M%	%I%	%E% SMI"
41 
42 #include <sys/condvar_impl.h>
43 #include <sys/types.h>
44 #include <sys/t_lock.h>
45 #include <sys/debug.h>
46 #include <sys/param.h>
47 #include <sys/systm.h>
48 #include <sys/signal.h>
49 #include <sys/cred.h>
50 #include <sys/proc.h>
51 #include <sys/disp.h>
52 #include <sys/user.h>
53 #include <sys/buf.h>
54 #include <sys/vfs.h>
55 #include <sys/vnode.h>
56 #include <sys/acl.h>
57 #include <sys/fs/ufs_fs.h>
58 #include <sys/fs/ufs_inode.h>
59 #include <sys/fs/ufs_acl.h>
60 #include <sys/fs/ufs_bio.h>
61 #include <sys/fs/ufs_quota.h>
62 #include <sys/kmem.h>
63 #include <sys/fs/ufs_trans.h>
64 #include <sys/fs/ufs_panic.h>
65 #include <sys/errno.h>
66 #include <sys/time.h>
67 #include <sys/sysmacros.h>
68 #include <sys/file.h>
69 #include <sys/fcntl.h>
70 #include <sys/flock.h>
71 #include <fs/fs_subr.h>
72 #include <sys/cmn_err.h>
73 #include <sys/policy.h>
74 
75 static ino_t	hashalloc();
76 static daddr_t	fragextend();
77 static daddr_t	alloccg();
78 static daddr_t	alloccgblk();
79 static ino_t	ialloccg();
80 static daddr_t	mapsearch();
81 
82 extern int	inside[], around[];
83 extern uchar_t	*fragtbl[];
84 void delay();
85 
86 /*
87  * Allocate a block in the file system.
88  *
89  * The size of the requested block is given, which must be some
90  * multiple of fs_fsize and <= fs_bsize.
91  * A preference may be optionally specified. If a preference is given
92  * the following hierarchy is used to allocate a block:
93  *   1) allocate the requested block.
94  *   2) allocate a rotationally optimal block in the same cylinder.
95  *   3) allocate a block in the same cylinder group.
96  *   4) quadratically rehash into other cylinder groups, until an
97  *	available block is located.
98  * If no block preference is given the following hierarchy is used
99  * to allocate a block:
100  *   1) allocate a block in the cylinder group that contains the
101  *	inode for the file.
102  *   2) quadratically rehash into other cylinder groups, until an
103  *	available block is located.
104  */
105 int
106 alloc(struct inode *ip, daddr_t bpref, int size, daddr_t *bnp, cred_t *cr)
107 {
108 	struct fs *fs;
109 	struct ufsvfs *ufsvfsp;
110 	daddr_t bno;
111 	int cg;
112 	int err;
113 	char *errmsg = NULL;
114 	size_t len;
115 
116 	ufsvfsp = ip->i_ufsvfs;
117 	fs = ufsvfsp->vfs_fs;
118 	if ((unsigned)size > fs->fs_bsize || fragoff(fs, size) != 0) {
119 		err = ufs_fault(ITOV(ip), "alloc: bad size, dev = 0x%lx,"
120 		    " bsize = %d, size = %d, fs = %s\n",
121 		    ip->i_dev, fs->fs_bsize, size, fs->fs_fsmnt);
122 		return (err);
123 	}
124 	if (size == fs->fs_bsize && fs->fs_cstotal.cs_nbfree == 0)
125 		goto nospace;
126 	if (freespace(fs, ufsvfsp) <= 0 &&
127 	    secpolicy_fs_minfree(cr, ufsvfsp->vfs_vfs) != 0)
128 		goto nospace;
129 	err = chkdq(ip, (long)btodb(size), 0, cr, &errmsg, &len);
130 	/* Note that may not have err, but may have errmsg */
131 	if (errmsg != NULL) {
132 		uprintf(errmsg);
133 		kmem_free(errmsg, len);
134 		errmsg = NULL;
135 	}
136 	if (err)
137 		return (err);
138 	if (bpref >= fs->fs_size)
139 		bpref = 0;
140 	if (bpref == 0)
141 		cg = (int)itog(fs, ip->i_number);
142 	else
143 		cg = dtog(fs, bpref);
144 
145 	bno = (daddr_t)hashalloc(ip, cg, (long)bpref, size,
146 	    (ulong_t (*)())alloccg);
147 	if (bno > 0) {
148 		*bnp = bno;
149 		return (0);
150 	}
151 
152 	/*
153 	 * hashalloc() failed because some other thread grabbed
154 	 * the last block so unwind the quota operation.  We can
155 	 * ignore the return because subtractions don't fail and
156 	 * size is guaranteed to be >= zero by our caller.
157 	 */
158 	(void) chkdq(ip, -(long)btodb(size), 0, cr, (char **)NULL,
159 	    (size_t *)NULL);
160 
161 nospace:
162 	mutex_enter(&ufsvfsp->vfs_lock);
163 	if ((lbolt - ufsvfsp->vfs_lastwhinetime) > (hz << 2) &&
164 	    (!(TRANS_ISTRANS(ufsvfsp)) || !(ip->i_flag & IQUIET))) {
165 		ufsvfsp->vfs_lastwhinetime = lbolt;
166 		cmn_err(CE_NOTE, "alloc: %s: file system full", fs->fs_fsmnt);
167 	}
168 	mutex_exit(&ufsvfsp->vfs_lock);
169 	return (ENOSPC);
170 }
171 
172 /*
173  * Reallocate a fragment to a bigger size
174  *
175  * The number and size of the old block is given, and a preference
176  * and new size is also specified.  The allocator attempts to extend
177  * the original block.  Failing that, the regular block allocator is
178  * invoked to get an appropriate block.
179  */
180 int
181 realloccg(struct inode *ip, daddr_t bprev, daddr_t bpref, int osize,
182     int nsize, daddr_t *bnp, cred_t *cr)
183 {
184 	daddr_t bno;
185 	struct fs *fs;
186 	struct ufsvfs *ufsvfsp;
187 	int cg, request;
188 	int err;
189 	char *errmsg = NULL;
190 	size_t len;
191 
192 	ufsvfsp = ip->i_ufsvfs;
193 	fs = ufsvfsp->vfs_fs;
194 	if ((unsigned)osize > fs->fs_bsize || fragoff(fs, osize) != 0 ||
195 	    (unsigned)nsize > fs->fs_bsize || fragoff(fs, nsize) != 0) {
196 		err = ufs_fault(ITOV(ip),
197 		    "realloccg: bad size, dev=0x%lx, bsize=%d, "
198 		    "osize=%d, nsize=%d, fs=%s\n",
199 		    ip->i_dev, fs->fs_bsize, osize, nsize, fs->fs_fsmnt);
200 		return (err);
201 	}
202 	if (freespace(fs, ufsvfsp) <= 0 &&
203 	    secpolicy_fs_minfree(cr, ufsvfsp->vfs_vfs) != 0)
204 		goto nospace;
205 	if (bprev == 0) {
206 		err = ufs_fault(ITOV(ip),
207 		    "realloccg: bad bprev, dev = 0x%lx, bsize = %d,"
208 		    " bprev = %ld, fs = %s\n", ip->i_dev, fs->fs_bsize, bprev,
209 		    fs->fs_fsmnt);
210 		return (err);
211 	}
212 	err = chkdq(ip, (long)btodb(nsize - osize), 0, cr, &errmsg, &len);
213 	/* Note that may not have err, but may have errmsg */
214 	if (errmsg != NULL) {
215 		uprintf(errmsg);
216 		kmem_free(errmsg, len);
217 		errmsg = NULL;
218 	}
219 	if (err)
220 		return (err);
221 	cg = dtog(fs, bprev);
222 	bno = fragextend(ip, cg, (long)bprev, osize, nsize);
223 	if (bno != 0) {
224 		*bnp = bno;
225 		return (0);
226 	}
227 	if (bpref >= fs->fs_size)
228 		bpref = 0;
229 
230 	/*
231 	 * When optimizing for time we allocate a full block and
232 	 * then only use the upper portion for this request. When
233 	 * this file grows again it will grow into the unused portion
234 	 * of the block (See fragextend() above).  This saves time
235 	 * because an extra disk write would be needed if the frags
236 	 * following the current allocation were not free. The extra
237 	 * disk write is needed to move the data from its current
238 	 * location into the newly allocated position.
239 	 *
240 	 * When optimizing for space we allocate a run of frags
241 	 * that is just the right size for this request.
242 	 */
243 	request = (fs->fs_optim == FS_OPTTIME) ? fs->fs_bsize : nsize;
244 	bno = (daddr_t)hashalloc(ip, cg, (long)bpref, request,
245 	    (ulong_t (*)())alloccg);
246 	if (bno > 0) {
247 		*bnp = bno;
248 		if (nsize < request)
249 			(void) free(ip, bno + numfrags(fs, nsize),
250 			    (off_t)(request - nsize), I_NOCANCEL);
251 		return (0);
252 	}
253 
254 	/*
255 	 * hashalloc() failed because some other thread grabbed
256 	 * the last block so unwind the quota operation.  We can
257 	 * ignore the return because subtractions don't fail, and
258 	 * our caller guarantees nsize >= osize.
259 	 */
260 	(void) chkdq(ip, -(long)btodb(nsize - osize), 0, cr, (char **)NULL,
261 	    (size_t *)NULL);
262 
263 nospace:
264 	mutex_enter(&ufsvfsp->vfs_lock);
265 	if ((lbolt - ufsvfsp->vfs_lastwhinetime) > (hz << 2) &&
266 	    (!(TRANS_ISTRANS(ufsvfsp)) || !(ip->i_flag & IQUIET))) {
267 		ufsvfsp->vfs_lastwhinetime = lbolt;
268 		cmn_err(CE_NOTE,
269 		    "realloccg %s: file system full", fs->fs_fsmnt);
270 	}
271 	mutex_exit(&ufsvfsp->vfs_lock);
272 	return (ENOSPC);
273 }
274 
275 /*
276  * Allocate an inode in the file system.
277  *
278  * A preference may be optionally specified. If a preference is given
279  * the following hierarchy is used to allocate an inode:
280  *   1) allocate the requested inode.
281  *   2) allocate an inode in the same cylinder group.
282  *   3) quadratically rehash into other cylinder groups, until an
283  *	available inode is located.
284  * If no inode preference is given the following hierarchy is used
285  * to allocate an inode:
286  *   1) allocate an inode in cylinder group 0.
287  *   2) quadratically rehash into other cylinder groups, until an
288  *	available inode is located.
289  */
290 int
291 ufs_ialloc(struct inode *pip,
292     ino_t ipref, mode_t mode, struct inode **ipp, cred_t *cr)
293 {
294 	struct inode *ip;
295 	struct fs *fs;
296 	int cg;
297 	ino_t ino;
298 	int err;
299 	int nifree;
300 	struct ufsvfs *ufsvfsp = pip->i_ufsvfs;
301 	char *errmsg = NULL;
302 	size_t len;
303 
304 	ASSERT(RW_WRITE_HELD(&pip->i_rwlock));
305 	fs = pip->i_fs;
306 loop:
307 	nifree = fs->fs_cstotal.cs_nifree;
308 
309 	if (nifree == 0)
310 		goto noinodes;
311 	/*
312 	 * Shadow inodes don't count against a user's inode allocation.
313 	 * They are an implementation method and not a resource.
314 	 */
315 	if ((mode != IFSHAD) && (mode != IFATTRDIR)) {
316 		err = chkiq((struct ufsvfs *)ITOV(pip)->v_vfsp->vfs_data,
317 		    /* change */ 1, (struct inode *)NULL, crgetuid(cr), 0,
318 		    cr, &errmsg, &len);
319 		/*
320 		 * As we haven't acquired any locks yet, dump the message
321 		 * now.
322 		 */
323 		if (errmsg != NULL) {
324 			uprintf(errmsg);
325 			kmem_free(errmsg, len);
326 			errmsg = NULL;
327 		}
328 		if (err)
329 			return (err);
330 	}
331 
332 	if (ipref >= (ulong_t)(fs->fs_ncg * fs->fs_ipg))
333 		ipref = 0;
334 	cg = (int)itog(fs, ipref);
335 	ino = (ino_t)hashalloc(pip, cg, (long)ipref, (int)mode,
336 	    (ulong_t (*)())ialloccg);
337 	if (ino == 0) {
338 		if ((mode != IFSHAD) && (mode != IFATTRDIR)) {
339 			/*
340 			 * We can safely ignore the return from chkiq()
341 			 * because deallocations can only fail if we
342 			 * can't get the user's quota info record off
343 			 * the disk due to an I/O error.  In that case,
344 			 * the quota subsystem is already messed up.
345 			 */
346 			(void) chkiq(ufsvfsp, /* change */ -1,
347 			    (struct inode *)NULL, crgetuid(cr), 0, cr,
348 			    (char **)NULL, (size_t *)NULL);
349 		}
350 		goto noinodes;
351 	}
352 	err = ufs_iget(pip->i_vfs, ino, ipp, cr);
353 	if (err) {
354 		if ((mode != IFSHAD) && (mode != IFATTRDIR)) {
355 			/*
356 			 * See above comment about why it is safe to ignore an
357 			 * error return here.
358 			 */
359 			(void) chkiq(ufsvfsp, /* change */ -1,
360 			    (struct inode *)NULL, crgetuid(cr), 0, cr,
361 			    (char **)NULL, (size_t *)NULL);
362 		}
363 		ufs_ifree(pip, ino, 0);
364 		return (err);
365 	}
366 	ip = *ipp;
367 	ASSERT(!ip->i_ufs_acl);
368 	ASSERT(!ip->i_dquot);
369 	rw_enter(&ip->i_contents, RW_WRITER);
370 
371 	/*
372 	 * Check if we really got a free inode, if not then complain
373 	 * and mark the inode ISTALE so that it will be freed by the
374 	 * ufs idle thread eventually and will not be sent to ufs_delete().
375 	 */
376 	if (ip->i_mode || (ip->i_nlink > 0)) {
377 		ip->i_flag |= ISTALE;
378 		rw_exit(&ip->i_contents);
379 		VN_RELE(ITOV(ip));
380 		cmn_err(CE_WARN,
381 		    "%s: unexpected allocated inode %d, run fsck(1M)%s",
382 		    fs->fs_fsmnt, (int)ino,
383 		    (TRANS_ISTRANS(ufsvfsp) ? " -o f" : ""));
384 		goto loop;
385 	}
386 
387 	/*
388 	 * Check the inode has no size or data blocks.
389 	 * This could have happened if the truncation failed when
390 	 * deleting the inode. It used to be possible for this to occur
391 	 * if a block allocation failed when iteratively truncating a
392 	 * large file using logging and with a full file system.
393 	 * This was fixed with bug fix 4348738. However, truncation may
394 	 * still fail on an IO error. So in all cases for safety and
395 	 * security we clear out the size; the blocks allocated; and
396 	 * pointers to the blocks. This will ultimately cause a fsck
397 	 * error of un-accounted for blocks, but its a fairly benign error,
398 	 * and possibly the correct thing to do anyway as accesssing those
399 	 * blocks agains may lead to more IO errors.
400 	 */
401 	if (ip->i_size || ip->i_blocks) {
402 		int i;
403 
404 		if (ip->i_size) {
405 			cmn_err(CE_WARN,
406 			    "%s: free inode %d had size 0x%llx, run fsck(1M)%s",
407 			    fs->fs_fsmnt, (int)ino, ip->i_size,
408 			    (TRANS_ISTRANS(ufsvfsp) ? " -o f" : ""));
409 		}
410 		/*
411 		 * Clear any garbage left behind.
412 		 */
413 		ip->i_size = (u_offset_t)0;
414 		ip->i_blocks = 0;
415 		for (i = 0; i < NDADDR; i++)
416 			ip->i_db[i] = 0;
417 		for (i = 0; i < NIADDR; i++)
418 			ip->i_ib[i] = 0;
419 	}
420 
421 	/*
422 	 * Initialize the link count
423 	 */
424 	ip->i_nlink = 0;
425 
426 	/*
427 	 * Clear the old flags
428 	 */
429 	ip->i_flag &= IREF;
430 
431 	/*
432 	 * Access times are not really defined if the fs is mounted
433 	 * with 'noatime'. But it can cause nfs clients to fail
434 	 * open() if the atime is not a legal value. Set a legal value
435 	 * here when the inode is allocated.
436 	 */
437 	if (ufsvfsp->vfs_noatime) {
438 		mutex_enter(&ufs_iuniqtime_lock);
439 		ip->i_atime = iuniqtime;
440 		mutex_exit(&ufs_iuniqtime_lock);
441 	}
442 	rw_exit(&ip->i_contents);
443 	return (0);
444 noinodes:
445 	if (!(TRANS_ISTRANS(ufsvfsp)) || !(pip->i_flag & IQUIET))
446 		cmn_err(CE_NOTE, "%s: out of inodes\n", fs->fs_fsmnt);
447 	return (ENOSPC);
448 }
449 
450 /*
451  * Find a cylinder group to place a directory.
452  * Returns an inumber within the selected cylinder group.
453  * Note, the vfs_lock is not needed as we don't require exact cg summary info.
454  *
455  * If the switch ufs_close_dirs is set, then the policy is to use
456  * the current cg if it has more than 25% free inodes and more
457  * than 25% free blocks. Otherwise the cgs are searched from
458  * the beginning and the first cg with the same criteria is
459  * used. If that is also null then we revert to the old algorithm.
460  * This tends to cluster files at the beginning of the disk
461  * until the disk gets full.
462  *
463  * Otherwise if ufs_close_dirs is not set then the original policy is
464  * used which is to select from among those cylinder groups with
465  * above the average number of free inodes, the one with the smallest
466  * number of directories.
467  */
468 
469 int ufs_close_dirs = 1;	/* allocate directories close as possible */
470 
471 ino_t
472 dirpref(inode_t *dp)
473 {
474 	int cg, minndir, mincg, avgifree, mininode, minbpg, ifree;
475 	struct fs *fs = dp->i_fs;
476 
477 	cg = itog(fs, dp->i_number);
478 	mininode = fs->fs_ipg >> 2;
479 	minbpg = fs->fs_maxbpg >> 2;
480 	if (ufs_close_dirs &&
481 	    (fs->fs_cs(fs, cg).cs_nifree > mininode) &&
482 	    (fs->fs_cs(fs, cg).cs_nbfree > minbpg)) {
483 		return (dp->i_number);
484 	}
485 
486 	avgifree = fs->fs_cstotal.cs_nifree / fs->fs_ncg;
487 	minndir = fs->fs_ipg;
488 	mincg = 0;
489 	for (cg = 0; cg < fs->fs_ncg; cg++) {
490 		ifree = fs->fs_cs(fs, cg).cs_nifree;
491 		if (ufs_close_dirs &&
492 		    (ifree > mininode) &&
493 		    (fs->fs_cs(fs, cg).cs_nbfree > minbpg)) {
494 			return ((ino_t)(fs->fs_ipg * cg));
495 		}
496 		if ((fs->fs_cs(fs, cg).cs_ndir < minndir) &&
497 		    (ifree >= avgifree)) {
498 			mincg = cg;
499 			minndir = fs->fs_cs(fs, cg).cs_ndir;
500 		}
501 	}
502 	return ((ino_t)(fs->fs_ipg * mincg));
503 }
504 
505 /*
506  * Select the desired position for the next block in a file.  The file is
507  * logically divided into sections. The first section is composed of the
508  * direct blocks. Each additional section contains fs_maxbpg blocks.
509  *
510  * If no blocks have been allocated in the first section, the policy is to
511  * request a block in the same cylinder group as the inode that describes
512  * the file. If no blocks have been allocated in any other section, the
513  * policy is to place the section in a cylinder group with a greater than
514  * average number of free blocks.  An appropriate cylinder group is found
515  * by using a rotor that sweeps the cylinder groups. When a new group of
516  * blocks is needed, the sweep begins in the cylinder group following the
517  * cylinder group from which the previous allocation was made. The sweep
518  * continues until a cylinder group with greater than the average number
519  * of free blocks is found. If the allocation is for the first block in an
520  * indirect block, the information on the previous allocation is unavailable;
521  * here a best guess is made based upon the logical block number being
522  * allocated.
523  *
524  * If a section is already partially allocated, the policy is to
525  * contiguously allocate fs_maxcontig blocks.  The end of one of these
526  * contiguous blocks and the beginning of the next is physically separated
527  * so that the disk head will be in transit between them for at least
528  * fs_rotdelay milliseconds.  This is to allow time for the processor to
529  * schedule another I/O transfer.
530  */
531 daddr_t
532 blkpref(struct inode *ip, daddr_t lbn, int indx, daddr32_t *bap)
533 {
534 	struct fs *fs;
535 	struct ufsvfs *ufsvfsp;
536 	int cg;
537 	int avgbfree, startcg;
538 	daddr_t nextblk;
539 
540 	ufsvfsp = ip->i_ufsvfs;
541 	fs = ip->i_fs;
542 	if (indx % fs->fs_maxbpg == 0 || bap[indx - 1] == 0) {
543 		if (lbn < NDADDR) {
544 			cg = itog(fs, ip->i_number);
545 			return (fs->fs_fpg * cg + fs->fs_frag);
546 		}
547 		/*
548 		 * Find a cylinder with greater than average
549 		 * number of unused data blocks.
550 		 */
551 		if (indx == 0 || bap[indx - 1] == 0)
552 			startcg = itog(fs, ip->i_number) + lbn / fs->fs_maxbpg;
553 		else
554 			startcg = dtog(fs, bap[indx - 1]) + 1;
555 		startcg %= fs->fs_ncg;
556 
557 		mutex_enter(&ufsvfsp->vfs_lock);
558 		avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg;
559 		/*
560 		 * used for computing log space for writes/truncs
561 		 */
562 		ufsvfsp->vfs_avgbfree = avgbfree;
563 		for (cg = startcg; cg < fs->fs_ncg; cg++)
564 			if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
565 				fs->fs_cgrotor = cg;
566 				mutex_exit(&ufsvfsp->vfs_lock);
567 				return (fs->fs_fpg * cg + fs->fs_frag);
568 			}
569 		for (cg = 0; cg <= startcg; cg++)
570 			if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
571 				fs->fs_cgrotor = cg;
572 				mutex_exit(&ufsvfsp->vfs_lock);
573 				return (fs->fs_fpg * cg + fs->fs_frag);
574 			}
575 		mutex_exit(&ufsvfsp->vfs_lock);
576 		return (NULL);
577 	}
578 	/*
579 	 * One or more previous blocks have been laid out. If less
580 	 * than fs_maxcontig previous blocks are contiguous, the
581 	 * next block is requested contiguously, otherwise it is
582 	 * requested rotationally delayed by fs_rotdelay milliseconds.
583 	 */
584 
585 	nextblk = bap[indx - 1];
586 	/*
587 	 * Provision for fallocate to return positive
588 	 * blk preference based on last allocation
589 	 */
590 	if (nextblk < 0 && nextblk != UFS_HOLE) {
591 		nextblk = (-bap[indx - 1]) + fs->fs_frag;
592 	} else {
593 		nextblk = bap[indx - 1] + fs->fs_frag;
594 	}
595 
596 	if (indx > fs->fs_maxcontig && bap[indx - fs->fs_maxcontig] +
597 	    blkstofrags(fs, fs->fs_maxcontig) != nextblk) {
598 		return (nextblk);
599 	}
600 	if (fs->fs_rotdelay != 0)
601 		/*
602 		 * Here we convert ms of delay to frags as:
603 		 * (frags) = (ms) * (rev/sec) * (sect/rev) /
604 		 * 	((sect/frag) * (ms/sec))
605 		 * then round up to the next block.
606 		 */
607 		nextblk += roundup(fs->fs_rotdelay * fs->fs_rps * fs->fs_nsect /
608 		    (NSPF(fs) * 1000), fs->fs_frag);
609 	return (nextblk);
610 }
611 
612 /*
613  * Free a block or fragment.
614  *
615  * The specified block or fragment is placed back in the
616  * free map. If a fragment is deallocated, a possible
617  * block reassembly is checked.
618  */
619 void
620 free(struct inode *ip, daddr_t bno, off_t size, int flags)
621 {
622 	struct fs *fs = ip->i_fs;
623 	struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
624 	struct ufs_q *delq = &ufsvfsp->vfs_delete;
625 	struct ufs_delq_info *delq_info = &ufsvfsp->vfs_delete_info;
626 	struct cg *cgp;
627 	struct buf *bp;
628 	int cg, bmap, bbase;
629 	int i;
630 	uchar_t *blksfree;
631 	int *blktot;
632 	short *blks;
633 	daddr_t blkno, cylno, rpos;
634 
635 	/*
636 	 * fallocate'd files will have negative block address.
637 	 * So negate it again to get original block address.
638 	 */
639 	if (bno < 0 && (bno % fs->fs_frag == 0) && bno != UFS_HOLE) {
640 		bno = -bno;
641 	}
642 
643 	if ((unsigned long)size > fs->fs_bsize || fragoff(fs, size) != 0) {
644 		(void) ufs_fault(ITOV(ip),
645 		    "free: bad size, dev = 0x%lx, bsize = %d, size = %d, "
646 		    "fs = %s\n", ip->i_dev, fs->fs_bsize,
647 		    (int)size, fs->fs_fsmnt);
648 		return;
649 	}
650 	cg = dtog(fs, bno);
651 	ASSERT(!ufs_badblock(ip, bno));
652 	bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, cgtod(fs, cg)),
653 	    (int)fs->fs_cgsize);
654 
655 	cgp = bp->b_un.b_cg;
656 	if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp)) {
657 		brelse(bp);
658 		return;
659 	}
660 
661 	if (!(flags & I_NOCANCEL))
662 		TRANS_CANCEL(ufsvfsp, ldbtob(fsbtodb(fs, bno)), size, flags);
663 	if (flags & (I_DIR|I_IBLK|I_SHAD|I_QUOTA)) {
664 		TRANS_MATA_FREE(ufsvfsp, ldbtob(fsbtodb(fs, bno)), size);
665 	}
666 	blksfree = cg_blksfree(cgp);
667 	blktot = cg_blktot(cgp);
668 	mutex_enter(&ufsvfsp->vfs_lock);
669 	cgp->cg_time = gethrestime_sec();
670 	bno = dtogd(fs, bno);
671 	if (size == fs->fs_bsize) {
672 		blkno = fragstoblks(fs, bno);
673 		cylno = cbtocylno(fs, bno);
674 		rpos = cbtorpos(ufsvfsp, bno);
675 		blks = cg_blks(ufsvfsp, cgp, cylno);
676 		if (!isclrblock(fs, blksfree, blkno)) {
677 			mutex_exit(&ufsvfsp->vfs_lock);
678 			brelse(bp);
679 			(void) ufs_fault(ITOV(ip), "free: freeing free block, "
680 			    "dev:0x%lx, block:%ld, ino:%lu, fs:%s",
681 			    ip->i_dev, bno, ip->i_number, fs->fs_fsmnt);
682 			return;
683 		}
684 		setblock(fs, blksfree, blkno);
685 		blks[rpos]++;
686 		blktot[cylno]++;
687 		cgp->cg_cs.cs_nbfree++;		/* Log below */
688 		fs->fs_cstotal.cs_nbfree++;
689 		fs->fs_cs(fs, cg).cs_nbfree++;
690 		if (TRANS_ISTRANS(ufsvfsp) && (flags & I_ACCT)) {
691 			mutex_enter(&delq->uq_mutex);
692 			delq_info->delq_unreclaimed_blocks -=
693 			    btodb(fs->fs_bsize);
694 			mutex_exit(&delq->uq_mutex);
695 		}
696 	} else {
697 		bbase = bno - fragnum(fs, bno);
698 		/*
699 		 * Decrement the counts associated with the old frags
700 		 */
701 		bmap = blkmap(fs, blksfree, bbase);
702 		fragacct(fs, bmap, cgp->cg_frsum, -1);
703 		/*
704 		 * Deallocate the fragment
705 		 */
706 		for (i = 0; i < numfrags(fs, size); i++) {
707 			if (isset(blksfree, bno + i)) {
708 				brelse(bp);
709 				mutex_exit(&ufsvfsp->vfs_lock);
710 				(void) ufs_fault(ITOV(ip),
711 				    "free: freeing free frag, "
712 				    "dev:0x%lx, blk:%ld, cg:%d, "
713 				    "ino:%lu, fs:%s",
714 				    ip->i_dev,
715 				    bno + i,
716 				    cgp->cg_cgx,
717 				    ip->i_number,
718 				    fs->fs_fsmnt);
719 				return;
720 			}
721 			setbit(blksfree, bno + i);
722 		}
723 		cgp->cg_cs.cs_nffree += i;
724 		fs->fs_cstotal.cs_nffree += i;
725 		fs->fs_cs(fs, cg).cs_nffree += i;
726 		if (TRANS_ISTRANS(ufsvfsp) && (flags & I_ACCT)) {
727 			mutex_enter(&delq->uq_mutex);
728 			delq_info->delq_unreclaimed_blocks -=
729 			    btodb(i * fs->fs_fsize);
730 			mutex_exit(&delq->uq_mutex);
731 		}
732 		/*
733 		 * Add back in counts associated with the new frags
734 		 */
735 		bmap = blkmap(fs, blksfree, bbase);
736 		fragacct(fs, bmap, cgp->cg_frsum, 1);
737 		/*
738 		 * If a complete block has been reassembled, account for it
739 		 */
740 		blkno = fragstoblks(fs, bbase);
741 		if (isblock(fs, blksfree, blkno)) {
742 			cylno = cbtocylno(fs, bbase);
743 			rpos = cbtorpos(ufsvfsp, bbase);
744 			blks = cg_blks(ufsvfsp, cgp, cylno);
745 			blks[rpos]++;
746 			blktot[cylno]++;
747 			cgp->cg_cs.cs_nffree -= fs->fs_frag;
748 			fs->fs_cstotal.cs_nffree -= fs->fs_frag;
749 			fs->fs_cs(fs, cg).cs_nffree -= fs->fs_frag;
750 			cgp->cg_cs.cs_nbfree++;
751 			fs->fs_cstotal.cs_nbfree++;
752 			fs->fs_cs(fs, cg).cs_nbfree++;
753 		}
754 	}
755 	fs->fs_fmod = 1;
756 	ufs_notclean(ufsvfsp);
757 	TRANS_BUF(ufsvfsp, 0, fs->fs_cgsize, bp, DT_CG);
758 	TRANS_SI(ufsvfsp, fs, cg);
759 	bdrwrite(bp);
760 }
761 
762 /*
763  * Free an inode.
764  *
765  * The specified inode is placed back in the free map.
766  */
767 void
768 ufs_ifree(struct inode *ip, ino_t ino, mode_t mode)
769 {
770 	struct fs *fs = ip->i_fs;
771 	struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
772 	struct cg *cgp;
773 	struct buf *bp;
774 	unsigned int inot;
775 	int cg;
776 	char *iused;
777 
778 	if (ip->i_number == ino && ip->i_mode != 0) {
779 		(void) ufs_fault(ITOV(ip),
780 		    "ufs_ifree: illegal mode: (imode) %o, (omode) %o, ino %d, "
781 		    "fs = %s\n",
782 		    ip->i_mode, mode, (int)ip->i_number, fs->fs_fsmnt);
783 		return;
784 	}
785 	if (ino >= fs->fs_ipg * fs->fs_ncg) {
786 		(void) ufs_fault(ITOV(ip),
787 		    "ifree: range, dev = 0x%x, ino = %d, fs = %s\n",
788 		    (int)ip->i_dev, (int)ino, fs->fs_fsmnt);
789 		return;
790 	}
791 	cg = (int)itog(fs, ino);
792 	bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, cgtod(fs, cg)),
793 	    (int)fs->fs_cgsize);
794 
795 	cgp = bp->b_un.b_cg;
796 	if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp)) {
797 		brelse(bp);
798 		return;
799 	}
800 	mutex_enter(&ufsvfsp->vfs_lock);
801 	cgp->cg_time = gethrestime_sec();
802 	iused = cg_inosused(cgp);
803 	inot = (unsigned int)(ino % (ulong_t)fs->fs_ipg);
804 	if (isclr(iused, inot)) {
805 		mutex_exit(&ufsvfsp->vfs_lock);
806 		brelse(bp);
807 		(void) ufs_fault(ITOV(ip), "ufs_ifree: freeing free inode, "
808 		    "mode: (imode) %o, (omode) %o, ino:%d, "
809 		    "fs:%s",
810 		    ip->i_mode, mode, (int)ino, fs->fs_fsmnt);
811 		return;
812 	}
813 	clrbit(iused, inot);
814 
815 	if (inot < (ulong_t)cgp->cg_irotor)
816 		cgp->cg_irotor = inot;
817 	cgp->cg_cs.cs_nifree++;
818 	fs->fs_cstotal.cs_nifree++;
819 	fs->fs_cs(fs, cg).cs_nifree++;
820 	if (((mode & IFMT) == IFDIR) || ((mode & IFMT) == IFATTRDIR)) {
821 		cgp->cg_cs.cs_ndir--;
822 		fs->fs_cstotal.cs_ndir--;
823 		fs->fs_cs(fs, cg).cs_ndir--;
824 	}
825 	fs->fs_fmod = 1;
826 	ufs_notclean(ufsvfsp);
827 	TRANS_BUF(ufsvfsp, 0, fs->fs_cgsize, bp, DT_CG);
828 	TRANS_SI(ufsvfsp, fs, cg);
829 	bdrwrite(bp);
830 }
831 
832 /*
833  * Implement the cylinder overflow algorithm.
834  *
835  * The policy implemented by this algorithm is:
836  *   1) allocate the block in its requested cylinder group.
837  *   2) quadratically rehash on the cylinder group number.
838  *   3) brute force search for a free block.
839  * The size parameter means size for data blocks, mode for inodes.
840  */
841 static ino_t
842 hashalloc(struct inode *ip, int cg, long pref, int size, ulong_t (*allocator)())
843 {
844 	struct fs *fs;
845 	int i;
846 	long result;
847 	int icg = cg;
848 
849 	fs = ip->i_fs;
850 	/*
851 	 * 1: preferred cylinder group
852 	 */
853 	result = (*allocator)(ip, cg, pref, size);
854 	if (result)
855 		return (result);
856 	/*
857 	 * 2: quadratic rehash
858 	 */
859 	for (i = 1; i < fs->fs_ncg; i *= 2) {
860 		cg += i;
861 		if (cg >= fs->fs_ncg)
862 			cg -= fs->fs_ncg;
863 		result = (*allocator)(ip, cg, 0, size);
864 		if (result)
865 			return (result);
866 	}
867 	/*
868 	 * 3: brute force search
869 	 * Note that we start at i == 2, since 0 was checked initially,
870 	 * and 1 is always checked in the quadratic rehash.
871 	 */
872 	cg = (icg + 2) % fs->fs_ncg;
873 	for (i = 2; i < fs->fs_ncg; i++) {
874 		result = (*allocator)(ip, cg, 0, size);
875 		if (result)
876 			return (result);
877 		cg++;
878 		if (cg == fs->fs_ncg)
879 			cg = 0;
880 	}
881 	return (NULL);
882 }
883 
884 /*
885  * Determine whether a fragment can be extended.
886  *
887  * Check to see if the necessary fragments are available, and
888  * if they are, allocate them.
889  */
890 static daddr_t
891 fragextend(struct inode *ip, int cg, long bprev, int osize, int nsize)
892 {
893 	struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
894 	struct fs *fs = ip->i_fs;
895 	struct buf *bp;
896 	struct cg *cgp;
897 	uchar_t *blksfree;
898 	long bno;
899 	int frags, bbase;
900 	int i, j;
901 
902 	if (fs->fs_cs(fs, cg).cs_nffree < numfrags(fs, nsize - osize))
903 		return (NULL);
904 	frags = numfrags(fs, nsize);
905 	bbase = (int)fragnum(fs, bprev);
906 	if (bbase > fragnum(fs, (bprev + frags - 1))) {
907 		/* cannot extend across a block boundary */
908 		return (NULL);
909 	}
910 
911 	bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, cgtod(fs, cg)),
912 	    (int)fs->fs_cgsize);
913 	cgp = bp->b_un.b_cg;
914 	if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp)) {
915 		brelse(bp);
916 		return (NULL);
917 	}
918 
919 	blksfree = cg_blksfree(cgp);
920 	mutex_enter(&ufsvfsp->vfs_lock);
921 	bno = dtogd(fs, bprev);
922 	for (i = numfrags(fs, osize); i < frags; i++) {
923 		if (isclr(blksfree, bno + i)) {
924 			mutex_exit(&ufsvfsp->vfs_lock);
925 			brelse(bp);
926 			return (NULL);
927 		}
928 		if ((TRANS_ISCANCEL(ufsvfsp, ldbtob(fsbtodb(fs, bprev + i)),
929 		    fs->fs_fsize))) {
930 			mutex_exit(&ufsvfsp->vfs_lock);
931 			brelse(bp);
932 			return (NULL);
933 		}
934 	}
935 
936 	cgp->cg_time = gethrestime_sec();
937 	/*
938 	 * The current fragment can be extended,
939 	 * deduct the count on fragment being extended into
940 	 * increase the count on the remaining fragment (if any)
941 	 * allocate the extended piece.
942 	 */
943 	for (i = frags; i < fs->fs_frag - bbase; i++)
944 		if (isclr(blksfree, bno + i))
945 			break;
946 	j = i - numfrags(fs, osize);
947 	cgp->cg_frsum[j]--;
948 	ASSERT(cgp->cg_frsum[j] >= 0);
949 	if (i != frags)
950 		cgp->cg_frsum[i - frags]++;
951 	for (i = numfrags(fs, osize); i < frags; i++) {
952 		clrbit(blksfree, bno + i);
953 		cgp->cg_cs.cs_nffree--;
954 		fs->fs_cs(fs, cg).cs_nffree--;
955 		fs->fs_cstotal.cs_nffree--;
956 	}
957 	fs->fs_fmod = 1;
958 	ufs_notclean(ufsvfsp);
959 	TRANS_BUF(ufsvfsp, 0, fs->fs_cgsize, bp, DT_CG);
960 	TRANS_SI(ufsvfsp, fs, cg);
961 	bdrwrite(bp);
962 	return ((daddr_t)bprev);
963 }
964 
965 /*
966  * Determine whether a block can be allocated.
967  *
968  * Check to see if a block of the apprpriate size
969  * is available, and if it is, allocate it.
970  */
971 static daddr_t
972 alloccg(struct inode *ip, int cg, daddr_t bpref, int size)
973 {
974 	struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
975 	struct fs *fs = ip->i_fs;
976 	struct buf *bp;
977 	struct cg *cgp;
978 	uchar_t *blksfree;
979 	int bno, frags;
980 	int allocsiz;
981 	int i;
982 
983 	/*
984 	 * Searching for space could be time expensive so do some
985 	 * up front checking to verify that there is actually space
986 	 * available (free blocks or free frags).
987 	 */
988 	if (fs->fs_cs(fs, cg).cs_nbfree == 0) {
989 		if (size == fs->fs_bsize)
990 			return (0);
991 
992 		/*
993 		 * If there are not enough free frags then return.
994 		 */
995 		if (fs->fs_cs(fs, cg).cs_nffree < numfrags(fs, size))
996 			return (0);
997 	}
998 
999 	bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, cgtod(fs, cg)),
1000 	    (int)fs->fs_cgsize);
1001 
1002 	cgp = bp->b_un.b_cg;
1003 	if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp) ||
1004 	    (cgp->cg_cs.cs_nbfree == 0 && size == fs->fs_bsize)) {
1005 		brelse(bp);
1006 		return (0);
1007 	}
1008 	blksfree = cg_blksfree(cgp);
1009 	mutex_enter(&ufsvfsp->vfs_lock);
1010 	cgp->cg_time = gethrestime_sec();
1011 	if (size == fs->fs_bsize) {
1012 		if ((bno = alloccgblk(ufsvfsp, cgp, bpref, bp)) == 0)
1013 			goto errout;
1014 		fs->fs_fmod = 1;
1015 		ufs_notclean(ufsvfsp);
1016 		TRANS_SI(ufsvfsp, fs, cg);
1017 		bdrwrite(bp);
1018 		return (bno);
1019 	}
1020 	/*
1021 	 * Check to see if any fragments are already available
1022 	 * allocsiz is the size which will be allocated, hacking
1023 	 * it down to a smaller size if necessary.
1024 	 */
1025 	frags = numfrags(fs, size);
1026 	for (allocsiz = frags; allocsiz < fs->fs_frag; allocsiz++)
1027 		if (cgp->cg_frsum[allocsiz] != 0)
1028 			break;
1029 
1030 	if (allocsiz != fs->fs_frag)
1031 		bno = mapsearch(ufsvfsp, cgp, bpref, allocsiz);
1032 
1033 	if (allocsiz == fs->fs_frag || bno < 0) {
1034 		/*
1035 		 * No fragments were available, so a block
1036 		 * will be allocated and hacked up.
1037 		 */
1038 		if (cgp->cg_cs.cs_nbfree == 0)
1039 			goto errout;
1040 		if ((bno = alloccgblk(ufsvfsp, cgp, bpref, bp)) == 0)
1041 			goto errout;
1042 		bpref = dtogd(fs, bno);
1043 		for (i = frags; i < fs->fs_frag; i++)
1044 			setbit(blksfree, bpref + i);
1045 		i = fs->fs_frag - frags;
1046 		cgp->cg_cs.cs_nffree += i;
1047 		fs->fs_cstotal.cs_nffree += i;
1048 		fs->fs_cs(fs, cg).cs_nffree += i;
1049 		cgp->cg_frsum[i]++;
1050 		fs->fs_fmod = 1;
1051 		ufs_notclean(ufsvfsp);
1052 		TRANS_SI(ufsvfsp, fs, cg);
1053 		bdrwrite(bp);
1054 		return (bno);
1055 	}
1056 
1057 	for (i = 0; i < frags; i++)
1058 		clrbit(blksfree, bno + i);
1059 	cgp->cg_cs.cs_nffree -= frags;
1060 	fs->fs_cstotal.cs_nffree -= frags;
1061 	fs->fs_cs(fs, cg).cs_nffree -= frags;
1062 	cgp->cg_frsum[allocsiz]--;
1063 	ASSERT(cgp->cg_frsum[allocsiz] >= 0);
1064 	if (frags != allocsiz) {
1065 		cgp->cg_frsum[allocsiz - frags]++;
1066 	}
1067 	fs->fs_fmod = 1;
1068 	ufs_notclean(ufsvfsp);
1069 	TRANS_BUF(ufsvfsp, 0, fs->fs_cgsize, bp, DT_CG);
1070 	TRANS_SI(ufsvfsp, fs, cg);
1071 	bdrwrite(bp);
1072 	return (cg * fs->fs_fpg + bno);
1073 errout:
1074 	mutex_exit(&ufsvfsp->vfs_lock);
1075 	brelse(bp);
1076 	return (0);
1077 }
1078 
1079 /*
1080  * Allocate a block in a cylinder group.
1081  *
1082  * This algorithm implements the following policy:
1083  *   1) allocate the requested block.
1084  *   2) allocate a rotationally optimal block in the same cylinder.
1085  *   3) allocate the next available block on the block rotor for the
1086  *	specified cylinder group.
1087  * Note that this routine only allocates fs_bsize blocks; these
1088  * blocks may be fragmented by the routine that allocates them.
1089  */
1090 static daddr_t
1091 alloccgblk(
1092 	struct ufsvfs *ufsvfsp,
1093 	struct cg *cgp,
1094 	daddr_t bpref,
1095 	struct buf *bp)
1096 {
1097 	daddr_t bno;
1098 	int cylno, pos, delta, rotbl_size;
1099 	short *cylbp;
1100 	int i;
1101 	struct fs *fs;
1102 	uchar_t *blksfree;
1103 	daddr_t blkno, rpos, frag;
1104 	short *blks;
1105 	int32_t *blktot;
1106 
1107 	ASSERT(MUTEX_HELD(&ufsvfsp->vfs_lock));
1108 	fs = ufsvfsp->vfs_fs;
1109 	blksfree = cg_blksfree(cgp);
1110 	if (bpref == 0) {
1111 		bpref = cgp->cg_rotor;
1112 		goto norot;
1113 	}
1114 	bpref = blknum(fs, bpref);
1115 	bpref = dtogd(fs, bpref);
1116 	/*
1117 	 * If the requested block is available, use it.
1118 	 */
1119 	if (isblock(fs, blksfree, (daddr_t)fragstoblks(fs, bpref))) {
1120 		bno = bpref;
1121 		goto gotit;
1122 	}
1123 	/*
1124 	 * Check for a block available on the same cylinder.
1125 	 */
1126 	cylno = cbtocylno(fs, bpref);
1127 	if (cg_blktot(cgp)[cylno] == 0)
1128 		goto norot;
1129 	if (fs->fs_cpc == 0) {
1130 		/*
1131 		 * Block layout info is not available, so just
1132 		 * have to take any block in this cylinder.
1133 		 */
1134 		bpref = howmany(fs->fs_spc * cylno, NSPF(fs));
1135 		goto norot;
1136 	}
1137 	/*
1138 	 * Check the summary information to see if a block is
1139 	 * available in the requested cylinder starting at the
1140 	 * requested rotational position and proceeding around.
1141 	 */
1142 	cylbp = cg_blks(ufsvfsp, cgp, cylno);
1143 	pos = cbtorpos(ufsvfsp, bpref);
1144 	for (i = pos; i < ufsvfsp->vfs_nrpos; i++)
1145 		if (cylbp[i] > 0)
1146 			break;
1147 	if (i == ufsvfsp->vfs_nrpos)
1148 		for (i = 0; i < pos; i++)
1149 			if (cylbp[i] > 0)
1150 				break;
1151 	if (cylbp[i] > 0) {
1152 		/*
1153 		 * Found a rotational position, now find the actual
1154 		 * block.  A "panic" if none is actually there.
1155 		 */
1156 
1157 		/*
1158 		 * Up to this point, "pos" has referred to the rotational
1159 		 * position of the desired block.  From now on, it holds
1160 		 * the offset of the current cylinder within a cylinder
1161 		 * cycle.  (A cylinder cycle refers to a set of cylinders
1162 		 * which are described by a single rotational table; the
1163 		 * size of the cycle is fs_cpc.)
1164 		 *
1165 		 * bno is set to the block number of the first block within
1166 		 * the current cylinder cycle.
1167 		 */
1168 
1169 		pos = cylno % fs->fs_cpc;
1170 		bno = (cylno - pos) * fs->fs_spc / NSPB(fs);
1171 
1172 		/*
1173 		 * The blocks within a cylinder are grouped into equivalence
1174 		 * classes according to their "rotational position."  There
1175 		 * are two tables used to determine these classes.
1176 		 *
1177 		 * The positional offset table (fs_postbl) has an entry for
1178 		 * each rotational position of each cylinder in a cylinder
1179 		 * cycle.  This entry contains the relative block number
1180 		 * (counting from the start of the cylinder cycle) of the
1181 		 * first block in the equivalence class for that position
1182 		 * and that cylinder.  Positions for which no blocks exist
1183 		 * are indicated by a -1.
1184 		 *
1185 		 * The rotational delta table (fs_rotbl) has an entry for
1186 		 * each block in a cylinder cycle.  This entry contains
1187 		 * the offset from that block to the next block in the
1188 		 * same equivalence class.  The last block in the class
1189 		 * is indicated by a zero in the table.
1190 		 *
1191 		 * The following code, then, walks through all of the blocks
1192 		 * in the cylinder (cylno) which we're allocating within
1193 		 * which are in the equivalence class for the rotational
1194 		 * position (i) which we're allocating within.
1195 		 */
1196 
1197 		if (fs_postbl(ufsvfsp, pos)[i] == -1) {
1198 			(void) ufs_fault(ufsvfsp->vfs_root,
1199 			    "alloccgblk: cyl groups corrupted, pos = %d, "
1200 			    "i = %d, fs = %s\n", pos, i, fs->fs_fsmnt);
1201 			return (0);
1202 		}
1203 
1204 		/*
1205 		 * There is one entry in the rotational table for each block
1206 		 * in the cylinder cycle.  These are whole blocks, not frags.
1207 		 */
1208 
1209 		rotbl_size = (fs->fs_cpc * fs->fs_spc) >>
1210 		    (fs->fs_fragshift + fs->fs_fsbtodb);
1211 
1212 		/*
1213 		 * As we start, "i" is the rotational position within which
1214 		 * we're searching.  After the next line, it will be a block
1215 		 * number (relative to the start of the cylinder cycle)
1216 		 * within the equivalence class of that rotational position.
1217 		 */
1218 
1219 		i = fs_postbl(ufsvfsp, pos)[i];
1220 
1221 		for (;;) {
1222 			if (isblock(fs, blksfree, (daddr_t)(bno + i))) {
1223 				bno = blkstofrags(fs, (bno + i));
1224 				goto gotit;
1225 			}
1226 			delta = fs_rotbl(fs)[i];
1227 			if (delta <= 0 ||		/* End of chain, or */
1228 			    delta + i > rotbl_size)	/* end of table? */
1229 				break;			/* If so, panic. */
1230 			i += delta;
1231 		}
1232 		(void) ufs_fault(ufsvfsp->vfs_root,
1233 		    "alloccgblk: can't find blk in cyl, pos:%d, i:%d, "
1234 		    "fs:%s bno: %x\n", pos, i, fs->fs_fsmnt, (int)bno);
1235 		return (0);
1236 	}
1237 norot:
1238 	/*
1239 	 * No blocks in the requested cylinder, so take
1240 	 * next available one in this cylinder group.
1241 	 */
1242 	bno = mapsearch(ufsvfsp, cgp, bpref, (int)fs->fs_frag);
1243 	if (bno < 0)
1244 		return (0);
1245 	cgp->cg_rotor = bno;
1246 gotit:
1247 	blkno = fragstoblks(fs, bno);
1248 	frag = (cgp->cg_cgx * fs->fs_fpg) + bno;
1249 	if (TRANS_ISCANCEL(ufsvfsp, ldbtob(fsbtodb(fs, frag)), fs->fs_bsize))
1250 		goto norot;
1251 	clrblock(fs, blksfree, (long)blkno);
1252 	/*
1253 	 * the other cg/sb/si fields are TRANS'ed by the caller
1254 	 */
1255 	cgp->cg_cs.cs_nbfree--;
1256 	fs->fs_cstotal.cs_nbfree--;
1257 	fs->fs_cs(fs, cgp->cg_cgx).cs_nbfree--;
1258 	cylno = cbtocylno(fs, bno);
1259 	blks = cg_blks(ufsvfsp, cgp, cylno);
1260 	rpos = cbtorpos(ufsvfsp, bno);
1261 	blktot = cg_blktot(cgp);
1262 	blks[rpos]--;
1263 	blktot[cylno]--;
1264 	TRANS_BUF(ufsvfsp, 0, fs->fs_cgsize, bp, DT_CG);
1265 	fs->fs_fmod = 1;
1266 	return (frag);
1267 }
1268 
1269 /*
1270  * Determine whether an inode can be allocated.
1271  *
1272  * Check to see if an inode is available, and if it is,
1273  * allocate it using the following policy:
1274  *   1) allocate the requested inode.
1275  *   2) allocate the next available inode after the requested
1276  *	inode in the specified cylinder group.
1277  */
1278 static ino_t
1279 ialloccg(struct inode *ip, int cg, daddr_t ipref, int mode)
1280 {
1281 	struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
1282 	struct fs *fs = ip->i_fs;
1283 	struct cg *cgp;
1284 	struct buf *bp;
1285 	int start, len, loc, map, i;
1286 	char *iused;
1287 
1288 	if (fs->fs_cs(fs, cg).cs_nifree == 0)
1289 		return (0);
1290 	bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, cgtod(fs, cg)),
1291 	    (int)fs->fs_cgsize);
1292 
1293 	cgp = bp->b_un.b_cg;
1294 	if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp) ||
1295 	    cgp->cg_cs.cs_nifree == 0) {
1296 		brelse(bp);
1297 		return (0);
1298 	}
1299 	iused = cg_inosused(cgp);
1300 	mutex_enter(&ufsvfsp->vfs_lock);
1301 	/*
1302 	 * While we are waiting for the mutex, someone may have taken
1303 	 * the last available inode.  Need to recheck.
1304 	 */
1305 	if (cgp->cg_cs.cs_nifree == 0) {
1306 		mutex_exit(&ufsvfsp->vfs_lock);
1307 		brelse(bp);
1308 		return (0);
1309 	}
1310 
1311 	cgp->cg_time = gethrestime_sec();
1312 	if (ipref) {
1313 		ipref %= fs->fs_ipg;
1314 		if (isclr(iused, ipref))
1315 			goto gotit;
1316 	}
1317 	start = cgp->cg_irotor / NBBY;
1318 	len = howmany(fs->fs_ipg - cgp->cg_irotor, NBBY);
1319 	loc = skpc(0xff, (uint_t)len, &iused[start]);
1320 	if (loc == 0) {
1321 		len = start + 1;
1322 		start = 0;
1323 		loc = skpc(0xff, (uint_t)len, &iused[0]);
1324 		if (loc == 0) {
1325 			mutex_exit(&ufsvfsp->vfs_lock);
1326 			(void) ufs_fault(ITOV(ip),
1327 			    "ialloccg: map corrupted, cg = %d, irotor = %d, "
1328 			    "fs = %s\n", cg, (int)cgp->cg_irotor, fs->fs_fsmnt);
1329 			return (0);
1330 		}
1331 	}
1332 	i = start + len - loc;
1333 	map = iused[i];
1334 	ipref = i * NBBY;
1335 	for (i = 1; i < (1 << NBBY); i <<= 1, ipref++) {
1336 		if ((map & i) == 0) {
1337 			cgp->cg_irotor = ipref;
1338 			goto gotit;
1339 		}
1340 	}
1341 
1342 	mutex_exit(&ufsvfsp->vfs_lock);
1343 	(void) ufs_fault(ITOV(ip), "ialloccg: block not in mapfs = %s",
1344 	    fs->fs_fsmnt);
1345 	return (0);
1346 gotit:
1347 	setbit(iused, ipref);
1348 	cgp->cg_cs.cs_nifree--;
1349 	fs->fs_cstotal.cs_nifree--;
1350 	fs->fs_cs(fs, cg).cs_nifree--;
1351 	if (((mode & IFMT) == IFDIR) || ((mode & IFMT) == IFATTRDIR)) {
1352 		cgp->cg_cs.cs_ndir++;
1353 		fs->fs_cstotal.cs_ndir++;
1354 		fs->fs_cs(fs, cg).cs_ndir++;
1355 	}
1356 	fs->fs_fmod = 1;
1357 	ufs_notclean(ufsvfsp);
1358 	TRANS_BUF(ufsvfsp, 0, fs->fs_cgsize, bp, DT_CG);
1359 	TRANS_SI(ufsvfsp, fs, cg);
1360 	bdrwrite(bp);
1361 	return (cg * fs->fs_ipg + ipref);
1362 }
1363 
1364 /*
1365  * Find a block of the specified size in the specified cylinder group.
1366  *
1367  * It is a panic if a request is made to find a block if none are
1368  * available.
1369  */
1370 static daddr_t
1371 mapsearch(struct ufsvfs *ufsvfsp, struct cg *cgp, daddr_t bpref,
1372 	int allocsiz)
1373 {
1374 	struct fs *fs	= ufsvfsp->vfs_fs;
1375 	daddr_t bno, cfrag;
1376 	int start, len, loc, i, last, first, secondtime;
1377 	int blk, field, subfield, pos;
1378 	int gotit;
1379 
1380 	/*
1381 	 * ufsvfs->vfs_lock is held when calling this.
1382 	 */
1383 	/*
1384 	 * Find the fragment by searching through the
1385 	 * free block map for an appropriate bit pattern.
1386 	 */
1387 	if (bpref)
1388 		start = dtogd(fs, bpref) / NBBY;
1389 	else
1390 		start = cgp->cg_frotor / NBBY;
1391 	/*
1392 	 * the following loop performs two scans -- the first scan
1393 	 * searches the bottom half of the array for a match and the
1394 	 * second scan searches the top half of the array.  The loops
1395 	 * have been merged just to make things difficult.
1396 	 */
1397 	first = start;
1398 	last = howmany(fs->fs_fpg, NBBY);
1399 	secondtime = 0;
1400 	cfrag = cgp->cg_cgx * fs->fs_fpg;
1401 	while (first < last) {
1402 		len = last - first;
1403 		/*
1404 		 * search the array for a match
1405 		 */
1406 		loc = scanc((unsigned)len, (uchar_t *)&cg_blksfree(cgp)[first],
1407 		    (uchar_t *)fragtbl[fs->fs_frag],
1408 		    (int)(1 << (allocsiz - 1 + (fs->fs_frag % NBBY))));
1409 		/*
1410 		 * match found
1411 		 */
1412 		if (loc) {
1413 			bno = (last - loc) * NBBY;
1414 
1415 			/*
1416 			 * Found the byte in the map, sift
1417 			 * through the bits to find the selected frag
1418 			 */
1419 			cgp->cg_frotor = bno;
1420 			gotit = 0;
1421 			for (i = bno + NBBY; bno < i; bno += fs->fs_frag) {
1422 				blk = blkmap(fs, cg_blksfree(cgp), bno);
1423 				blk <<= 1;
1424 				field = around[allocsiz];
1425 				subfield = inside[allocsiz];
1426 				for (pos = 0;
1427 				    pos <= fs->fs_frag - allocsiz;
1428 				    pos++) {
1429 					if ((blk & field) == subfield) {
1430 						gotit++;
1431 						break;
1432 					}
1433 					field <<= 1;
1434 					subfield <<= 1;
1435 				}
1436 				if (gotit)
1437 					break;
1438 			}
1439 			bno += pos;
1440 
1441 			/*
1442 			 * success if block is *not* being converted from
1443 			 * metadata into userdata (harpy).  If so, ignore.
1444 			 */
1445 			if (!TRANS_ISCANCEL(ufsvfsp,
1446 			    ldbtob(fsbtodb(fs, (cfrag+bno))),
1447 			    allocsiz * fs->fs_fsize))
1448 				return (bno);
1449 
1450 			/*
1451 			 * keep looking -- this block is being converted
1452 			 */
1453 			first = (last - loc) + 1;
1454 			loc = 0;
1455 			if (first < last)
1456 				continue;
1457 		}
1458 		/*
1459 		 * no usable matches in bottom half -- now search the top half
1460 		 */
1461 		if (secondtime)
1462 			/*
1463 			 * no usable matches in top half -- all done
1464 			 */
1465 			break;
1466 		secondtime = 1;
1467 		last = start + 1;
1468 		first = 0;
1469 	}
1470 	/*
1471 	 * no usable matches
1472 	 */
1473 	return ((daddr_t)-1);
1474 }
1475 
1476 #define	UFSNADDR (NDADDR + NIADDR)	/* NADDR applies to (obsolete) S5FS */
1477 #define	IB(i)	(NDADDR + (i))	/* index of i'th indirect block ptr */
1478 #define	SINGLE	0		/* single indirect block ptr */
1479 #define	DOUBLE	1		/* double indirect block ptr */
1480 #define	TRIPLE	2		/* triple indirect block ptr */
1481 
1482 /*
1483  * Acquire a write lock, and keep trying till we get it
1484  */
1485 static int
1486 allocsp_wlockfs(struct vnode *vp, struct lockfs *lf)
1487 {
1488 	int err = 0;
1489 
1490 lockagain:
1491 	do {
1492 		err = ufs_fiolfss(vp, lf);
1493 		if (err)
1494 			return (err);
1495 	} while (!LOCKFS_IS_ULOCK(lf));
1496 
1497 	lf->lf_lock = LOCKFS_WLOCK;
1498 	lf->lf_flags = 0;
1499 	lf->lf_comment = NULL;
1500 	err = ufs__fiolfs(vp, lf, 1, 0);
1501 
1502 	if (err == EBUSY || err == EINVAL)
1503 		goto lockagain;
1504 
1505 	return (err);
1506 }
1507 
1508 /*
1509  * Release the write lock
1510  */
1511 static int
1512 allocsp_unlockfs(struct vnode *vp, struct lockfs *lf)
1513 {
1514 	int err = 0;
1515 
1516 	lf->lf_lock = LOCKFS_ULOCK;
1517 	lf->lf_flags = 0;
1518 	err = ufs__fiolfs(vp, lf, 1, 0);
1519 	return (err);
1520 }
1521 
1522 struct allocsp_undo {
1523 	daddr_t offset;
1524 	daddr_t blk;
1525 	struct allocsp_undo *next;
1526 };
1527 
1528 /*
1529  * ufs_allocsp() can be used to pre-allocate blocks for a file on a given
1530  * file system. The blocks are not initialized and are only marked as allocated.
1531  * These addresses are then stored as negative block numbers in the inode to
1532  * imply special handling. UFS has been modified where necessary to understand
1533  * this new notion. Successfully fallocated files will have IFALLOCATE cflag
1534  * set in the inode.
1535  */
1536 int
1537 ufs_allocsp(struct vnode *vp, struct flock64 *lp, cred_t *cr)
1538 {
1539 	struct lockfs lf;
1540 	int berr, err, resv, issync;
1541 	off_t start, istart, len; /* istart, special for idb */
1542 	struct inode *ip;
1543 	struct fs *fs;
1544 	struct ufsvfs *ufsvfsp;
1545 	u_offset_t resid, i;
1546 	daddr32_t db_undo[NDADDR];	/* old direct blocks */
1547 	struct allocsp_undo *ib_undo = NULL;	/* ib undo */
1548 	struct allocsp_undo *undo = NULL;
1549 	u_offset_t osz;			/* old file size */
1550 	int chunkblks = 0;		/* # of blocks in 1 allocation */
1551 	int cnt = 0;
1552 	daddr_t allocblk;
1553 	daddr_t totblks = 0;
1554 	struct ulockfs	*ulp;
1555 
1556 	ASSERT(vp->v_type == VREG);
1557 
1558 	ip = VTOI(vp);
1559 	fs = ip->i_fs;
1560 	if ((ufsvfsp = ip->i_ufsvfs) == NULL) {
1561 		err = EIO;
1562 		goto out_allocsp;
1563 	}
1564 
1565 	istart = start = blkroundup(fs, (lp->l_start));
1566 	len = blkroundup(fs, (lp->l_len));
1567 	chunkblks = blkroundup(fs, ufsvfsp->vfs_iotransz) / fs->fs_bsize;
1568 	ulp = &ufsvfsp->vfs_ulockfs;
1569 
1570 	if (lp->l_start < 0 || lp->l_len <= 0)
1571 		return (EINVAL);
1572 
1573 	/* Quickly check to make sure we have space before we proceed */
1574 	if (lblkno(fs, len) > fs->fs_cstotal.cs_nbfree) {
1575 		if (TRANS_ISTRANS(ufsvfsp)) {
1576 			ufs_delete_drain_wait(ufsvfsp, 1);
1577 			if (lblkno(fs, len) > fs->fs_cstotal.cs_nbfree)
1578 				return (ENOSPC);
1579 		} else
1580 			return (ENOSPC);
1581 	}
1582 
1583 	/*
1584 	 * We will keep i_rwlock locked as WRITER through out the function
1585 	 * since we don't want anyone else reading or writing to the inode
1586 	 * while we are in the middle of fallocating the file.
1587 	 */
1588 	rw_enter(&ip->i_rwlock, RW_WRITER);
1589 
1590 	/* Back up the direct block list, used for undo later if necessary */
1591 	rw_enter(&ip->i_contents, RW_READER);
1592 	for (i = 0; i < NDADDR; i++)
1593 		db_undo[i] = ip->i_db[i];
1594 	osz = ip->i_size;
1595 	rw_exit(&ip->i_contents);
1596 
1597 	/* Allocate any direct blocks now before we write lock the fs */
1598 	if (lblkno(fs, start) < NDADDR) {
1599 		ufs_trans_trunc_resv(ip, ip->i_size + (NDADDR * fs->fs_bsize),
1600 		    &resv, &resid);
1601 		TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_ALLOCSP, resv);
1602 
1603 		rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
1604 		rw_enter(&ip->i_contents, RW_WRITER);
1605 
1606 		for (i = start; (i < (start + len)) && (lblkno(fs, i) < NDADDR);
1607 		    i += fs->fs_bsize) {
1608 			berr = bmap_write(ip, i, fs->fs_bsize, BI_FALLOCATE,
1609 			    &allocblk, cr);
1610 			/* Yikes error, quit */
1611 			if (berr) {
1612 				TRANS_INODE(ufsvfsp, ip);
1613 				rw_exit(&ip->i_contents);
1614 				rw_exit(&ufsvfsp->vfs_dqrwlock);
1615 				TRANS_END_CSYNC(ufsvfsp, err, issync,
1616 				    TOP_ALLOCSP, resv);
1617 				goto exit;
1618 			}
1619 
1620 			if (allocblk) {
1621 				totblks++;
1622 				if (i >= ip->i_size)
1623 					ip->i_size += fs->fs_bsize;
1624 			}
1625 		}
1626 
1627 		TRANS_INODE(ufsvfsp, ip);
1628 		rw_exit(&ip->i_contents);
1629 		rw_exit(&ufsvfsp->vfs_dqrwlock);
1630 		TRANS_END_CSYNC(ufsvfsp, err, issync, TOP_ALLOCSP, resv);
1631 
1632 		istart =  i;	/* start offset for indirect allocation */
1633 	}
1634 
1635 	/* Write lock the file system */
1636 	if (err = allocsp_wlockfs(vp, &lf))
1637 		goto exit;
1638 
1639 	/* Break the transactions into vfs_iotransz units */
1640 	ufs_trans_trunc_resv(ip, ip->i_size +
1641 	    blkroundup(fs, ufsvfsp->vfs_iotransz), &resv, &resid);
1642 	TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_ALLOCSP, resv);
1643 
1644 	rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
1645 	rw_enter(&ip->i_contents, RW_WRITER);
1646 
1647 	/* Now go about fallocating necessary indirect blocks */
1648 	for (i = istart; i < (start + len); i += fs->fs_bsize) {
1649 		berr = bmap_write(ip, i, fs->fs_bsize, BI_FALLOCATE,
1650 		    &allocblk, cr);
1651 		if (berr) {
1652 			TRANS_INODE(ufsvfsp, ip);
1653 			rw_exit(&ip->i_contents);
1654 			rw_exit(&ufsvfsp->vfs_dqrwlock);
1655 			TRANS_END_CSYNC(ufsvfsp, err, issync,
1656 			    TOP_ALLOCSP, resv);
1657 			err = allocsp_unlockfs(vp, &lf);
1658 			goto exit;
1659 		}
1660 
1661 		/* Update the blk counter only if new block was added */
1662 		if (allocblk) {
1663 			/* Save undo information */
1664 			undo = kmem_alloc(sizeof (struct allocsp_undo),
1665 			    KM_SLEEP);
1666 			undo->offset = i;
1667 			undo->blk = allocblk;
1668 			undo->next = ib_undo;
1669 			ib_undo = undo;
1670 			totblks++;
1671 
1672 			if (i >= ip->i_size)
1673 				ip->i_size += fs->fs_bsize;
1674 		}
1675 		cnt++;
1676 
1677 		/* Being a good UFS citizen, let others get a share */
1678 		if (cnt == chunkblks) {
1679 			/*
1680 			 * If there are waiters or the fs is hard locked,
1681 			 * error locked, or read-only error locked,
1682 			 * quit with EIO
1683 			 */
1684 			if (ULOCKFS_IS_HLOCK(ulp) || ULOCKFS_IS_ELOCK(ulp) ||
1685 			    ULOCKFS_IS_ROELOCK(ulp)) {
1686 				ip->i_cflags |= IFALLOCATE;
1687 				TRANS_INODE(ufsvfsp, ip);
1688 				rw_exit(&ip->i_contents);
1689 				rw_exit(&ufsvfsp->vfs_dqrwlock);
1690 
1691 				TRANS_END_CSYNC(ufsvfsp, err, issync,
1692 				    TOP_ALLOCSP, resv);
1693 				rw_exit(&ip->i_rwlock);
1694 				return (EIO);
1695 			}
1696 
1697 			TRANS_INODE(ufsvfsp, ip);
1698 			rw_exit(&ip->i_contents);
1699 			rw_exit(&ufsvfsp->vfs_dqrwlock);
1700 
1701 			/* End the current transaction */
1702 			TRANS_END_CSYNC(ufsvfsp, err, issync,
1703 			    TOP_ALLOCSP, resv);
1704 
1705 			if (CV_HAS_WAITERS(&ulp->ul_cv)) {
1706 				/* Release the write lock */
1707 				if (err = allocsp_unlockfs(vp, &lf))
1708 					goto exit;
1709 
1710 				/* Wake up others waiting to do operations */
1711 				mutex_enter(&ulp->ul_lock);
1712 				cv_broadcast(&ulp->ul_cv);
1713 				mutex_exit(&ulp->ul_lock);
1714 
1715 				/* Grab the write lock again */
1716 				if (err = allocsp_wlockfs(vp, &lf))
1717 					goto exit;
1718 			} /* end of CV_HAS_WAITERS(&ulp->ul_cv) */
1719 
1720 			/* Reserve more space in log for this file */
1721 			ufs_trans_trunc_resv(ip,
1722 			    ip->i_size + blkroundup(fs, ufsvfsp->vfs_iotransz),
1723 			    &resv, &resid);
1724 			TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_ALLOCSP, resv);
1725 
1726 			rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
1727 			rw_enter(&ip->i_contents, RW_WRITER);
1728 
1729 			cnt = 0;	/* reset cnt b/c of new transaction */
1730 		}
1731 	}
1732 
1733 	if (!err && !berr)
1734 		ip->i_cflags |= IFALLOCATE;
1735 
1736 	/* Release locks, end log transaction and unlock fs */
1737 	TRANS_INODE(ufsvfsp, ip);
1738 	rw_exit(&ip->i_contents);
1739 	rw_exit(&ufsvfsp->vfs_dqrwlock);
1740 
1741 	TRANS_END_CSYNC(ufsvfsp, err, issync, TOP_ALLOCSP, resv);
1742 	err = allocsp_unlockfs(vp, &lf);
1743 
1744 	/*
1745 	 * @ exit label, we should no longer be holding the fs write lock, and
1746 	 * all logging transactions should have been ended. We still hold
1747 	 * ip->i_rwlock.
1748 	 */
1749 exit:
1750 	/*
1751 	 * File has grown larger than 2GB. Set flag
1752 	 * in superblock to indicate this, if it
1753 	 * is not already set.
1754 	 */
1755 	if ((ip->i_size > MAXOFF32_T) &&
1756 	    !(fs->fs_flags & FSLARGEFILES)) {
1757 		ASSERT(ufsvfsp->vfs_lfflags & UFS_LARGEFILES);
1758 		mutex_enter(&ufsvfsp->vfs_lock);
1759 		fs->fs_flags |= FSLARGEFILES;
1760 		ufs_sbwrite(ufsvfsp);
1761 		mutex_exit(&ufsvfsp->vfs_lock);
1762 	}
1763 
1764 	/*
1765 	 * Since we couldn't allocate completely, we will undo the allocations.
1766 	 */
1767 	if (berr) {
1768 		ufs_trans_trunc_resv(ip, totblks * fs->fs_bsize, &resv, &resid);
1769 		TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_ALLOCSP, resv);
1770 
1771 		rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
1772 		rw_enter(&ip->i_contents, RW_WRITER);
1773 
1774 		/* Direct blocks */
1775 		for (i = 0; i < NDADDR; i++) {
1776 			/*
1777 			 * Only free the block if they are not same, and
1778 			 * the old one isn't zero (the fragment was
1779 			 * re-allocated).
1780 			 */
1781 			if (db_undo[i] != ip->i_db[i] && db_undo[i] == 0) {
1782 				free(ip, ip->i_db[i], fs->fs_bsize, 0);
1783 				ip->i_db[i] = 0;
1784 			}
1785 		}
1786 
1787 		/* Undo the indirect blocks */
1788 		while (ib_undo != NULL) {
1789 			undo = ib_undo;
1790 			err = bmap_set_bn(vp, undo->offset, 0);
1791 			if (err)
1792 				cmn_err(CE_PANIC, "ufs_allocsp(): failed to "
1793 				    "undo allocation of block %ld",
1794 				    undo->offset);
1795 			free(ip, undo->blk, fs->fs_bsize, I_IBLK);
1796 			ib_undo = undo->next;
1797 			kmem_free(undo, sizeof (struct allocsp_undo));
1798 		}
1799 
1800 		ip->i_size = osz;
1801 		TRANS_INODE(ufsvfsp, ip);
1802 
1803 		rw_exit(&ip->i_contents);
1804 		rw_exit(&ufsvfsp->vfs_dqrwlock);
1805 
1806 		TRANS_END_CSYNC(ufsvfsp, err, issync, TOP_ALLOCSP, resv);
1807 
1808 		rw_exit(&ip->i_rwlock);
1809 		return (berr);
1810 	}
1811 
1812 	/*
1813 	 * Don't forget to free the undo chain :)
1814 	 */
1815 	while (ib_undo != NULL) {
1816 		undo = ib_undo;
1817 		ib_undo = undo->next;
1818 		kmem_free(undo, sizeof (struct allocsp_undo));
1819 	}
1820 
1821 	rw_exit(&ip->i_rwlock);
1822 
1823 out_allocsp:
1824 	return (err);
1825 }
1826 
1827 /*
1828  * Free storage space associated with the specified inode.  The portion
1829  * to be freed is specified by lp->l_start and lp->l_len (already
1830  * normalized to a "whence" of 0).
1831  *
1832  * This is an experimental facility whose continued existence is not
1833  * guaranteed.  Currently, we only support the special case
1834  * of l_len == 0, meaning free to end of file.
1835  *
1836  * Blocks are freed in reverse order.  This FILO algorithm will tend to
1837  * maintain a contiguous free list much longer than FIFO.
1838  * See also ufs_itrunc() in ufs_inode.c.
1839  *
1840  * Bug: unused bytes in the last retained block are not cleared.
1841  * This may result in a "hole" in the file that does not read as zeroes.
1842  */
1843 /* ARGSUSED */
1844 int
1845 ufs_freesp(struct vnode *vp, struct flock64 *lp, int flag, cred_t *cr)
1846 {
1847 	int i;
1848 	struct inode *ip = VTOI(vp);
1849 	int error;
1850 
1851 	ASSERT(vp->v_type == VREG);
1852 	ASSERT(lp->l_start >= 0);	/* checked by convoff */
1853 
1854 	if (lp->l_len != 0)
1855 		return (EINVAL);
1856 
1857 	rw_enter(&ip->i_contents, RW_READER);
1858 	if (ip->i_size == (u_offset_t)lp->l_start) {
1859 		rw_exit(&ip->i_contents);
1860 		return (0);
1861 	}
1862 
1863 	/*
1864 	 * Check if there is any active mandatory lock on the
1865 	 * range that will be truncated/expanded.
1866 	 */
1867 	if (MANDLOCK(vp, ip->i_mode)) {
1868 		offset_t save_start;
1869 
1870 		save_start = lp->l_start;
1871 
1872 		if (ip->i_size < lp->l_start) {
1873 			/*
1874 			 * "Truncate up" case: need to make sure there
1875 			 * is no lock beyond current end-of-file. To
1876 			 * do so, we need to set l_start to the size
1877 			 * of the file temporarily.
1878 			 */
1879 			lp->l_start = ip->i_size;
1880 		}
1881 		lp->l_type = F_WRLCK;
1882 		lp->l_sysid = 0;
1883 		lp->l_pid = ttoproc(curthread)->p_pid;
1884 		i = (flag & (FNDELAY|FNONBLOCK)) ? 0 : SLPFLCK;
1885 		rw_exit(&ip->i_contents);
1886 		if ((i = reclock(vp, lp, i, 0, lp->l_start, NULL)) != 0 ||
1887 		    lp->l_type != F_UNLCK) {
1888 			return (i ? i : EAGAIN);
1889 		}
1890 		rw_enter(&ip->i_contents, RW_READER);
1891 
1892 		lp->l_start = save_start;
1893 	}
1894 
1895 	/*
1896 	 * Make sure a write isn't in progress (allocating blocks)
1897 	 * by acquiring i_rwlock (we promised ufs_bmap we wouldn't
1898 	 * truncate while it was allocating blocks).
1899 	 * Grab the locks in the right order.
1900 	 */
1901 	rw_exit(&ip->i_contents);
1902 	rw_enter(&ip->i_rwlock, RW_WRITER);
1903 	error = TRANS_ITRUNC(ip, (u_offset_t)lp->l_start, 0, cr);
1904 	rw_exit(&ip->i_rwlock);
1905 	return (error);
1906 }
1907 
1908 /*
1909  * Find a cg with as close to nb contiguous bytes as possible
1910  *	THIS MAY TAKE MANY DISK READS!
1911  *
1912  * Implemented in an attempt to allocate contiguous blocks for
1913  * writing the ufs log file to, minimizing future disk head seeking
1914  */
1915 daddr_t
1916 contigpref(ufsvfs_t *ufsvfsp, size_t nb)
1917 {
1918 	struct fs	*fs	= ufsvfsp->vfs_fs;
1919 	daddr_t		nblk	= lblkno(fs, blkroundup(fs, nb));
1920 	daddr_t		savebno, curbno, cgbno;
1921 	int		cg, cgblks, savecg, savenblk, curnblk;
1922 	uchar_t		*blksfree;
1923 	buf_t		*bp;
1924 	struct cg	*cgp;
1925 
1926 	savenblk = 0;
1927 	savecg = 0;
1928 	savebno = 0;
1929 	for (cg = 0; cg < fs->fs_ncg; ++cg) {
1930 
1931 		/* not enough free blks for a contig check */
1932 		if (fs->fs_cs(fs, cg).cs_nbfree < nblk)
1933 			continue;
1934 
1935 		/*
1936 		 * find the largest contiguous range in this cg
1937 		 */
1938 		bp = UFS_BREAD(ufsvfsp, ufsvfsp->vfs_dev,
1939 		    (daddr_t)fsbtodb(fs, cgtod(fs, cg)),
1940 		    (int)fs->fs_cgsize);
1941 		cgp = bp->b_un.b_cg;
1942 		if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp)) {
1943 			brelse(bp);
1944 			continue;
1945 		}
1946 		blksfree = cg_blksfree(cgp);	    /* free array */
1947 		cgblks = fragstoblks(fs, fs->fs_fpg); /* blks in free array */
1948 		cgbno = 0;
1949 		while (cgbno < cgblks && savenblk < nblk) {
1950 			/* find a free block */
1951 			for (; cgbno < cgblks; ++cgbno)
1952 				if (isblock(fs, blksfree, cgbno))
1953 					break;
1954 			curbno = cgbno;
1955 			/* count the number of free blocks */
1956 			for (curnblk = 0; cgbno < cgblks; ++cgbno) {
1957 				if (!isblock(fs, blksfree, cgbno))
1958 					break;
1959 				if (++curnblk >= nblk)
1960 					break;
1961 			}
1962 			if (curnblk > savenblk) {
1963 				savecg = cg;
1964 				savenblk = curnblk;
1965 				savebno = curbno;
1966 			}
1967 		}
1968 		brelse(bp);
1969 		if (savenblk >= nblk)
1970 			break;
1971 	}
1972 
1973 	/* convert block offset in cg to frag offset in cg */
1974 	savebno = blkstofrags(fs, savebno);
1975 
1976 	/* convert frag offset in cg to frag offset in fs */
1977 	savebno += (savecg * fs->fs_fpg);
1978 
1979 	return (savebno);
1980 }
1981