1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26/*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
27/*	  All Rights Reserved	*/
28
29/*
30 * University Copyright- Copyright (c) 1982, 1986, 1988
31 * The Regents of the University of California
32 * All Rights Reserved
33 *
34 * University Acknowledgment- Portions of this document are derived from
35 * software developed by the University of California, Berkeley, and its
36 * contributors.
37 */
38
39#include <sys/condvar_impl.h>
40#include <sys/types.h>
41#include <sys/t_lock.h>
42#include <sys/debug.h>
43#include <sys/param.h>
44#include <sys/systm.h>
45#include <sys/signal.h>
46#include <sys/cred.h>
47#include <sys/proc.h>
48#include <sys/disp.h>
49#include <sys/user.h>
50#include <sys/buf.h>
51#include <sys/vfs.h>
52#include <sys/vnode.h>
53#include <sys/acl.h>
54#include <sys/fs/ufs_fs.h>
55#include <sys/fs/ufs_inode.h>
56#include <sys/fs/ufs_acl.h>
57#include <sys/fs/ufs_bio.h>
58#include <sys/fs/ufs_quota.h>
59#include <sys/kmem.h>
60#include <sys/fs/ufs_trans.h>
61#include <sys/fs/ufs_panic.h>
62#include <sys/errno.h>
63#include <sys/time.h>
64#include <sys/sysmacros.h>
65#include <sys/file.h>
66#include <sys/fcntl.h>
67#include <sys/flock.h>
68#include <fs/fs_subr.h>
69#include <sys/cmn_err.h>
70#include <sys/policy.h>
71#include <sys/fs/ufs_log.h>
72
73static ino_t	hashalloc();
74static daddr_t	fragextend();
75static daddr_t	alloccg();
76static daddr_t	alloccgblk();
77static ino_t	ialloccg();
78static daddr_t	mapsearch();
79static int	findlogstartcg();
80
81extern int	inside[], around[];
82extern uchar_t	*fragtbl[];
83void delay();
84
85/*
86 * Allocate a block in the file system.
87 *
88 * The size of the requested block is given, which must be some
89 * multiple of fs_fsize and <= fs_bsize.
90 * A preference may be optionally specified. If a preference is given
91 * the following hierarchy is used to allocate a block:
92 *   1) allocate the requested block.
93 *   2) allocate a rotationally optimal block in the same cylinder.
94 *   3) allocate a block in the same cylinder group.
95 *   4) quadratically rehash into other cylinder groups, until an
96 *	available block is located.
97 * If no block preference is given the following hierarchy is used
98 * to allocate a block:
99 *   1) allocate a block in the cylinder group that contains the
100 *	inode for the file.
101 *   2) quadratically rehash into other cylinder groups, until an
102 *	available block is located.
103 */
104int
105alloc(struct inode *ip, daddr_t bpref, int size, daddr_t *bnp, cred_t *cr)
106{
107	struct fs *fs;
108	struct ufsvfs *ufsvfsp;
109	daddr_t bno;
110	int cg;
111	int err;
112	char *errmsg = NULL;
113	size_t len;
114	clock_t	now;
115
116	ufsvfsp = ip->i_ufsvfs;
117	fs = ufsvfsp->vfs_fs;
118	if ((unsigned)size > fs->fs_bsize || fragoff(fs, size) != 0) {
119		err = ufs_fault(ITOV(ip), "alloc: bad size, dev = 0x%lx,"
120		    " bsize = %d, size = %d, fs = %s\n",
121		    ip->i_dev, fs->fs_bsize, size, fs->fs_fsmnt);
122		return (err);
123	}
124	if (size == fs->fs_bsize && fs->fs_cstotal.cs_nbfree == 0)
125		goto nospace;
126	if (freespace(fs, ufsvfsp) <= 0 &&
127	    secpolicy_fs_minfree(cr, ufsvfsp->vfs_vfs) != 0)
128		goto nospace;
129	err = chkdq(ip, (long)btodb(size), 0, cr, &errmsg, &len);
130	/* Note that may not have err, but may have errmsg */
131	if (errmsg != NULL) {
132		uprintf(errmsg);
133		kmem_free(errmsg, len);
134		errmsg = NULL;
135	}
136	if (err)
137		return (err);
138	if (bpref >= fs->fs_size)
139		bpref = 0;
140	if (bpref == 0)
141		cg = (int)itog(fs, ip->i_number);
142	else
143		cg = dtog(fs, bpref);
144
145	bno = (daddr_t)hashalloc(ip, cg, (long)bpref, size,
146	    (ulong_t (*)())alloccg);
147	if (bno > 0) {
148		*bnp = bno;
149		return (0);
150	}
151
152	/*
153	 * hashalloc() failed because some other thread grabbed
154	 * the last block so unwind the quota operation.  We can
155	 * ignore the return because subtractions don't fail and
156	 * size is guaranteed to be >= zero by our caller.
157	 */
158	(void) chkdq(ip, -(long)btodb(size), 0, cr, (char **)NULL,
159	    (size_t *)NULL);
160
161nospace:
162	now = ddi_get_lbolt();
163	mutex_enter(&ufsvfsp->vfs_lock);
164	if ((now - ufsvfsp->vfs_lastwhinetime) > (hz << 2) &&
165	    (!(TRANS_ISTRANS(ufsvfsp)) || !(ip->i_flag & IQUIET))) {
166		ufsvfsp->vfs_lastwhinetime = now;
167		cmn_err(CE_NOTE, "alloc: %s: file system full", fs->fs_fsmnt);
168	}
169	mutex_exit(&ufsvfsp->vfs_lock);
170	return (ENOSPC);
171}
172
173/*
174 * Reallocate a fragment to a bigger size
175 *
176 * The number and size of the old block is given, and a preference
177 * and new size is also specified.  The allocator attempts to extend
178 * the original block.  Failing that, the regular block allocator is
179 * invoked to get an appropriate block.
180 */
181int
182realloccg(struct inode *ip, daddr_t bprev, daddr_t bpref, int osize,
183    int nsize, daddr_t *bnp, cred_t *cr)
184{
185	daddr_t bno;
186	struct fs *fs;
187	struct ufsvfs *ufsvfsp;
188	int cg, request;
189	int err;
190	char *errmsg = NULL;
191	size_t len;
192	clock_t	now;
193
194	ufsvfsp = ip->i_ufsvfs;
195	fs = ufsvfsp->vfs_fs;
196	if ((unsigned)osize > fs->fs_bsize || fragoff(fs, osize) != 0 ||
197	    (unsigned)nsize > fs->fs_bsize || fragoff(fs, nsize) != 0) {
198		err = ufs_fault(ITOV(ip),
199		    "realloccg: bad size, dev=0x%lx, bsize=%d, "
200		    "osize=%d, nsize=%d, fs=%s\n",
201		    ip->i_dev, fs->fs_bsize, osize, nsize, fs->fs_fsmnt);
202		return (err);
203	}
204	if (freespace(fs, ufsvfsp) <= 0 &&
205	    secpolicy_fs_minfree(cr, ufsvfsp->vfs_vfs) != 0)
206		goto nospace;
207	if (bprev == 0) {
208		err = ufs_fault(ITOV(ip),
209		    "realloccg: bad bprev, dev = 0x%lx, bsize = %d,"
210		    " bprev = %ld, fs = %s\n", ip->i_dev, fs->fs_bsize, bprev,
211		    fs->fs_fsmnt);
212		return (err);
213	}
214	err = chkdq(ip, (long)btodb(nsize - osize), 0, cr, &errmsg, &len);
215	/* Note that may not have err, but may have errmsg */
216	if (errmsg != NULL) {
217		uprintf(errmsg);
218		kmem_free(errmsg, len);
219		errmsg = NULL;
220	}
221	if (err)
222		return (err);
223	cg = dtog(fs, bprev);
224	bno = fragextend(ip, cg, (long)bprev, osize, nsize);
225	if (bno != 0) {
226		*bnp = bno;
227		return (0);
228	}
229	if (bpref >= fs->fs_size)
230		bpref = 0;
231
232	/*
233	 * When optimizing for time we allocate a full block and
234	 * then only use the upper portion for this request. When
235	 * this file grows again it will grow into the unused portion
236	 * of the block (See fragextend() above).  This saves time
237	 * because an extra disk write would be needed if the frags
238	 * following the current allocation were not free. The extra
239	 * disk write is needed to move the data from its current
240	 * location into the newly allocated position.
241	 *
242	 * When optimizing for space we allocate a run of frags
243	 * that is just the right size for this request.
244	 */
245	request = (fs->fs_optim == FS_OPTTIME) ? fs->fs_bsize : nsize;
246	bno = (daddr_t)hashalloc(ip, cg, (long)bpref, request,
247	    (ulong_t (*)())alloccg);
248	if (bno > 0) {
249		*bnp = bno;
250		if (nsize < request)
251			(void) free(ip, bno + numfrags(fs, nsize),
252			    (off_t)(request - nsize), I_NOCANCEL);
253		return (0);
254	}
255
256	/*
257	 * hashalloc() failed because some other thread grabbed
258	 * the last block so unwind the quota operation.  We can
259	 * ignore the return because subtractions don't fail, and
260	 * our caller guarantees nsize >= osize.
261	 */
262	(void) chkdq(ip, -(long)btodb(nsize - osize), 0, cr, (char **)NULL,
263	    (size_t *)NULL);
264
265nospace:
266	now = ddi_get_lbolt();
267	mutex_enter(&ufsvfsp->vfs_lock);
268	if ((now - ufsvfsp->vfs_lastwhinetime) > (hz << 2) &&
269	    (!(TRANS_ISTRANS(ufsvfsp)) || !(ip->i_flag & IQUIET))) {
270		ufsvfsp->vfs_lastwhinetime = now;
271		cmn_err(CE_NOTE,
272		    "realloccg %s: file system full", fs->fs_fsmnt);
273	}
274	mutex_exit(&ufsvfsp->vfs_lock);
275	return (ENOSPC);
276}
277
278/*
279 * Allocate an inode in the file system.
280 *
281 * A preference may be optionally specified. If a preference is given
282 * the following hierarchy is used to allocate an inode:
283 *   1) allocate the requested inode.
284 *   2) allocate an inode in the same cylinder group.
285 *   3) quadratically rehash into other cylinder groups, until an
286 *	available inode is located.
287 * If no inode preference is given the following hierarchy is used
288 * to allocate an inode:
289 *   1) allocate an inode in cylinder group 0.
290 *   2) quadratically rehash into other cylinder groups, until an
291 *	available inode is located.
292 */
293int
294ufs_ialloc(struct inode *pip,
295    ino_t ipref, mode_t mode, struct inode **ipp, cred_t *cr)
296{
297	struct inode *ip;
298	struct fs *fs;
299	int cg;
300	ino_t ino;
301	int err;
302	int nifree;
303	struct ufsvfs *ufsvfsp = pip->i_ufsvfs;
304	char *errmsg = NULL;
305	size_t len;
306
307	ASSERT(RW_WRITE_HELD(&pip->i_rwlock));
308	fs = pip->i_fs;
309loop:
310	nifree = fs->fs_cstotal.cs_nifree;
311
312	if (nifree == 0)
313		goto noinodes;
314	/*
315	 * Shadow inodes don't count against a user's inode allocation.
316	 * They are an implementation method and not a resource.
317	 */
318	if ((mode != IFSHAD) && (mode != IFATTRDIR)) {
319		err = chkiq((struct ufsvfs *)ITOV(pip)->v_vfsp->vfs_data,
320		    /* change */ 1, (struct inode *)NULL, crgetuid(cr), 0,
321		    cr, &errmsg, &len);
322		/*
323		 * As we haven't acquired any locks yet, dump the message
324		 * now.
325		 */
326		if (errmsg != NULL) {
327			uprintf(errmsg);
328			kmem_free(errmsg, len);
329			errmsg = NULL;
330		}
331		if (err)
332			return (err);
333	}
334
335	if (ipref >= (ulong_t)(fs->fs_ncg * fs->fs_ipg))
336		ipref = 0;
337	cg = (int)itog(fs, ipref);
338	ino = (ino_t)hashalloc(pip, cg, (long)ipref, (int)mode,
339	    (ulong_t (*)())ialloccg);
340	if (ino == 0) {
341		if ((mode != IFSHAD) && (mode != IFATTRDIR)) {
342			/*
343			 * We can safely ignore the return from chkiq()
344			 * because deallocations can only fail if we
345			 * can't get the user's quota info record off
346			 * the disk due to an I/O error.  In that case,
347			 * the quota subsystem is already messed up.
348			 */
349			(void) chkiq(ufsvfsp, /* change */ -1,
350			    (struct inode *)NULL, crgetuid(cr), 0, cr,
351			    (char **)NULL, (size_t *)NULL);
352		}
353		goto noinodes;
354	}
355	err = ufs_iget(pip->i_vfs, ino, ipp, cr);
356	if (err) {
357		if ((mode != IFSHAD) && (mode != IFATTRDIR)) {
358			/*
359			 * See above comment about why it is safe to ignore an
360			 * error return here.
361			 */
362			(void) chkiq(ufsvfsp, /* change */ -1,
363			    (struct inode *)NULL, crgetuid(cr), 0, cr,
364			    (char **)NULL, (size_t *)NULL);
365		}
366		ufs_ifree(pip, ino, 0);
367		return (err);
368	}
369	ip = *ipp;
370	ASSERT(!ip->i_ufs_acl);
371	ASSERT(!ip->i_dquot);
372	rw_enter(&ip->i_contents, RW_WRITER);
373
374	/*
375	 * Check if we really got a free inode, if not then complain
376	 * and mark the inode ISTALE so that it will be freed by the
377	 * ufs idle thread eventually and will not be sent to ufs_delete().
378	 */
379	if (ip->i_mode || (ip->i_nlink > 0)) {
380		ip->i_flag |= ISTALE;
381		rw_exit(&ip->i_contents);
382		VN_RELE(ITOV(ip));
383		cmn_err(CE_WARN,
384		    "%s: unexpected allocated inode %d, run fsck(1M)%s",
385		    fs->fs_fsmnt, (int)ino,
386		    (TRANS_ISTRANS(ufsvfsp) ? " -o f" : ""));
387		goto loop;
388	}
389
390	/*
391	 * Check the inode has no size or data blocks.
392	 * This could have happened if the truncation failed when
393	 * deleting the inode. It used to be possible for this to occur
394	 * if a block allocation failed when iteratively truncating a
395	 * large file using logging and with a full file system.
396	 * This was fixed with bug fix 4348738. However, truncation may
397	 * still fail on an IO error. So in all cases for safety and
398	 * security we clear out the size; the blocks allocated; and
399	 * pointers to the blocks. This will ultimately cause a fsck
400	 * error of un-accounted for blocks, but its a fairly benign error,
401	 * and possibly the correct thing to do anyway as accesssing those
402	 * blocks agains may lead to more IO errors.
403	 */
404	if (ip->i_size || ip->i_blocks) {
405		int i;
406
407		if (ip->i_size) {
408			cmn_err(CE_WARN,
409			    "%s: free inode %d had size 0x%llx, run fsck(1M)%s",
410			    fs->fs_fsmnt, (int)ino, ip->i_size,
411			    (TRANS_ISTRANS(ufsvfsp) ? " -o f" : ""));
412		}
413		/*
414		 * Clear any garbage left behind.
415		 */
416		ip->i_size = (u_offset_t)0;
417		ip->i_blocks = 0;
418		for (i = 0; i < NDADDR; i++)
419			ip->i_db[i] = 0;
420		for (i = 0; i < NIADDR; i++)
421			ip->i_ib[i] = 0;
422	}
423
424	/*
425	 * Initialize the link count
426	 */
427	ip->i_nlink = 0;
428
429	/*
430	 * Clear the old flags
431	 */
432	ip->i_flag &= IREF;
433
434	/*
435	 * Access times are not really defined if the fs is mounted
436	 * with 'noatime'. But it can cause nfs clients to fail
437	 * open() if the atime is not a legal value. Set a legal value
438	 * here when the inode is allocated.
439	 */
440	if (ufsvfsp->vfs_noatime) {
441		mutex_enter(&ufs_iuniqtime_lock);
442		ip->i_atime = iuniqtime;
443		mutex_exit(&ufs_iuniqtime_lock);
444	}
445	rw_exit(&ip->i_contents);
446	return (0);
447noinodes:
448	if (!(TRANS_ISTRANS(ufsvfsp)) || !(pip->i_flag & IQUIET))
449		cmn_err(CE_NOTE, "%s: out of inodes\n", fs->fs_fsmnt);
450	return (ENOSPC);
451}
452
453/*
454 * Find a cylinder group to place a directory.
455 * Returns an inumber within the selected cylinder group.
456 * Note, the vfs_lock is not needed as we don't require exact cg summary info.
457 *
458 * If the switch ufs_close_dirs is set, then the policy is to use
459 * the current cg if it has more than 25% free inodes and more
460 * than 25% free blocks. Otherwise the cgs are searched from
461 * the beginning and the first cg with the same criteria is
462 * used. If that is also null then we revert to the old algorithm.
463 * This tends to cluster files at the beginning of the disk
464 * until the disk gets full.
465 *
466 * Otherwise if ufs_close_dirs is not set then the original policy is
467 * used which is to select from among those cylinder groups with
468 * above the average number of free inodes, the one with the smallest
469 * number of directories.
470 */
471
472int ufs_close_dirs = 1;	/* allocate directories close as possible */
473
474ino_t
475dirpref(inode_t *dp)
476{
477	int cg, minndir, mincg, avgifree, mininode, minbpg, ifree;
478	struct fs *fs = dp->i_fs;
479
480	cg = itog(fs, dp->i_number);
481	mininode = fs->fs_ipg >> 2;
482	minbpg = fs->fs_maxbpg >> 2;
483	if (ufs_close_dirs &&
484	    (fs->fs_cs(fs, cg).cs_nifree > mininode) &&
485	    (fs->fs_cs(fs, cg).cs_nbfree > minbpg)) {
486		return (dp->i_number);
487	}
488
489	avgifree = fs->fs_cstotal.cs_nifree / fs->fs_ncg;
490	minndir = fs->fs_ipg;
491	mincg = 0;
492	for (cg = 0; cg < fs->fs_ncg; cg++) {
493		ifree = fs->fs_cs(fs, cg).cs_nifree;
494		if (ufs_close_dirs &&
495		    (ifree > mininode) &&
496		    (fs->fs_cs(fs, cg).cs_nbfree > minbpg)) {
497			return ((ino_t)(fs->fs_ipg * cg));
498		}
499		if ((fs->fs_cs(fs, cg).cs_ndir < minndir) &&
500		    (ifree >= avgifree)) {
501			mincg = cg;
502			minndir = fs->fs_cs(fs, cg).cs_ndir;
503		}
504	}
505	return ((ino_t)(fs->fs_ipg * mincg));
506}
507
508/*
509 * Select the desired position for the next block in a file.  The file is
510 * logically divided into sections. The first section is composed of the
511 * direct blocks. Each additional section contains fs_maxbpg blocks.
512 *
513 * If no blocks have been allocated in the first section, the policy is to
514 * request a block in the same cylinder group as the inode that describes
515 * the file. If no blocks have been allocated in any other section, the
516 * policy is to place the section in a cylinder group with a greater than
517 * average number of free blocks.  An appropriate cylinder group is found
518 * by using a rotor that sweeps the cylinder groups. When a new group of
519 * blocks is needed, the sweep begins in the cylinder group following the
520 * cylinder group from which the previous allocation was made. The sweep
521 * continues until a cylinder group with greater than the average number
522 * of free blocks is found. If the allocation is for the first block in an
523 * indirect block, the information on the previous allocation is unavailable;
524 * here a best guess is made based upon the logical block number being
525 * allocated.
526 *
527 * If a section is already partially allocated, the policy is to
528 * contiguously allocate fs_maxcontig blocks.  The end of one of these
529 * contiguous blocks and the beginning of the next is physically separated
530 * so that the disk head will be in transit between them for at least
531 * fs_rotdelay milliseconds.  This is to allow time for the processor to
532 * schedule another I/O transfer.
533 */
534daddr_t
535blkpref(struct inode *ip, daddr_t lbn, int indx, daddr32_t *bap)
536{
537	struct fs *fs;
538	struct ufsvfs *ufsvfsp;
539	int cg;
540	int avgbfree, startcg;
541	daddr_t nextblk;
542
543	ufsvfsp = ip->i_ufsvfs;
544	fs = ip->i_fs;
545	if (indx % fs->fs_maxbpg == 0 || bap[indx - 1] == 0) {
546		if (lbn < NDADDR) {
547			cg = itog(fs, ip->i_number);
548			return (fs->fs_fpg * cg + fs->fs_frag);
549		}
550		/*
551		 * Find a cylinder with greater than average
552		 * number of unused data blocks.
553		 */
554		if (indx == 0 || bap[indx - 1] == 0)
555			startcg = itog(fs, ip->i_number) + lbn / fs->fs_maxbpg;
556		else
557			startcg = dtog(fs, bap[indx - 1]) + 1;
558		startcg %= fs->fs_ncg;
559
560		mutex_enter(&ufsvfsp->vfs_lock);
561		avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg;
562		/*
563		 * used for computing log space for writes/truncs
564		 */
565		ufsvfsp->vfs_avgbfree = avgbfree;
566		for (cg = startcg; cg < fs->fs_ncg; cg++)
567			if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
568				fs->fs_cgrotor = cg;
569				mutex_exit(&ufsvfsp->vfs_lock);
570				return (fs->fs_fpg * cg + fs->fs_frag);
571			}
572		for (cg = 0; cg <= startcg; cg++)
573			if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
574				fs->fs_cgrotor = cg;
575				mutex_exit(&ufsvfsp->vfs_lock);
576				return (fs->fs_fpg * cg + fs->fs_frag);
577			}
578		mutex_exit(&ufsvfsp->vfs_lock);
579		return (0);
580	}
581	/*
582	 * One or more previous blocks have been laid out. If less
583	 * than fs_maxcontig previous blocks are contiguous, the
584	 * next block is requested contiguously, otherwise it is
585	 * requested rotationally delayed by fs_rotdelay milliseconds.
586	 */
587
588	nextblk = bap[indx - 1];
589	/*
590	 * Provision for fallocate to return positive
591	 * blk preference based on last allocation
592	 */
593	if (nextblk < 0 && nextblk != UFS_HOLE) {
594		nextblk = (-bap[indx - 1]) + fs->fs_frag;
595	} else {
596		nextblk = bap[indx - 1] + fs->fs_frag;
597	}
598
599	if (indx > fs->fs_maxcontig && bap[indx - fs->fs_maxcontig] +
600	    blkstofrags(fs, fs->fs_maxcontig) != nextblk) {
601		return (nextblk);
602	}
603	if (fs->fs_rotdelay != 0)
604		/*
605		 * Here we convert ms of delay to frags as:
606		 * (frags) = (ms) * (rev/sec) * (sect/rev) /
607		 *	((sect/frag) * (ms/sec))
608		 * then round up to the next block.
609		 */
610		nextblk += roundup(fs->fs_rotdelay * fs->fs_rps * fs->fs_nsect /
611		    (NSPF(fs) * 1000), fs->fs_frag);
612	return (nextblk);
613}
614
615/*
616 * Free a block or fragment.
617 *
618 * The specified block or fragment is placed back in the
619 * free map. If a fragment is deallocated, a possible
620 * block reassembly is checked.
621 */
622void
623free(struct inode *ip, daddr_t bno, off_t size, int flags)
624{
625	struct fs *fs = ip->i_fs;
626	struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
627	struct ufs_q *delq = &ufsvfsp->vfs_delete;
628	struct ufs_delq_info *delq_info = &ufsvfsp->vfs_delete_info;
629	struct cg *cgp;
630	struct buf *bp;
631	int cg, bmap, bbase;
632	int i;
633	uchar_t *blksfree;
634	int *blktot;
635	short *blks;
636	daddr_t blkno, cylno, rpos;
637
638	/*
639	 * fallocate'd files will have negative block address.
640	 * So negate it again to get original block address.
641	 */
642	if (bno < 0 && (bno % fs->fs_frag == 0) && bno != UFS_HOLE) {
643		bno = -bno;
644	}
645
646	if ((unsigned long)size > fs->fs_bsize || fragoff(fs, size) != 0) {
647		(void) ufs_fault(ITOV(ip),
648		    "free: bad size, dev = 0x%lx, bsize = %d, size = %d, "
649		    "fs = %s\n", ip->i_dev, fs->fs_bsize,
650		    (int)size, fs->fs_fsmnt);
651		return;
652	}
653	cg = dtog(fs, bno);
654	ASSERT(!ufs_badblock(ip, bno));
655	bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, cgtod(fs, cg)),
656	    (int)fs->fs_cgsize);
657
658	cgp = bp->b_un.b_cg;
659	if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp)) {
660		brelse(bp);
661		return;
662	}
663
664	if (!(flags & I_NOCANCEL))
665		TRANS_CANCEL(ufsvfsp, ldbtob(fsbtodb(fs, bno)), size, flags);
666	if (flags & (I_DIR|I_IBLK|I_SHAD|I_QUOTA)) {
667		TRANS_MATA_FREE(ufsvfsp, ldbtob(fsbtodb(fs, bno)), size);
668	}
669	blksfree = cg_blksfree(cgp);
670	blktot = cg_blktot(cgp);
671	mutex_enter(&ufsvfsp->vfs_lock);
672	cgp->cg_time = gethrestime_sec();
673	bno = dtogd(fs, bno);
674	if (size == fs->fs_bsize) {
675		blkno = fragstoblks(fs, bno);
676		cylno = cbtocylno(fs, bno);
677		rpos = cbtorpos(ufsvfsp, bno);
678		blks = cg_blks(ufsvfsp, cgp, cylno);
679		if (!isclrblock(fs, blksfree, blkno)) {
680			mutex_exit(&ufsvfsp->vfs_lock);
681			brelse(bp);
682			(void) ufs_fault(ITOV(ip), "free: freeing free block, "
683			    "dev:0x%lx, block:%ld, ino:%lu, fs:%s",
684			    ip->i_dev, bno, ip->i_number, fs->fs_fsmnt);
685			return;
686		}
687		setblock(fs, blksfree, blkno);
688		blks[rpos]++;
689		blktot[cylno]++;
690		cgp->cg_cs.cs_nbfree++;		/* Log below */
691		fs->fs_cstotal.cs_nbfree++;
692		fs->fs_cs(fs, cg).cs_nbfree++;
693		if (TRANS_ISTRANS(ufsvfsp) && (flags & I_ACCT)) {
694			mutex_enter(&delq->uq_mutex);
695			delq_info->delq_unreclaimed_blocks -=
696			    btodb(fs->fs_bsize);
697			mutex_exit(&delq->uq_mutex);
698		}
699	} else {
700		bbase = bno - fragnum(fs, bno);
701		/*
702		 * Decrement the counts associated with the old frags
703		 */
704		bmap = blkmap(fs, blksfree, bbase);
705		fragacct(fs, bmap, cgp->cg_frsum, -1);
706		/*
707		 * Deallocate the fragment
708		 */
709		for (i = 0; i < numfrags(fs, size); i++) {
710			if (isset(blksfree, bno + i)) {
711				brelse(bp);
712				mutex_exit(&ufsvfsp->vfs_lock);
713				(void) ufs_fault(ITOV(ip),
714				    "free: freeing free frag, "
715				    "dev:0x%lx, blk:%ld, cg:%d, "
716				    "ino:%lu, fs:%s",
717				    ip->i_dev,
718				    bno + i,
719				    cgp->cg_cgx,
720				    ip->i_number,
721				    fs->fs_fsmnt);
722				return;
723			}
724			setbit(blksfree, bno + i);
725		}
726		cgp->cg_cs.cs_nffree += i;
727		fs->fs_cstotal.cs_nffree += i;
728		fs->fs_cs(fs, cg).cs_nffree += i;
729		if (TRANS_ISTRANS(ufsvfsp) && (flags & I_ACCT)) {
730			mutex_enter(&delq->uq_mutex);
731			delq_info->delq_unreclaimed_blocks -=
732			    btodb(i * fs->fs_fsize);
733			mutex_exit(&delq->uq_mutex);
734		}
735		/*
736		 * Add back in counts associated with the new frags
737		 */
738		bmap = blkmap(fs, blksfree, bbase);
739		fragacct(fs, bmap, cgp->cg_frsum, 1);
740		/*
741		 * If a complete block has been reassembled, account for it
742		 */
743		blkno = fragstoblks(fs, bbase);
744		if (isblock(fs, blksfree, blkno)) {
745			cylno = cbtocylno(fs, bbase);
746			rpos = cbtorpos(ufsvfsp, bbase);
747			blks = cg_blks(ufsvfsp, cgp, cylno);
748			blks[rpos]++;
749			blktot[cylno]++;
750			cgp->cg_cs.cs_nffree -= fs->fs_frag;
751			fs->fs_cstotal.cs_nffree -= fs->fs_frag;
752			fs->fs_cs(fs, cg).cs_nffree -= fs->fs_frag;
753			cgp->cg_cs.cs_nbfree++;
754			fs->fs_cstotal.cs_nbfree++;
755			fs->fs_cs(fs, cg).cs_nbfree++;
756		}
757	}
758	fs->fs_fmod = 1;
759	ufs_notclean(ufsvfsp);
760	TRANS_BUF(ufsvfsp, 0, fs->fs_cgsize, bp, DT_CG);
761	TRANS_SI(ufsvfsp, fs, cg);
762	bdrwrite(bp);
763}
764
765/*
766 * Free an inode.
767 *
768 * The specified inode is placed back in the free map.
769 */
770void
771ufs_ifree(struct inode *ip, ino_t ino, mode_t mode)
772{
773	struct fs *fs = ip->i_fs;
774	struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
775	struct cg *cgp;
776	struct buf *bp;
777	unsigned int inot;
778	int cg;
779	char *iused;
780
781	if (ip->i_number == ino && ip->i_mode != 0) {
782		(void) ufs_fault(ITOV(ip),
783		    "ufs_ifree: illegal mode: (imode) %o, (omode) %o, ino %d, "
784		    "fs = %s\n",
785		    ip->i_mode, mode, (int)ip->i_number, fs->fs_fsmnt);
786		return;
787	}
788	if (ino >= fs->fs_ipg * fs->fs_ncg) {
789		(void) ufs_fault(ITOV(ip),
790		    "ifree: range, dev = 0x%x, ino = %d, fs = %s\n",
791		    (int)ip->i_dev, (int)ino, fs->fs_fsmnt);
792		return;
793	}
794	cg = (int)itog(fs, ino);
795	bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, cgtod(fs, cg)),
796	    (int)fs->fs_cgsize);
797
798	cgp = bp->b_un.b_cg;
799	if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp)) {
800		brelse(bp);
801		return;
802	}
803	mutex_enter(&ufsvfsp->vfs_lock);
804	cgp->cg_time = gethrestime_sec();
805	iused = cg_inosused(cgp);
806	inot = (unsigned int)(ino % (ulong_t)fs->fs_ipg);
807	if (isclr(iused, inot)) {
808		mutex_exit(&ufsvfsp->vfs_lock);
809		brelse(bp);
810		(void) ufs_fault(ITOV(ip), "ufs_ifree: freeing free inode, "
811		    "mode: (imode) %o, (omode) %o, ino:%d, "
812		    "fs:%s",
813		    ip->i_mode, mode, (int)ino, fs->fs_fsmnt);
814		return;
815	}
816	clrbit(iused, inot);
817
818	if (inot < (ulong_t)cgp->cg_irotor)
819		cgp->cg_irotor = inot;
820	cgp->cg_cs.cs_nifree++;
821	fs->fs_cstotal.cs_nifree++;
822	fs->fs_cs(fs, cg).cs_nifree++;
823	if (((mode & IFMT) == IFDIR) || ((mode & IFMT) == IFATTRDIR)) {
824		cgp->cg_cs.cs_ndir--;
825		fs->fs_cstotal.cs_ndir--;
826		fs->fs_cs(fs, cg).cs_ndir--;
827	}
828	fs->fs_fmod = 1;
829	ufs_notclean(ufsvfsp);
830	TRANS_BUF(ufsvfsp, 0, fs->fs_cgsize, bp, DT_CG);
831	TRANS_SI(ufsvfsp, fs, cg);
832	bdrwrite(bp);
833}
834
835/*
836 * Implement the cylinder overflow algorithm.
837 *
838 * The policy implemented by this algorithm is:
839 *   1) allocate the block in its requested cylinder group.
840 *   2) quadratically rehash on the cylinder group number.
841 *   3) brute force search for a free block.
842 * The size parameter means size for data blocks, mode for inodes.
843 */
844static ino_t
845hashalloc(struct inode *ip, int cg, long pref, int size, ulong_t (*allocator)())
846{
847	struct fs *fs;
848	int i;
849	long result;
850	int icg = cg;
851
852	fs = ip->i_fs;
853	/*
854	 * 1: preferred cylinder group
855	 */
856	result = (*allocator)(ip, cg, pref, size);
857	if (result)
858		return (result);
859	/*
860	 * 2: quadratic rehash
861	 */
862	for (i = 1; i < fs->fs_ncg; i *= 2) {
863		cg += i;
864		if (cg >= fs->fs_ncg)
865			cg -= fs->fs_ncg;
866		result = (*allocator)(ip, cg, 0, size);
867		if (result)
868			return (result);
869	}
870	/*
871	 * 3: brute force search
872	 * Note that we start at i == 2, since 0 was checked initially,
873	 * and 1 is always checked in the quadratic rehash.
874	 */
875	cg = (icg + 2) % fs->fs_ncg;
876	for (i = 2; i < fs->fs_ncg; i++) {
877		result = (*allocator)(ip, cg, 0, size);
878		if (result)
879			return (result);
880		cg++;
881		if (cg == fs->fs_ncg)
882			cg = 0;
883	}
884	return (0);
885}
886
887/*
888 * Determine whether a fragment can be extended.
889 *
890 * Check to see if the necessary fragments are available, and
891 * if they are, allocate them.
892 */
893static daddr_t
894fragextend(struct inode *ip, int cg, long bprev, int osize, int nsize)
895{
896	struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
897	struct fs *fs = ip->i_fs;
898	struct buf *bp;
899	struct cg *cgp;
900	uchar_t *blksfree;
901	long bno;
902	int frags, bbase;
903	int i, j;
904
905	if (fs->fs_cs(fs, cg).cs_nffree < numfrags(fs, nsize - osize))
906		return (0);
907	frags = numfrags(fs, nsize);
908	bbase = (int)fragnum(fs, bprev);
909	if (bbase > fragnum(fs, (bprev + frags - 1))) {
910		/* cannot extend across a block boundary */
911		return (0);
912	}
913
914	bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, cgtod(fs, cg)),
915	    (int)fs->fs_cgsize);
916	cgp = bp->b_un.b_cg;
917	if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp)) {
918		brelse(bp);
919		return (0);
920	}
921
922	blksfree = cg_blksfree(cgp);
923	mutex_enter(&ufsvfsp->vfs_lock);
924	bno = dtogd(fs, bprev);
925	for (i = numfrags(fs, osize); i < frags; i++) {
926		if (isclr(blksfree, bno + i)) {
927			mutex_exit(&ufsvfsp->vfs_lock);
928			brelse(bp);
929			return (0);
930		}
931		if ((TRANS_ISCANCEL(ufsvfsp, ldbtob(fsbtodb(fs, bprev + i)),
932		    fs->fs_fsize))) {
933			mutex_exit(&ufsvfsp->vfs_lock);
934			brelse(bp);
935			return (0);
936		}
937	}
938
939	cgp->cg_time = gethrestime_sec();
940	/*
941	 * The current fragment can be extended,
942	 * deduct the count on fragment being extended into
943	 * increase the count on the remaining fragment (if any)
944	 * allocate the extended piece.
945	 */
946	for (i = frags; i < fs->fs_frag - bbase; i++)
947		if (isclr(blksfree, bno + i))
948			break;
949	j = i - numfrags(fs, osize);
950	cgp->cg_frsum[j]--;
951	ASSERT(cgp->cg_frsum[j] >= 0);
952	if (i != frags)
953		cgp->cg_frsum[i - frags]++;
954	for (i = numfrags(fs, osize); i < frags; i++) {
955		clrbit(blksfree, bno + i);
956		cgp->cg_cs.cs_nffree--;
957		fs->fs_cs(fs, cg).cs_nffree--;
958		fs->fs_cstotal.cs_nffree--;
959	}
960	fs->fs_fmod = 1;
961	ufs_notclean(ufsvfsp);
962	TRANS_BUF(ufsvfsp, 0, fs->fs_cgsize, bp, DT_CG);
963	TRANS_SI(ufsvfsp, fs, cg);
964	bdrwrite(bp);
965	return ((daddr_t)bprev);
966}
967
968/*
969 * Determine whether a block can be allocated.
970 *
971 * Check to see if a block of the apprpriate size
972 * is available, and if it is, allocate it.
973 */
974static daddr_t
975alloccg(struct inode *ip, int cg, daddr_t bpref, int size)
976{
977	struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
978	struct fs *fs = ip->i_fs;
979	struct buf *bp;
980	struct cg *cgp;
981	uchar_t *blksfree;
982	int bno, frags;
983	int allocsiz;
984	int i;
985
986	/*
987	 * Searching for space could be time expensive so do some
988	 * up front checking to verify that there is actually space
989	 * available (free blocks or free frags).
990	 */
991	if (fs->fs_cs(fs, cg).cs_nbfree == 0) {
992		if (size == fs->fs_bsize)
993			return (0);
994
995		/*
996		 * If there are not enough free frags then return.
997		 */
998		if (fs->fs_cs(fs, cg).cs_nffree < numfrags(fs, size))
999			return (0);
1000	}
1001
1002	bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, cgtod(fs, cg)),
1003	    (int)fs->fs_cgsize);
1004
1005	cgp = bp->b_un.b_cg;
1006	if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp) ||
1007	    (cgp->cg_cs.cs_nbfree == 0 && size == fs->fs_bsize)) {
1008		brelse(bp);
1009		return (0);
1010	}
1011	blksfree = cg_blksfree(cgp);
1012	mutex_enter(&ufsvfsp->vfs_lock);
1013	cgp->cg_time = gethrestime_sec();
1014	if (size == fs->fs_bsize) {
1015		if ((bno = alloccgblk(ufsvfsp, cgp, bpref, bp)) == 0)
1016			goto errout;
1017		fs->fs_fmod = 1;
1018		ufs_notclean(ufsvfsp);
1019		TRANS_SI(ufsvfsp, fs, cg);
1020		bdrwrite(bp);
1021		return (bno);
1022	}
1023	/*
1024	 * Check fragment bitmap to see if any fragments are already available.
1025	 * mapsearch() may fail because the fragment that fits this request
1026	 * might still be on the cancel list and not available for re-use yet.
1027	 * Look for a bigger sized fragment to allocate first before we have
1028	 * to give up and fragment a whole new block eventually.
1029	 */
1030	frags = numfrags(fs, size);
1031	allocsiz = frags;
1032next_size:
1033	for (; allocsiz < fs->fs_frag; allocsiz++)
1034		if (cgp->cg_frsum[allocsiz] != 0)
1035			break;
1036
1037	if (allocsiz != fs->fs_frag) {
1038		bno = mapsearch(ufsvfsp, cgp, bpref, allocsiz);
1039		if (bno < 0 && allocsiz < (fs->fs_frag - 1)) {
1040			allocsiz++;
1041			goto next_size;
1042		}
1043	}
1044
1045	if (allocsiz == fs->fs_frag || bno < 0) {
1046		/*
1047		 * No fragments were available, so a block
1048		 * will be allocated and hacked up.
1049		 */
1050		if (cgp->cg_cs.cs_nbfree == 0)
1051			goto errout;
1052		if ((bno = alloccgblk(ufsvfsp, cgp, bpref, bp)) == 0)
1053			goto errout;
1054		bpref = dtogd(fs, bno);
1055		for (i = frags; i < fs->fs_frag; i++)
1056			setbit(blksfree, bpref + i);
1057		i = fs->fs_frag - frags;
1058		cgp->cg_cs.cs_nffree += i;
1059		fs->fs_cstotal.cs_nffree += i;
1060		fs->fs_cs(fs, cg).cs_nffree += i;
1061		cgp->cg_frsum[i]++;
1062		fs->fs_fmod = 1;
1063		ufs_notclean(ufsvfsp);
1064		TRANS_SI(ufsvfsp, fs, cg);
1065		bdrwrite(bp);
1066		return (bno);
1067	}
1068
1069	for (i = 0; i < frags; i++)
1070		clrbit(blksfree, bno + i);
1071	cgp->cg_cs.cs_nffree -= frags;
1072	fs->fs_cstotal.cs_nffree -= frags;
1073	fs->fs_cs(fs, cg).cs_nffree -= frags;
1074	cgp->cg_frsum[allocsiz]--;
1075	ASSERT(cgp->cg_frsum[allocsiz] >= 0);
1076	if (frags != allocsiz) {
1077		cgp->cg_frsum[allocsiz - frags]++;
1078	}
1079	fs->fs_fmod = 1;
1080	ufs_notclean(ufsvfsp);
1081	TRANS_BUF(ufsvfsp, 0, fs->fs_cgsize, bp, DT_CG);
1082	TRANS_SI(ufsvfsp, fs, cg);
1083	bdrwrite(bp);
1084	return (cg * fs->fs_fpg + bno);
1085errout:
1086	mutex_exit(&ufsvfsp->vfs_lock);
1087	brelse(bp);
1088	return (0);
1089}
1090
1091/*
1092 * Allocate a block in a cylinder group.
1093 *
1094 * This algorithm implements the following policy:
1095 *   1) allocate the requested block.
1096 *   2) allocate a rotationally optimal block in the same cylinder.
1097 *   3) allocate the next available block on the block rotor for the
1098 *	specified cylinder group.
1099 * Note that this routine only allocates fs_bsize blocks; these
1100 * blocks may be fragmented by the routine that allocates them.
1101 */
1102static daddr_t
1103alloccgblk(
1104	struct ufsvfs *ufsvfsp,
1105	struct cg *cgp,
1106	daddr_t bpref,
1107	struct buf *bp)
1108{
1109	daddr_t bno;
1110	int cylno, pos, delta, rotbl_size;
1111	short *cylbp;
1112	int i;
1113	struct fs *fs;
1114	uchar_t *blksfree;
1115	daddr_t blkno, rpos, frag;
1116	short *blks;
1117	int32_t *blktot;
1118
1119	ASSERT(MUTEX_HELD(&ufsvfsp->vfs_lock));
1120	fs = ufsvfsp->vfs_fs;
1121	blksfree = cg_blksfree(cgp);
1122	if (bpref == 0) {
1123		bpref = cgp->cg_rotor;
1124		goto norot;
1125	}
1126	bpref = blknum(fs, bpref);
1127	bpref = dtogd(fs, bpref);
1128	/*
1129	 * If the requested block is available, use it.
1130	 */
1131	if (isblock(fs, blksfree, (daddr_t)fragstoblks(fs, bpref))) {
1132		bno = bpref;
1133		goto gotit;
1134	}
1135	/*
1136	 * Check for a block available on the same cylinder.
1137	 */
1138	cylno = cbtocylno(fs, bpref);
1139	if (cg_blktot(cgp)[cylno] == 0)
1140		goto norot;
1141	if (fs->fs_cpc == 0) {
1142		/*
1143		 * Block layout info is not available, so just
1144		 * have to take any block in this cylinder.
1145		 */
1146		bpref = howmany(fs->fs_spc * cylno, NSPF(fs));
1147		goto norot;
1148	}
1149	/*
1150	 * Check the summary information to see if a block is
1151	 * available in the requested cylinder starting at the
1152	 * requested rotational position and proceeding around.
1153	 */
1154	cylbp = cg_blks(ufsvfsp, cgp, cylno);
1155	pos = cbtorpos(ufsvfsp, bpref);
1156	for (i = pos; i < ufsvfsp->vfs_nrpos; i++)
1157		if (cylbp[i] > 0)
1158			break;
1159	if (i == ufsvfsp->vfs_nrpos)
1160		for (i = 0; i < pos; i++)
1161			if (cylbp[i] > 0)
1162				break;
1163	if (cylbp[i] > 0) {
1164		/*
1165		 * Found a rotational position, now find the actual
1166		 * block.  A "panic" if none is actually there.
1167		 */
1168
1169		/*
1170		 * Up to this point, "pos" has referred to the rotational
1171		 * position of the desired block.  From now on, it holds
1172		 * the offset of the current cylinder within a cylinder
1173		 * cycle.  (A cylinder cycle refers to a set of cylinders
1174		 * which are described by a single rotational table; the
1175		 * size of the cycle is fs_cpc.)
1176		 *
1177		 * bno is set to the block number of the first block within
1178		 * the current cylinder cycle.
1179		 */
1180
1181		pos = cylno % fs->fs_cpc;
1182		bno = (cylno - pos) * fs->fs_spc / NSPB(fs);
1183
1184		/*
1185		 * The blocks within a cylinder are grouped into equivalence
1186		 * classes according to their "rotational position."  There
1187		 * are two tables used to determine these classes.
1188		 *
1189		 * The positional offset table (fs_postbl) has an entry for
1190		 * each rotational position of each cylinder in a cylinder
1191		 * cycle.  This entry contains the relative block number
1192		 * (counting from the start of the cylinder cycle) of the
1193		 * first block in the equivalence class for that position
1194		 * and that cylinder.  Positions for which no blocks exist
1195		 * are indicated by a -1.
1196		 *
1197		 * The rotational delta table (fs_rotbl) has an entry for
1198		 * each block in a cylinder cycle.  This entry contains
1199		 * the offset from that block to the next block in the
1200		 * same equivalence class.  The last block in the class
1201		 * is indicated by a zero in the table.
1202		 *
1203		 * The following code, then, walks through all of the blocks
1204		 * in the cylinder (cylno) which we're allocating within
1205		 * which are in the equivalence class for the rotational
1206		 * position (i) which we're allocating within.
1207		 */
1208
1209		if (fs_postbl(ufsvfsp, pos)[i] == -1) {
1210			(void) ufs_fault(ufsvfsp->vfs_root,
1211			    "alloccgblk: cyl groups corrupted, pos = %d, "
1212			    "i = %d, fs = %s\n", pos, i, fs->fs_fsmnt);
1213			return (0);
1214		}
1215
1216		/*
1217		 * There is one entry in the rotational table for each block
1218		 * in the cylinder cycle.  These are whole blocks, not frags.
1219		 */
1220
1221		rotbl_size = (fs->fs_cpc * fs->fs_spc) >>
1222		    (fs->fs_fragshift + fs->fs_fsbtodb);
1223
1224		/*
1225		 * As we start, "i" is the rotational position within which
1226		 * we're searching.  After the next line, it will be a block
1227		 * number (relative to the start of the cylinder cycle)
1228		 * within the equivalence class of that rotational position.
1229		 */
1230
1231		i = fs_postbl(ufsvfsp, pos)[i];
1232
1233		for (;;) {
1234			if (isblock(fs, blksfree, (daddr_t)(bno + i))) {
1235				bno = blkstofrags(fs, (bno + i));
1236				goto gotit;
1237			}
1238			delta = fs_rotbl(fs)[i];
1239			if (delta <= 0 ||		/* End of chain, or */
1240			    delta + i > rotbl_size)	/* end of table? */
1241				break;			/* If so, panic. */
1242			i += delta;
1243		}
1244		(void) ufs_fault(ufsvfsp->vfs_root,
1245		    "alloccgblk: can't find blk in cyl, pos:%d, i:%d, "
1246		    "fs:%s bno: %x\n", pos, i, fs->fs_fsmnt, (int)bno);
1247		return (0);
1248	}
1249norot:
1250	/*
1251	 * No blocks in the requested cylinder, so take
1252	 * next available one in this cylinder group.
1253	 */
1254	bno = mapsearch(ufsvfsp, cgp, bpref, (int)fs->fs_frag);
1255	if (bno < 0)
1256		return (0);
1257	cgp->cg_rotor = bno;
1258gotit:
1259	blkno = fragstoblks(fs, bno);
1260	frag = (cgp->cg_cgx * fs->fs_fpg) + bno;
1261	if (TRANS_ISCANCEL(ufsvfsp, ldbtob(fsbtodb(fs, frag)), fs->fs_bsize))
1262		goto norot;
1263	clrblock(fs, blksfree, (long)blkno);
1264	/*
1265	 * the other cg/sb/si fields are TRANS'ed by the caller
1266	 */
1267	cgp->cg_cs.cs_nbfree--;
1268	fs->fs_cstotal.cs_nbfree--;
1269	fs->fs_cs(fs, cgp->cg_cgx).cs_nbfree--;
1270	cylno = cbtocylno(fs, bno);
1271	blks = cg_blks(ufsvfsp, cgp, cylno);
1272	rpos = cbtorpos(ufsvfsp, bno);
1273	blktot = cg_blktot(cgp);
1274	blks[rpos]--;
1275	blktot[cylno]--;
1276	TRANS_BUF(ufsvfsp, 0, fs->fs_cgsize, bp, DT_CG);
1277	fs->fs_fmod = 1;
1278	return (frag);
1279}
1280
1281/*
1282 * Determine whether an inode can be allocated.
1283 *
1284 * Check to see if an inode is available, and if it is,
1285 * allocate it using the following policy:
1286 *   1) allocate the requested inode.
1287 *   2) allocate the next available inode after the requested
1288 *	inode in the specified cylinder group.
1289 */
1290static ino_t
1291ialloccg(struct inode *ip, int cg, daddr_t ipref, int mode)
1292{
1293	struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
1294	struct fs *fs = ip->i_fs;
1295	struct cg *cgp;
1296	struct buf *bp;
1297	int start, len, loc, map, i;
1298	char *iused;
1299
1300	if (fs->fs_cs(fs, cg).cs_nifree == 0)
1301		return (0);
1302	bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, cgtod(fs, cg)),
1303	    (int)fs->fs_cgsize);
1304
1305	cgp = bp->b_un.b_cg;
1306	if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp) ||
1307	    cgp->cg_cs.cs_nifree == 0) {
1308		brelse(bp);
1309		return (0);
1310	}
1311	iused = cg_inosused(cgp);
1312	mutex_enter(&ufsvfsp->vfs_lock);
1313	/*
1314	 * While we are waiting for the mutex, someone may have taken
1315	 * the last available inode.  Need to recheck.
1316	 */
1317	if (cgp->cg_cs.cs_nifree == 0) {
1318		mutex_exit(&ufsvfsp->vfs_lock);
1319		brelse(bp);
1320		return (0);
1321	}
1322
1323	cgp->cg_time = gethrestime_sec();
1324	if (ipref) {
1325		ipref %= fs->fs_ipg;
1326		if (isclr(iused, ipref))
1327			goto gotit;
1328	}
1329	start = cgp->cg_irotor / NBBY;
1330	len = howmany(fs->fs_ipg - cgp->cg_irotor, NBBY);
1331	loc = skpc(0xff, (uint_t)len, &iused[start]);
1332	if (loc == 0) {
1333		len = start + 1;
1334		start = 0;
1335		loc = skpc(0xff, (uint_t)len, &iused[0]);
1336		if (loc == 0) {
1337			mutex_exit(&ufsvfsp->vfs_lock);
1338			(void) ufs_fault(ITOV(ip),
1339			    "ialloccg: map corrupted, cg = %d, irotor = %d, "
1340			    "fs = %s\n", cg, (int)cgp->cg_irotor, fs->fs_fsmnt);
1341			return (0);
1342		}
1343	}
1344	i = start + len - loc;
1345	map = iused[i];
1346	ipref = i * NBBY;
1347	for (i = 1; i < (1 << NBBY); i <<= 1, ipref++) {
1348		if ((map & i) == 0) {
1349			cgp->cg_irotor = ipref;
1350			goto gotit;
1351		}
1352	}
1353
1354	mutex_exit(&ufsvfsp->vfs_lock);
1355	(void) ufs_fault(ITOV(ip), "ialloccg: block not in mapfs = %s",
1356	    fs->fs_fsmnt);
1357	return (0);
1358gotit:
1359	setbit(iused, ipref);
1360	cgp->cg_cs.cs_nifree--;
1361	fs->fs_cstotal.cs_nifree--;
1362	fs->fs_cs(fs, cg).cs_nifree--;
1363	if (((mode & IFMT) == IFDIR) || ((mode & IFMT) == IFATTRDIR)) {
1364		cgp->cg_cs.cs_ndir++;
1365		fs->fs_cstotal.cs_ndir++;
1366		fs->fs_cs(fs, cg).cs_ndir++;
1367	}
1368	fs->fs_fmod = 1;
1369	ufs_notclean(ufsvfsp);
1370	TRANS_BUF(ufsvfsp, 0, fs->fs_cgsize, bp, DT_CG);
1371	TRANS_SI(ufsvfsp, fs, cg);
1372	bdrwrite(bp);
1373	return (cg * fs->fs_ipg + ipref);
1374}
1375
1376/*
1377 * Find a block of the specified size in the specified cylinder group.
1378 *
1379 * It is a panic if a request is made to find a block if none are
1380 * available.
1381 */
1382static daddr_t
1383mapsearch(struct ufsvfs *ufsvfsp, struct cg *cgp, daddr_t bpref, int allocsiz)
1384{
1385	struct fs *fs	= ufsvfsp->vfs_fs;
1386	daddr_t bno, cfrag;
1387	int start, len, loc, i, last, first, secondtime;
1388	int blk, field, subfield, pos;
1389	int gotit;
1390
1391	/*
1392	 * ufsvfs->vfs_lock is held when calling this.
1393	 */
1394	/*
1395	 * Find the fragment by searching through the
1396	 * free block map for an appropriate bit pattern.
1397	 */
1398	if (bpref)
1399		start = dtogd(fs, bpref) / NBBY;
1400	else
1401		start = cgp->cg_frotor / NBBY;
1402	/*
1403	 * the following loop performs two scans -- the first scan
1404	 * searches the bottom half of the array for a match and the
1405	 * second scan searches the top half of the array.  The loops
1406	 * have been merged just to make things difficult.
1407	 */
1408	first = start;
1409	last = howmany(fs->fs_fpg, NBBY);
1410	secondtime = 0;
1411	cfrag = cgp->cg_cgx * fs->fs_fpg;
1412	while (first < last) {
1413		len = last - first;
1414		/*
1415		 * search the array for a match
1416		 */
1417		loc = scanc((unsigned)len, (uchar_t *)&cg_blksfree(cgp)[first],
1418		    (uchar_t *)fragtbl[fs->fs_frag],
1419		    (int)(1 << (allocsiz - 1 + (fs->fs_frag % NBBY))));
1420		/*
1421		 * match found
1422		 */
1423		if (loc) {
1424			bno = (last - loc) * NBBY;
1425
1426			/*
1427			 * Found the byte in the map, sift
1428			 * through the bits to find the selected frag
1429			 */
1430			cgp->cg_frotor = bno;
1431			gotit = 0;
1432			for (i = bno + NBBY; bno < i; bno += fs->fs_frag) {
1433				blk = blkmap(fs, cg_blksfree(cgp), bno);
1434				blk <<= 1;
1435				field = around[allocsiz];
1436				subfield = inside[allocsiz];
1437				for (pos = 0;
1438				    pos <= fs->fs_frag - allocsiz;
1439				    pos++) {
1440					if ((blk & field) == subfield) {
1441						gotit++;
1442						break;
1443					}
1444					field <<= 1;
1445					subfield <<= 1;
1446				}
1447				if (gotit)
1448					break;
1449			}
1450			bno += pos;
1451
1452			/*
1453			 * success if block is *not* being converted from
1454			 * metadata into userdata (harpy).  If so, ignore.
1455			 */
1456			if (!TRANS_ISCANCEL(ufsvfsp,
1457			    ldbtob(fsbtodb(fs, (cfrag+bno))),
1458			    allocsiz * fs->fs_fsize))
1459				return (bno);
1460
1461			/*
1462			 * keep looking -- this block is being converted
1463			 */
1464			first = (last - loc) + 1;
1465			loc = 0;
1466			if (first < last)
1467				continue;
1468		}
1469		/*
1470		 * no usable matches in bottom half -- now search the top half
1471		 */
1472		if (secondtime)
1473			/*
1474			 * no usable matches in top half -- all done
1475			 */
1476			break;
1477		secondtime = 1;
1478		last = start + 1;
1479		first = 0;
1480	}
1481	/*
1482	 * no usable matches
1483	 */
1484	return ((daddr_t)-1);
1485}
1486
1487#define	UFSNADDR (NDADDR + NIADDR)	/* NADDR applies to (obsolete) S5FS */
1488#define	IB(i)	(NDADDR + (i))	/* index of i'th indirect block ptr */
1489#define	SINGLE	0		/* single indirect block ptr */
1490#define	DOUBLE	1		/* double indirect block ptr */
1491#define	TRIPLE	2		/* triple indirect block ptr */
1492
1493/*
1494 * Acquire a write lock, and keep trying till we get it
1495 */
1496static int
1497allocsp_wlockfs(struct vnode *vp, struct lockfs *lf)
1498{
1499	int err = 0;
1500
1501lockagain:
1502	do {
1503		err = ufs_fiolfss(vp, lf);
1504		if (err)
1505			return (err);
1506	} while (!LOCKFS_IS_ULOCK(lf));
1507
1508	lf->lf_lock = LOCKFS_WLOCK;
1509	lf->lf_flags = 0;
1510	lf->lf_comment = NULL;
1511	err = ufs__fiolfs(vp, lf, 1, 0);
1512
1513	if (err == EBUSY || err == EINVAL)
1514		goto lockagain;
1515
1516	return (err);
1517}
1518
1519/*
1520 * Release the write lock
1521 */
1522static int
1523allocsp_unlockfs(struct vnode *vp, struct lockfs *lf)
1524{
1525	int err = 0;
1526
1527	lf->lf_lock = LOCKFS_ULOCK;
1528	lf->lf_flags = 0;
1529	err = ufs__fiolfs(vp, lf, 1, 0);
1530	return (err);
1531}
1532
1533struct allocsp_undo {
1534	daddr_t offset;
1535	daddr_t blk;
1536	struct allocsp_undo *next;
1537};
1538
1539/*
1540 * ufs_allocsp() can be used to pre-allocate blocks for a file on a given
1541 * file system. For direct blocks, the blocks are allocated from the offset
1542 * requested to the block boundary, then any full blocks are allocated,
1543 * and finally any remainder.
1544 * For indirect blocks the blocks are not initialized and are
1545 * only marked as allocated. These addresses are then stored as negative
1546 * block numbers in the inode to imply special handling. UFS has been modified
1547 * where necessary to understand this new notion.
1548 * Successfully fallocated files will have IFALLOCATE cflag set in the inode.
1549 */
1550int
1551ufs_allocsp(struct vnode *vp, struct flock64 *lp, cred_t *cr)
1552{
1553	struct lockfs lf;
1554	int berr, err, resv, issync;
1555	off_t istart, len; /* istart, special for idb */
1556	struct inode *ip;
1557	struct fs *fs;
1558	struct ufsvfs *ufsvfsp;
1559	u_offset_t resid, i, uoff;
1560	daddr32_t db_undo[NDADDR];	/* old direct blocks */
1561	struct allocsp_undo *ib_undo = NULL;	/* ib undo */
1562	struct allocsp_undo *undo = NULL;
1563	u_offset_t osz;			/* old file size */
1564	int chunkblks = 0;		/* # of blocks in 1 allocation */
1565	int cnt = 0;
1566	daddr_t allocblk;
1567	daddr_t totblks = 0;
1568	struct ulockfs	*ulp;
1569	size_t done_len;
1570	int nbytes, offsetn;
1571
1572
1573	ASSERT(vp->v_type == VREG);
1574
1575	ip = VTOI(vp);
1576	fs = ip->i_fs;
1577	if ((ufsvfsp = ip->i_ufsvfs) == NULL) {
1578		err = EIO;
1579		goto out_allocsp;
1580	}
1581
1582	istart = blkroundup(fs, (lp->l_start));
1583	len = blkroundup(fs, (lp->l_len));
1584	chunkblks = blkroundup(fs, ufsvfsp->vfs_iotransz) / fs->fs_bsize;
1585	ulp = &ufsvfsp->vfs_ulockfs;
1586
1587	if (lp->l_start < 0 || lp->l_len <= 0)
1588		return (EINVAL);
1589
1590	/* Quickly check to make sure we have space before we proceed */
1591	if (lblkno(fs, len) > fs->fs_cstotal.cs_nbfree) {
1592		if (TRANS_ISTRANS(ufsvfsp)) {
1593			ufs_delete_drain_wait(ufsvfsp, 1);
1594			if (lblkno(fs, len) > fs->fs_cstotal.cs_nbfree)
1595				return (ENOSPC);
1596		} else
1597			return (ENOSPC);
1598	}
1599
1600	/*
1601	 * We will keep i_rwlock locked as WRITER through out the function
1602	 * since we don't want anyone else reading or writing to the inode
1603	 * while we are in the middle of fallocating the file.
1604	 */
1605	rw_enter(&ip->i_rwlock, RW_WRITER);
1606
1607	/* Back up the direct block list, used for undo later if necessary */
1608	rw_enter(&ip->i_contents, RW_READER);
1609	for (i = 0; i < NDADDR; i++)
1610		db_undo[i] = ip->i_db[i];
1611	osz = ip->i_size;
1612	rw_exit(&ip->i_contents);
1613
1614	/* Write lock the file system */
1615	if (err = allocsp_wlockfs(vp, &lf))
1616		goto exit;
1617
1618	/*
1619	 * Allocate any direct blocks now.
1620	 * Blocks are allocated from the offset requested to the block
1621	 * boundary, then any full blocks are allocated, and finally any
1622	 * remainder.
1623	 */
1624	if (lblkno(fs, lp->l_start) < NDADDR) {
1625		ufs_trans_trunc_resv(ip, ip->i_size + (NDADDR * fs->fs_bsize),
1626		    &resv, &resid);
1627		TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_ALLOCSP, resv);
1628
1629		rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
1630		rw_enter(&ip->i_contents, RW_WRITER);
1631
1632		done_len = 0;
1633		while ((done_len < lp->l_len) &&
1634		    (lblkno(fs, lp->l_start + done_len) < NDADDR)) {
1635			uoff = (offset_t)(lp->l_start + done_len);
1636			offsetn = (int)blkoff(fs, uoff);
1637			nbytes = (int)MIN(fs->fs_bsize - offsetn,
1638			    lp->l_len - done_len);
1639
1640			berr = bmap_write(ip, uoff, offsetn + nbytes,
1641			    BI_FALLOCATE, &allocblk, cr);
1642			/* Yikes error, quit */
1643			if (berr) {
1644				TRANS_INODE(ufsvfsp, ip);
1645				rw_exit(&ip->i_contents);
1646				rw_exit(&ufsvfsp->vfs_dqrwlock);
1647				TRANS_END_CSYNC(ufsvfsp, err, issync,
1648				    TOP_ALLOCSP, resv);
1649				err = allocsp_unlockfs(vp, &lf);
1650				goto exit;
1651			}
1652
1653			if (allocblk) {
1654				totblks++;
1655				if ((uoff + nbytes) > ip->i_size)
1656					ip->i_size = (uoff + nbytes);
1657			}
1658			done_len += nbytes;
1659		}
1660
1661		TRANS_INODE(ufsvfsp, ip);
1662		rw_exit(&ip->i_contents);
1663		rw_exit(&ufsvfsp->vfs_dqrwlock);
1664		TRANS_END_CSYNC(ufsvfsp, err, issync, TOP_ALLOCSP, resv);
1665
1666		/* start offset for indirect allocation */
1667		istart =  (uoff + nbytes);
1668	}
1669
1670	/* Break the transactions into vfs_iotransz units */
1671	ufs_trans_trunc_resv(ip, ip->i_size +
1672	    blkroundup(fs, ufsvfsp->vfs_iotransz), &resv, &resid);
1673	TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_ALLOCSP, resv);
1674
1675	rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
1676	rw_enter(&ip->i_contents, RW_WRITER);
1677
1678	/* Now go about fallocating necessary indirect blocks */
1679	for (i = istart; i < (lp->l_start + lp->l_len); i += fs->fs_bsize) {
1680		berr = bmap_write(ip, i, fs->fs_bsize, BI_FALLOCATE,
1681		    &allocblk, cr);
1682		if (berr) {
1683			TRANS_INODE(ufsvfsp, ip);
1684			rw_exit(&ip->i_contents);
1685			rw_exit(&ufsvfsp->vfs_dqrwlock);
1686			TRANS_END_CSYNC(ufsvfsp, err, issync,
1687			    TOP_ALLOCSP, resv);
1688			err = allocsp_unlockfs(vp, &lf);
1689			goto exit;
1690		}
1691
1692		/* Update the blk counter only if new block was added */
1693		if (allocblk) {
1694			/* Save undo information */
1695			undo = kmem_alloc(sizeof (struct allocsp_undo),
1696			    KM_SLEEP);
1697			undo->offset = i;
1698			undo->blk = allocblk;
1699			undo->next = ib_undo;
1700			ib_undo = undo;
1701			totblks++;
1702
1703			if (i >= ip->i_size)
1704				ip->i_size += fs->fs_bsize;
1705		}
1706		cnt++;
1707
1708		/* Being a good UFS citizen, let others get a share */
1709		if (cnt == chunkblks) {
1710			/*
1711			 * If there are waiters or the fs is hard locked,
1712			 * error locked, or read-only error locked,
1713			 * quit with EIO
1714			 */
1715			if (ULOCKFS_IS_HLOCK(ulp) || ULOCKFS_IS_ELOCK(ulp) ||
1716			    ULOCKFS_IS_ROELOCK(ulp)) {
1717				ip->i_cflags |= IFALLOCATE;
1718				TRANS_INODE(ufsvfsp, ip);
1719				rw_exit(&ip->i_contents);
1720				rw_exit(&ufsvfsp->vfs_dqrwlock);
1721
1722				TRANS_END_CSYNC(ufsvfsp, err, issync,
1723				    TOP_ALLOCSP, resv);
1724				rw_exit(&ip->i_rwlock);
1725				(void) allocsp_unlockfs(vp, &lf);
1726				return (EIO);
1727			}
1728
1729			TRANS_INODE(ufsvfsp, ip);
1730			rw_exit(&ip->i_contents);
1731			rw_exit(&ufsvfsp->vfs_dqrwlock);
1732
1733			/* End the current transaction */
1734			TRANS_END_CSYNC(ufsvfsp, err, issync,
1735			    TOP_ALLOCSP, resv);
1736
1737			if (CV_HAS_WAITERS(&ulp->ul_cv)) {
1738				/* Release the write lock */
1739				if (err = allocsp_unlockfs(vp, &lf))
1740					goto exit;
1741
1742				/* Wake up others waiting to do operations */
1743				mutex_enter(&ulp->ul_lock);
1744				cv_broadcast(&ulp->ul_cv);
1745				mutex_exit(&ulp->ul_lock);
1746
1747				/* Grab the write lock again */
1748				if (err = allocsp_wlockfs(vp, &lf))
1749					goto exit;
1750			} /* end of CV_HAS_WAITERS(&ulp->ul_cv) */
1751
1752			/* Reserve more space in log for this file */
1753			ufs_trans_trunc_resv(ip,
1754			    ip->i_size + blkroundup(fs, ufsvfsp->vfs_iotransz),
1755			    &resv, &resid);
1756			TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_ALLOCSP, resv);
1757
1758			rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
1759			rw_enter(&ip->i_contents, RW_WRITER);
1760
1761			cnt = 0;	/* reset cnt b/c of new transaction */
1762		}
1763	}
1764
1765	if (!err && !berr)
1766		ip->i_cflags |= IFALLOCATE;
1767
1768	/* If the file has grown then correct the file size */
1769	if (osz < (lp->l_start + lp->l_len))
1770		ip->i_size = (lp->l_start + lp->l_len);
1771
1772	/* Release locks, end log transaction and unlock fs */
1773	TRANS_INODE(ufsvfsp, ip);
1774	rw_exit(&ip->i_contents);
1775	rw_exit(&ufsvfsp->vfs_dqrwlock);
1776
1777	TRANS_END_CSYNC(ufsvfsp, err, issync, TOP_ALLOCSP, resv);
1778	err = allocsp_unlockfs(vp, &lf);
1779
1780	/*
1781	 * @ exit label, we should no longer be holding the fs write lock, and
1782	 * all logging transactions should have been ended. We still hold
1783	 * ip->i_rwlock.
1784	 */
1785exit:
1786	/*
1787	 * File has grown larger than 2GB. Set flag
1788	 * in superblock to indicate this, if it
1789	 * is not already set.
1790	 */
1791	if ((ip->i_size > MAXOFF32_T) &&
1792	    !(fs->fs_flags & FSLARGEFILES)) {
1793		ASSERT(ufsvfsp->vfs_lfflags & UFS_LARGEFILES);
1794		mutex_enter(&ufsvfsp->vfs_lock);
1795		fs->fs_flags |= FSLARGEFILES;
1796		ufs_sbwrite(ufsvfsp);
1797		mutex_exit(&ufsvfsp->vfs_lock);
1798	}
1799
1800	/*
1801	 * Since we couldn't allocate completely, we will undo the allocations.
1802	 */
1803	if (berr) {
1804		ufs_trans_trunc_resv(ip, totblks * fs->fs_bsize, &resv, &resid);
1805		TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_ALLOCSP, resv);
1806
1807		rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
1808		rw_enter(&ip->i_contents, RW_WRITER);
1809
1810		/* Direct blocks */
1811		for (i = 0; i < NDADDR; i++) {
1812			/*
1813			 * Only free the block if they are not same, and
1814			 * the old one isn't zero (the fragment was
1815			 * re-allocated).
1816			 */
1817			if (db_undo[i] != ip->i_db[i] && db_undo[i] == 0) {
1818				free(ip, ip->i_db[i], fs->fs_bsize, 0);
1819				ip->i_db[i] = 0;
1820			}
1821		}
1822
1823		/* Undo the indirect blocks */
1824		while (ib_undo != NULL) {
1825			undo = ib_undo;
1826			err = bmap_set_bn(vp, undo->offset, 0);
1827			if (err)
1828				cmn_err(CE_PANIC, "ufs_allocsp(): failed to "
1829				    "undo allocation of block %ld",
1830				    undo->offset);
1831			free(ip, undo->blk, fs->fs_bsize, I_IBLK);
1832			ib_undo = undo->next;
1833			kmem_free(undo, sizeof (struct allocsp_undo));
1834		}
1835
1836		ip->i_size = osz;
1837		TRANS_INODE(ufsvfsp, ip);
1838
1839		rw_exit(&ip->i_contents);
1840		rw_exit(&ufsvfsp->vfs_dqrwlock);
1841
1842		TRANS_END_CSYNC(ufsvfsp, err, issync, TOP_ALLOCSP, resv);
1843
1844		rw_exit(&ip->i_rwlock);
1845		return (berr);
1846	}
1847
1848	/*
1849	 * Don't forget to free the undo chain :)
1850	 */
1851	while (ib_undo != NULL) {
1852		undo = ib_undo;
1853		ib_undo = undo->next;
1854		kmem_free(undo, sizeof (struct allocsp_undo));
1855	}
1856
1857	rw_exit(&ip->i_rwlock);
1858
1859out_allocsp:
1860	return (err);
1861}
1862
1863/*
1864 * Free storage space associated with the specified inode.  The portion
1865 * to be freed is specified by lp->l_start and lp->l_len (already
1866 * normalized to a "whence" of 0).
1867 *
1868 * This is an experimental facility whose continued existence is not
1869 * guaranteed.  Currently, we only support the special case
1870 * of l_len == 0, meaning free to end of file.
1871 *
1872 * Blocks are freed in reverse order.  This FILO algorithm will tend to
1873 * maintain a contiguous free list much longer than FIFO.
1874 * See also ufs_itrunc() in ufs_inode.c.
1875 *
1876 * Bug: unused bytes in the last retained block are not cleared.
1877 * This may result in a "hole" in the file that does not read as zeroes.
1878 */
1879/* ARGSUSED */
1880int
1881ufs_freesp(struct vnode *vp, struct flock64 *lp, int flag, cred_t *cr)
1882{
1883	int i;
1884	struct inode *ip = VTOI(vp);
1885	int error;
1886
1887	ASSERT(vp->v_type == VREG);
1888	ASSERT(lp->l_start >= 0);	/* checked by convoff */
1889
1890	if (lp->l_len != 0)
1891		return (EINVAL);
1892
1893	rw_enter(&ip->i_contents, RW_READER);
1894	if (ip->i_size == (u_offset_t)lp->l_start) {
1895		rw_exit(&ip->i_contents);
1896		return (0);
1897	}
1898
1899	/*
1900	 * Check if there is any active mandatory lock on the
1901	 * range that will be truncated/expanded.
1902	 */
1903	if (MANDLOCK(vp, ip->i_mode)) {
1904		offset_t save_start;
1905
1906		save_start = lp->l_start;
1907
1908		if (ip->i_size < lp->l_start) {
1909			/*
1910			 * "Truncate up" case: need to make sure there
1911			 * is no lock beyond current end-of-file. To
1912			 * do so, we need to set l_start to the size
1913			 * of the file temporarily.
1914			 */
1915			lp->l_start = ip->i_size;
1916		}
1917		lp->l_type = F_WRLCK;
1918		lp->l_sysid = 0;
1919		lp->l_pid = ttoproc(curthread)->p_pid;
1920		i = (flag & (FNDELAY|FNONBLOCK)) ? 0 : SLPFLCK;
1921		rw_exit(&ip->i_contents);
1922		if ((i = reclock(vp, lp, i, 0, lp->l_start, NULL)) != 0 ||
1923		    lp->l_type != F_UNLCK) {
1924			return (i ? i : EAGAIN);
1925		}
1926		rw_enter(&ip->i_contents, RW_READER);
1927
1928		lp->l_start = save_start;
1929	}
1930
1931	/*
1932	 * Make sure a write isn't in progress (allocating blocks)
1933	 * by acquiring i_rwlock (we promised ufs_bmap we wouldn't
1934	 * truncate while it was allocating blocks).
1935	 * Grab the locks in the right order.
1936	 */
1937	rw_exit(&ip->i_contents);
1938	rw_enter(&ip->i_rwlock, RW_WRITER);
1939	error = TRANS_ITRUNC(ip, (u_offset_t)lp->l_start, 0, cr);
1940	rw_exit(&ip->i_rwlock);
1941	return (error);
1942}
1943
1944/*
1945 * Find a cg with as close to nb contiguous bytes as possible
1946 *	THIS MAY TAKE MANY DISK READS!
1947 *
1948 * Implemented in an attempt to allocate contiguous blocks for
1949 * writing the ufs log file to, minimizing future disk head seeking
1950 */
1951daddr_t
1952contigpref(ufsvfs_t *ufsvfsp, size_t nb, size_t minb)
1953{
1954	struct fs	*fs	= ufsvfsp->vfs_fs;
1955	daddr_t		nblk	= lblkno(fs, blkroundup(fs, nb));
1956	daddr_t		minblk	= lblkno(fs, blkroundup(fs, minb));
1957	daddr_t		savebno, curbno, cgbno;
1958	int		cg, cgblks, savecg, savenblk, curnblk, startcg;
1959	uchar_t		*blksfree;
1960	buf_t		*bp;
1961	struct cg	*cgp;
1962
1963	savenblk = 0;
1964	savecg = 0;
1965	savebno = 0;
1966
1967	if ((startcg = findlogstartcg(fs, nblk, minblk)) == -1)
1968		cg = 0;	/* Nothing suitable found */
1969	else
1970		cg = startcg;
1971
1972	for (; cg < fs->fs_ncg; ++cg) {
1973		/*
1974		 * find the largest contiguous range in this cg
1975		 */
1976		bp = UFS_BREAD(ufsvfsp, ufsvfsp->vfs_dev,
1977		    (daddr_t)fsbtodb(fs, cgtod(fs, cg)),
1978		    (int)fs->fs_cgsize);
1979		cgp = bp->b_un.b_cg;
1980		if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp)) {
1981			brelse(bp);
1982			continue;
1983		}
1984		blksfree = cg_blksfree(cgp);	    /* free array */
1985		cgblks = fragstoblks(fs, fs->fs_fpg); /* blks in free array */
1986		cgbno = 0;
1987		while (cgbno < cgblks && savenblk < nblk) {
1988			/* find a free block */
1989			for (; cgbno < cgblks; ++cgbno) {
1990				if (isblock(fs, blksfree, cgbno)) {
1991					if (startcg != -1) {
1992						brelse(bp);
1993						savecg = startcg;
1994						savebno = cgbno;
1995						goto done;
1996					} else
1997						break;
1998				}
1999			}
2000			curbno = cgbno;
2001			/* count the number of free blocks */
2002			for (curnblk = 0; cgbno < cgblks; ++cgbno) {
2003				if (!isblock(fs, blksfree, cgbno))
2004					break;
2005				if (++curnblk >= nblk)
2006					break;
2007			}
2008			if (curnblk > savenblk) {
2009				savecg = cg;
2010				savenblk = curnblk;
2011				savebno = curbno;
2012			}
2013		}
2014		brelse(bp);
2015		if (savenblk >= nblk)
2016			break;
2017	}
2018
2019done:
2020
2021	/* convert block offset in cg to frag offset in cg */
2022	savebno = blkstofrags(fs, savebno);
2023
2024	/* convert frag offset in cg to frag offset in fs */
2025	savebno += (savecg * fs->fs_fpg);
2026
2027	return (savebno);
2028}
2029
2030/*
2031 * The object of this routine is to find a start point for the UFS log.
2032 * Ideally the space should be allocated from the smallest possible number
2033 * of contiguous cylinder groups. This is found by using a sliding window
2034 * technique. The smallest window of contiguous cylinder groups, which is
2035 * still able to accommodate the target, is found by moving the window
2036 * through the cylinder groups in a single pass. The end of the window is
2037 * advanced until the space is accommodated, then the start is advanced until
2038 * it no longer fits, the end is then advanced again and so on until the
2039 * final cylinder group is reached. The first suitable instance is recorded
2040 * and its starting cg number is returned.
2041 *
2042 * If we are not able to find a minimum amount of space, represented by
2043 * minblk, or to do so uses more than the available extents, then return -1.
2044 */
2045
2046int
2047findlogstartcg(struct fs *fs, daddr_t requested, daddr_t minblk)
2048{
2049	int	 ncgs;		 /* number of cylinder groups */
2050	daddr_t target;		 /* amount of space sought */
2051	int	 cwidth, ctotal; /* current window width and total */
2052	int	 bwidth, btotal; /* best window width and total so far */
2053	int	 s;	/* index of the first element in the current window */
2054	int	 e;	/* index of the first element + the width */
2055			/*  (i.e. 1 + index of last element) */
2056	int	 bs; /* index of the first element in the best window so far */
2057	int	 header, max_extents;
2058
2059	target = requested;
2060	ncgs = fs->fs_ncg;
2061
2062	header = sizeof (extent_block_t) - sizeof (extent_t);
2063	max_extents = ((fs->fs_bsize)-header) / sizeof (extent_t);
2064	cwidth = ctotal = 0;
2065	btotal = -1;
2066	bwidth = ncgs;
2067	s = e = 0;
2068	while (e < ncgs) {
2069	/* Advance the end of the window until it accommodates the target. */
2070		while (ctotal < target && e < ncgs) {
2071			ctotal += fs->fs_cs(fs, e).cs_nbfree;
2072			e++;
2073		}
2074
2075		/*
2076		 * Advance the start of the window until it no longer
2077		 * accommodates the target.
2078		 */
2079		while (ctotal >= target && s < e) {
2080			/* See if this is the smallest window so far. */
2081			cwidth = e - s;
2082			if (cwidth <= bwidth) {
2083				if (cwidth == bwidth && ctotal <= btotal)
2084					goto more;
2085				bwidth = cwidth;
2086				btotal = ctotal;
2087				bs = s;
2088			}
2089more:
2090			ctotal -= fs->fs_cs(fs, s).cs_nbfree;
2091			s++;
2092		}
2093	}
2094
2095	/*
2096	 * If we cannot allocate the minimum required or we use too many
2097	 * extents to do so, return -1.
2098	 */
2099	if (btotal < minblk || bwidth > max_extents)
2100		bs = -1;
2101
2102	return (bs);
2103}
2104