1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 1983, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2016, 2017 by Delphix. All rights reserved.
24 */
25
26/*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
27/*	  All Rights Reserved  	*/
28
29/*
30 * University Copyright- Copyright (c) 1982, 1986, 1988
31 * The Regents of the University of California
32 * All Rights Reserved
33 *
34 * University Acknowledgment- Portions of this document are derived from
35 * software developed by the University of California, Berkeley, and its
36 * contributors.
37 */
38
39#include <sys/types.h>
40#include <sys/t_lock.h>
41#include <sys/param.h>
42#include <sys/systm.h>
43#include <sys/uio.h>
44#include <sys/bitmap.h>
45#include <sys/signal.h>
46#include <sys/cred.h>
47#include <sys/user.h>
48#include <sys/vfs.h>
49#include <sys/stat.h>
50#include <sys/vnode.h>
51#include <sys/buf.h>
52#include <sys/proc.h>
53#include <sys/disp.h>
54#include <sys/dnlc.h>
55#include <sys/mode.h>
56#include <sys/cmn_err.h>
57#include <sys/kstat.h>
58#include <sys/acl.h>
59#include <sys/var.h>
60#include <sys/fs/ufs_inode.h>
61#include <sys/fs/ufs_fs.h>
62#include <sys/fs/ufs_trans.h>
63#include <sys/fs/ufs_acl.h>
64#include <sys/fs/ufs_bio.h>
65#include <sys/fs/ufs_quota.h>
66#include <sys/fs/ufs_log.h>
67#include <vm/hat.h>
68#include <vm/as.h>
69#include <vm/pvn.h>
70#include <vm/seg.h>
71#include <sys/swap.h>
72#include <sys/cpuvar.h>
73#include <sys/sysmacros.h>
74#include <sys/errno.h>
75#include <sys/kmem.h>
76#include <sys/debug.h>
77#include <fs/fs_subr.h>
78#include <sys/policy.h>
79
80struct kmem_cache *inode_cache;		/* cache of free inodes */
81
82/* UFS Inode Cache Stats -- Not protected */
83struct	instats ins = {
84	{ "size",		KSTAT_DATA_ULONG },
85	{ "maxsize",		KSTAT_DATA_ULONG },
86	{ "hits",		KSTAT_DATA_ULONG },
87	{ "misses",		KSTAT_DATA_ULONG },
88	{ "kmem allocs",	KSTAT_DATA_ULONG },
89	{ "kmem frees",		KSTAT_DATA_ULONG },
90	{ "maxsize reached",	KSTAT_DATA_ULONG },
91	{ "puts at frontlist",	KSTAT_DATA_ULONG },
92	{ "puts at backlist",	KSTAT_DATA_ULONG },
93	{ "queues to free",	KSTAT_DATA_ULONG },
94	{ "scans",		KSTAT_DATA_ULONG },
95	{ "thread idles",	KSTAT_DATA_ULONG },
96	{ "lookup idles",	KSTAT_DATA_ULONG },
97	{ "vget idles",		KSTAT_DATA_ULONG },
98	{ "cache allocs",	KSTAT_DATA_ULONG },
99	{ "cache frees",	KSTAT_DATA_ULONG },
100	{ "pushes at close",	KSTAT_DATA_ULONG }
101};
102
103/* kstat data */
104static kstat_t		*ufs_inode_kstat = NULL;
105
106union ihead *ihead;	/* inode LRU cache, Chris Maltby */
107kmutex_t *ih_lock;	/* protect inode cache hash table */
108static int ino_hashlen = 4;	/* desired average hash chain length */
109int inohsz;		/* number of buckets in the hash table */
110
111kmutex_t	ufs_scan_lock;	/* stop racing multiple ufs_scan_inodes() */
112kmutex_t	ufs_iuniqtime_lock; /* protect iuniqtime */
113kmutex_t	ufsvfs_mutex;
114struct ufsvfs	*oldufsvfslist, *ufsvfslist;
115
116/*
117 * time to wait after ufsvfsp->vfs_iotstamp before declaring that no
118 * I/Os are going on.
119 */
120clock_t	ufs_iowait;
121
122/*
123 * the threads that process idle inodes and free (deleted) inodes
124 * have high water marks that are set in ufsinit().
125 * These values but can be no less then the minimum shown below
126 */
127int	ufs_idle_max;	/* # of allowable idle inodes */
128ulong_t	ufs_inode_max;	/* hard limit of allowable idle inodes */
129#define	UFS_IDLE_MAX	(16)	/* min # of allowable idle inodes */
130
131/*
132 * Tunables for ufs write throttling.
133 * These are validated in ufs_iinit() since improper settings
134 * can lead to filesystem hangs.
135 */
136#define	UFS_HW_DEFAULT	(16 * 1024 * 1024)
137#define	UFS_LW_DEFAULT	(8 * 1024 * 1024)
138int	ufs_HW = UFS_HW_DEFAULT;
139int	ufs_LW = UFS_LW_DEFAULT;
140
141static void ihinit(void);
142extern int hash2ints(int, int);
143
144static int ufs_iget_internal(struct vfs *, ino_t, struct inode **,
145    struct cred *, int);
146
147/* ARGSUSED */
148static int
149ufs_inode_kstat_update(kstat_t *ksp, int rw)
150{
151	if (rw == KSTAT_WRITE)
152		return (EACCES);
153
154	ins.in_malloc.value.ul	= (ulong_t)kmem_cache_stat(inode_cache,
155	    "slab_alloc");
156	ins.in_mfree.value.ul	= (ulong_t)kmem_cache_stat(inode_cache,
157	    "slab_free");
158	ins.in_kcalloc.value.ul	= (ulong_t)kmem_cache_stat(inode_cache,
159	    "alloc");
160	ins.in_kcfree.value.ul	= (ulong_t)kmem_cache_stat(inode_cache,
161	    "free");
162	ins.in_size.value.ul	= (ulong_t)kmem_cache_stat(inode_cache,
163	    "buf_inuse");
164	ins.in_maxreached.value.ul = (ulong_t)kmem_cache_stat(inode_cache,
165	    "buf_max");
166	ins.in_misses.value.ul = ins.in_kcalloc.value.ul;
167
168	return (0);
169}
170
171void
172ufs_iinit(void)
173{
174	/*
175	 * Validate that ufs_HW > ufs_LW.
176	 * The default values for these two tunables have been increased.
177	 * There is now a range of values for ufs_HW that used to be
178	 * legal on previous Solaris versions but no longer is now.
179	 * Upgrading a machine which has an /etc/system setting for ufs_HW
180	 * from that range can lead to filesystem hangs unless the values
181	 * are checked here.
182	 */
183	if (ufs_HW <= ufs_LW) {
184		cmn_err(CE_WARN,
185		    "ufs_HW (%d) <= ufs_LW (%d). Check /etc/system.",
186		    ufs_HW, ufs_LW);
187		ufs_LW = UFS_LW_DEFAULT;
188		ufs_HW = UFS_HW_DEFAULT;
189		cmn_err(CE_CONT, "using defaults, ufs_HW = %d, ufs_LW = %d\n",
190		    ufs_HW, ufs_LW);
191	}
192
193	/*
194	 * Adjust the tunable `ufs_ninode' to a reasonable value
195	 */
196	if (ufs_ninode <= 0)
197		ufs_ninode = ncsize;
198	if (ufs_inode_max == 0)
199		ufs_inode_max =
200		    (ulong_t)((kmem_maxavail() >> 2) / sizeof (struct inode));
201	if (ufs_ninode > ufs_inode_max || (ufs_ninode == 0 && ncsize == 0)) {
202		cmn_err(CE_NOTE, "setting ufs_ninode to max value of %ld",
203		    ufs_inode_max);
204		ufs_ninode = ufs_inode_max;
205	}
206	/*
207	 * Wait till third call of ufs_update to declare that no I/Os are
208	 * going on. This allows deferred access times to be flushed to disk.
209	 */
210	ufs_iowait = v.v_autoup * hz * 2;
211
212	/*
213	 * idle thread runs when 25% of ufs_ninode entries are on the queue
214	 */
215	if (ufs_idle_max == 0)
216		ufs_idle_max = ufs_ninode >> 2;
217	if (ufs_idle_max < UFS_IDLE_MAX)
218		ufs_idle_max = UFS_IDLE_MAX;
219	if (ufs_idle_max > ufs_ninode)
220		ufs_idle_max = ufs_ninode;
221	/*
222	 * This is really a misnomer, it is ufs_queue_init
223	 */
224	ufs_thread_init(&ufs_idle_q, ufs_idle_max);
225	ufs_thread_start(&ufs_idle_q, ufs_thread_idle, NULL);
226
227	/*
228	 * global hlock thread
229	 */
230	ufs_thread_init(&ufs_hlock, 1);
231	ufs_thread_start(&ufs_hlock, ufs_thread_hlock, NULL);
232
233	ihinit();
234	qtinit();
235	ins.in_maxsize.value.ul = ufs_ninode;
236	if ((ufs_inode_kstat = kstat_create("ufs", 0, "inode_cache", "ufs",
237	    KSTAT_TYPE_NAMED, sizeof (ins) / sizeof (kstat_named_t),
238	    KSTAT_FLAG_VIRTUAL)) != NULL) {
239		ufs_inode_kstat->ks_data = (void *)&ins;
240		ufs_inode_kstat->ks_update = ufs_inode_kstat_update;
241		kstat_install(ufs_inode_kstat);
242	}
243	ufsfx_init();		/* fix-on-panic initialization */
244	si_cache_init();
245	ufs_directio_init();
246	lufs_init();
247	mutex_init(&ufs_iuniqtime_lock, NULL, MUTEX_DEFAULT, NULL);
248}
249
250/* ARGSUSED */
251static int
252ufs_inode_cache_constructor(void *buf, void *cdrarg, int kmflags)
253{
254	struct inode *ip = buf;
255	struct vnode *vp;
256
257	vp = ip->i_vnode = vn_alloc(kmflags);
258	if (vp == NULL) {
259		return (-1);
260	}
261	vn_setops(vp, ufs_vnodeops);
262	vp->v_data = ip;
263
264	rw_init(&ip->i_rwlock, NULL, RW_DEFAULT, NULL);
265	rw_init(&ip->i_contents, NULL, RW_DEFAULT, NULL);
266	mutex_init(&ip->i_tlock, NULL, MUTEX_DEFAULT, NULL);
267	dnlc_dir_init(&ip->i_danchor);
268
269	cv_init(&ip->i_wrcv, NULL, CV_DRIVER, NULL);
270
271	return (0);
272}
273
274/* ARGSUSED */
275static void
276ufs_inode_cache_destructor(void *buf, void *cdrarg)
277{
278	struct inode *ip = buf;
279	struct vnode *vp;
280
281	vp = ITOV(ip);
282
283	rw_destroy(&ip->i_rwlock);
284	rw_destroy(&ip->i_contents);
285	mutex_destroy(&ip->i_tlock);
286	if (vp->v_type == VDIR) {
287		dnlc_dir_fini(&ip->i_danchor);
288	}
289
290	cv_destroy(&ip->i_wrcv);
291
292	vn_free(vp);
293}
294
295/*
296 * Initialize hash links for inodes
297 * and build inode free list.
298 */
299void
300ihinit(void)
301{
302	int i;
303	union	ihead *ih = ihead;
304
305	mutex_init(&ufs_scan_lock, NULL, MUTEX_DEFAULT, NULL);
306
307	inohsz = 1 << highbit(ufs_ninode / ino_hashlen);
308	ihead = kmem_zalloc(inohsz * sizeof (union ihead), KM_SLEEP);
309	ih_lock = kmem_zalloc(inohsz * sizeof (kmutex_t), KM_SLEEP);
310
311	for (i = 0, ih = ihead; i < inohsz; i++,  ih++) {
312		ih->ih_head[0] = ih;
313		ih->ih_head[1] = ih;
314		mutex_init(&ih_lock[i], NULL, MUTEX_DEFAULT, NULL);
315	}
316	inode_cache = kmem_cache_create("ufs_inode_cache",
317	    sizeof (struct inode), 0, ufs_inode_cache_constructor,
318	    ufs_inode_cache_destructor, ufs_inode_cache_reclaim,
319	    NULL, NULL, 0);
320}
321
322/*
323 * Free an inode structure
324 */
325void
326ufs_free_inode(struct inode *ip)
327{
328	vn_invalid(ITOV(ip));
329	kmem_cache_free(inode_cache, ip);
330}
331
332/*
333 * Allocate an inode structure
334 */
335struct inode *
336ufs_alloc_inode(ufsvfs_t *ufsvfsp, ino_t ino)
337{
338	struct inode *ip;
339	vnode_t *vp;
340
341	ip = kmem_cache_alloc(inode_cache, KM_SLEEP);
342	/*
343	 * at this point we have a newly allocated inode
344	 */
345	ip->i_freef = ip;
346	ip->i_freeb = ip;
347	ip->i_flag = IREF;
348	ip->i_seq = 0xFF;	/* Unique initial value */
349	ip->i_dev = ufsvfsp->vfs_dev;
350	ip->i_ufsvfs = ufsvfsp;
351	ip->i_devvp = ufsvfsp->vfs_devvp;
352	ip->i_number = ino;
353	ip->i_diroff = 0;
354	ip->i_nextr = 0;
355	ip->i_map = NULL;
356	ip->i_rdev = 0;
357	ip->i_writes = 0;
358	ip->i_mode = 0;
359	ip->i_delaylen = 0;
360	ip->i_delayoff = 0;
361	ip->i_nextrio = 0;
362	ip->i_ufs_acl = NULL;
363	ip->i_cflags = 0;
364	ip->i_mapcnt = 0;
365	ip->i_dquot = NULL;
366	ip->i_cachedir = CD_ENABLED;
367	ip->i_writer = NULL;
368
369	/*
370	 * the vnode for this inode was allocated by the constructor
371	 */
372	vp = ITOV(ip);
373	vn_reinit(vp);
374	if (ino == (ino_t)UFSROOTINO)
375		vp->v_flag = VROOT;
376	vp->v_vfsp = ufsvfsp->vfs_vfs;
377	vn_exists(vp);
378	return (ip);
379}
380
381/*
382 * Look up an inode by device, inumber.  If it is in core (in the
383 * inode structure), honor the locking protocol.  If it is not in
384 * core, read it in from the specified device after freeing any pages.
385 * In all cases, a pointer to a VN_HELD inode structure is returned.
386 */
387int
388ufs_iget(struct vfs *vfsp, ino_t ino, struct inode **ipp, struct cred *cr)
389{
390	return (ufs_iget_internal(vfsp, ino, ipp, cr, 0));
391}
392
393/*
394 * A version of ufs_iget which returns only allocated, linked inodes.
395 * This is appropriate for any callers who do not expect a free inode.
396 */
397int
398ufs_iget_alloced(struct vfs *vfsp, ino_t ino, struct inode **ipp,
399    struct cred *cr)
400{
401	return (ufs_iget_internal(vfsp, ino, ipp, cr, 1));
402}
403
404/*
405 * Set vnode attributes based on v_type, this should be called whenever
406 * an inode's i_mode is changed.
407 */
408void
409ufs_reset_vnode(vnode_t *vp)
410{
411	/*
412	 * an old DBE hack
413	 */
414	if ((VTOI(vp)->i_mode & (ISVTX | IEXEC | IFDIR)) == ISVTX)
415		vp->v_flag |= VSWAPLIKE;
416	else
417		vp->v_flag &= ~VSWAPLIKE;
418
419	/*
420	 * if not swap like and it's just a regular file, we want
421	 * to maintain the vnode's pages sorted by clean/modified
422	 * for faster sync'ing to disk
423	 */
424	if (vp->v_type == VREG)
425		vp->v_flag |= VMODSORT;
426	else
427		vp->v_flag &= ~VMODSORT;
428
429	/*
430	 * Is this an attribute hidden dir?
431	 */
432	if ((VTOI(vp)->i_mode & IFMT) == IFATTRDIR)
433		vp->v_flag |= V_XATTRDIR;
434	else
435		vp->v_flag &= ~V_XATTRDIR;
436}
437
438/*
439 * Shared implementation of ufs_iget and ufs_iget_alloced.  The 'validate'
440 * flag is used to distinguish the two; when true, we validate that the inode
441 * being retrieved looks like a linked and allocated inode.
442 */
443/* ARGSUSED */
444static int
445ufs_iget_internal(struct vfs *vfsp, ino_t ino, struct inode **ipp,
446    struct cred *cr, int validate)
447{
448	struct inode *ip, *sp;
449	union ihead *ih;
450	kmutex_t *ihm;
451	struct buf *bp;
452	struct dinode *dp;
453	struct vnode *vp;
454	extern vfs_t EIO_vfs;
455	int error;
456	int ftype;	/* XXX - Remove later on */
457	dev_t vfs_dev;
458	struct ufsvfs *ufsvfsp;
459	struct fs *fs;
460	int hno;
461	daddr_t bno;
462	ulong_t ioff;
463
464	CPU_STATS_ADD_K(sys, ufsiget, 1);
465
466	/*
467	 * Lookup inode in cache.
468	 */
469	vfs_dev = vfsp->vfs_dev;
470	hno = INOHASH(ino);
471	ih = &ihead[hno];
472	ihm = &ih_lock[hno];
473
474again:
475	mutex_enter(ihm);
476	for (ip = ih->ih_chain[0]; ip != (struct inode *)ih; ip = ip->i_forw) {
477		if (ino != ip->i_number || vfs_dev != ip->i_dev ||
478		    (ip->i_flag & ISTALE))
479			continue;
480
481		/*
482		 * Found the interesting inode; hold it and drop the cache lock
483		 */
484		vp = ITOV(ip);	/* for locknest */
485		VN_HOLD(vp);
486		mutex_exit(ihm);
487		rw_enter(&ip->i_contents, RW_READER);
488
489		/*
490		 * if necessary, remove from idle list
491		 */
492		if ((ip->i_flag & IREF) == 0) {
493			if (ufs_rmidle(ip))
494				VN_RELE(vp);
495		}
496
497		/*
498		 * Could the inode be read from disk?
499		 */
500		if (ip->i_flag & ISTALE) {
501			rw_exit(&ip->i_contents);
502			VN_RELE(vp);
503			goto again;
504		}
505
506		ins.in_hits.value.ul++;
507		*ipp = ip;
508
509		/*
510		 * Reset the vnode's attribute flags
511		 */
512		mutex_enter(&vp->v_lock);
513		ufs_reset_vnode(vp);
514		mutex_exit(&vp->v_lock);
515
516		rw_exit(&ip->i_contents);
517
518		return (0);
519	}
520	mutex_exit(ihm);
521
522	/*
523	 * Inode was not in cache.
524	 *
525	 * Allocate a new entry
526	 */
527	ufsvfsp = (struct ufsvfs *)vfsp->vfs_data;
528	fs = ufsvfsp->vfs_fs;
529
530	ip = ufs_alloc_inode(ufsvfsp, ino);
531	vp = ITOV(ip);
532
533	bno = fsbtodb(fs, itod(fs, ino));
534	ioff = (sizeof (struct dinode)) * (itoo(fs, ino));
535	ip->i_doff = (offset_t)ioff + ldbtob(bno);
536
537	/*
538	 * put a place holder in the cache (if not already there)
539	 */
540	mutex_enter(ihm);
541	for (sp = ih->ih_chain[0]; sp != (struct inode *)ih; sp = sp->i_forw)
542		if (ino == sp->i_number && vfs_dev == sp->i_dev &&
543		    ((sp->i_flag & ISTALE) == 0)) {
544			mutex_exit(ihm);
545			ufs_free_inode(ip);
546			goto again;
547		}
548	/*
549	 * It would be nice to ASSERT(RW_READ_HELD(&ufsvfsp->vfs_dqrwlock))
550	 * here, but if we do, then shadow inode allocations panic the
551	 * system.  We don't have to hold vfs_dqrwlock for shadow inodes
552	 * and the ufs_iget() parameters don't tell us what we are getting
553	 * so we have no way of knowing this is a ufs_iget() call from
554	 * a ufs_ialloc() call for a shadow inode.
555	 */
556	rw_enter(&ip->i_contents, RW_WRITER);
557	insque(ip, ih);
558	mutex_exit(ihm);
559	/*
560	 * read the dinode
561	 */
562	bp = UFS_BREAD(ufsvfsp, ip->i_dev, bno, (int)fs->fs_bsize);
563
564	/*
565	 * Check I/O errors
566	 */
567	error = ((bp->b_flags & B_ERROR) ? geterror(bp) : 0);
568	if (error) {
569		brelse(bp);
570		ip->i_flag |= ISTALE;	/* in case someone is looking it up */
571		rw_exit(&ip->i_contents);
572		vp->v_vfsp = &EIO_vfs;
573		VN_RELE(vp);
574		return (error);
575	}
576	/*
577	 * initialize the inode's dinode
578	 */
579	dp = (struct dinode *)(ioff + bp->b_un.b_addr);
580	ip->i_ic = dp->di_ic;			/* structure assignment */
581	brelse(bp);
582
583	/*
584	 * Maintain compatibility with Solaris 1.x UFS
585	 */
586	if (ip->i_suid != UID_LONG)
587		ip->i_uid = ip->i_suid;
588	if (ip->i_sgid != GID_LONG)
589		ip->i_gid = ip->i_sgid;
590
591	ftype = ip->i_mode & IFMT;
592	if (ftype == IFBLK || ftype == IFCHR) {
593		dev_t dv;
594		uint_t top16 = ip->i_ordev & 0xffff0000u;
595
596		if (top16 == 0 || top16 == 0xffff0000u)
597			dv = expdev(ip->i_ordev);
598		else
599			dv = expldev(ip->i_ordev);
600		vp->v_rdev = ip->i_rdev = dv;
601	}
602
603	/*
604	 * if our caller only expects allocated inodes, verify that
605	 * this inode looks good; throw it out if it's bad.
606	 */
607	if (validate) {
608		if ((ftype == 0) || (ip->i_nlink <= 0)) {
609			ip->i_flag |= ISTALE;
610			rw_exit(&ip->i_contents);
611			vp->v_vfsp = &EIO_vfs;
612			VN_RELE(vp);
613			cmn_err(CE_NOTE,
614			    "%s: unexpected free inode %d, run fsck(1M)%s",
615			    fs->fs_fsmnt, (int)ino,
616			    (TRANS_ISTRANS(ufsvfsp) ? " -o f" : ""));
617			return (EIO);
618		}
619	}
620
621	/*
622	 * Finish initializing the vnode, special handling for shadow inodes
623	 * because IFTOVT() will produce a v_type of VNON which is not what we
624	 * want, set v_type to VREG explicitly in that case.
625	 */
626	if (ftype == IFSHAD) {
627		vp->v_type = VREG;
628	} else {
629		vp->v_type = IFTOVT((mode_t)ip->i_mode);
630	}
631
632	ufs_reset_vnode(vp);
633
634	/*
635	 * read the shadow
636	 */
637	if (ftype != 0 && ip->i_shadow != 0) {
638		if ((error = ufs_si_load(ip, cr)) != 0) {
639			ip->i_flag |= ISTALE;
640			ip->i_ufs_acl = NULL;
641			rw_exit(&ip->i_contents);
642			vp->v_vfsp = &EIO_vfs;
643			VN_RELE(vp);
644			return (error);
645		}
646	}
647
648	/*
649	 * Only attach quota information if the inode has a type and if
650	 * that type is not a shadow inode.
651	 */
652	if (ip->i_mode && ((ip->i_mode & IFMT) != IFSHAD) &&
653	    ((ip->i_mode & IFMT) != IFATTRDIR)) {
654		ip->i_dquot = getinoquota(ip);
655	}
656	TRANS_MATA_IGET(ufsvfsp, ip);
657	*ipp = ip;
658	rw_exit(&ip->i_contents);
659
660	return (0);
661}
662
663/*
664 * Vnode is no longer referenced, write the inode out
665 * and if necessary, truncate and deallocate the file.
666 */
667void
668ufs_iinactive(struct inode *ip)
669{
670	int		front;
671	struct inode	*iq;
672	struct inode	*hip;
673	struct ufs_q	*uq;
674	struct vnode	*vp = ITOV(ip);
675	struct ufsvfs   *ufsvfsp = ip->i_ufsvfs;
676	struct ufs_delq_info *delq_info = &ufsvfsp->vfs_delete_info;
677
678	/*
679	 * Because the vnode type might have been changed,
680	 * the dnlc_dir_purge must be called unconditionally.
681	 */
682	dnlc_dir_purge(&ip->i_danchor);
683
684	/*
685	 * Get exclusive access to inode data.
686	 */
687	rw_enter(&ip->i_contents, RW_WRITER);
688	ASSERT(ip->i_flag & IREF);
689
690	/*
691	 * Make sure no one reclaimed the inode before we put it on
692	 * the freelist or destroy it. We keep our 'hold' on the vnode
693	 * from vn_rele until we are ready to do something with the inode.
694	 *
695	 * Pageout may put a VN_HOLD/VN_RELE at anytime during this
696	 * operation via an async putpage, so we must make sure
697	 * we don't free/destroy the inode more than once. ufs_iget
698	 * may also put a VN_HOLD on the inode before it grabs
699	 * the i_contents lock. This is done so we don't free
700	 * an inode that a thread is waiting on.
701	 */
702	mutex_enter(&vp->v_lock);
703
704	if (vp->v_count > 1) {
705		VN_RELE_LOCKED(vp);
706		mutex_exit(&vp->v_lock);
707		rw_exit(&ip->i_contents);
708		return;
709	}
710	mutex_exit(&vp->v_lock);
711
712	/*
713	 * For umount case: if ufsvfs ptr is NULL, the inode is unhashed
714	 * and clean.  It can be safely destroyed (cyf).
715	 */
716	if (ip->i_ufsvfs == NULL) {
717		rw_exit(&ip->i_contents);
718		ufs_si_del(ip);
719		ASSERT((vp->v_type == VCHR) || !vn_has_cached_data(vp));
720		ufs_free_inode(ip);
721		return;
722	}
723
724	/*
725	 * queue idle inode to appropriate thread. Will check v_count == 1
726	 * prior to putting this on the appropriate queue.
727	 * Stale inodes will be unhashed and freed by the ufs idle thread
728	 * in ufs_idle_free()
729	 */
730	front = 1;
731	if ((ip->i_flag & ISTALE) == 0 && ip->i_fs->fs_ronly == 0 &&
732	    ip->i_mode && ip->i_nlink <= 0) {
733		/*
734		 * Mark the i_flag to indicate that inode is being deleted.
735		 * This flag will be cleared when the deletion is complete.
736		 * This prevents nfs from sneaking in via ufs_vget() while
737		 * the delete is in progress (bugid 1242481).
738		 */
739		ip->i_flag |= IDEL;
740
741		/*
742		 * NOIDEL means that deletes are not allowed at this time;
743		 * whoever resets NOIDEL will also send this inode back
744		 * through ufs_iinactive.  IREF remains set.
745		 */
746		if (ULOCKFS_IS_NOIDEL(ITOUL(ip))) {
747			mutex_enter(&vp->v_lock);
748			VN_RELE_LOCKED(vp);
749			mutex_exit(&vp->v_lock);
750			rw_exit(&ip->i_contents);
751			return;
752		}
753		if (!TRANS_ISTRANS(ip->i_ufsvfs)) {
754			rw_exit(&ip->i_contents);
755			ufs_delete(ip->i_ufsvfs, ip, 0);
756			return;
757		}
758
759		/* queue to delete thread; IREF remains set */
760		ins.in_qfree.value.ul++;
761		uq = &ip->i_ufsvfs->vfs_delete;
762
763		mutex_enter(&uq->uq_mutex);
764
765		/* add to q */
766		if ((iq = uq->uq_ihead) != 0) {
767			ip->i_freef = iq;
768			ip->i_freeb = iq->i_freeb;
769			iq->i_freeb->i_freef = ip;
770			iq->i_freeb = ip;
771			if (front)
772				uq->uq_ihead = ip;
773		} else {
774			uq->uq_ihead = ip;
775			ip->i_freef = ip;
776			ip->i_freeb = ip;
777		}
778
779		delq_info->delq_unreclaimed_files += 1;
780		delq_info->delq_unreclaimed_blocks += ip->i_blocks;
781	} else {
782		/*
783		 * queue to idle thread
784		 *  Check the v_count == 1 again.
785		 *
786		 */
787		mutex_enter(&vp->v_lock);
788		if (vp->v_count > 1) {
789			VN_RELE_LOCKED(vp);
790			mutex_exit(&vp->v_lock);
791			rw_exit(&ip->i_contents);
792			return;
793		}
794		mutex_exit(&vp->v_lock);
795		uq = &ufs_idle_q;
796
797		/*
798		 * useful iff it has pages or is a fastsymlink; otherwise junk
799		 */
800		mutex_enter(&uq->uq_mutex);
801
802		/* clear IREF means `on idle list' */
803		ip->i_flag &= ~(IREF | IDIRECTIO);
804
805		if (vn_has_cached_data(vp) || ip->i_flag & IFASTSYMLNK) {
806			ins.in_frback.value.ul++;
807			hip = (inode_t *)&ufs_useful_iq[IQHASH(ip)];
808			ufs_nuseful_iq++;
809		} else {
810			ins.in_frfront.value.ul++;
811			hip = (inode_t *)&ufs_junk_iq[IQHASH(ip)];
812			ip->i_flag |= IJUNKIQ;
813			ufs_njunk_iq++;
814		}
815		ip->i_freef = hip;
816		ip->i_freeb = hip->i_freeb;
817		hip->i_freeb->i_freef = ip;
818		hip->i_freeb = ip;
819	}
820
821	/* wakeup thread(s) if q is overfull */
822	if (++uq->uq_ne == uq->uq_lowat)
823		cv_broadcast(&uq->uq_cv);
824
825	/* all done, release the q and inode */
826	mutex_exit(&uq->uq_mutex);
827	rw_exit(&ip->i_contents);
828}
829
830/*
831 * Check accessed and update flags on an inode structure.
832 * If any are on, update the inode with the (unique) current time.
833 * If waitfor is given, insure I/O order so wait for write to complete.
834 */
835void
836ufs_iupdat(struct inode *ip, int waitfor)
837{
838	struct buf	*bp;
839	struct fs	*fp;
840	struct dinode	*dp;
841	struct ufsvfs	*ufsvfsp 	= ip->i_ufsvfs;
842	int 		i;
843	int		do_trans_times;
844	ushort_t	flag;
845	o_uid_t		suid;
846	o_gid_t		sgid;
847
848	/*
849	 * This function is now safe to be called with either the reader
850	 * or writer i_contents lock.
851	 */
852	ASSERT(RW_LOCK_HELD(&ip->i_contents));
853
854	/*
855	 * Return if file system has been forcibly umounted.
856	 */
857	if (ufsvfsp == NULL)
858		return;
859
860	flag = ip->i_flag;	/* Atomic read */
861	/*
862	 * We better not update the disk inode from a stale inode.
863	 */
864	if (flag & ISTALE)
865		return;
866
867	fp = ip->i_fs;
868
869	if ((flag & (IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG)) != 0) {
870		if (fp->fs_ronly) {
871			mutex_enter(&ip->i_tlock);
872			ip->i_flag &= ~(IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG);
873			mutex_exit(&ip->i_tlock);
874			return;
875		}
876		/*
877		 * fs is active while metadata is being written
878		 */
879		mutex_enter(&ufsvfsp->vfs_lock);
880		ufs_notclean(ufsvfsp);
881		/*
882		 * get the dinode
883		 */
884		bp = UFS_BREAD(ufsvfsp, ip->i_dev,
885		    (daddr_t)fsbtodb(fp, itod(fp, ip->i_number)),
886		    (int)fp->fs_bsize);
887		if (bp->b_flags & B_ERROR) {
888			mutex_enter(&ip->i_tlock);
889			ip->i_flag &=
890			    ~(IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG);
891			mutex_exit(&ip->i_tlock);
892			brelse(bp);
893			return;
894		}
895		/*
896		 * munge inode fields
897		 */
898		mutex_enter(&ip->i_tlock);
899		ITIMES_NOLOCK(ip);
900		do_trans_times = ((ip->i_flag & (IMOD|IMODACC)) == IMODACC);
901		ip->i_flag &= ~(IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG);
902		mutex_exit(&ip->i_tlock);
903
904		/*
905		 * For reads and concurrent re-writes, no deltas were
906		 * entered for the access time changes - do it now.
907		 */
908		if (do_trans_times) {
909			TRANS_INODE_TIMES(ufsvfsp, ip);
910		}
911
912		/*
913		 * For SunOS 5.0->5.4, these lines below read:
914		 *
915		 * suid = (ip->i_uid > MAXUID) ? UID_LONG : ip->i_uid;
916		 * sgid = (ip->i_gid > MAXUID) ? GID_LONG : ip->i_gid;
917		 *
918		 * where MAXUID was set to 60002.  This was incorrect -
919		 * the uids should have been constrained to what fitted into
920		 * a 16-bit word.
921		 *
922		 * This means that files from 4.x filesystems that have an
923		 * i_suid field larger than 60002 will have that field
924		 * changed to 65535.
925		 *
926		 * Security note: 4.x UFS could never create a i_suid of
927		 * UID_LONG since that would've corresponded to -1.
928		 */
929		suid = (ulong_t)ip->i_uid > (ulong_t)USHRT_MAX ?
930		    UID_LONG : ip->i_uid;
931		sgid = (ulong_t)ip->i_gid > (ulong_t)USHRT_MAX ?
932		    GID_LONG : ip->i_gid;
933
934		if ((ip->i_suid != suid) || (ip->i_sgid != sgid)) {
935			ip->i_suid = suid;
936			ip->i_sgid = sgid;
937			TRANS_INODE(ufsvfsp, ip);
938		}
939
940		if ((ip->i_mode & IFMT) == IFBLK ||
941		    (ip->i_mode & IFMT) == IFCHR) {
942			dev_t d = ip->i_rdev;
943			dev32_t dev32;
944
945			/*
946			 * load first direct block only if special device
947			 */
948			if (!cmpldev(&dev32, d)) {
949				/*
950				 * We panic here because there's "no way"
951				 * we should have been able to create a large
952				 * inode with a large dev_t.  Earlier layers
953				 * should've caught this.
954				 */
955				panic("ip %p: i_rdev too big", (void *)ip);
956			}
957
958			if (dev32 & ~((O_MAXMAJ << L_BITSMINOR32) | O_MAXMIN)) {
959				ip->i_ordev = dev32;	/* can't use old fmt. */
960			} else {
961				ip->i_ordev = cmpdev(d);
962			}
963		}
964
965		/*
966		 * copy inode to dinode (zero fastsymlnk in dinode)
967		 */
968		dp = (struct dinode *)bp->b_un.b_addr + itoo(fp, ip->i_number);
969		dp->di_ic = ip->i_ic;	/* structure assignment */
970		if (flag & IFASTSYMLNK) {
971			for (i = 1; i < NDADDR; i++)
972				dp->di_db[i] = 0;
973			for (i = 0; i < NIADDR; i++)
974				dp->di_ib[i] = 0;
975		}
976		if (TRANS_ISTRANS(ufsvfsp)) {
977			/*
978			 * Pass only a sector size buffer containing
979			 * the inode, otherwise when the buffer is copied
980			 * into a cached roll buffer then too much memory
981			 * gets consumed if 8KB inode buffers are passed.
982			 */
983			TRANS_LOG(ufsvfsp, (caddr_t)dp, ip->i_doff,
984			    sizeof (struct dinode),
985			    (caddr_t)P2ALIGN((uintptr_t)dp, DEV_BSIZE),
986			    DEV_BSIZE);
987
988			brelse(bp);
989		} else if (waitfor && (ip->i_ufsvfs->vfs_dio == 0)) {
990			UFS_BRWRITE(ufsvfsp, bp);
991
992			/*
993			 * Synchronous write has guaranteed that inode
994			 * has been written on disk so clear the flag
995			 */
996			mutex_enter(&ip->i_tlock);
997			ip->i_flag &= ~IBDWRITE;
998			mutex_exit(&ip->i_tlock);
999		} else {
1000			bdrwrite(bp);
1001
1002			/*
1003			 * This write hasn't guaranteed that inode has been
1004			 * written on the disk.
1005			 * Since, all updat flags on inode are cleared, we must
1006			 * remember the condition in case inode is to be updated
1007			 * synchronously later (e.g.- fsync()/fdatasync())
1008			 * and inode has not been modified yet.
1009			 */
1010			mutex_enter(&ip->i_tlock);
1011			ip->i_flag |= IBDWRITE;
1012			mutex_exit(&ip->i_tlock);
1013		}
1014	} else {
1015		/*
1016		 * In case previous inode update was done asynchronously
1017		 * (IBDWRITE) and this inode update request wants guaranteed
1018		 * (synchronous) disk update, flush the inode.
1019		 */
1020		if (waitfor && (flag & IBDWRITE)) {
1021			blkflush(ip->i_dev,
1022			    (daddr_t)fsbtodb(fp, itod(fp, ip->i_number)));
1023			mutex_enter(&ip->i_tlock);
1024			ip->i_flag &= ~IBDWRITE;
1025			mutex_exit(&ip->i_tlock);
1026		}
1027	}
1028}
1029
1030#define	SINGLE	0	/* index of single indirect block */
1031#define	DOUBLE	1	/* index of double indirect block */
1032#define	TRIPLE	2	/* index of triple indirect block */
1033
1034/*
1035 * Release blocks associated with the inode ip and
1036 * stored in the indirect block bn.  Blocks are free'd
1037 * in LIFO order up to (but not including) lastbn.  If
1038 * level is greater than SINGLE, the block is an indirect
1039 * block and recursive calls to indirtrunc must be used to
1040 * cleanse other indirect blocks.
1041 *
1042 * N.B.: triple indirect blocks are untested.
1043 */
1044static long
1045indirtrunc(struct inode *ip, daddr_t bn, daddr_t lastbn, int level, int flags)
1046{
1047	int i;
1048	struct buf *bp, *copy;
1049	daddr32_t *bap;
1050	struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
1051	struct fs *fs = ufsvfsp->vfs_fs;
1052	daddr_t nb, last;
1053	long factor;
1054	int blocksreleased = 0, nblocks;
1055
1056	ASSERT(RW_WRITE_HELD(&ip->i_contents));
1057	/*
1058	 * Calculate index in current block of last
1059	 * block to be kept.  -1 indicates the entire
1060	 * block so we need not calculate the index.
1061	 */
1062	factor = 1;
1063	for (i = SINGLE; i < level; i++)
1064		factor *= NINDIR(fs);
1065	last = lastbn;
1066	if (lastbn > 0)
1067		last /= factor;
1068	nblocks = btodb(fs->fs_bsize);
1069	/*
1070	 * Get buffer of block pointers, zero those
1071	 * entries corresponding to blocks to be free'd,
1072	 * and update on disk copy first.
1073	 * *Unless* the root pointer has been synchronously
1074	 * written to disk.  If nothing points to this
1075	 * indirect block then don't bother zero'ing and
1076	 * writing it.
1077	 */
1078	bp = UFS_BREAD(ufsvfsp,
1079	    ip->i_dev, (daddr_t)fsbtodb(fs, bn), (int)fs->fs_bsize);
1080	if (bp->b_flags & B_ERROR) {
1081		brelse(bp);
1082		return (0);
1083	}
1084	bap = bp->b_un.b_daddr;
1085	if ((flags & I_CHEAP) == 0) {
1086		uint_t	zb;
1087
1088		zb = (uint_t)((NINDIR(fs) - (last + 1)) * sizeof (daddr32_t));
1089
1090		if (zb) {
1091			/*
1092			 * push any data into the log before we zero it
1093			 */
1094			if (bp->b_flags & B_DELWRI)
1095				TRANS_LOG(ufsvfsp, (caddr_t)bap,
1096				    ldbtob(bp->b_blkno), bp->b_bcount,
1097				    bp->b_un.b_addr, bp->b_bcount);
1098			copy = ngeteblk(fs->fs_bsize);
1099			bcopy((caddr_t)bap, (caddr_t)copy->b_un.b_daddr,
1100			    (uint_t)fs->fs_bsize);
1101			bzero((caddr_t)&bap[last + 1], zb);
1102
1103			TRANS_BUF(ufsvfsp,
1104			    (caddr_t)&bap[last + 1] - (caddr_t)bap,
1105			    zb, bp, DT_ABZERO);
1106
1107			UFS_BRWRITE(ufsvfsp, bp);
1108			bp = copy, bap = bp->b_un.b_daddr;
1109		}
1110	} else {
1111		/* make sure write retries are also cleared */
1112		bp->b_flags &= ~(B_DELWRI | B_RETRYWRI);
1113		bp->b_flags |= B_STALE | B_AGE;
1114	}
1115
1116	/*
1117	 * Recursively free totally unused blocks.
1118	 */
1119	flags |= I_CHEAP;
1120	for (i = NINDIR(fs) - 1; i > last; i--) {
1121		nb = bap[i];
1122		if (nb == 0)
1123			continue;
1124		if (level > SINGLE) {
1125			blocksreleased +=
1126			    indirtrunc(ip, nb, (daddr_t)-1, level - 1, flags);
1127			free(ip, nb, (off_t)fs->fs_bsize, flags | I_IBLK);
1128		} else
1129			free(ip, nb, (off_t)fs->fs_bsize, flags);
1130		blocksreleased += nblocks;
1131	}
1132	flags &= ~I_CHEAP;
1133
1134	/*
1135	 * Recursively free last partial block.
1136	 */
1137	if (level > SINGLE && lastbn >= 0) {
1138		last = lastbn % factor;
1139		nb = bap[i];
1140		if (nb != 0)
1141			blocksreleased +=
1142			    indirtrunc(ip, nb, last, level - 1, flags);
1143	}
1144	brelse(bp);
1145	return (blocksreleased);
1146}
1147
1148/*
1149 * Truncate the inode ip to at most length size.
1150 * Free affected disk blocks -- the blocks of the
1151 * file are removed in reverse order.
1152 *
1153 * N.B.: triple indirect blocks are untested.
1154 */
1155static int i_genrand = 1234;
1156int
1157ufs_itrunc(struct inode *oip, u_offset_t length, int flags, cred_t *cr)
1158{
1159	struct fs *fs = oip->i_fs;
1160	struct ufsvfs *ufsvfsp = oip->i_ufsvfs;
1161	struct inode *ip;
1162	daddr_t lastblock;
1163	off_t bsize;
1164	int boff;
1165	daddr_t bn, lastiblock[NIADDR];
1166	int level;
1167	long nblocks, blocksreleased = 0;
1168	int i;
1169	ushort_t mode;
1170	struct inode tip;
1171	int err;
1172	u_offset_t maxoffset = (ufsvfsp->vfs_lfflags & UFS_LARGEFILES) ?
1173	    (UFS_MAXOFFSET_T) : (MAXOFF32_T);
1174
1175	/*
1176	 * Shadow inodes do not need to hold the vfs_dqrwlock lock. Most
1177	 * other uses need the reader lock. opendq() holds the writer lock.
1178	 */
1179	ASSERT((oip->i_mode & IFMT) == IFSHAD ||
1180	    RW_LOCK_HELD(&ufsvfsp->vfs_dqrwlock));
1181	ASSERT(RW_WRITE_HELD(&oip->i_contents));
1182	/*
1183	 * We only allow truncation of regular files and directories
1184	 * to arbitrary lengths here.  In addition, we allow symbolic
1185	 * links to be truncated only to zero length.  Other inode
1186	 * types cannot have their length set here.  Disk blocks are
1187	 * being dealt with - especially device inodes where
1188	 * ip->i_ordev is actually being stored in ip->i_db[0]!
1189	 */
1190	TRANS_INODE(ufsvfsp, oip);
1191	mode = oip->i_mode & IFMT;
1192	if (flags & I_FREE) {
1193		i_genrand *= 16843009;  /* turns into shift and adds */
1194		i_genrand++;
1195		oip->i_gen += ((i_genrand + ddi_get_lbolt()) & 0xffff) + 1;
1196		oip->i_flag |= ICHG |IUPD;
1197		oip->i_seq++;
1198		if (length == oip->i_size)
1199			return (0);
1200		flags |= I_CHEAP;
1201	}
1202	if (mode == IFIFO)
1203		return (0);
1204	if (mode != IFREG && mode != IFDIR && mode != IFATTRDIR &&
1205	    !(mode == IFLNK && length == (offset_t)0) && mode != IFSHAD)
1206		return (EINVAL);
1207	if (length > maxoffset)
1208		return (EFBIG);
1209	if ((mode == IFDIR) || (mode == IFATTRDIR))
1210		flags |= I_DIR;
1211	if (mode == IFSHAD)
1212		flags |= I_SHAD;
1213	if (oip == ufsvfsp->vfs_qinod)
1214		flags |= I_QUOTA;
1215	if (length == oip->i_size) {
1216		/* update ctime and mtime to please POSIX tests */
1217		oip->i_flag |= ICHG |IUPD;
1218		oip->i_seq++;
1219		if (length == 0) {
1220			/* nothing to cache so clear the flag */
1221			oip->i_flag &= ~IFASTSYMLNK;
1222		}
1223		return (0);
1224	}
1225	/* wipe out fast symlink till next access */
1226	if (oip->i_flag & IFASTSYMLNK) {
1227		int j;
1228
1229		ASSERT(ITOV(oip)->v_type == VLNK);
1230
1231		oip->i_flag &= ~IFASTSYMLNK;
1232
1233		for (j = 1; j < NDADDR; j++)
1234			oip->i_db[j] = 0;
1235		for (j = 0; j < NIADDR; j++)
1236			oip->i_ib[j] = 0;
1237	}
1238
1239	boff = (int)blkoff(fs, length);
1240
1241	if (length > oip->i_size) {
1242		/*
1243		 * Trunc up case.  BMAPALLOC will insure that the right blocks
1244		 * are allocated.  This includes extending the old frag to a
1245		 * full block (if needed) in addition to doing any work
1246		 * needed for allocating the last block.
1247		 */
1248		if (boff == 0)
1249			err = BMAPALLOC(oip, length - 1, (int)fs->fs_bsize, cr);
1250		else
1251			err = BMAPALLOC(oip, length - 1, boff, cr);
1252
1253		if (err == 0) {
1254			/*
1255			 * Save old size and set inode's size now
1256			 * so that we don't cause too much of the
1257			 * file to be zero'd and pushed.
1258			 */
1259			u_offset_t osize = oip->i_size;
1260			oip->i_size  = length;
1261			/*
1262			 * Make sure we zero out the remaining bytes of
1263			 * the page in case a mmap scribbled on it. We
1264			 * can't prevent a mmap from writing beyond EOF
1265			 * on the last page of a file.
1266			 *
1267			 */
1268			if ((boff = (int)blkoff(fs, osize)) != 0) {
1269				bsize = (int)lblkno(fs, osize - 1) >= NDADDR ?
1270				    fs->fs_bsize : fragroundup(fs, boff);
1271				pvn_vpzero(ITOV(oip), osize,
1272				    (size_t)(bsize - boff));
1273			}
1274			oip->i_flag |= ICHG|IATTCHG;
1275			oip->i_seq++;
1276			ITIMES_NOLOCK(oip);
1277			/*
1278			 * MAXOFF32_T is old 2GB size limit. If
1279			 * this operation caused a large file to be
1280			 * created, turn on the superblock flag
1281			 * and update the superblock, if the flag
1282			 * is not already on.
1283			 */
1284			if ((length > (u_offset_t)MAXOFF32_T) &&
1285			    !(fs->fs_flags & FSLARGEFILES)) {
1286				ASSERT(ufsvfsp->vfs_lfflags & UFS_LARGEFILES);
1287				mutex_enter(&ufsvfsp->vfs_lock);
1288				fs->fs_flags |= FSLARGEFILES;
1289				ufs_sbwrite(ufsvfsp);
1290				mutex_exit(&ufsvfsp->vfs_lock);
1291			}
1292		}
1293
1294		return (err);
1295	}
1296
1297	/*
1298	 * Update the pages of the file.  If the file is not being
1299	 * truncated to a block boundary, the contents of the
1300	 * pages following the end of the file must be zero'ed
1301	 * in case it ever become accessible again because
1302	 * of subsequent file growth.
1303	 */
1304	if (boff == 0) {
1305		(void) pvn_vplist_dirty(ITOV(oip), length, ufs_putapage,
1306		    B_INVAL | B_TRUNC, CRED());
1307	} else {
1308		/*
1309		 * Make sure that the last block is properly allocated.
1310		 * We only really have to do this if the last block is
1311		 * actually allocated since ufs_bmap will now handle the case
1312		 * of an fragment which has no block allocated.  Just to
1313		 * be sure, we do it now independent of current allocation.
1314		 */
1315		err = BMAPALLOC(oip, length - 1, boff, cr);
1316		if (err)
1317			return (err);
1318
1319		/*
1320		 * BMAPALLOC will call bmap_write which defers i_seq
1321		 * processing.  If the timestamps were changed, update
1322		 * i_seq before rdip drops i_contents or syncs the inode.
1323		 */
1324		if (oip->i_flag & (ICHG|IUPD))
1325			oip->i_seq++;
1326
1327		/*
1328		 * BugId 4069932
1329		 * Make sure that the relevant partial page appears in
1330		 * the v_pages list, so that pvn_vpzero() will do its
1331		 * job.  Since doing this correctly requires everything
1332		 * in rdip() except for the uiomove(), it's easier and
1333		 * safer to do the uiomove() rather than duplicate the
1334		 * rest of rdip() here.
1335		 *
1336		 * To get here, we know that length indicates a byte
1337		 * that is not the first byte of a block.  (length - 1)
1338		 * is the last actual byte known to exist.  Deduction
1339		 * shows it is in the same block as byte (length).
1340		 * Thus, this rdip() invocation should always succeed
1341		 * except in the face of i/o errors, and give us the
1342		 * block we care about.
1343		 *
1344		 * rdip() makes the same locking assertions and
1345		 * assumptions as we do.  We do not acquire any locks
1346		 * before calling it, so we have not changed the locking
1347		 * situation.  Finally, there do not appear to be any
1348		 * paths whereby rdip() ends up invoking us again.
1349		 * Thus, infinite recursion is avoided.
1350		 */
1351		{
1352			uio_t uio;
1353			iovec_t iov[1];
1354			char buffer;
1355
1356			uio.uio_iov = iov;
1357			uio.uio_iovcnt = 1;
1358			uio.uio_loffset = length - 1;
1359			uio.uio_resid = 1;
1360			uio.uio_segflg = UIO_SYSSPACE;
1361			uio.uio_extflg = UIO_COPY_CACHED;
1362
1363			iov[0].iov_base = &buffer;
1364			iov[0].iov_len = 1;
1365
1366			err = rdip(oip, &uio, UIO_READ, NULL);
1367			if (err)
1368				return (err);
1369		}
1370
1371		bsize = (int)lblkno(fs, length - 1) >= NDADDR ?
1372		    fs->fs_bsize : fragroundup(fs, boff);
1373		pvn_vpzero(ITOV(oip), length, (size_t)(bsize - boff));
1374		/*
1375		 * Ensure full fs block is marked as dirty.
1376		 */
1377		(void) pvn_vplist_dirty(ITOV(oip), length + (bsize - boff),
1378		    ufs_putapage, B_INVAL | B_TRUNC, CRED());
1379	}
1380
1381	/*
1382	 * Calculate index into inode's block list of
1383	 * last direct and indirect blocks (if any)
1384	 * which we want to keep.  Lastblock is -1 when
1385	 * the file is truncated to 0.
1386	 */
1387	lastblock = lblkno(fs, length + fs->fs_bsize - 1) - 1;
1388	lastiblock[SINGLE] = lastblock - NDADDR;
1389	lastiblock[DOUBLE] = lastiblock[SINGLE] - NINDIR(fs);
1390	lastiblock[TRIPLE] = lastiblock[DOUBLE] - NINDIR(fs) * NINDIR(fs);
1391	nblocks = btodb(fs->fs_bsize);
1392
1393	/*
1394	 * Update file and block pointers
1395	 * on disk before we start freeing blocks.
1396	 * If we crash before free'ing blocks below,
1397	 * the blocks will be returned to the free list.
1398	 * lastiblock values are also normalized to -1
1399	 * for calls to indirtrunc below.
1400	 */
1401	tip = *oip;			/* structure copy */
1402	ip = &tip;
1403
1404	for (level = TRIPLE; level >= SINGLE; level--)
1405		if (lastiblock[level] < 0) {
1406			oip->i_ib[level] = 0;
1407			lastiblock[level] = -1;
1408		}
1409	for (i = NDADDR - 1; i > lastblock; i--) {
1410		oip->i_db[i] = 0;
1411		flags |= I_CHEAP;
1412	}
1413	oip->i_size = length;
1414	oip->i_flag |= ICHG|IUPD|IATTCHG;
1415	oip->i_seq++;
1416	if (!TRANS_ISTRANS(ufsvfsp))
1417		ufs_iupdat(oip, I_SYNC);	/* do sync inode update */
1418
1419	/*
1420	 * Indirect blocks first.
1421	 */
1422	for (level = TRIPLE; level >= SINGLE; level--) {
1423		bn = ip->i_ib[level];
1424		if (bn != 0) {
1425			blocksreleased +=
1426			    indirtrunc(ip, bn, lastiblock[level], level, flags);
1427			if (lastiblock[level] < 0) {
1428				ip->i_ib[level] = 0;
1429				free(ip, bn, (off_t)fs->fs_bsize,
1430				    flags | I_IBLK);
1431				blocksreleased += nblocks;
1432			}
1433		}
1434		if (lastiblock[level] >= 0)
1435			goto done;
1436	}
1437
1438	/*
1439	 * All whole direct blocks or frags.
1440	 */
1441	for (i = NDADDR - 1; i > lastblock; i--) {
1442		bn = ip->i_db[i];
1443		if (bn == 0)
1444			continue;
1445		ip->i_db[i] = 0;
1446		bsize = (off_t)blksize(fs, ip, i);
1447		free(ip, bn, bsize, flags);
1448		blocksreleased += btodb(bsize);
1449	}
1450	if (lastblock < 0)
1451		goto done;
1452
1453	/*
1454	 * Finally, look for a change in size of the
1455	 * last direct block; release any frags.
1456	 */
1457	bn = ip->i_db[lastblock];
1458	if (bn != 0) {
1459		off_t oldspace, newspace;
1460
1461		/*
1462		 * Calculate amount of space we're giving
1463		 * back as old block size minus new block size.
1464		 */
1465		oldspace = blksize(fs, ip, lastblock);
1466		UFS_SET_ISIZE(length, ip);
1467		newspace = blksize(fs, ip, lastblock);
1468		if (newspace == 0) {
1469			err = ufs_fault(ITOV(ip), "ufs_itrunc: newspace == 0");
1470			return (err);
1471		}
1472		if (oldspace - newspace > 0) {
1473			/*
1474			 * Block number of space to be free'd is
1475			 * the old block # plus the number of frags
1476			 * required for the storage we're keeping.
1477			 */
1478			bn += numfrags(fs, newspace);
1479			free(ip, bn, oldspace - newspace, flags);
1480			blocksreleased += btodb(oldspace - newspace);
1481		}
1482	}
1483done:
1484/* BEGIN PARANOIA */
1485	for (level = SINGLE; level <= TRIPLE; level++)
1486		if (ip->i_ib[level] != oip->i_ib[level]) {
1487			err = ufs_fault(ITOV(ip), "ufs_itrunc: indirect block");
1488			return (err);
1489		}
1490
1491	for (i = 0; i < NDADDR; i++)
1492		if (ip->i_db[i] != oip->i_db[i]) {
1493			err = ufs_fault(ITOV(ip), "ufs_itrunc: direct block");
1494			return (err);
1495		}
1496/* END PARANOIA */
1497	oip->i_blocks -= blocksreleased;
1498
1499	if (oip->i_blocks < 0) {		/* sanity */
1500		cmn_err(CE_NOTE,
1501		    "ufs_itrunc: %s/%d new size = %lld, blocks = %d\n",
1502		    fs->fs_fsmnt, (int)oip->i_number, oip->i_size,
1503		    (int)oip->i_blocks);
1504		oip->i_blocks = 0;
1505	}
1506	oip->i_flag |= ICHG|IATTCHG;
1507	oip->i_seq++;
1508	/* blocksreleased is >= zero, so this can not fail */
1509	(void) chkdq(oip, -blocksreleased, 0, cr, (char **)NULL,
1510	    (size_t *)NULL);
1511	return (0);
1512}
1513
1514/*
1515 * Check mode permission on inode.  Mode is READ, WRITE or EXEC.
1516 * In the case of WRITE, the read-only status of the file system
1517 * is checked.  Depending on the calling user, the appropriate
1518 * mode bits are selected; privileges to override missing permission
1519 * bits are checked through secpolicy_vnode_access().
1520 * The i_contens lock must be held as reader here to prevent racing with
1521 * the acl subsystem removing/setting/changing acls on this inode.
1522 * The caller is responsible for indicating whether or not the i_contents
1523 * lock needs to be acquired here or if already held.
1524 */
1525int
1526ufs_iaccess(struct inode  *ip, int mode, struct cred *cr, int dolock)
1527{
1528	int shift = 0;
1529	int ret = 0;
1530
1531	if (dolock)
1532		rw_enter(&ip->i_contents, RW_READER);
1533	ASSERT(RW_LOCK_HELD(&ip->i_contents));
1534
1535	if (mode & IWRITE) {
1536		/*
1537		 * Disallow write attempts on read-only
1538		 * file systems, unless the file is a block
1539		 * or character device or a FIFO.
1540		 */
1541		if (ip->i_fs->fs_ronly != 0) {
1542			if ((ip->i_mode & IFMT) != IFCHR &&
1543			    (ip->i_mode & IFMT) != IFBLK &&
1544			    (ip->i_mode & IFMT) != IFIFO) {
1545				ret = EROFS;
1546				goto out;
1547			}
1548		}
1549	}
1550	/*
1551	 * If there is an acl, check the acl and return.
1552	 */
1553	if (ip->i_ufs_acl && ip->i_ufs_acl->aowner) {
1554		ret = ufs_acl_access(ip, mode, cr);
1555		goto out;
1556	}
1557
1558	/*
1559	 * Access check is based on only one of owner, group, public.
1560	 * If not owner, then check group.
1561	 * If not a member of the group, then check public access.
1562	 */
1563	if (crgetuid(cr) != ip->i_uid) {
1564		shift += 3;
1565		if (!groupmember((uid_t)ip->i_gid, cr))
1566			shift += 3;
1567	}
1568
1569	/* test missing privilege bits */
1570	ret = secpolicy_vnode_access2(cr, ITOV(ip), ip->i_uid,
1571	    ip->i_mode << shift, mode);
1572out:
1573	if (dolock)
1574		rw_exit(&ip->i_contents);
1575	return (ret);
1576}
1577
1578/*
1579 * if necessary, remove an inode from the free list
1580 *	i_contents is held except at unmount
1581 *
1582 * Return 1 if the inode is taken off of the ufs_idle_q,
1583 * and the caller is expected to call VN_RELE.
1584 *
1585 * Return 0 otherwise.
1586 */
1587int
1588ufs_rmidle(struct inode *ip)
1589{
1590	int rval = 0;
1591
1592	mutex_enter(&ip->i_tlock);
1593	if ((ip->i_flag & IREF) == 0) {
1594		mutex_enter(&ufs_idle_q.uq_mutex);
1595		ip->i_freef->i_freeb = ip->i_freeb;
1596		ip->i_freeb->i_freef = ip->i_freef;
1597		ip->i_freef = ip;
1598		ip->i_freeb = ip;
1599		ip->i_flag |= IREF;
1600		ufs_idle_q.uq_ne--;
1601		if (ip->i_flag & IJUNKIQ) {
1602			ufs_njunk_iq--;
1603			ip->i_flag &= ~IJUNKIQ;
1604		} else {
1605			ufs_nuseful_iq--;
1606		}
1607		mutex_exit(&ufs_idle_q.uq_mutex);
1608		rval = 1;
1609	}
1610	mutex_exit(&ip->i_tlock);
1611	return (rval);
1612}
1613
1614/*
1615 * scan the hash of inodes and call func with the inode locked
1616 */
1617int
1618ufs_scan_inodes(int rwtry, int (*func)(struct inode *, void *), void *arg,
1619    struct ufsvfs *ufsvfsp)
1620{
1621	struct inode		*ip;		/* current inode */
1622	struct inode		*lip = NULL;	/* last/previous inode */
1623	union ihead		*ih;		/* current hash chain */
1624	int			error, i;
1625	int			saverror = 0;
1626	int			lip_held;	/* lip needs a VN_RELE() */
1627
1628	/*
1629	 * If ufsvfsp is NULL, then our caller should be holding
1630	 * ufs_scan_lock to avoid conflicts between ufs_unmount() and
1631	 * ufs_update().  Otherwise, to avoid false-positives in
1632	 * ufs_unmount()'s v_count-based EBUSY check, we only hold
1633	 * those inodes that are in the file system our caller cares
1634	 * about.
1635	 *
1636	 * We know that ip is a valid inode in the hash chain (and thus
1637	 * we can trust i_ufsvfs) because the inode we chained from
1638	 * (lip) is still in the hash chain.  This is true because either:
1639	 *
1640	 * 1. We did not drop the hash chain lock since the last
1641	 *    iteration (because we were not interested in the last inode),
1642	 * or
1643	 * 2. We maintained a hold on the last inode while we
1644	 *    we were processing it, so it could not be removed
1645	 *    from the hash chain.
1646	 *
1647	 * The whole reason we're dropping and re-grabbing the chain
1648	 * lock on every inode is so that we don't present a major
1649	 * choke point on throughput, particularly when we've been
1650	 * called on behalf of fsflush.
1651	 */
1652
1653	for (i = 0, ih = ihead; i < inohsz; i++, ih++) {
1654		mutex_enter(&ih_lock[i]);
1655		for (ip = ih->ih_chain[0], lip_held = 0;
1656		    ip != (struct inode *)ih;
1657		    ip = lip->i_forw) {
1658
1659			ins.in_scan.value.ul++;
1660
1661			/*
1662			 * Undo the previous iteration's VN_HOLD(), but
1663			 * only if one was done.
1664			 */
1665			if (lip_held)
1666				VN_RELE(ITOV(lip));
1667
1668			lip = ip;
1669			if (ufsvfsp != NULL && ip->i_ufsvfs != ufsvfsp) {
1670				/*
1671				 * We're not processing all inodes, and
1672				 * this inode is not in the filesystem of
1673				 * interest, so skip it.  No need to do a
1674				 * VN_HOLD() since we're not dropping the
1675				 * hash chain lock until after we've
1676				 * done the i_forw traversal above.
1677				 */
1678				lip_held = 0;
1679				continue;
1680			}
1681			VN_HOLD(ITOV(ip));
1682			lip_held = 1;
1683			mutex_exit(&ih_lock[i]);
1684
1685			/*
1686			 * Acquire the contents lock as writer to make
1687			 * sure that the inode has been initialized in
1688			 * the cache or removed from the idle list by
1689			 * ufs_iget().  This works because ufs_iget()
1690			 * acquires the contents lock before putting
1691			 * the inode into the cache.  If we can lock
1692			 * it, then ufs_iget() is done with it.
1693			 */
1694
1695			if (rwtry) {
1696				if (!rw_tryenter(&ip->i_contents, RW_WRITER)) {
1697					mutex_enter(&ih_lock[i]);
1698					continue;
1699				}
1700			} else {
1701				rw_enter(&ip->i_contents, RW_WRITER);
1702			}
1703
1704			rw_exit(&ip->i_contents);
1705
1706			/*
1707			 * ISTALE means the inode couldn't be read
1708			 *
1709			 * We don't have to hold the i_contents lock
1710			 * for this check for a couple of
1711			 * reasons. First, if ISTALE is set then the
1712			 * flag cannot be cleared until the inode is
1713			 * removed from the cache and that cannot
1714			 * happen until after we VN_RELE() it.
1715			 * Second, if ISTALE is not set, then the
1716			 * inode is in the cache and does not need to
1717			 * be read from disk so ISTALE cannot be set
1718			 * while we are not looking.
1719			 */
1720			if ((ip->i_flag & ISTALE) == 0) {
1721				if ((error = (*func)(ip, arg)) != 0)
1722					saverror = error;
1723			}
1724
1725			mutex_enter(&ih_lock[i]);
1726		}
1727		if (lip_held)
1728			VN_RELE(ITOV(lip));
1729		mutex_exit(&ih_lock[i]);
1730	}
1731	return (saverror);
1732}
1733
1734/*
1735 * Mark inode with the current time, plus a unique increment.
1736 *
1737 * Since we only keep 32-bit time on disk, if UFS is still alive
1738 * beyond 2038, filesystem times will simply stick at the last
1739 * possible second of 32-bit time. Not ideal, but probably better
1740 * than going into the remote past, or confusing applications with
1741 * negative time.
1742 */
1743void
1744ufs_imark(struct inode *ip)
1745{
1746	timestruc_t now;
1747	int32_t usec, nsec;
1748
1749	/*
1750	 * The update of i_seq may have been deferred, increase i_seq here
1751	 * to make sure it is in sync with the timestamps.
1752	 */
1753	if (ip->i_flag & ISEQ) {
1754		ASSERT(ip->i_flag & (IUPD|ICHG));
1755		ip->i_seq++;
1756		ip->i_flag &= ~ISEQ;
1757	}
1758
1759	gethrestime(&now);
1760
1761	/*
1762	 * Fast algorithm to convert nsec to usec -- see hrt2ts()
1763	 * in common/os/timers.c for a full description.
1764	 */
1765	nsec = now.tv_nsec;
1766	usec = nsec + (nsec >> 2);
1767	usec = nsec + (usec >> 1);
1768	usec = nsec + (usec >> 2);
1769	usec = nsec + (usec >> 4);
1770	usec = nsec - (usec >> 3);
1771	usec = nsec + (usec >> 2);
1772	usec = nsec + (usec >> 3);
1773	usec = nsec + (usec >> 4);
1774	usec = nsec + (usec >> 1);
1775	usec = nsec + (usec >> 6);
1776	usec = usec >> 10;
1777
1778	mutex_enter(&ufs_iuniqtime_lock);
1779	if (now.tv_sec > (time_t)iuniqtime.tv_sec ||
1780	    usec > iuniqtime.tv_usec) {
1781		if (now.tv_sec < TIME32_MAX) {
1782			iuniqtime.tv_sec = (time32_t)now.tv_sec;
1783			iuniqtime.tv_usec = usec;
1784		}
1785	} else {
1786		if (iuniqtime.tv_sec < TIME32_MAX) {
1787			iuniqtime.tv_usec++;
1788			/* Check for usec overflow */
1789			if (iuniqtime.tv_usec >= MICROSEC) {
1790				iuniqtime.tv_sec++;
1791				iuniqtime.tv_usec = 0;
1792			}
1793		}
1794	}
1795
1796	if ((ip->i_flag & IACC) && !(ip->i_ufsvfs->vfs_noatime)) {
1797		ip->i_atime = iuniqtime;
1798	}
1799	if (ip->i_flag & IUPD) {
1800		ip->i_mtime = iuniqtime;
1801		ip->i_flag |= IMODTIME;
1802	}
1803	if (ip->i_flag & ICHG) {
1804		ip->i_diroff = 0;
1805		ip->i_ctime = iuniqtime;
1806	}
1807	mutex_exit(&ufs_iuniqtime_lock);
1808}
1809
1810/*
1811 * Update timestamps in inode.
1812 */
1813void
1814ufs_itimes_nolock(struct inode *ip)
1815{
1816
1817	/*
1818	 * if noatime is set and the inode access time is the only field that
1819	 * must be changed, exit immediately.
1820	 */
1821	if (((ip->i_flag & (IUPD|IACC|ICHG)) == IACC) &&
1822	    (ip->i_ufsvfs->vfs_noatime)) {
1823		return;
1824	}
1825
1826	if (ip->i_flag & (IUPD|IACC|ICHG)) {
1827		if (ip->i_flag & ICHG)
1828			ip->i_flag |= IMOD;
1829		else
1830			ip->i_flag |= IMODACC;
1831		ufs_imark(ip);
1832		ip->i_flag &= ~(IACC|IUPD|ICHG);
1833	}
1834}
1835