xref: /illumos-gate/usr/src/uts/common/fs/ufs/ufs_lockfs.c (revision d3d50737)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/types.h>
27 #include <sys/t_lock.h>
28 #include <sys/param.h>
29 #include <sys/time.h>
30 #include <sys/systm.h>
31 #include <sys/sysmacros.h>
32 #include <sys/resource.h>
33 #include <sys/signal.h>
34 #include <sys/cred.h>
35 #include <sys/user.h>
36 #include <sys/buf.h>
37 #include <sys/vfs.h>
38 #include <sys/vnode.h>
39 #include <sys/proc.h>
40 #include <sys/disp.h>
41 #include <sys/file.h>
42 #include <sys/fcntl.h>
43 #include <sys/flock.h>
44 #include <sys/atomic.h>
45 #include <sys/kmem.h>
46 #include <sys/uio.h>
47 #include <sys/conf.h>
48 #include <sys/mman.h>
49 #include <sys/pathname.h>
50 #include <sys/debug.h>
51 #include <sys/vmmeter.h>
52 #include <sys/vmsystm.h>
53 #include <sys/cmn_err.h>
54 #include <sys/acct.h>
55 #include <sys/dnlc.h>
56 #include <sys/swap.h>
57 
58 #include <sys/fs/ufs_fs.h>
59 #include <sys/fs/ufs_inode.h>
60 #include <sys/fs/ufs_fsdir.h>
61 #include <sys/fs/ufs_trans.h>
62 #include <sys/fs/ufs_panic.h>
63 #include <sys/fs/ufs_mount.h>
64 #include <sys/fs/ufs_bio.h>
65 #include <sys/fs/ufs_log.h>
66 #include <sys/fs/ufs_quota.h>
67 #include <sys/dirent.h>		/* must be AFTER <sys/fs/fsdir.h>! */
68 #include <sys/errno.h>
69 #include <sys/sysinfo.h>
70 
71 #include <vm/hat.h>
72 #include <vm/pvn.h>
73 #include <vm/as.h>
74 #include <vm/seg.h>
75 #include <vm/seg_map.h>
76 #include <vm/seg_vn.h>
77 #include <vm/rm.h>
78 #include <vm/anon.h>
79 #include <sys/swap.h>
80 #include <sys/dnlc.h>
81 
82 extern struct vnode *common_specvp(struct vnode *vp);
83 
84 /* error lock status */
85 #define	UN_ERRLCK	(-1)
86 #define	SET_ERRLCK	1
87 #define	RE_ERRLCK	2
88 #define	NO_ERRLCK	0
89 
90 /*
91  * Index to be used in TSD for storing lockfs data
92  */
93 uint_t ufs_lockfs_key;
94 
95 typedef struct _ulockfs_info {
96 	struct _ulockfs_info *next;
97 	struct ulockfs *ulp;
98 	uint_t flags;
99 } ulockfs_info_t;
100 
101 #define	ULOCK_INFO_FALLOCATE	0x00000001	/* fallocate thread */
102 
103 /*
104  * Check in TSD that whether we are already doing any VOP on this filesystem
105  */
106 #define	IS_REC_VOP(found, head, ulp, free)		\
107 {							\
108 	ulockfs_info_t *_curr;				\
109 							\
110 	for (found = 0, free = NULL, _curr = head;	\
111 	    _curr != NULL; _curr = _curr->next) {	\
112 		if ((free == NULL) &&			\
113 		    (_curr->ulp == NULL))		\
114 			free = _curr;			\
115 		if (_curr->ulp == ulp) {		\
116 			found = 1;			\
117 			break;				\
118 		}					\
119 	}						\
120 }
121 
122 /*
123  * Get the lockfs data from TSD so that lockfs handles the recursive VOP
124  * properly
125  */
126 #define	SEARCH_ULOCKFSP(head, ulp, info)		\
127 {							\
128 	ulockfs_info_t *_curr;				\
129 							\
130 	for (_curr = head; _curr != NULL;		\
131 	    _curr = _curr->next) {			\
132 		if (_curr->ulp == ulp) {		\
133 			break;				\
134 		}					\
135 	}						\
136 							\
137 	info = _curr;					\
138 }
139 
140 /*
141  * Validate lockfs request
142  */
143 static int
144 ufs_getlfd(
145 	struct lockfs *lockfsp,		/* new lock request */
146 	struct lockfs *ul_lockfsp)	/* old lock state */
147 {
148 	int	error = 0;
149 
150 	/*
151 	 * no input flags defined
152 	 */
153 	if (lockfsp->lf_flags != 0) {
154 		error = EINVAL;
155 		goto errout;
156 	}
157 
158 	/*
159 	 * check key
160 	 */
161 	if (!LOCKFS_IS_ULOCK(ul_lockfsp))
162 		if (lockfsp->lf_key != ul_lockfsp->lf_key) {
163 			error = EINVAL;
164 			goto errout;
165 	}
166 
167 	lockfsp->lf_key = ul_lockfsp->lf_key + 1;
168 
169 errout:
170 	return (error);
171 }
172 
173 /*
174  * ufs_checkaccton
175  *	check if accounting is turned on on this fs
176  */
177 
178 int
179 ufs_checkaccton(struct vnode *vp)
180 {
181 	if (acct_fs_in_use(vp))
182 		return (EDEADLK);
183 	return (0);
184 }
185 
186 /*
187  * ufs_checkswapon
188  *	check if local swapping is to file on this fs
189  */
190 int
191 ufs_checkswapon(struct vnode *vp)
192 {
193 	struct swapinfo	*sip;
194 
195 	mutex_enter(&swapinfo_lock);
196 	for (sip = swapinfo; sip; sip = sip->si_next)
197 		if (sip->si_vp->v_vfsp == vp->v_vfsp) {
198 			mutex_exit(&swapinfo_lock);
199 			return (EDEADLK);
200 		}
201 	mutex_exit(&swapinfo_lock);
202 	return (0);
203 }
204 
205 /*
206  * ufs_freeze
207  *	pend future accesses for current lock and desired lock
208  */
209 void
210 ufs_freeze(struct ulockfs *ulp, struct lockfs *lockfsp)
211 {
212 	/*
213 	 * set to new lock type
214 	 */
215 	ulp->ul_lockfs.lf_lock = lockfsp->lf_lock;
216 	ulp->ul_lockfs.lf_key = lockfsp->lf_key;
217 	ulp->ul_lockfs.lf_comlen = lockfsp->lf_comlen;
218 	ulp->ul_lockfs.lf_comment = lockfsp->lf_comment;
219 
220 	ulp->ul_fs_lock = (1 << ulp->ul_lockfs.lf_lock);
221 }
222 
223 /*
224  * All callers of ufs_quiesce() atomically increment ufs_quiesce_pend before
225  * starting ufs_quiesce() protocol and decrement it only when a file system no
226  * longer has to be in quiescent state. This allows ufs_pageio() to detect
227  * that another thread wants to quiesce a file system. See more comments in
228  * ufs_pageio().
229  */
230 ulong_t ufs_quiesce_pend = 0;
231 
232 /*
233  * ufs_quiesce
234  *	wait for outstanding accesses to finish
235  */
236 int
237 ufs_quiesce(struct ulockfs *ulp)
238 {
239 	int error = 0;
240 	ulockfs_info_t *head;
241 	ulockfs_info_t *info;
242 	klwp_t *lwp = ttolwp(curthread);
243 
244 	head = (ulockfs_info_t *)tsd_get(ufs_lockfs_key);
245 	SEARCH_ULOCKFSP(head, ulp, info);
246 
247 	/*
248 	 * We have to keep /proc away from stopping us after we applied
249 	 * the softlock but before we got a chance to clear it again.
250 	 * prstop() may pagefault and become stuck on the softlock still
251 	 * pending.
252 	 */
253 	if (lwp != NULL)
254 		lwp->lwp_nostop++;
255 
256 	/*
257 	 * Set a softlock to suspend future ufs_vnops so that
258 	 * this lockfs request will not be starved
259 	 */
260 	ULOCKFS_SET_SLOCK(ulp);
261 	ASSERT(ufs_quiesce_pend);
262 
263 	/* check if there is any outstanding ufs vnodeops calls */
264 	while (ulp->ul_vnops_cnt || ulp->ul_falloc_cnt) {
265 		/*
266 		 * use timed version of cv_wait_sig() to make sure we don't
267 		 * miss a wake up call from ufs_pageio() when it doesn't use
268 		 * ul_lock.
269 		 *
270 		 * when a fallocate thread comes in, the only way it returns
271 		 * from this function is if there are no other vnode operations
272 		 * going on (remember fallocate threads are tracked using
273 		 * ul_falloc_cnt not ul_vnops_cnt), and another fallocate thread
274 		 * hasn't already grabbed the fs write lock.
275 		 */
276 		if (info && (info->flags & ULOCK_INFO_FALLOCATE)) {
277 			if (!ulp->ul_vnops_cnt && !ULOCKFS_IS_FWLOCK(ulp))
278 				goto out;
279 		}
280 		if (!cv_reltimedwait_sig(&ulp->ul_cv, &ulp->ul_lock, hz,
281 		    TR_CLOCK_TICK)) {
282 			error = EINTR;
283 			goto out;
284 		}
285 	}
286 
287 out:
288 	/*
289 	 * unlock the soft lock
290 	 */
291 	ULOCKFS_CLR_SLOCK(ulp);
292 
293 	if (lwp != NULL)
294 		lwp->lwp_nostop--;
295 
296 	return (error);
297 }
298 
299 /*
300  * ufs_flush_inode
301  */
302 int
303 ufs_flush_inode(struct inode *ip, void *arg)
304 {
305 	int	error;
306 	int	saverror	= 0;
307 
308 	/*
309 	 * wrong file system; keep looking
310 	 */
311 	if (ip->i_ufsvfs != (struct ufsvfs *)arg)
312 		return (0);
313 
314 	/*
315 	 * asynchronously push all the dirty pages
316 	 */
317 	if (((error = TRANS_SYNCIP(ip, B_ASYNC, 0, TOP_SYNCIP_FLUSHI)) != 0) &&
318 	    (error != EAGAIN))
319 		saverror = error;
320 	/*
321 	 * wait for io and discard all mappings
322 	 */
323 	if (error = TRANS_SYNCIP(ip, B_INVAL, 0, TOP_SYNCIP_FLUSHI))
324 		saverror = error;
325 
326 	if (ITOV(ip)->v_type == VDIR) {
327 		dnlc_dir_purge(&ip->i_danchor);
328 	}
329 
330 	return (saverror);
331 }
332 
333 /*
334  * ufs_flush
335  *	Flush everything that is currently dirty; this includes invalidating
336  *	any mappings.
337  */
338 int
339 ufs_flush(struct vfs *vfsp)
340 {
341 	int		error;
342 	int		saverror = 0;
343 	struct ufsvfs	*ufsvfsp	= (struct ufsvfs *)vfsp->vfs_data;
344 	struct fs	*fs		= ufsvfsp->vfs_fs;
345 	int		tdontblock = 0;
346 
347 	ASSERT(vfs_lock_held(vfsp));
348 
349 	/*
350 	 * purge dnlc
351 	 */
352 	(void) dnlc_purge_vfsp(vfsp, 0);
353 
354 	/*
355 	 * drain the delete and idle threads
356 	 */
357 	ufs_delete_drain(vfsp, 0, 0);
358 	ufs_idle_drain(vfsp);
359 
360 	/*
361 	 * flush and invalidate quota records
362 	 */
363 	(void) qsync(ufsvfsp);
364 
365 	/*
366 	 * flush w/invalidate the inodes for vfsp
367 	 */
368 	if (error = ufs_scan_inodes(0, ufs_flush_inode, ufsvfsp, ufsvfsp))
369 		saverror = error;
370 
371 	/*
372 	 * synchronously flush superblock and summary info
373 	 */
374 	if (fs->fs_ronly == 0 && fs->fs_fmod) {
375 		fs->fs_fmod = 0;
376 		TRANS_SBUPDATE(ufsvfsp, vfsp, TOP_SBUPDATE_FLUSH);
377 	}
378 	/*
379 	 * flush w/invalidate block device pages and buf cache
380 	 */
381 	if ((error = VOP_PUTPAGE(common_specvp(ufsvfsp->vfs_devvp),
382 	    (offset_t)0, 0, B_INVAL, CRED(), NULL)) > 0)
383 		saverror = error;
384 
385 	(void) bflush((dev_t)vfsp->vfs_dev);
386 	(void) bfinval((dev_t)vfsp->vfs_dev, 0);
387 
388 	/*
389 	 * drain the delete and idle threads again
390 	 */
391 	ufs_delete_drain(vfsp, 0, 0);
392 	ufs_idle_drain(vfsp);
393 
394 	/*
395 	 * play with the clean flag
396 	 */
397 	if (saverror == 0)
398 		ufs_checkclean(vfsp);
399 
400 	/*
401 	 * Flush any outstanding transactions and roll the log
402 	 * only if we are supposed to do, i.e. LDL_NOROLL not set.
403 	 * We can not simply check for fs_ronly here since fsck also may
404 	 * use this code to roll the log on a read-only filesystem, e.g.
405 	 * root during early stages of boot, if other then a sanity check is
406 	 * done, it will clear LDL_NOROLL before.
407 	 * In addition we assert that the deltamap does not contain any deltas
408 	 * in case LDL_NOROLL is set since this is not supposed to happen.
409 	 */
410 	if (TRANS_ISTRANS(ufsvfsp)) {
411 		ml_unit_t	*ul	= ufsvfsp->vfs_log;
412 		mt_map_t	*mtm	= ul->un_deltamap;
413 
414 		if (ul->un_flags & LDL_NOROLL) {
415 			ASSERT(mtm->mtm_nme == 0);
416 		} else {
417 			/*
418 			 * Do not set T_DONTBLOCK if there is a
419 			 * transaction opened by caller.
420 			 */
421 			if (curthread->t_flag & T_DONTBLOCK)
422 				tdontblock = 1;
423 			else
424 				curthread->t_flag |= T_DONTBLOCK;
425 
426 			TRANS_BEGIN_SYNC(ufsvfsp, TOP_COMMIT_FLUSH,
427 			    TOP_COMMIT_SIZE, error);
428 
429 			if (!error) {
430 				TRANS_END_SYNC(ufsvfsp, saverror,
431 				    TOP_COMMIT_FLUSH, TOP_COMMIT_SIZE);
432 			}
433 
434 			if (tdontblock == 0)
435 				curthread->t_flag &= ~T_DONTBLOCK;
436 
437 			logmap_roll_dev(ufsvfsp->vfs_log);
438 		}
439 	}
440 
441 	return (saverror);
442 }
443 
444 /*
445  * ufs_thaw_wlock
446  *	special processing when thawing down to wlock
447  */
448 static int
449 ufs_thaw_wlock(struct inode *ip, void *arg)
450 {
451 	/*
452 	 * wrong file system; keep looking
453 	 */
454 	if (ip->i_ufsvfs != (struct ufsvfs *)arg)
455 		return (0);
456 
457 	/*
458 	 * iupdat refuses to clear flags if the fs is read only.  The fs
459 	 * may become read/write during the lock and we wouldn't want
460 	 * these inodes being written to disk.  So clear the flags.
461 	 */
462 	rw_enter(&ip->i_contents, RW_WRITER);
463 	ip->i_flag &= ~(IMOD|IMODACC|IACC|IUPD|ICHG|IATTCHG);
464 	rw_exit(&ip->i_contents);
465 
466 	/*
467 	 * pages are mlocked -- fail wlock
468 	 */
469 	if (ITOV(ip)->v_type != VCHR && vn_has_cached_data(ITOV(ip)))
470 		return (EBUSY);
471 
472 	return (0);
473 }
474 
475 /*
476  * ufs_thaw_hlock
477  *	special processing when thawing down to hlock or elock
478  */
479 static int
480 ufs_thaw_hlock(struct inode *ip, void *arg)
481 {
482 	struct vnode	*vp	= ITOV(ip);
483 
484 	/*
485 	 * wrong file system; keep looking
486 	 */
487 	if (ip->i_ufsvfs != (struct ufsvfs *)arg)
488 		return (0);
489 
490 	/*
491 	 * blow away all pages - even if they are mlocked
492 	 */
493 	do {
494 		(void) TRANS_SYNCIP(ip, B_INVAL | B_FORCE, 0, TOP_SYNCIP_HLOCK);
495 	} while ((vp->v_type != VCHR) && vn_has_cached_data(vp));
496 	rw_enter(&ip->i_contents, RW_WRITER);
497 	ip->i_flag &= ~(IMOD|IMODACC|IACC|IUPD|ICHG|IATTCHG);
498 	rw_exit(&ip->i_contents);
499 
500 	return (0);
501 }
502 
503 /*
504  * ufs_thaw
505  *	thaw file system lock down to current value
506  */
507 int
508 ufs_thaw(struct vfs *vfsp, struct ufsvfs *ufsvfsp, struct ulockfs *ulp)
509 {
510 	int		error	= 0;
511 	int		noidel	= (int)(ulp->ul_flag & ULOCKFS_NOIDEL);
512 
513 	/*
514 	 * if wlock or hlock or elock
515 	 */
516 	if (ULOCKFS_IS_WLOCK(ulp) || ULOCKFS_IS_HLOCK(ulp) ||
517 	    ULOCKFS_IS_ELOCK(ulp)) {
518 
519 		/*
520 		 * don't keep access times
521 		 * don't free deleted files
522 		 * if superblock writes are allowed, limit them to me for now
523 		 */
524 		ulp->ul_flag |= (ULOCKFS_NOIACC|ULOCKFS_NOIDEL);
525 		if (ulp->ul_sbowner != (kthread_id_t)-1)
526 			ulp->ul_sbowner = curthread;
527 
528 		/*
529 		 * wait for writes for deleted files and superblock updates
530 		 */
531 		(void) ufs_flush(vfsp);
532 
533 		/*
534 		 * now make sure the quota file is up-to-date
535 		 *	expensive; but effective
536 		 */
537 		error = ufs_flush(vfsp);
538 		/*
539 		 * no one can write the superblock
540 		 */
541 		ulp->ul_sbowner = (kthread_id_t)-1;
542 
543 		/*
544 		 * special processing for wlock/hlock/elock
545 		 */
546 		if (ULOCKFS_IS_WLOCK(ulp)) {
547 			if (error)
548 				goto errout;
549 			error = bfinval(ufsvfsp->vfs_dev, 0);
550 			if (error)
551 				goto errout;
552 			error = ufs_scan_inodes(0, ufs_thaw_wlock,
553 			    (void *)ufsvfsp, ufsvfsp);
554 			if (error)
555 				goto errout;
556 		}
557 		if (ULOCKFS_IS_HLOCK(ulp) || ULOCKFS_IS_ELOCK(ulp)) {
558 			error = 0;
559 			(void) ufs_scan_inodes(0, ufs_thaw_hlock,
560 			    (void *)ufsvfsp, ufsvfsp);
561 			(void) bfinval(ufsvfsp->vfs_dev, 1);
562 		}
563 	} else {
564 
565 		/*
566 		 * okay to keep access times
567 		 * okay to free deleted files
568 		 * okay to write the superblock
569 		 */
570 		ulp->ul_flag &= ~(ULOCKFS_NOIACC|ULOCKFS_NOIDEL);
571 		ulp->ul_sbowner = NULL;
572 
573 		/*
574 		 * flush in case deleted files are in memory
575 		 */
576 		if (noidel) {
577 			if (error = ufs_flush(vfsp))
578 				goto errout;
579 		}
580 	}
581 
582 errout:
583 	cv_broadcast(&ulp->ul_cv);
584 	return (error);
585 }
586 
587 /*
588  * ufs_reconcile_fs
589  *	reconcile incore superblock with ondisk superblock
590  */
591 int
592 ufs_reconcile_fs(struct vfs *vfsp, struct ufsvfs *ufsvfsp, int errlck)
593 {
594 	struct fs	*mfs; 	/* in-memory superblock */
595 	struct fs	*dfs;	/* on-disk   superblock */
596 	struct buf	*bp;	/* on-disk   superblock buf */
597 	int		 needs_unlock;
598 	char		 finished_fsclean;
599 
600 	mfs = ufsvfsp->vfs_fs;
601 
602 	/*
603 	 * get the on-disk copy of the superblock
604 	 */
605 	bp = UFS_BREAD(ufsvfsp, vfsp->vfs_dev, SBLOCK, SBSIZE);
606 	bp->b_flags |= (B_STALE|B_AGE);
607 	if (bp->b_flags & B_ERROR) {
608 		brelse(bp);
609 		return (EIO);
610 	}
611 	dfs = bp->b_un.b_fs;
612 
613 	/* error locks may only unlock after the fs has been made consistent */
614 	if (errlck == UN_ERRLCK) {
615 		if (dfs->fs_clean == FSFIX) {	/* being repaired */
616 			brelse(bp);
617 			return (EAGAIN);
618 		}
619 		/* repair not yet started? */
620 		finished_fsclean = TRANS_ISTRANS(ufsvfsp)? FSLOG: FSCLEAN;
621 		if (dfs->fs_clean != finished_fsclean) {
622 			brelse(bp);
623 			return (EBUSY);
624 		}
625 	}
626 
627 	/*
628 	 * if superblock has changed too much, abort
629 	 */
630 	if ((mfs->fs_sblkno		!= dfs->fs_sblkno) ||
631 	    (mfs->fs_cblkno		!= dfs->fs_cblkno) ||
632 	    (mfs->fs_iblkno		!= dfs->fs_iblkno) ||
633 	    (mfs->fs_dblkno		!= dfs->fs_dblkno) ||
634 	    (mfs->fs_cgoffset		!= dfs->fs_cgoffset) ||
635 	    (mfs->fs_cgmask		!= dfs->fs_cgmask) ||
636 	    (mfs->fs_bsize		!= dfs->fs_bsize) ||
637 	    (mfs->fs_fsize		!= dfs->fs_fsize) ||
638 	    (mfs->fs_frag		!= dfs->fs_frag) ||
639 	    (mfs->fs_bmask		!= dfs->fs_bmask) ||
640 	    (mfs->fs_fmask		!= dfs->fs_fmask) ||
641 	    (mfs->fs_bshift		!= dfs->fs_bshift) ||
642 	    (mfs->fs_fshift		!= dfs->fs_fshift) ||
643 	    (mfs->fs_fragshift		!= dfs->fs_fragshift) ||
644 	    (mfs->fs_fsbtodb		!= dfs->fs_fsbtodb) ||
645 	    (mfs->fs_sbsize		!= dfs->fs_sbsize) ||
646 	    (mfs->fs_nindir		!= dfs->fs_nindir) ||
647 	    (mfs->fs_nspf		!= dfs->fs_nspf) ||
648 	    (mfs->fs_trackskew		!= dfs->fs_trackskew) ||
649 	    (mfs->fs_cgsize		!= dfs->fs_cgsize) ||
650 	    (mfs->fs_ntrak		!= dfs->fs_ntrak) ||
651 	    (mfs->fs_nsect		!= dfs->fs_nsect) ||
652 	    (mfs->fs_spc		!= dfs->fs_spc) ||
653 	    (mfs->fs_cpg		!= dfs->fs_cpg) ||
654 	    (mfs->fs_ipg		!= dfs->fs_ipg) ||
655 	    (mfs->fs_fpg		!= dfs->fs_fpg) ||
656 	    (mfs->fs_postblformat	!= dfs->fs_postblformat) ||
657 	    (mfs->fs_magic		!= dfs->fs_magic)) {
658 		brelse(bp);
659 		return (EACCES);
660 	}
661 	if (dfs->fs_clean == FSBAD || FSOKAY != dfs->fs_state + dfs->fs_time)
662 		if (mfs->fs_clean == FSLOG) {
663 			brelse(bp);
664 			return (EACCES);
665 		}
666 
667 	/*
668 	 * get new summary info
669 	 */
670 	if (ufs_getsummaryinfo(vfsp->vfs_dev, ufsvfsp, dfs)) {
671 		brelse(bp);
672 		return (EIO);
673 	}
674 
675 	/*
676 	 * release old summary info and update in-memory superblock
677 	 */
678 	kmem_free(mfs->fs_u.fs_csp, mfs->fs_cssize);
679 	mfs->fs_u.fs_csp = dfs->fs_u.fs_csp;	/* Only entry 0 used */
680 
681 	/*
682 	 * update fields allowed to change
683 	 */
684 	mfs->fs_size		= dfs->fs_size;
685 	mfs->fs_dsize		= dfs->fs_dsize;
686 	mfs->fs_ncg		= dfs->fs_ncg;
687 	mfs->fs_minfree		= dfs->fs_minfree;
688 	mfs->fs_rotdelay	= dfs->fs_rotdelay;
689 	mfs->fs_rps		= dfs->fs_rps;
690 	mfs->fs_maxcontig	= dfs->fs_maxcontig;
691 	mfs->fs_maxbpg		= dfs->fs_maxbpg;
692 	mfs->fs_csmask		= dfs->fs_csmask;
693 	mfs->fs_csshift		= dfs->fs_csshift;
694 	mfs->fs_optim		= dfs->fs_optim;
695 	mfs->fs_csaddr		= dfs->fs_csaddr;
696 	mfs->fs_cssize		= dfs->fs_cssize;
697 	mfs->fs_ncyl		= dfs->fs_ncyl;
698 	mfs->fs_cstotal		= dfs->fs_cstotal;
699 	mfs->fs_reclaim		= dfs->fs_reclaim;
700 
701 	if (mfs->fs_reclaim & (FS_RECLAIM|FS_RECLAIMING)) {
702 		mfs->fs_reclaim &= ~FS_RECLAIM;
703 		mfs->fs_reclaim |=  FS_RECLAIMING;
704 		ufs_thread_start(&ufsvfsp->vfs_reclaim,
705 		    ufs_thread_reclaim, vfsp);
706 	}
707 
708 	/* XXX What to do about sparecon? */
709 
710 	/* XXX need to copy volume label */
711 
712 	/*
713 	 * ondisk clean flag overrides inmemory clean flag iff == FSBAD
714 	 * or if error-locked and ondisk is now clean
715 	 */
716 	needs_unlock = !MUTEX_HELD(&ufsvfsp->vfs_lock);
717 	if (needs_unlock)
718 		mutex_enter(&ufsvfsp->vfs_lock);
719 
720 	if (errlck == UN_ERRLCK) {
721 		if (finished_fsclean == dfs->fs_clean)
722 			mfs->fs_clean = finished_fsclean;
723 		else
724 			mfs->fs_clean = FSBAD;
725 		mfs->fs_state = FSOKAY - dfs->fs_time;
726 	}
727 
728 	if (FSOKAY != dfs->fs_state + dfs->fs_time ||
729 	    (dfs->fs_clean == FSBAD))
730 		mfs->fs_clean = FSBAD;
731 
732 	if (needs_unlock)
733 		mutex_exit(&ufsvfsp->vfs_lock);
734 
735 	brelse(bp);
736 
737 	return (0);
738 }
739 
740 /*
741  * ufs_reconcile_inode
742  *	reconcile ondisk inode with incore inode
743  */
744 static int
745 ufs_reconcile_inode(struct inode *ip, void *arg)
746 {
747 	int		i;
748 	int		ndaddr;
749 	int		niaddr;
750 	struct dinode	*dp;		/* ondisk inode */
751 	struct buf	*bp	= NULL;
752 	uid_t		d_uid;
753 	gid_t		d_gid;
754 	int		error = 0;
755 	struct fs	*fs;
756 
757 	/*
758 	 * not an inode we care about
759 	 */
760 	if (ip->i_ufsvfs != (struct ufsvfs *)arg)
761 		return (0);
762 
763 	fs = ip->i_fs;
764 
765 	/*
766 	 * Inode reconciliation fails: we made the filesystem quiescent
767 	 * and we did a ufs_flush() before calling ufs_reconcile_inode()
768 	 * and thus the inode should not have been changed inbetween.
769 	 * Any discrepancies indicate a logic error and a pretty
770 	 * significant run-state inconsistency we should complain about.
771 	 */
772 	if (ip->i_flag & (IMOD|IMODACC|IACC|IUPD|ICHG|IATTCHG)) {
773 		cmn_err(CE_WARN, "%s: Inode reconciliation failed for"
774 		    "inode %llu", fs->fs_fsmnt, (u_longlong_t)ip->i_number);
775 		return (EINVAL);
776 	}
777 
778 	/*
779 	 * get the dinode
780 	 */
781 	bp = UFS_BREAD(ip->i_ufsvfs,
782 	    ip->i_dev, (daddr_t)fsbtodb(fs, itod(fs, ip->i_number)),
783 	    (int)fs->fs_bsize);
784 	if (bp->b_flags & B_ERROR) {
785 		brelse(bp);
786 		return (EIO);
787 	}
788 	dp  = bp->b_un.b_dino;
789 	dp += itoo(fs, ip->i_number);
790 
791 	/*
792 	 * handle Sun's implementation of EFT
793 	 */
794 	d_uid = (dp->di_suid == UID_LONG) ? dp->di_uid : (uid_t)dp->di_suid;
795 	d_gid = (dp->di_sgid == GID_LONG) ? dp->di_gid : (uid_t)dp->di_sgid;
796 
797 	rw_enter(&ip->i_contents, RW_WRITER);
798 
799 	/*
800 	 * some fields are not allowed to change
801 	 */
802 	if ((ip->i_mode  != dp->di_mode) ||
803 	    (ip->i_gen   != dp->di_gen) ||
804 	    (ip->i_uid   != d_uid) ||
805 	    (ip->i_gid   != d_gid)) {
806 		error = EACCES;
807 		goto out;
808 	}
809 
810 	/*
811 	 * and some are allowed to change
812 	 */
813 	ip->i_size		= dp->di_size;
814 	ip->i_ic.ic_flags	= dp->di_ic.ic_flags;
815 	ip->i_blocks		= dp->di_blocks;
816 	ip->i_nlink		= dp->di_nlink;
817 	if (ip->i_flag & IFASTSYMLNK) {
818 		ndaddr = 1;
819 		niaddr = 0;
820 	} else {
821 		ndaddr = NDADDR;
822 		niaddr = NIADDR;
823 	}
824 	for (i = 0; i < ndaddr; ++i)
825 		ip->i_db[i] = dp->di_db[i];
826 	for (i = 0; i < niaddr; ++i)
827 		ip->i_ib[i] = dp->di_ib[i];
828 
829 out:
830 	rw_exit(&ip->i_contents);
831 	brelse(bp);
832 	return (error);
833 }
834 
835 /*
836  * ufs_reconcile
837  *	reconcile ondisk superblock/inodes with any incore
838  */
839 static int
840 ufs_reconcile(struct vfs *vfsp, struct ufsvfs *ufsvfsp, int errlck)
841 {
842 	int	error = 0;
843 
844 	/*
845 	 * get rid of as much inmemory data as possible
846 	 */
847 	(void) ufs_flush(vfsp);
848 
849 	/*
850 	 * reconcile the superblock and inodes
851 	 */
852 	if (error = ufs_reconcile_fs(vfsp, ufsvfsp, errlck))
853 		return (error);
854 	if (error = ufs_scan_inodes(0, ufs_reconcile_inode, ufsvfsp, ufsvfsp))
855 		return (error);
856 	/*
857 	 * allocation blocks may be incorrect; get rid of them
858 	 */
859 	(void) ufs_flush(vfsp);
860 
861 	return (error);
862 }
863 
864 /*
865  * File system locking
866  */
867 int
868 ufs_fiolfs(struct vnode *vp, struct lockfs *lockfsp, int from_log)
869 {
870 	return (ufs__fiolfs(vp, lockfsp, /* from_user */ 1, from_log));
871 }
872 
873 /* kernel-internal interface, also used by fix-on-panic */
874 int
875 ufs__fiolfs(
876 	struct vnode *vp,
877 	struct lockfs *lockfsp,
878 	int from_user,
879 	int from_log)
880 {
881 	struct ulockfs	*ulp;
882 	struct lockfs	lfs;
883 	int		error;
884 	struct vfs	*vfsp;
885 	struct ufsvfs	*ufsvfsp;
886 	int		 errlck		= NO_ERRLCK;
887 	int		 poll_events	= POLLPRI;
888 	extern struct pollhead ufs_pollhd;
889 	ulockfs_info_t *head;
890 	ulockfs_info_t *info;
891 	int signal = 0;
892 
893 	/* check valid lock type */
894 	if (!lockfsp || lockfsp->lf_lock > LOCKFS_MAXLOCK)
895 		return (EINVAL);
896 
897 	if (!vp || !vp->v_vfsp || !vp->v_vfsp->vfs_data)
898 		return (EIO);
899 
900 	vfsp = vp->v_vfsp;
901 
902 	if (vfsp->vfs_flag & VFS_UNMOUNTED) /* has been unmounted */
903 		return (EIO);
904 
905 	/* take the lock and check again */
906 	vfs_lock_wait(vfsp);
907 	if (vfsp->vfs_flag & VFS_UNMOUNTED) {
908 		vfs_unlock(vfsp);
909 		return (EIO);
910 	}
911 
912 	/*
913 	 * Can't wlock or ro/elock fs with accounting or local swap file
914 	 * We need to check for this before we grab the ul_lock to avoid
915 	 * deadlocks with the accounting framework.
916 	 */
917 	if ((LOCKFS_IS_WLOCK(lockfsp) || LOCKFS_IS_ELOCK(lockfsp) ||
918 	    LOCKFS_IS_ROELOCK(lockfsp)) && !from_log) {
919 		if (ufs_checkaccton(vp) || ufs_checkswapon(vp)) {
920 			vfs_unlock(vfsp);
921 			return (EDEADLK);
922 		}
923 	}
924 
925 	ufsvfsp = (struct ufsvfs *)vfsp->vfs_data;
926 	ulp = &ufsvfsp->vfs_ulockfs;
927 	head = (ulockfs_info_t *)tsd_get(ufs_lockfs_key);
928 	SEARCH_ULOCKFSP(head, ulp, info);
929 
930 	/*
931 	 * Suspend both the reclaim thread and the delete thread.
932 	 * This must be done outside the lockfs locking protocol.
933 	 */
934 	ufs_thread_suspend(&ufsvfsp->vfs_reclaim);
935 	ufs_thread_suspend(&ufsvfsp->vfs_delete);
936 
937 	mutex_enter(&ulp->ul_lock);
938 	atomic_add_long(&ufs_quiesce_pend, 1);
939 
940 	/*
941 	 * Quit if there is another lockfs request in progress
942 	 * that is waiting for existing ufs_vnops to complete.
943 	 */
944 	if (ULOCKFS_IS_BUSY(ulp)) {
945 		error = EBUSY;
946 		goto errexit;
947 	}
948 
949 	/* cannot ulocked or downgrade a hard-lock */
950 	if (ULOCKFS_IS_HLOCK(ulp)) {
951 		error = EIO;
952 		goto errexit;
953 	}
954 
955 	/* an error lock may be unlocked or relocked, only */
956 	if (ULOCKFS_IS_ELOCK(ulp)) {
957 		if (!LOCKFS_IS_ULOCK(lockfsp) && !LOCKFS_IS_ELOCK(lockfsp)) {
958 			error = EBUSY;
959 			goto errexit;
960 		}
961 	}
962 
963 	/*
964 	 * a read-only error lock may only be upgraded to an
965 	 * error lock or hard lock
966 	 */
967 	if (ULOCKFS_IS_ROELOCK(ulp)) {
968 		if (!LOCKFS_IS_HLOCK(lockfsp) && !LOCKFS_IS_ELOCK(lockfsp)) {
969 			error = EBUSY;
970 			goto errexit;
971 		}
972 	}
973 
974 	/*
975 	 * until read-only error locks are fully implemented
976 	 * just return EINVAL
977 	 */
978 	if (LOCKFS_IS_ROELOCK(lockfsp)) {
979 		error = EINVAL;
980 		goto errexit;
981 	}
982 
983 	/*
984 	 * an error lock may only be applied if the file system is
985 	 * unlocked or already error locked.
986 	 * (this is to prevent the case where a fs gets changed out from
987 	 * underneath a fs that is locked for backup,
988 	 * that is, name/delete/write-locked.)
989 	 */
990 	if ((!ULOCKFS_IS_ULOCK(ulp) && !ULOCKFS_IS_ELOCK(ulp) &&
991 	    !ULOCKFS_IS_ROELOCK(ulp)) &&
992 	    (LOCKFS_IS_ELOCK(lockfsp) || LOCKFS_IS_ROELOCK(lockfsp))) {
993 		error = EBUSY;
994 		goto errexit;
995 	}
996 
997 	/* get and validate the input lockfs request */
998 	if (error = ufs_getlfd(lockfsp, &ulp->ul_lockfs))
999 		goto errexit;
1000 
1001 	/*
1002 	 * save current ulockfs struct
1003 	 */
1004 	bcopy(&ulp->ul_lockfs, &lfs, sizeof (struct lockfs));
1005 
1006 	/*
1007 	 * Freeze the file system (pend future accesses)
1008 	 */
1009 	ufs_freeze(ulp, lockfsp);
1010 
1011 	/*
1012 	 * Set locking in progress because ufs_quiesce may free the
1013 	 * ul_lock mutex.
1014 	 */
1015 	ULOCKFS_SET_BUSY(ulp);
1016 	/* update the ioctl copy */
1017 	LOCKFS_SET_BUSY(&ulp->ul_lockfs);
1018 
1019 	/*
1020 	 * We  need to unset FWLOCK status before we call ufs_quiesce
1021 	 * so that the thread doesnt get suspended. We do this only if
1022 	 * this (fallocate) thread requested an unlock operation.
1023 	 */
1024 	if (info && (info->flags & ULOCK_INFO_FALLOCATE)) {
1025 		if (!ULOCKFS_IS_WLOCK(ulp))
1026 			ULOCKFS_CLR_FWLOCK(ulp);
1027 	}
1028 
1029 	/*
1030 	 * Quiesce (wait for outstanding accesses to finish)
1031 	 */
1032 	if (error = ufs_quiesce(ulp)) {
1033 		/*
1034 		 * Interrupted due to signal. There could still be
1035 		 * pending vnops.
1036 		 */
1037 		signal = 1;
1038 
1039 		/*
1040 		 * We do broadcast because lock-status
1041 		 * could be reverted to old status.
1042 		 */
1043 		cv_broadcast(&ulp->ul_cv);
1044 		goto errout;
1045 	}
1046 
1047 	/*
1048 	 * If the fallocate thread requested a write fs lock operation
1049 	 * then we set fwlock status in the ulp.
1050 	 */
1051 	if (info && (info->flags & ULOCK_INFO_FALLOCATE)) {
1052 		if (ULOCKFS_IS_WLOCK(ulp))
1053 			ULOCKFS_SET_FWLOCK(ulp);
1054 	}
1055 
1056 	/*
1057 	 * save error lock status to pass down to reconcilation
1058 	 * routines and for later cleanup
1059 	 */
1060 	if (LOCKFS_IS_ELOCK(&lfs) && ULOCKFS_IS_ULOCK(ulp))
1061 		errlck = UN_ERRLCK;
1062 
1063 	if (ULOCKFS_IS_ELOCK(ulp) || ULOCKFS_IS_ROELOCK(ulp)) {
1064 		int needs_unlock;
1065 		int needs_sbwrite;
1066 
1067 		poll_events |= POLLERR;
1068 		errlck = LOCKFS_IS_ELOCK(&lfs) || LOCKFS_IS_ROELOCK(&lfs) ?
1069 		    RE_ERRLCK : SET_ERRLCK;
1070 
1071 		needs_unlock = !MUTEX_HELD(&ufsvfsp->vfs_lock);
1072 		if (needs_unlock)
1073 			mutex_enter(&ufsvfsp->vfs_lock);
1074 
1075 		/* disable delayed i/o */
1076 		needs_sbwrite = 0;
1077 
1078 		if (errlck == SET_ERRLCK) {
1079 			ufsvfsp->vfs_fs->fs_clean = FSBAD;
1080 			needs_sbwrite = 1;
1081 		}
1082 
1083 		needs_sbwrite |= ufsvfsp->vfs_dio;
1084 		ufsvfsp->vfs_dio = 0;
1085 
1086 		if (needs_unlock)
1087 			mutex_exit(&ufsvfsp->vfs_lock);
1088 
1089 		if (needs_sbwrite) {
1090 			ulp->ul_sbowner = curthread;
1091 			TRANS_SBWRITE(ufsvfsp, TOP_SBWRITE_STABLE);
1092 
1093 			if (needs_unlock)
1094 				mutex_enter(&ufsvfsp->vfs_lock);
1095 
1096 			ufsvfsp->vfs_fs->fs_fmod = 0;
1097 
1098 			if (needs_unlock)
1099 				mutex_exit(&ufsvfsp->vfs_lock);
1100 		}
1101 	}
1102 
1103 	/*
1104 	 * reconcile superblock and inodes if was wlocked
1105 	 */
1106 	if (LOCKFS_IS_WLOCK(&lfs) || LOCKFS_IS_ELOCK(&lfs)) {
1107 		if (error = ufs_reconcile(vfsp, ufsvfsp, errlck))
1108 			goto errout;
1109 		/*
1110 		 * in case the fs grew; reset the metadata map for logging tests
1111 		 */
1112 		TRANS_MATA_UMOUNT(ufsvfsp);
1113 		TRANS_MATA_MOUNT(ufsvfsp);
1114 		TRANS_MATA_SI(ufsvfsp, ufsvfsp->vfs_fs);
1115 	}
1116 
1117 	/*
1118 	 * At least everything *currently* dirty goes out.
1119 	 */
1120 
1121 	if ((error = ufs_flush(vfsp)) != 0 && !ULOCKFS_IS_HLOCK(ulp) &&
1122 	    !ULOCKFS_IS_ELOCK(ulp))
1123 		goto errout;
1124 
1125 	/*
1126 	 * thaw file system and wakeup pended processes
1127 	 */
1128 	if (error = ufs_thaw(vfsp, ufsvfsp, ulp))
1129 		if (!ULOCKFS_IS_HLOCK(ulp) && !ULOCKFS_IS_ELOCK(ulp))
1130 			goto errout;
1131 
1132 	/*
1133 	 * reset modified flag if not already write locked
1134 	 */
1135 	if (!LOCKFS_IS_WLOCK(&lfs))
1136 		ULOCKFS_CLR_MOD(ulp);
1137 
1138 	/*
1139 	 * idle the lock struct
1140 	 */
1141 	ULOCKFS_CLR_BUSY(ulp);
1142 	/* update the ioctl copy */
1143 	LOCKFS_CLR_BUSY(&ulp->ul_lockfs);
1144 
1145 	/*
1146 	 * free current comment
1147 	 */
1148 	if (lfs.lf_comment && lfs.lf_comlen != 0) {
1149 		kmem_free(lfs.lf_comment, lfs.lf_comlen);
1150 		lfs.lf_comment = NULL;
1151 		lfs.lf_comlen = 0;
1152 	}
1153 
1154 	/* do error lock cleanup */
1155 	if (errlck == UN_ERRLCK)
1156 		ufsfx_unlockfs(ufsvfsp);
1157 
1158 	else if (errlck == RE_ERRLCK)
1159 		ufsfx_lockfs(ufsvfsp);
1160 
1161 	/* don't allow error lock from user to invoke panic */
1162 	else if (from_user && errlck == SET_ERRLCK &&
1163 	    !(ufsvfsp->vfs_fsfx.fx_flags & (UFSMNT_ONERROR_PANIC >> 4)))
1164 		(void) ufs_fault(ufsvfsp->vfs_root,
1165 		    ulp->ul_lockfs.lf_comment && ulp->ul_lockfs.lf_comlen > 0 ?
1166 		    ulp->ul_lockfs.lf_comment: "user-applied error lock");
1167 
1168 	atomic_add_long(&ufs_quiesce_pend, -1);
1169 	mutex_exit(&ulp->ul_lock);
1170 	vfs_unlock(vfsp);
1171 
1172 	if (ULOCKFS_IS_HLOCK(&ufsvfsp->vfs_ulockfs))
1173 		poll_events |= POLLERR;
1174 
1175 	pollwakeup(&ufs_pollhd, poll_events);
1176 
1177 	/*
1178 	 * Allow both the delete thread and the reclaim thread to
1179 	 * continue.
1180 	 */
1181 	ufs_thread_continue(&ufsvfsp->vfs_delete);
1182 	ufs_thread_continue(&ufsvfsp->vfs_reclaim);
1183 
1184 	return (0);
1185 
1186 errout:
1187 	/*
1188 	 * Lock failed. Reset the old lock in ufsvfs if not hard locked.
1189 	 */
1190 	if (!LOCKFS_IS_HLOCK(&ulp->ul_lockfs)) {
1191 		bcopy(&lfs, &ulp->ul_lockfs, sizeof (struct lockfs));
1192 		ulp->ul_fs_lock = (1 << lfs.lf_lock);
1193 	}
1194 
1195 	/*
1196 	 * Don't call ufs_thaw() when there's a signal during
1197 	 * ufs quiesce operation as it can lead to deadlock
1198 	 * with getpage.
1199 	 */
1200 	if (signal == 0)
1201 		(void) ufs_thaw(vfsp, ufsvfsp, ulp);
1202 
1203 	ULOCKFS_CLR_BUSY(ulp);
1204 	LOCKFS_CLR_BUSY(&ulp->ul_lockfs);
1205 
1206 errexit:
1207 	atomic_add_long(&ufs_quiesce_pend, -1);
1208 	mutex_exit(&ulp->ul_lock);
1209 	vfs_unlock(vfsp);
1210 
1211 	/*
1212 	 * Allow both the delete thread and the reclaim thread to
1213 	 * continue.
1214 	 */
1215 	ufs_thread_continue(&ufsvfsp->vfs_delete);
1216 	ufs_thread_continue(&ufsvfsp->vfs_reclaim);
1217 
1218 	return (error);
1219 }
1220 
1221 /*
1222  * fiolfss
1223  * 	return the current file system locking state info
1224  */
1225 int
1226 ufs_fiolfss(struct vnode *vp, struct lockfs *lockfsp)
1227 {
1228 	struct ulockfs	*ulp;
1229 
1230 	if (!vp || !vp->v_vfsp || !VTOI(vp))
1231 		return (EINVAL);
1232 
1233 	/* file system has been forcibly unmounted */
1234 	if (VTOI(vp)->i_ufsvfs == NULL)
1235 		return (EIO);
1236 
1237 	ulp = VTOUL(vp);
1238 
1239 	if (ULOCKFS_IS_HLOCK(ulp)) {
1240 		*lockfsp = ulp->ul_lockfs;	/* structure assignment */
1241 		return (0);
1242 	}
1243 
1244 	mutex_enter(&ulp->ul_lock);
1245 
1246 	*lockfsp = ulp->ul_lockfs;	/* structure assignment */
1247 
1248 	if (ULOCKFS_IS_MOD(ulp))
1249 		lockfsp->lf_flags |= LOCKFS_MOD;
1250 
1251 	mutex_exit(&ulp->ul_lock);
1252 
1253 	return (0);
1254 }
1255 
1256 /*
1257  * ufs_check_lockfs
1258  *	check whether a ufs_vnops conflicts with the file system lock
1259  */
1260 int
1261 ufs_check_lockfs(struct ufsvfs *ufsvfsp, struct ulockfs *ulp, ulong_t mask)
1262 {
1263 	k_sigset_t	smask;
1264 	int		sig, slock;
1265 
1266 	ASSERT(MUTEX_HELD(&ulp->ul_lock));
1267 
1268 	while (ulp->ul_fs_lock & mask) {
1269 		slock = (int)ULOCKFS_IS_SLOCK(ulp);
1270 		if ((curthread->t_flag & T_DONTPEND) && !slock) {
1271 			curthread->t_flag |= T_WOULDBLOCK;
1272 			return (EAGAIN);
1273 		}
1274 		curthread->t_flag &= ~T_WOULDBLOCK;
1275 
1276 		/*
1277 		 * In the case of an onerr umount of the fs, threads could
1278 		 * have blocked before coming into ufs_check_lockfs and
1279 		 * need to check for the special case of ELOCK and
1280 		 * vfs_dontblock being set which would indicate that the fs
1281 		 * is on its way out and will not return therefore making
1282 		 * EIO the appropriate response.
1283 		 */
1284 		if (ULOCKFS_IS_HLOCK(ulp) ||
1285 		    (ULOCKFS_IS_ELOCK(ulp) && ufsvfsp->vfs_dontblock))
1286 			return (EIO);
1287 
1288 		/*
1289 		 * wait for lock status to change
1290 		 */
1291 		if (slock || ufsvfsp->vfs_nointr) {
1292 			cv_wait(&ulp->ul_cv, &ulp->ul_lock);
1293 		} else {
1294 			sigintr(&smask, 1);
1295 			sig = cv_wait_sig(&ulp->ul_cv, &ulp->ul_lock);
1296 			sigunintr(&smask);
1297 			if ((!sig && (ulp->ul_fs_lock & mask)) ||
1298 			    ufsvfsp->vfs_dontblock)
1299 				return (EINTR);
1300 		}
1301 	}
1302 
1303 	if (mask & ULOCKFS_FWLOCK) {
1304 		atomic_add_long(&ulp->ul_falloc_cnt, 1);
1305 		ULOCKFS_SET_FALLOC(ulp);
1306 	} else {
1307 		atomic_add_long(&ulp->ul_vnops_cnt, 1);
1308 	}
1309 
1310 	return (0);
1311 }
1312 
1313 /*
1314  * Check whether we came across the handcrafted lockfs protocol path. We can't
1315  * simply check for T_DONTBLOCK here as one would assume since this can also
1316  * falsely catch recursive VOP's going to a different filesystem, instead we
1317  * check if we already hold the ulockfs->ul_lock mutex.
1318  */
1319 static int
1320 ufs_lockfs_is_under_rawlockfs(struct ulockfs *ulp)
1321 {
1322 	return ((mutex_owner(&ulp->ul_lock) != curthread) ? 0 : 1);
1323 }
1324 
1325 /*
1326  * ufs_lockfs_begin - start the lockfs locking protocol
1327  */
1328 int
1329 ufs_lockfs_begin(struct ufsvfs *ufsvfsp, struct ulockfs **ulpp, ulong_t mask)
1330 {
1331 	int 		error;
1332 	int		rec_vop;
1333 	ushort_t	op_cnt_incremented = 0;
1334 	ulong_t		*ctr;
1335 	struct ulockfs *ulp;
1336 	ulockfs_info_t	*ulockfs_info;
1337 	ulockfs_info_t	*ulockfs_info_free;
1338 	ulockfs_info_t	*ulockfs_info_temp;
1339 
1340 	/*
1341 	 * file system has been forcibly unmounted
1342 	 */
1343 	if (ufsvfsp == NULL)
1344 		return (EIO);
1345 
1346 	*ulpp = ulp = &ufsvfsp->vfs_ulockfs;
1347 
1348 	/*
1349 	 * Do lockfs protocol
1350 	 */
1351 	ulockfs_info = (ulockfs_info_t *)tsd_get(ufs_lockfs_key);
1352 	IS_REC_VOP(rec_vop, ulockfs_info, ulp, ulockfs_info_free);
1353 
1354 	/*
1355 	 * Detect recursive VOP call or handcrafted internal lockfs protocol
1356 	 * path and bail out in that case.
1357 	 */
1358 	if (rec_vop || ufs_lockfs_is_under_rawlockfs(ulp)) {
1359 		*ulpp = NULL;
1360 		return (0);
1361 	} else {
1362 		if (ulockfs_info_free == NULL) {
1363 			if ((ulockfs_info_temp = (ulockfs_info_t *)
1364 			    kmem_zalloc(sizeof (ulockfs_info_t),
1365 			    KM_NOSLEEP)) == NULL) {
1366 				*ulpp = NULL;
1367 				return (ENOMEM);
1368 			}
1369 		}
1370 	}
1371 
1372 	/*
1373 	 * First time VOP call
1374 	 *
1375 	 * Increment the ctr irrespective of the lockfs state. If the lockfs
1376 	 * state is not ULOCKFS_ULOCK, we can decrement it later. However,
1377 	 * before incrementing we need to check if there is a pending quiesce
1378 	 * request because if we have a continuous stream of ufs_lockfs_begin
1379 	 * requests pounding on a few cpu's then the ufs_quiesce thread might
1380 	 * never see the value of zero for ctr - a livelock kind of scenario.
1381 	 */
1382 	ctr = (mask & ULOCKFS_FWLOCK) ?
1383 	    &ulp->ul_falloc_cnt : &ulp->ul_vnops_cnt;
1384 	if (!ULOCKFS_IS_SLOCK(ulp)) {
1385 		atomic_add_long(ctr, 1);
1386 		op_cnt_incremented++;
1387 	}
1388 
1389 	/*
1390 	 * If the lockfs state (indicated by ul_fs_lock) is not just
1391 	 * ULOCKFS_ULOCK, then we will be routed through ufs_check_lockfs
1392 	 * where there is a check with an appropriate mask to selectively allow
1393 	 * operations permitted for that kind of lockfs state.
1394 	 *
1395 	 * Even these selective operations should not be allowed to go through
1396 	 * if a lockfs request is in progress because that could result in inode
1397 	 * modifications during a quiesce and could hence result in inode
1398 	 * reconciliation failures. ULOCKFS_SLOCK alone would not be sufficient,
1399 	 * so make use of ufs_quiesce_pend to disallow vnode operations when a
1400 	 * quiesce is in progress.
1401 	 */
1402 	if (!ULOCKFS_IS_JUSTULOCK(ulp) || ufs_quiesce_pend) {
1403 		if (op_cnt_incremented)
1404 			if (!atomic_add_long_nv(ctr, -1))
1405 				cv_broadcast(&ulp->ul_cv);
1406 		mutex_enter(&ulp->ul_lock);
1407 		error = ufs_check_lockfs(ufsvfsp, ulp, mask);
1408 		mutex_exit(&ulp->ul_lock);
1409 		if (error) {
1410 			if (ulockfs_info_free == NULL)
1411 				kmem_free(ulockfs_info_temp,
1412 				    sizeof (ulockfs_info_t));
1413 			return (error);
1414 		}
1415 	} else {
1416 		/*
1417 		 * This is the common case of file system in a unlocked state.
1418 		 *
1419 		 * If a file system is unlocked, we would expect the ctr to have
1420 		 * been incremented by now. But this will not be true when a
1421 		 * quiesce is winding up - SLOCK was set when we checked before
1422 		 * incrementing the ctr, but by the time we checked for
1423 		 * ULOCKFS_IS_JUSTULOCK, the quiesce thread was gone. It is okay
1424 		 * to take ul_lock and go through the slow path in this uncommon
1425 		 * case.
1426 		 */
1427 		if (op_cnt_incremented == 0) {
1428 			mutex_enter(&ulp->ul_lock);
1429 			error = ufs_check_lockfs(ufsvfsp, ulp, mask);
1430 			if (error) {
1431 				mutex_exit(&ulp->ul_lock);
1432 				if (ulockfs_info_free == NULL)
1433 					kmem_free(ulockfs_info_temp,
1434 					    sizeof (ulockfs_info_t));
1435 				return (error);
1436 			}
1437 			if (mask & ULOCKFS_FWLOCK)
1438 				ULOCKFS_SET_FALLOC(ulp);
1439 			mutex_exit(&ulp->ul_lock);
1440 		} else if (mask & ULOCKFS_FWLOCK) {
1441 			mutex_enter(&ulp->ul_lock);
1442 			ULOCKFS_SET_FALLOC(ulp);
1443 			mutex_exit(&ulp->ul_lock);
1444 		}
1445 	}
1446 
1447 	if (ulockfs_info_free != NULL) {
1448 		ulockfs_info_free->ulp = ulp;
1449 		if (mask & ULOCKFS_FWLOCK)
1450 			ulockfs_info_free->flags |= ULOCK_INFO_FALLOCATE;
1451 	} else {
1452 		ulockfs_info_temp->ulp = ulp;
1453 		ulockfs_info_temp->next = ulockfs_info;
1454 		if (mask & ULOCKFS_FWLOCK)
1455 			ulockfs_info_temp->flags |= ULOCK_INFO_FALLOCATE;
1456 		ASSERT(ufs_lockfs_key != 0);
1457 		(void) tsd_set(ufs_lockfs_key, (void *)ulockfs_info_temp);
1458 	}
1459 
1460 	curthread->t_flag |= T_DONTBLOCK;
1461 	return (0);
1462 }
1463 
1464 /*
1465  * Check whether we are returning from the top level VOP.
1466  */
1467 static int
1468 ufs_lockfs_top_vop_return(ulockfs_info_t *head)
1469 {
1470 	ulockfs_info_t *info;
1471 	int result = 1;
1472 
1473 	for (info = head; info != NULL; info = info->next) {
1474 		if (info->ulp != NULL) {
1475 			result = 0;
1476 			break;
1477 		}
1478 	}
1479 
1480 	return (result);
1481 }
1482 
1483 /*
1484  * ufs_lockfs_end - terminate the lockfs locking protocol
1485  */
1486 void
1487 ufs_lockfs_end(struct ulockfs *ulp)
1488 {
1489 	ulockfs_info_t *info;
1490 	ulockfs_info_t *head;
1491 
1492 	/*
1493 	 * end-of-VOP protocol
1494 	 */
1495 	if (ulp == NULL)
1496 		return;
1497 
1498 	head = (ulockfs_info_t *)tsd_get(ufs_lockfs_key);
1499 	SEARCH_ULOCKFSP(head, ulp, info);
1500 
1501 	/*
1502 	 * If we're called from a first level VOP, we have to have a
1503 	 * valid ulockfs record in the TSD.
1504 	 */
1505 	ASSERT(info != NULL);
1506 
1507 	/*
1508 	 * Invalidate the ulockfs record.
1509 	 */
1510 	info->ulp = NULL;
1511 
1512 	if (ufs_lockfs_top_vop_return(head))
1513 		curthread->t_flag &= ~T_DONTBLOCK;
1514 
1515 	/* fallocate thread */
1516 	if (ULOCKFS_IS_FALLOC(ulp) && info->flags & ULOCK_INFO_FALLOCATE) {
1517 		/* Clear the thread's fallocate state */
1518 		info->flags &= ~ULOCK_INFO_FALLOCATE;
1519 		if (!atomic_add_long_nv(&ulp->ul_falloc_cnt, -1)) {
1520 			mutex_enter(&ulp->ul_lock);
1521 			ULOCKFS_CLR_FALLOC(ulp);
1522 			cv_broadcast(&ulp->ul_cv);
1523 			mutex_exit(&ulp->ul_lock);
1524 		}
1525 	} else  { /* normal thread */
1526 		if (!atomic_add_long_nv(&ulp->ul_vnops_cnt, -1))
1527 			cv_broadcast(&ulp->ul_cv);
1528 	}
1529 }
1530 
1531 /*
1532  * ufs_lockfs_trybegin - try to start the lockfs locking protocol without
1533  * blocking.
1534  */
1535 int
1536 ufs_lockfs_trybegin(struct ufsvfs *ufsvfsp, struct ulockfs **ulpp, ulong_t mask)
1537 {
1538 	int 		error = 0;
1539 	int		rec_vop;
1540 	ushort_t	op_cnt_incremented = 0;
1541 	ulong_t		*ctr;
1542 	struct ulockfs *ulp;
1543 	ulockfs_info_t	*ulockfs_info;
1544 	ulockfs_info_t	*ulockfs_info_free;
1545 	ulockfs_info_t	*ulockfs_info_temp;
1546 
1547 	/*
1548 	 * file system has been forcibly unmounted
1549 	 */
1550 	if (ufsvfsp == NULL)
1551 		return (EIO);
1552 
1553 	*ulpp = ulp = &ufsvfsp->vfs_ulockfs;
1554 
1555 	/*
1556 	 * Do lockfs protocol
1557 	 */
1558 	ulockfs_info = (ulockfs_info_t *)tsd_get(ufs_lockfs_key);
1559 	IS_REC_VOP(rec_vop, ulockfs_info, ulp, ulockfs_info_free);
1560 
1561 	/*
1562 	 * Detect recursive VOP call or handcrafted internal lockfs protocol
1563 	 * path and bail out in that case.
1564 	 */
1565 	if (rec_vop || ufs_lockfs_is_under_rawlockfs(ulp)) {
1566 		*ulpp = NULL;
1567 		return (0);
1568 	} else {
1569 		if (ulockfs_info_free == NULL) {
1570 			if ((ulockfs_info_temp = (ulockfs_info_t *)
1571 			    kmem_zalloc(sizeof (ulockfs_info_t),
1572 			    KM_NOSLEEP)) == NULL) {
1573 				*ulpp = NULL;
1574 				return (ENOMEM);
1575 			}
1576 		}
1577 	}
1578 
1579 	/*
1580 	 * First time VOP call
1581 	 *
1582 	 * Increment the ctr irrespective of the lockfs state. If the lockfs
1583 	 * state is not ULOCKFS_ULOCK, we can decrement it later. However,
1584 	 * before incrementing we need to check if there is a pending quiesce
1585 	 * request because if we have a continuous stream of ufs_lockfs_begin
1586 	 * requests pounding on a few cpu's then the ufs_quiesce thread might
1587 	 * never see the value of zero for ctr - a livelock kind of scenario.
1588 	 */
1589 	ctr = (mask & ULOCKFS_FWLOCK) ?
1590 	    &ulp->ul_falloc_cnt : &ulp->ul_vnops_cnt;
1591 	if (!ULOCKFS_IS_SLOCK(ulp)) {
1592 		atomic_add_long(ctr, 1);
1593 		op_cnt_incremented++;
1594 	}
1595 
1596 	if (!ULOCKFS_IS_JUSTULOCK(ulp) || ufs_quiesce_pend) {
1597 		/*
1598 		 * Non-blocking version of ufs_check_lockfs() code.
1599 		 *
1600 		 * If the file system is not hard locked or error locked
1601 		 * and if ulp->ul_fs_lock allows this operation, increment
1602 		 * the appropriate counter and proceed (For eg., In case the
1603 		 * file system is delete locked, a mmap can still go through).
1604 		 */
1605 		if (op_cnt_incremented)
1606 			if (!atomic_add_long_nv(ctr, -1))
1607 				cv_broadcast(&ulp->ul_cv);
1608 		mutex_enter(&ulp->ul_lock);
1609 		if (ULOCKFS_IS_HLOCK(ulp) ||
1610 		    (ULOCKFS_IS_ELOCK(ulp) && ufsvfsp->vfs_dontblock))
1611 			error = EIO;
1612 		else if (ulp->ul_fs_lock & mask)
1613 			error = EAGAIN;
1614 
1615 		if (error) {
1616 			mutex_exit(&ulp->ul_lock);
1617 			if (ulockfs_info_free == NULL)
1618 				kmem_free(ulockfs_info_temp,
1619 				    sizeof (ulockfs_info_t));
1620 			return (error);
1621 		}
1622 		atomic_add_long(ctr, 1);
1623 		if (mask & ULOCKFS_FWLOCK)
1624 			ULOCKFS_SET_FALLOC(ulp);
1625 		mutex_exit(&ulp->ul_lock);
1626 	} else {
1627 		/*
1628 		 * This is the common case of file system in a unlocked state.
1629 		 *
1630 		 * If a file system is unlocked, we would expect the ctr to have
1631 		 * been incremented by now. But this will not be true when a
1632 		 * quiesce is winding up - SLOCK was set when we checked before
1633 		 * incrementing the ctr, but by the time we checked for
1634 		 * ULOCKFS_IS_JUSTULOCK, the quiesce thread was gone. Take
1635 		 * ul_lock and go through the non-blocking version of
1636 		 * ufs_check_lockfs() code.
1637 		 */
1638 		if (op_cnt_incremented == 0) {
1639 			mutex_enter(&ulp->ul_lock);
1640 			if (ULOCKFS_IS_HLOCK(ulp) ||
1641 			    (ULOCKFS_IS_ELOCK(ulp) && ufsvfsp->vfs_dontblock))
1642 				error = EIO;
1643 			else if (ulp->ul_fs_lock & mask)
1644 				error = EAGAIN;
1645 
1646 			if (error) {
1647 				mutex_exit(&ulp->ul_lock);
1648 				if (ulockfs_info_free == NULL)
1649 					kmem_free(ulockfs_info_temp,
1650 					    sizeof (ulockfs_info_t));
1651 				return (error);
1652 			}
1653 			atomic_add_long(ctr, 1);
1654 			if (mask & ULOCKFS_FWLOCK)
1655 				ULOCKFS_SET_FALLOC(ulp);
1656 			mutex_exit(&ulp->ul_lock);
1657 		} else if (mask & ULOCKFS_FWLOCK) {
1658 			mutex_enter(&ulp->ul_lock);
1659 			ULOCKFS_SET_FALLOC(ulp);
1660 			mutex_exit(&ulp->ul_lock);
1661 		}
1662 	}
1663 
1664 	if (ulockfs_info_free != NULL) {
1665 		ulockfs_info_free->ulp = ulp;
1666 		if (mask & ULOCKFS_FWLOCK)
1667 			ulockfs_info_free->flags |= ULOCK_INFO_FALLOCATE;
1668 	} else {
1669 		ulockfs_info_temp->ulp = ulp;
1670 		ulockfs_info_temp->next = ulockfs_info;
1671 		if (mask & ULOCKFS_FWLOCK)
1672 			ulockfs_info_temp->flags |= ULOCK_INFO_FALLOCATE;
1673 		ASSERT(ufs_lockfs_key != 0);
1674 		(void) tsd_set(ufs_lockfs_key, (void *)ulockfs_info_temp);
1675 	}
1676 
1677 	curthread->t_flag |= T_DONTBLOCK;
1678 	return (0);
1679 }
1680 
1681 /*
1682  * specialized version of ufs_lockfs_begin() called by ufs_getpage().
1683  */
1684 int
1685 ufs_lockfs_begin_getpage(
1686 	struct ufsvfs	*ufsvfsp,
1687 	struct ulockfs	**ulpp,
1688 	struct seg	*seg,
1689 	int		read_access,
1690 	uint_t		*protp)
1691 {
1692 	ulong_t			mask;
1693 	int 			error;
1694 	int			rec_vop;
1695 	struct ulockfs		*ulp;
1696 	ulockfs_info_t		*ulockfs_info;
1697 	ulockfs_info_t		*ulockfs_info_free;
1698 	ulockfs_info_t		*ulockfs_info_temp;
1699 
1700 	/*
1701 	 * file system has been forcibly unmounted
1702 	 */
1703 	if (ufsvfsp == NULL)
1704 		return (EIO);
1705 
1706 	*ulpp = ulp = &ufsvfsp->vfs_ulockfs;
1707 
1708 	/*
1709 	 * Do lockfs protocol
1710 	 */
1711 	ulockfs_info = (ulockfs_info_t *)tsd_get(ufs_lockfs_key);
1712 	IS_REC_VOP(rec_vop, ulockfs_info, ulp, ulockfs_info_free);
1713 
1714 	/*
1715 	 * Detect recursive VOP call or handcrafted internal lockfs protocol
1716 	 * path and bail out in that case.
1717 	 */
1718 	if (rec_vop || ufs_lockfs_is_under_rawlockfs(ulp)) {
1719 		*ulpp = NULL;
1720 		return (0);
1721 	} else {
1722 		if (ulockfs_info_free == NULL) {
1723 			if ((ulockfs_info_temp = (ulockfs_info_t *)
1724 			    kmem_zalloc(sizeof (ulockfs_info_t),
1725 			    KM_NOSLEEP)) == NULL) {
1726 				*ulpp = NULL;
1727 				return (ENOMEM);
1728 			}
1729 		}
1730 	}
1731 
1732 	/*
1733 	 * First time VOP call
1734 	 */
1735 	atomic_add_long(&ulp->ul_vnops_cnt, 1);
1736 	if (!ULOCKFS_IS_JUSTULOCK(ulp) || ufs_quiesce_pend) {
1737 		if (!atomic_add_long_nv(&ulp->ul_vnops_cnt, -1))
1738 			cv_broadcast(&ulp->ul_cv);
1739 		mutex_enter(&ulp->ul_lock);
1740 		if (seg->s_ops == &segvn_ops &&
1741 		    ((struct segvn_data *)seg->s_data)->type != MAP_SHARED) {
1742 			mask = (ulong_t)ULOCKFS_GETREAD_MASK;
1743 		} else if (protp && read_access) {
1744 			/*
1745 			 * Restrict the mapping to readonly.
1746 			 * Writes to this mapping will cause
1747 			 * another fault which will then
1748 			 * be suspended if fs is write locked
1749 			 */
1750 			*protp &= ~PROT_WRITE;
1751 			mask = (ulong_t)ULOCKFS_GETREAD_MASK;
1752 		} else
1753 			mask = (ulong_t)ULOCKFS_GETWRITE_MASK;
1754 
1755 		/*
1756 		 * will sleep if this fs is locked against this VOP
1757 		 */
1758 		error = ufs_check_lockfs(ufsvfsp, ulp, mask);
1759 		mutex_exit(&ulp->ul_lock);
1760 		if (error) {
1761 			if (ulockfs_info_free == NULL)
1762 				kmem_free(ulockfs_info_temp,
1763 				    sizeof (ulockfs_info_t));
1764 			return (error);
1765 		}
1766 	}
1767 
1768 	if (ulockfs_info_free != NULL) {
1769 		ulockfs_info_free->ulp = ulp;
1770 	} else {
1771 		ulockfs_info_temp->ulp = ulp;
1772 		ulockfs_info_temp->next = ulockfs_info;
1773 		ASSERT(ufs_lockfs_key != 0);
1774 		(void) tsd_set(ufs_lockfs_key, (void *)ulockfs_info_temp);
1775 	}
1776 
1777 	curthread->t_flag |= T_DONTBLOCK;
1778 	return (0);
1779 }
1780 
1781 void
1782 ufs_lockfs_tsd_destructor(void *head)
1783 {
1784 	ulockfs_info_t *curr = (ulockfs_info_t *)head;
1785 	ulockfs_info_t *temp;
1786 
1787 	for (; curr != NULL; ) {
1788 		/*
1789 		 * The TSD destructor is being called when the thread exits
1790 		 * (via thread_exit()). At that time it must have cleaned up
1791 		 * all VOPs via ufs_lockfs_end() and there must not be a
1792 		 * valid ulockfs record exist while a thread is exiting.
1793 		 */
1794 		temp = curr;
1795 		curr = curr->next;
1796 		ASSERT(temp->ulp == NULL);
1797 		kmem_free(temp, sizeof (ulockfs_info_t));
1798 	}
1799 }
1800