1/*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 1989, 1993
5 *	The Regents of the University of California.  All rights reserved.
6 * (c) UNIX System Laboratories, Inc.
7 * All or some portions of this file are derived from material licensed
8 * to the University of California by American Telephone and Telegraph
9 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
10 * the permission of UNIX System Laboratories, Inc.
11 *
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
14 * are met:
15 * 1. Redistributions of source code must retain the above copyright
16 *    notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 *    notice, this list of conditions and the following disclaimer in the
19 *    documentation and/or other materials provided with the distribution.
20 * 3. Neither the name of the University nor the names of its contributors
21 *    may be used to endorse or promote products derived from this software
22 *    without specific prior written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
35 *
36 *	@(#)vfs_subr.c	8.31 (Berkeley) 5/26/95
37 */
38
39/*
40 * External virtual filesystem routines
41 */
42
43#include <sys/cdefs.h>
44__FBSDID("$FreeBSD$");
45
46#include "opt_ddb.h"
47#include "opt_watchdog.h"
48
49#include <sys/param.h>
50#include <sys/systm.h>
51#include <sys/bio.h>
52#include <sys/buf.h>
53#include <sys/capsicum.h>
54#include <sys/condvar.h>
55#include <sys/conf.h>
56#include <sys/counter.h>
57#include <sys/dirent.h>
58#include <sys/event.h>
59#include <sys/eventhandler.h>
60#include <sys/extattr.h>
61#include <sys/file.h>
62#include <sys/fcntl.h>
63#include <sys/jail.h>
64#include <sys/kdb.h>
65#include <sys/kernel.h>
66#include <sys/kthread.h>
67#include <sys/ktr.h>
68#include <sys/lockf.h>
69#include <sys/malloc.h>
70#include <sys/mount.h>
71#include <sys/namei.h>
72#include <sys/pctrie.h>
73#include <sys/priv.h>
74#include <sys/reboot.h>
75#include <sys/refcount.h>
76#include <sys/rwlock.h>
77#include <sys/sched.h>
78#include <sys/sleepqueue.h>
79#include <sys/smr.h>
80#include <sys/smp.h>
81#include <sys/stat.h>
82#include <sys/sysctl.h>
83#include <sys/syslog.h>
84#include <sys/vmmeter.h>
85#include <sys/vnode.h>
86#include <sys/watchdog.h>
87
88#include <machine/stdarg.h>
89
90#include <security/mac/mac_framework.h>
91
92#include <vm/vm.h>
93#include <vm/vm_object.h>
94#include <vm/vm_extern.h>
95#include <vm/pmap.h>
96#include <vm/vm_map.h>
97#include <vm/vm_page.h>
98#include <vm/vm_kern.h>
99#include <vm/uma.h>
100
101#ifdef DDB
102#include <ddb/ddb.h>
103#endif
104
105static void	delmntque(struct vnode *vp);
106static int	flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo,
107		    int slpflag, int slptimeo);
108static void	syncer_shutdown(void *arg, int howto);
109static int	vtryrecycle(struct vnode *vp);
110static void	v_init_counters(struct vnode *);
111static void	vgonel(struct vnode *);
112static void	vfs_knllock(void *arg);
113static void	vfs_knlunlock(void *arg);
114static void	vfs_knl_assert_locked(void *arg);
115static void	vfs_knl_assert_unlocked(void *arg);
116static void	destroy_vpollinfo(struct vpollinfo *vi);
117static int	v_inval_buf_range_locked(struct vnode *vp, struct bufobj *bo,
118		    daddr_t startlbn, daddr_t endlbn);
119static void	vnlru_recalc(void);
120
121/*
122 * These fences are intended for cases where some synchronization is
123 * needed between access of v_iflags and lockless vnode refcount (v_holdcnt
124 * and v_usecount) updates.  Access to v_iflags is generally synchronized
125 * by the interlock, but we have some internal assertions that check vnode
126 * flags without acquiring the lock.  Thus, these fences are INVARIANTS-only
127 * for now.
128 */
129#ifdef INVARIANTS
130#define	VNODE_REFCOUNT_FENCE_ACQ()	atomic_thread_fence_acq()
131#define	VNODE_REFCOUNT_FENCE_REL()	atomic_thread_fence_rel()
132#else
133#define	VNODE_REFCOUNT_FENCE_ACQ()
134#define	VNODE_REFCOUNT_FENCE_REL()
135#endif
136
137/*
138 * Number of vnodes in existence.  Increased whenever getnewvnode()
139 * allocates a new vnode, decreased in vdropl() for VIRF_DOOMED vnode.
140 */
141static u_long __exclusive_cache_line numvnodes;
142
143SYSCTL_ULONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0,
144    "Number of vnodes in existence");
145
146static counter_u64_t vnodes_created;
147SYSCTL_COUNTER_U64(_vfs, OID_AUTO, vnodes_created, CTLFLAG_RD, &vnodes_created,
148    "Number of vnodes created by getnewvnode");
149
150/*
151 * Conversion tables for conversion from vnode types to inode formats
152 * and back.
153 */
154enum vtype iftovt_tab[16] = {
155	VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
156	VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VNON
157};
158int vttoif_tab[10] = {
159	0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
160	S_IFSOCK, S_IFIFO, S_IFMT, S_IFMT
161};
162
163/*
164 * List of allocates vnodes in the system.
165 */
166static TAILQ_HEAD(freelst, vnode) vnode_list;
167static struct vnode *vnode_list_free_marker;
168static struct vnode *vnode_list_reclaim_marker;
169
170/*
171 * "Free" vnode target.  Free vnodes are rarely completely free, but are
172 * just ones that are cheap to recycle.  Usually they are for files which
173 * have been stat'd but not read; these usually have inode and namecache
174 * data attached to them.  This target is the preferred minimum size of a
175 * sub-cache consisting mostly of such files. The system balances the size
176 * of this sub-cache with its complement to try to prevent either from
177 * thrashing while the other is relatively inactive.  The targets express
178 * a preference for the best balance.
179 *
180 * "Above" this target there are 2 further targets (watermarks) related
181 * to recyling of free vnodes.  In the best-operating case, the cache is
182 * exactly full, the free list has size between vlowat and vhiwat above the
183 * free target, and recycling from it and normal use maintains this state.
184 * Sometimes the free list is below vlowat or even empty, but this state
185 * is even better for immediate use provided the cache is not full.
186 * Otherwise, vnlru_proc() runs to reclaim enough vnodes (usually non-free
187 * ones) to reach one of these states.  The watermarks are currently hard-
188 * coded as 4% and 9% of the available space higher.  These and the default
189 * of 25% for wantfreevnodes are too large if the memory size is large.
190 * E.g., 9% of 75% of MAXVNODES is more than 566000 vnodes to reclaim
191 * whenever vnlru_proc() becomes active.
192 */
193static long wantfreevnodes;
194static long __exclusive_cache_line freevnodes;
195SYSCTL_ULONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD,
196    &freevnodes, 0, "Number of \"free\" vnodes");
197static long freevnodes_old;
198
199static counter_u64_t recycles_count;
200SYSCTL_COUNTER_U64(_vfs, OID_AUTO, recycles, CTLFLAG_RD, &recycles_count,
201    "Number of vnodes recycled to meet vnode cache targets");
202
203static counter_u64_t recycles_free_count;
204SYSCTL_COUNTER_U64(_vfs, OID_AUTO, recycles_free, CTLFLAG_RD, &recycles_free_count,
205    "Number of free vnodes recycled to meet vnode cache targets");
206
207static counter_u64_t deferred_inact;
208SYSCTL_COUNTER_U64(_vfs, OID_AUTO, deferred_inact, CTLFLAG_RD, &deferred_inact,
209    "Number of times inactive processing was deferred");
210
211/* To keep more than one thread at a time from running vfs_getnewfsid */
212static struct mtx mntid_mtx;
213
214/*
215 * Lock for any access to the following:
216 *	vnode_list
217 *	numvnodes
218 *	freevnodes
219 */
220static struct mtx __exclusive_cache_line vnode_list_mtx;
221
222/* Publicly exported FS */
223struct nfs_public nfs_pub;
224
225static uma_zone_t buf_trie_zone;
226static smr_t buf_trie_smr;
227
228/* Zone for allocation of new vnodes - used exclusively by getnewvnode() */
229static uma_zone_t vnode_zone;
230static uma_zone_t vnodepoll_zone;
231
232__read_frequently smr_t vfs_smr;
233
234/*
235 * The workitem queue.
236 *
237 * It is useful to delay writes of file data and filesystem metadata
238 * for tens of seconds so that quickly created and deleted files need
239 * not waste disk bandwidth being created and removed. To realize this,
240 * we append vnodes to a "workitem" queue. When running with a soft
241 * updates implementation, most pending metadata dependencies should
242 * not wait for more than a few seconds. Thus, mounted on block devices
243 * are delayed only about a half the time that file data is delayed.
244 * Similarly, directory updates are more critical, so are only delayed
245 * about a third the time that file data is delayed. Thus, there are
246 * SYNCER_MAXDELAY queues that are processed round-robin at a rate of
247 * one each second (driven off the filesystem syncer process). The
248 * syncer_delayno variable indicates the next queue that is to be processed.
249 * Items that need to be processed soon are placed in this queue:
250 *
251 *	syncer_workitem_pending[syncer_delayno]
252 *
253 * A delay of fifteen seconds is done by placing the request fifteen
254 * entries later in the queue:
255 *
256 *	syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
257 *
258 */
259static int syncer_delayno;
260static long syncer_mask;
261LIST_HEAD(synclist, bufobj);
262static struct synclist *syncer_workitem_pending;
263/*
264 * The sync_mtx protects:
265 *	bo->bo_synclist
266 *	sync_vnode_count
267 *	syncer_delayno
268 *	syncer_state
269 *	syncer_workitem_pending
270 *	syncer_worklist_len
271 *	rushjob
272 */
273static struct mtx sync_mtx;
274static struct cv sync_wakeup;
275
276#define SYNCER_MAXDELAY		32
277static int syncer_maxdelay = SYNCER_MAXDELAY;	/* maximum delay time */
278static int syncdelay = 30;		/* max time to delay syncing data */
279static int filedelay = 30;		/* time to delay syncing files */
280SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0,
281    "Time to delay syncing files (in seconds)");
282static int dirdelay = 29;		/* time to delay syncing directories */
283SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0,
284    "Time to delay syncing directories (in seconds)");
285static int metadelay = 28;		/* time to delay syncing metadata */
286SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0,
287    "Time to delay syncing metadata (in seconds)");
288static int rushjob;		/* number of slots to run ASAP */
289static int stat_rush_requests;	/* number of times I/O speeded up */
290SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0,
291    "Number of times I/O speeded up (rush requests)");
292
293#define	VDBATCH_SIZE 8
294struct vdbatch {
295	u_int index;
296	long freevnodes;
297	struct mtx lock;
298	struct vnode *tab[VDBATCH_SIZE];
299};
300DPCPU_DEFINE_STATIC(struct vdbatch, vd);
301
302static void	vdbatch_dequeue(struct vnode *vp);
303
304/*
305 * When shutting down the syncer, run it at four times normal speed.
306 */
307#define SYNCER_SHUTDOWN_SPEEDUP		4
308static int sync_vnode_count;
309static int syncer_worklist_len;
310static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY }
311    syncer_state;
312
313/* Target for maximum number of vnodes. */
314u_long desiredvnodes;
315static u_long gapvnodes;		/* gap between wanted and desired */
316static u_long vhiwat;		/* enough extras after expansion */
317static u_long vlowat;		/* minimal extras before expansion */
318static u_long vstir;		/* nonzero to stir non-free vnodes */
319static volatile int vsmalltrigger = 8;	/* pref to keep if > this many pages */
320
321static u_long vnlru_read_freevnodes(void);
322
323/*
324 * Note that no attempt is made to sanitize these parameters.
325 */
326static int
327sysctl_maxvnodes(SYSCTL_HANDLER_ARGS)
328{
329	u_long val;
330	int error;
331
332	val = desiredvnodes;
333	error = sysctl_handle_long(oidp, &val, 0, req);
334	if (error != 0 || req->newptr == NULL)
335		return (error);
336
337	if (val == desiredvnodes)
338		return (0);
339	mtx_lock(&vnode_list_mtx);
340	desiredvnodes = val;
341	wantfreevnodes = desiredvnodes / 4;
342	vnlru_recalc();
343	mtx_unlock(&vnode_list_mtx);
344	/*
345	 * XXX There is no protection against multiple threads changing
346	 * desiredvnodes at the same time. Locking above only helps vnlru and
347	 * getnewvnode.
348	 */
349	vfs_hash_changesize(desiredvnodes);
350	cache_changesize(desiredvnodes);
351	return (0);
352}
353
354SYSCTL_PROC(_kern, KERN_MAXVNODES, maxvnodes,
355    CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_maxvnodes,
356    "LU", "Target for maximum number of vnodes");
357
358static int
359sysctl_wantfreevnodes(SYSCTL_HANDLER_ARGS)
360{
361	u_long val;
362	int error;
363
364	val = wantfreevnodes;
365	error = sysctl_handle_long(oidp, &val, 0, req);
366	if (error != 0 || req->newptr == NULL)
367		return (error);
368
369	if (val == wantfreevnodes)
370		return (0);
371	mtx_lock(&vnode_list_mtx);
372	wantfreevnodes = val;
373	vnlru_recalc();
374	mtx_unlock(&vnode_list_mtx);
375	return (0);
376}
377
378SYSCTL_PROC(_vfs, OID_AUTO, wantfreevnodes,
379    CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_wantfreevnodes,
380    "LU", "Target for minimum number of \"free\" vnodes");
381
382SYSCTL_ULONG(_kern, OID_AUTO, minvnodes, CTLFLAG_RW,
383    &wantfreevnodes, 0, "Old name for vfs.wantfreevnodes (legacy)");
384static int vnlru_nowhere;
385SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW,
386    &vnlru_nowhere, 0, "Number of times the vnlru process ran without success");
387
388static int
389sysctl_try_reclaim_vnode(SYSCTL_HANDLER_ARGS)
390{
391	struct vnode *vp;
392	struct nameidata nd;
393	char *buf;
394	unsigned long ndflags;
395	int error;
396
397	if (req->newptr == NULL)
398		return (EINVAL);
399	if (req->newlen >= PATH_MAX)
400		return (E2BIG);
401
402	buf = malloc(PATH_MAX, M_TEMP, M_WAITOK);
403	error = SYSCTL_IN(req, buf, req->newlen);
404	if (error != 0)
405		goto out;
406
407	buf[req->newlen] = '\0';
408
409	ndflags = LOCKLEAF | NOFOLLOW | AUDITVNODE1 | SAVENAME;
410	NDINIT(&nd, LOOKUP, ndflags, UIO_SYSSPACE, buf, curthread);
411	if ((error = namei(&nd)) != 0)
412		goto out;
413	vp = nd.ni_vp;
414
415	if (VN_IS_DOOMED(vp)) {
416		/*
417		 * This vnode is being recycled.  Return != 0 to let the caller
418		 * know that the sysctl had no effect.  Return EAGAIN because a
419		 * subsequent call will likely succeed (since namei will create
420		 * a new vnode if necessary)
421		 */
422		error = EAGAIN;
423		goto putvnode;
424	}
425
426	counter_u64_add(recycles_count, 1);
427	vgone(vp);
428putvnode:
429	NDFREE(&nd, 0);
430out:
431	free(buf, M_TEMP);
432	return (error);
433}
434
435static int
436sysctl_ftry_reclaim_vnode(SYSCTL_HANDLER_ARGS)
437{
438	struct thread *td = curthread;
439	struct vnode *vp;
440	struct file *fp;
441	int error;
442	int fd;
443
444	if (req->newptr == NULL)
445		return (EBADF);
446
447        error = sysctl_handle_int(oidp, &fd, 0, req);
448        if (error != 0)
449                return (error);
450	error = getvnode(curthread, fd, &cap_fcntl_rights, &fp);
451	if (error != 0)
452		return (error);
453	vp = fp->f_vnode;
454
455	error = vn_lock(vp, LK_EXCLUSIVE);
456	if (error != 0)
457		goto drop;
458
459	counter_u64_add(recycles_count, 1);
460	vgone(vp);
461	VOP_UNLOCK(vp);
462drop:
463	fdrop(fp, td);
464	return (error);
465}
466
467SYSCTL_PROC(_debug, OID_AUTO, try_reclaim_vnode,
468    CTLTYPE_STRING | CTLFLAG_MPSAFE | CTLFLAG_WR, NULL, 0,
469    sysctl_try_reclaim_vnode, "A", "Try to reclaim a vnode by its pathname");
470SYSCTL_PROC(_debug, OID_AUTO, ftry_reclaim_vnode,
471    CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_WR, NULL, 0,
472    sysctl_ftry_reclaim_vnode, "I",
473    "Try to reclaim a vnode by its file descriptor");
474
475/* Shift count for (uintptr_t)vp to initialize vp->v_hash. */
476static int vnsz2log;
477
478/*
479 * Support for the bufobj clean & dirty pctrie.
480 */
481static void *
482buf_trie_alloc(struct pctrie *ptree)
483{
484	return (uma_zalloc_smr(buf_trie_zone, M_NOWAIT));
485}
486
487static void
488buf_trie_free(struct pctrie *ptree, void *node)
489{
490	uma_zfree_smr(buf_trie_zone, node);
491}
492PCTRIE_DEFINE_SMR(BUF, buf, b_lblkno, buf_trie_alloc, buf_trie_free,
493    buf_trie_smr);
494
495/*
496 * Initialize the vnode management data structures.
497 *
498 * Reevaluate the following cap on the number of vnodes after the physical
499 * memory size exceeds 512GB.  In the limit, as the physical memory size
500 * grows, the ratio of the memory size in KB to vnodes approaches 64:1.
501 */
502#ifndef	MAXVNODES_MAX
503#define	MAXVNODES_MAX	(512UL * 1024 * 1024 / 64)	/* 8M */
504#endif
505
506static MALLOC_DEFINE(M_VNODE_MARKER, "vnodemarker", "vnode marker");
507
508static struct vnode *
509vn_alloc_marker(struct mount *mp)
510{
511	struct vnode *vp;
512
513	vp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO);
514	vp->v_type = VMARKER;
515	vp->v_mount = mp;
516
517	return (vp);
518}
519
520static void
521vn_free_marker(struct vnode *vp)
522{
523
524	MPASS(vp->v_type == VMARKER);
525	free(vp, M_VNODE_MARKER);
526}
527
528/*
529 * Initialize a vnode as it first enters the zone.
530 */
531static int
532vnode_init(void *mem, int size, int flags)
533{
534	struct vnode *vp;
535
536	vp = mem;
537	bzero(vp, size);
538	/*
539	 * Setup locks.
540	 */
541	vp->v_vnlock = &vp->v_lock;
542	mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF);
543	/*
544	 * By default, don't allow shared locks unless filesystems opt-in.
545	 */
546	lockinit(vp->v_vnlock, PVFS, "vnode", VLKTIMEOUT,
547	    LK_NOSHARE | LK_IS_VNODE);
548	/*
549	 * Initialize bufobj.
550	 */
551	bufobj_init(&vp->v_bufobj, vp);
552	/*
553	 * Initialize namecache.
554	 */
555	cache_vnode_init(vp);
556	/*
557	 * Initialize rangelocks.
558	 */
559	rangelock_init(&vp->v_rl);
560
561	vp->v_dbatchcpu = NOCPU;
562
563	mtx_lock(&vnode_list_mtx);
564	TAILQ_INSERT_BEFORE(vnode_list_free_marker, vp, v_vnodelist);
565	mtx_unlock(&vnode_list_mtx);
566	return (0);
567}
568
569/*
570 * Free a vnode when it is cleared from the zone.
571 */
572static void
573vnode_fini(void *mem, int size)
574{
575	struct vnode *vp;
576	struct bufobj *bo;
577
578	vp = mem;
579	vdbatch_dequeue(vp);
580	mtx_lock(&vnode_list_mtx);
581	TAILQ_REMOVE(&vnode_list, vp, v_vnodelist);
582	mtx_unlock(&vnode_list_mtx);
583	rangelock_destroy(&vp->v_rl);
584	lockdestroy(vp->v_vnlock);
585	mtx_destroy(&vp->v_interlock);
586	bo = &vp->v_bufobj;
587	rw_destroy(BO_LOCKPTR(bo));
588}
589
590/*
591 * Provide the size of NFS nclnode and NFS fh for calculation of the
592 * vnode memory consumption.  The size is specified directly to
593 * eliminate dependency on NFS-private header.
594 *
595 * Other filesystems may use bigger or smaller (like UFS and ZFS)
596 * private inode data, but the NFS-based estimation is ample enough.
597 * Still, we care about differences in the size between 64- and 32-bit
598 * platforms.
599 *
600 * Namecache structure size is heuristically
601 * sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1.
602 */
603#ifdef _LP64
604#define	NFS_NCLNODE_SZ	(528 + 64)
605#define	NC_SZ		148
606#else
607#define	NFS_NCLNODE_SZ	(360 + 32)
608#define	NC_SZ		92
609#endif
610
611static void
612vntblinit(void *dummy __unused)
613{
614	struct vdbatch *vd;
615	int cpu, physvnodes, virtvnodes;
616	u_int i;
617
618	/*
619	 * Desiredvnodes is a function of the physical memory size and the
620	 * kernel's heap size.  Generally speaking, it scales with the
621	 * physical memory size.  The ratio of desiredvnodes to the physical
622	 * memory size is 1:16 until desiredvnodes exceeds 98,304.
623	 * Thereafter, the
624	 * marginal ratio of desiredvnodes to the physical memory size is
625	 * 1:64.  However, desiredvnodes is limited by the kernel's heap
626	 * size.  The memory required by desiredvnodes vnodes and vm objects
627	 * must not exceed 1/10th of the kernel's heap size.
628	 */
629	physvnodes = maxproc + pgtok(vm_cnt.v_page_count) / 64 +
630	    3 * min(98304 * 16, pgtok(vm_cnt.v_page_count)) / 64;
631	virtvnodes = vm_kmem_size / (10 * (sizeof(struct vm_object) +
632	    sizeof(struct vnode) + NC_SZ * ncsizefactor + NFS_NCLNODE_SZ));
633	desiredvnodes = min(physvnodes, virtvnodes);
634	if (desiredvnodes > MAXVNODES_MAX) {
635		if (bootverbose)
636			printf("Reducing kern.maxvnodes %lu -> %lu\n",
637			    desiredvnodes, MAXVNODES_MAX);
638		desiredvnodes = MAXVNODES_MAX;
639	}
640	wantfreevnodes = desiredvnodes / 4;
641	mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF);
642	TAILQ_INIT(&vnode_list);
643	mtx_init(&vnode_list_mtx, "vnode_list", NULL, MTX_DEF);
644	/*
645	 * The lock is taken to appease WITNESS.
646	 */
647	mtx_lock(&vnode_list_mtx);
648	vnlru_recalc();
649	mtx_unlock(&vnode_list_mtx);
650	vnode_list_free_marker = vn_alloc_marker(NULL);
651	TAILQ_INSERT_HEAD(&vnode_list, vnode_list_free_marker, v_vnodelist);
652	vnode_list_reclaim_marker = vn_alloc_marker(NULL);
653	TAILQ_INSERT_HEAD(&vnode_list, vnode_list_reclaim_marker, v_vnodelist);
654	vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL,
655	    vnode_init, vnode_fini, UMA_ALIGN_PTR, 0);
656	uma_zone_set_smr(vnode_zone, vfs_smr);
657	vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo),
658	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
659	/*
660	 * Preallocate enough nodes to support one-per buf so that
661	 * we can not fail an insert.  reassignbuf() callers can not
662	 * tolerate the insertion failure.
663	 */
664	buf_trie_zone = uma_zcreate("BUF TRIE", pctrie_node_size(),
665	    NULL, NULL, pctrie_zone_init, NULL, UMA_ALIGN_PTR,
666	    UMA_ZONE_NOFREE | UMA_ZONE_SMR);
667	buf_trie_smr = uma_zone_get_smr(buf_trie_zone);
668	uma_prealloc(buf_trie_zone, nbuf);
669
670	vnodes_created = counter_u64_alloc(M_WAITOK);
671	recycles_count = counter_u64_alloc(M_WAITOK);
672	recycles_free_count = counter_u64_alloc(M_WAITOK);
673	deferred_inact = counter_u64_alloc(M_WAITOK);
674
675	/*
676	 * Initialize the filesystem syncer.
677	 */
678	syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE,
679	    &syncer_mask);
680	syncer_maxdelay = syncer_mask + 1;
681	mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF);
682	cv_init(&sync_wakeup, "syncer");
683	for (i = 1; i <= sizeof(struct vnode); i <<= 1)
684		vnsz2log++;
685	vnsz2log--;
686
687	CPU_FOREACH(cpu) {
688		vd = DPCPU_ID_PTR((cpu), vd);
689		bzero(vd, sizeof(*vd));
690		mtx_init(&vd->lock, "vdbatch", NULL, MTX_DEF);
691	}
692}
693SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL);
694
695/*
696 * Mark a mount point as busy. Used to synchronize access and to delay
697 * unmounting. Eventually, mountlist_mtx is not released on failure.
698 *
699 * vfs_busy() is a custom lock, it can block the caller.
700 * vfs_busy() only sleeps if the unmount is active on the mount point.
701 * For a mountpoint mp, vfs_busy-enforced lock is before lock of any
702 * vnode belonging to mp.
703 *
704 * Lookup uses vfs_busy() to traverse mount points.
705 * root fs			var fs
706 * / vnode lock		A	/ vnode lock (/var)		D
707 * /var vnode lock	B	/log vnode lock(/var/log)	E
708 * vfs_busy lock	C	vfs_busy lock			F
709 *
710 * Within each file system, the lock order is C->A->B and F->D->E.
711 *
712 * When traversing across mounts, the system follows that lock order:
713 *
714 *        C->A->B
715 *              |
716 *              +->F->D->E
717 *
718 * The lookup() process for namei("/var") illustrates the process:
719 *  VOP_LOOKUP() obtains B while A is held
720 *  vfs_busy() obtains a shared lock on F while A and B are held
721 *  vput() releases lock on B
722 *  vput() releases lock on A
723 *  VFS_ROOT() obtains lock on D while shared lock on F is held
724 *  vfs_unbusy() releases shared lock on F
725 *  vn_lock() obtains lock on deadfs vnode vp_crossmp instead of A.
726 *    Attempt to lock A (instead of vp_crossmp) while D is held would
727 *    violate the global order, causing deadlocks.
728 *
729 * dounmount() locks B while F is drained.
730 */
731int
732vfs_busy(struct mount *mp, int flags)
733{
734
735	MPASS((flags & ~MBF_MASK) == 0);
736	CTR3(KTR_VFS, "%s: mp %p with flags %d", __func__, mp, flags);
737
738	if (vfs_op_thread_enter(mp)) {
739		MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0);
740		MPASS((mp->mnt_kern_flag & MNTK_UNMOUNT) == 0);
741		MPASS((mp->mnt_kern_flag & MNTK_REFEXPIRE) == 0);
742		vfs_mp_count_add_pcpu(mp, ref, 1);
743		vfs_mp_count_add_pcpu(mp, lockref, 1);
744		vfs_op_thread_exit(mp);
745		if (flags & MBF_MNTLSTLOCK)
746			mtx_unlock(&mountlist_mtx);
747		return (0);
748	}
749
750	MNT_ILOCK(mp);
751	vfs_assert_mount_counters(mp);
752	MNT_REF(mp);
753	/*
754	 * If mount point is currently being unmounted, sleep until the
755	 * mount point fate is decided.  If thread doing the unmounting fails,
756	 * it will clear MNTK_UNMOUNT flag before waking us up, indicating
757	 * that this mount point has survived the unmount attempt and vfs_busy
758	 * should retry.  Otherwise the unmounter thread will set MNTK_REFEXPIRE
759	 * flag in addition to MNTK_UNMOUNT, indicating that mount point is
760	 * about to be really destroyed.  vfs_busy needs to release its
761	 * reference on the mount point in this case and return with ENOENT,
762	 * telling the caller that mount mount it tried to busy is no longer
763	 * valid.
764	 */
765	while (mp->mnt_kern_flag & MNTK_UNMOUNT) {
766		if (flags & MBF_NOWAIT || mp->mnt_kern_flag & MNTK_REFEXPIRE) {
767			MNT_REL(mp);
768			MNT_IUNLOCK(mp);
769			CTR1(KTR_VFS, "%s: failed busying before sleeping",
770			    __func__);
771			return (ENOENT);
772		}
773		if (flags & MBF_MNTLSTLOCK)
774			mtx_unlock(&mountlist_mtx);
775		mp->mnt_kern_flag |= MNTK_MWAIT;
776		msleep(mp, MNT_MTX(mp), PVFS | PDROP, "vfs_busy", 0);
777		if (flags & MBF_MNTLSTLOCK)
778			mtx_lock(&mountlist_mtx);
779		MNT_ILOCK(mp);
780	}
781	if (flags & MBF_MNTLSTLOCK)
782		mtx_unlock(&mountlist_mtx);
783	mp->mnt_lockref++;
784	MNT_IUNLOCK(mp);
785	return (0);
786}
787
788/*
789 * Free a busy filesystem.
790 */
791void
792vfs_unbusy(struct mount *mp)
793{
794	int c;
795
796	CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
797
798	if (vfs_op_thread_enter(mp)) {
799		MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0);
800		vfs_mp_count_sub_pcpu(mp, lockref, 1);
801		vfs_mp_count_sub_pcpu(mp, ref, 1);
802		vfs_op_thread_exit(mp);
803		return;
804	}
805
806	MNT_ILOCK(mp);
807	vfs_assert_mount_counters(mp);
808	MNT_REL(mp);
809	c = --mp->mnt_lockref;
810	if (mp->mnt_vfs_ops == 0) {
811		MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0);
812		MNT_IUNLOCK(mp);
813		return;
814	}
815	if (c < 0)
816		vfs_dump_mount_counters(mp);
817	if (c == 0 && (mp->mnt_kern_flag & MNTK_DRAINING) != 0) {
818		MPASS(mp->mnt_kern_flag & MNTK_UNMOUNT);
819		CTR1(KTR_VFS, "%s: waking up waiters", __func__);
820		mp->mnt_kern_flag &= ~MNTK_DRAINING;
821		wakeup(&mp->mnt_lockref);
822	}
823	MNT_IUNLOCK(mp);
824}
825
826/*
827 * Lookup a mount point by filesystem identifier.
828 */
829struct mount *
830vfs_getvfs(fsid_t *fsid)
831{
832	struct mount *mp;
833
834	CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid);
835	mtx_lock(&mountlist_mtx);
836	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
837		if (fsidcmp(&mp->mnt_stat.f_fsid, fsid) == 0) {
838			vfs_ref(mp);
839			mtx_unlock(&mountlist_mtx);
840			return (mp);
841		}
842	}
843	mtx_unlock(&mountlist_mtx);
844	CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid);
845	return ((struct mount *) 0);
846}
847
848/*
849 * Lookup a mount point by filesystem identifier, busying it before
850 * returning.
851 *
852 * To avoid congestion on mountlist_mtx, implement simple direct-mapped
853 * cache for popular filesystem identifiers.  The cache is lockess, using
854 * the fact that struct mount's are never freed.  In worst case we may
855 * get pointer to unmounted or even different filesystem, so we have to
856 * check what we got, and go slow way if so.
857 */
858struct mount *
859vfs_busyfs(fsid_t *fsid)
860{
861#define	FSID_CACHE_SIZE	256
862	typedef struct mount * volatile vmp_t;
863	static vmp_t cache[FSID_CACHE_SIZE];
864	struct mount *mp;
865	int error;
866	uint32_t hash;
867
868	CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid);
869	hash = fsid->val[0] ^ fsid->val[1];
870	hash = (hash >> 16 ^ hash) & (FSID_CACHE_SIZE - 1);
871	mp = cache[hash];
872	if (mp == NULL || fsidcmp(&mp->mnt_stat.f_fsid, fsid) != 0)
873		goto slow;
874	if (vfs_busy(mp, 0) != 0) {
875		cache[hash] = NULL;
876		goto slow;
877	}
878	if (fsidcmp(&mp->mnt_stat.f_fsid, fsid) == 0)
879		return (mp);
880	else
881	    vfs_unbusy(mp);
882
883slow:
884	mtx_lock(&mountlist_mtx);
885	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
886		if (fsidcmp(&mp->mnt_stat.f_fsid, fsid) == 0) {
887			error = vfs_busy(mp, MBF_MNTLSTLOCK);
888			if (error) {
889				cache[hash] = NULL;
890				mtx_unlock(&mountlist_mtx);
891				return (NULL);
892			}
893			cache[hash] = mp;
894			return (mp);
895		}
896	}
897	CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid);
898	mtx_unlock(&mountlist_mtx);
899	return ((struct mount *) 0);
900}
901
902/*
903 * Check if a user can access privileged mount options.
904 */
905int
906vfs_suser(struct mount *mp, struct thread *td)
907{
908	int error;
909
910	if (jailed(td->td_ucred)) {
911		/*
912		 * If the jail of the calling thread lacks permission for
913		 * this type of file system, deny immediately.
914		 */
915		if (!prison_allow(td->td_ucred, mp->mnt_vfc->vfc_prison_flag))
916			return (EPERM);
917
918		/*
919		 * If the file system was mounted outside the jail of the
920		 * calling thread, deny immediately.
921		 */
922		if (prison_check(td->td_ucred, mp->mnt_cred) != 0)
923			return (EPERM);
924	}
925
926	/*
927	 * If file system supports delegated administration, we don't check
928	 * for the PRIV_VFS_MOUNT_OWNER privilege - it will be better verified
929	 * by the file system itself.
930	 * If this is not the user that did original mount, we check for
931	 * the PRIV_VFS_MOUNT_OWNER privilege.
932	 */
933	if (!(mp->mnt_vfc->vfc_flags & VFCF_DELEGADMIN) &&
934	    mp->mnt_cred->cr_uid != td->td_ucred->cr_uid) {
935		if ((error = priv_check(td, PRIV_VFS_MOUNT_OWNER)) != 0)
936			return (error);
937	}
938	return (0);
939}
940
941/*
942 * Get a new unique fsid.  Try to make its val[0] unique, since this value
943 * will be used to create fake device numbers for stat().  Also try (but
944 * not so hard) make its val[0] unique mod 2^16, since some emulators only
945 * support 16-bit device numbers.  We end up with unique val[0]'s for the
946 * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls.
947 *
948 * Keep in mind that several mounts may be running in parallel.  Starting
949 * the search one past where the previous search terminated is both a
950 * micro-optimization and a defense against returning the same fsid to
951 * different mounts.
952 */
953void
954vfs_getnewfsid(struct mount *mp)
955{
956	static uint16_t mntid_base;
957	struct mount *nmp;
958	fsid_t tfsid;
959	int mtype;
960
961	CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
962	mtx_lock(&mntid_mtx);
963	mtype = mp->mnt_vfc->vfc_typenum;
964	tfsid.val[1] = mtype;
965	mtype = (mtype & 0xFF) << 24;
966	for (;;) {
967		tfsid.val[0] = makedev(255,
968		    mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF));
969		mntid_base++;
970		if ((nmp = vfs_getvfs(&tfsid)) == NULL)
971			break;
972		vfs_rel(nmp);
973	}
974	mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
975	mp->mnt_stat.f_fsid.val[1] = tfsid.val[1];
976	mtx_unlock(&mntid_mtx);
977}
978
979/*
980 * Knob to control the precision of file timestamps:
981 *
982 *   0 = seconds only; nanoseconds zeroed.
983 *   1 = seconds and nanoseconds, accurate within 1/HZ.
984 *   2 = seconds and nanoseconds, truncated to microseconds.
985 * >=3 = seconds and nanoseconds, maximum precision.
986 */
987enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC };
988
989static int timestamp_precision = TSP_USEC;
990SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW,
991    &timestamp_precision, 0, "File timestamp precision (0: seconds, "
992    "1: sec + ns accurate to 1/HZ, 2: sec + ns truncated to us, "
993    "3+: sec + ns (max. precision))");
994
995/*
996 * Get a current timestamp.
997 */
998void
999vfs_timestamp(struct timespec *tsp)
1000{
1001	struct timeval tv;
1002
1003	switch (timestamp_precision) {
1004	case TSP_SEC:
1005		tsp->tv_sec = time_second;
1006		tsp->tv_nsec = 0;
1007		break;
1008	case TSP_HZ:
1009		getnanotime(tsp);
1010		break;
1011	case TSP_USEC:
1012		microtime(&tv);
1013		TIMEVAL_TO_TIMESPEC(&tv, tsp);
1014		break;
1015	case TSP_NSEC:
1016	default:
1017		nanotime(tsp);
1018		break;
1019	}
1020}
1021
1022/*
1023 * Set vnode attributes to VNOVAL
1024 */
1025void
1026vattr_null(struct vattr *vap)
1027{
1028
1029	vap->va_type = VNON;
1030	vap->va_size = VNOVAL;
1031	vap->va_bytes = VNOVAL;
1032	vap->va_mode = VNOVAL;
1033	vap->va_nlink = VNOVAL;
1034	vap->va_uid = VNOVAL;
1035	vap->va_gid = VNOVAL;
1036	vap->va_fsid = VNOVAL;
1037	vap->va_fileid = VNOVAL;
1038	vap->va_blocksize = VNOVAL;
1039	vap->va_rdev = VNOVAL;
1040	vap->va_atime.tv_sec = VNOVAL;
1041	vap->va_atime.tv_nsec = VNOVAL;
1042	vap->va_mtime.tv_sec = VNOVAL;
1043	vap->va_mtime.tv_nsec = VNOVAL;
1044	vap->va_ctime.tv_sec = VNOVAL;
1045	vap->va_ctime.tv_nsec = VNOVAL;
1046	vap->va_birthtime.tv_sec = VNOVAL;
1047	vap->va_birthtime.tv_nsec = VNOVAL;
1048	vap->va_flags = VNOVAL;
1049	vap->va_gen = VNOVAL;
1050	vap->va_vaflags = 0;
1051}
1052
1053/*
1054 * Try to reduce the total number of vnodes.
1055 *
1056 * This routine (and its user) are buggy in at least the following ways:
1057 * - all parameters were picked years ago when RAM sizes were significantly
1058 *   smaller
1059 * - it can pick vnodes based on pages used by the vm object, but filesystems
1060 *   like ZFS don't use it making the pick broken
1061 * - since ZFS has its own aging policy it gets partially combated by this one
1062 * - a dedicated method should be provided for filesystems to let them decide
1063 *   whether the vnode should be recycled
1064 *
1065 * This routine is called when we have too many vnodes.  It attempts
1066 * to free <count> vnodes and will potentially free vnodes that still
1067 * have VM backing store (VM backing store is typically the cause
1068 * of a vnode blowout so we want to do this).  Therefore, this operation
1069 * is not considered cheap.
1070 *
1071 * A number of conditions may prevent a vnode from being reclaimed.
1072 * the buffer cache may have references on the vnode, a directory
1073 * vnode may still have references due to the namei cache representing
1074 * underlying files, or the vnode may be in active use.   It is not
1075 * desirable to reuse such vnodes.  These conditions may cause the
1076 * number of vnodes to reach some minimum value regardless of what
1077 * you set kern.maxvnodes to.  Do not set kern.maxvnodes too low.
1078 *
1079 * @param reclaim_nc_src Only reclaim directories with outgoing namecache
1080 * 			 entries if this argument is strue
1081 * @param trigger	 Only reclaim vnodes with fewer than this many resident
1082 *			 pages.
1083 * @param target	 How many vnodes to reclaim.
1084 * @return		 The number of vnodes that were reclaimed.
1085 */
1086static int
1087vlrureclaim(bool reclaim_nc_src, int trigger, u_long target)
1088{
1089	struct vnode *vp, *mvp;
1090	struct mount *mp;
1091	struct vm_object *object;
1092	u_long done;
1093	bool retried;
1094
1095	mtx_assert(&vnode_list_mtx, MA_OWNED);
1096
1097	retried = false;
1098	done = 0;
1099
1100	mvp = vnode_list_reclaim_marker;
1101restart:
1102	vp = mvp;
1103	while (done < target) {
1104		vp = TAILQ_NEXT(vp, v_vnodelist);
1105		if (__predict_false(vp == NULL))
1106			break;
1107
1108		if (__predict_false(vp->v_type == VMARKER))
1109			continue;
1110
1111		/*
1112		 * If it's been deconstructed already, it's still
1113		 * referenced, or it exceeds the trigger, skip it.
1114		 * Also skip free vnodes.  We are trying to make space
1115		 * to expand the free list, not reduce it.
1116		 */
1117		if (vp->v_usecount > 0 || vp->v_holdcnt == 0 ||
1118		    (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)))
1119			goto next_iter;
1120
1121		if (vp->v_type == VBAD || vp->v_type == VNON)
1122			goto next_iter;
1123
1124		if (!VI_TRYLOCK(vp))
1125			goto next_iter;
1126
1127		if (vp->v_usecount > 0 || vp->v_holdcnt == 0 ||
1128		    (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)) ||
1129		    VN_IS_DOOMED(vp) || vp->v_type == VNON) {
1130			VI_UNLOCK(vp);
1131			goto next_iter;
1132		}
1133
1134		object = atomic_load_ptr(&vp->v_object);
1135		if (object == NULL || object->resident_page_count > trigger) {
1136			VI_UNLOCK(vp);
1137			goto next_iter;
1138		}
1139
1140		vholdl(vp);
1141		VI_UNLOCK(vp);
1142		TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist);
1143		TAILQ_INSERT_AFTER(&vnode_list, vp, mvp, v_vnodelist);
1144		mtx_unlock(&vnode_list_mtx);
1145
1146		if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
1147			vdrop(vp);
1148			goto next_iter_unlocked;
1149		}
1150		if (VOP_LOCK(vp, LK_EXCLUSIVE|LK_NOWAIT) != 0) {
1151			vdrop(vp);
1152			vn_finished_write(mp);
1153			goto next_iter_unlocked;
1154		}
1155
1156		VI_LOCK(vp);
1157		if (vp->v_usecount > 0 ||
1158		    (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)) ||
1159		    (vp->v_object != NULL &&
1160		    vp->v_object->resident_page_count > trigger)) {
1161			VOP_UNLOCK(vp);
1162			vdropl(vp);
1163			vn_finished_write(mp);
1164			goto next_iter_unlocked;
1165		}
1166		counter_u64_add(recycles_count, 1);
1167		vgonel(vp);
1168		VOP_UNLOCK(vp);
1169		vdropl(vp);
1170		vn_finished_write(mp);
1171		done++;
1172next_iter_unlocked:
1173		if (should_yield())
1174			kern_yield(PRI_USER);
1175		mtx_lock(&vnode_list_mtx);
1176		goto restart;
1177next_iter:
1178		MPASS(vp->v_type != VMARKER);
1179		if (!should_yield())
1180			continue;
1181		TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist);
1182		TAILQ_INSERT_AFTER(&vnode_list, vp, mvp, v_vnodelist);
1183		mtx_unlock(&vnode_list_mtx);
1184		kern_yield(PRI_USER);
1185		mtx_lock(&vnode_list_mtx);
1186		goto restart;
1187	}
1188	if (done == 0 && !retried) {
1189		TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist);
1190		TAILQ_INSERT_HEAD(&vnode_list, mvp, v_vnodelist);
1191		retried = true;
1192		goto restart;
1193	}
1194	return (done);
1195}
1196
1197static int max_vnlru_free = 10000; /* limit on vnode free requests per call */
1198SYSCTL_INT(_debug, OID_AUTO, max_vnlru_free, CTLFLAG_RW, &max_vnlru_free,
1199    0,
1200    "limit on vnode free requests per call to the vnlru_free routine");
1201
1202/*
1203 * Attempt to reduce the free list by the requested amount.
1204 */
1205static int
1206vnlru_free_locked(int count, struct vfsops *mnt_op)
1207{
1208	struct vnode *vp, *mvp;
1209	struct mount *mp;
1210	int ocount;
1211
1212	mtx_assert(&vnode_list_mtx, MA_OWNED);
1213	if (count > max_vnlru_free)
1214		count = max_vnlru_free;
1215	ocount = count;
1216	mvp = vnode_list_free_marker;
1217restart:
1218	vp = mvp;
1219	while (count > 0) {
1220		vp = TAILQ_NEXT(vp, v_vnodelist);
1221		if (__predict_false(vp == NULL)) {
1222			TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist);
1223			TAILQ_INSERT_TAIL(&vnode_list, mvp, v_vnodelist);
1224			break;
1225		}
1226		if (__predict_false(vp->v_type == VMARKER))
1227			continue;
1228
1229		/*
1230		 * Don't recycle if our vnode is from different type
1231		 * of mount point.  Note that mp is type-safe, the
1232		 * check does not reach unmapped address even if
1233		 * vnode is reclaimed.
1234		 * Don't recycle if we can't get the interlock without
1235		 * blocking.
1236		 */
1237		if (vp->v_holdcnt > 0 || (mnt_op != NULL && (mp = vp->v_mount) != NULL &&
1238		    mp->mnt_op != mnt_op) || !VI_TRYLOCK(vp)) {
1239			continue;
1240		}
1241		TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist);
1242		TAILQ_INSERT_AFTER(&vnode_list, vp, mvp, v_vnodelist);
1243		if (__predict_false(vp->v_type == VBAD || vp->v_type == VNON)) {
1244			VI_UNLOCK(vp);
1245			continue;
1246		}
1247		vholdl(vp);
1248		count--;
1249		mtx_unlock(&vnode_list_mtx);
1250		VI_UNLOCK(vp);
1251		vtryrecycle(vp);
1252		vdrop(vp);
1253		mtx_lock(&vnode_list_mtx);
1254		goto restart;
1255	}
1256	return (ocount - count);
1257}
1258
1259void
1260vnlru_free(int count, struct vfsops *mnt_op)
1261{
1262
1263	mtx_lock(&vnode_list_mtx);
1264	vnlru_free_locked(count, mnt_op);
1265	mtx_unlock(&vnode_list_mtx);
1266}
1267
1268static void
1269vnlru_recalc(void)
1270{
1271
1272	mtx_assert(&vnode_list_mtx, MA_OWNED);
1273	gapvnodes = imax(desiredvnodes - wantfreevnodes, 100);
1274	vhiwat = gapvnodes / 11; /* 9% -- just under the 10% in vlrureclaim() */
1275	vlowat = vhiwat / 2;
1276}
1277
1278/*
1279 * Attempt to recycle vnodes in a context that is always safe to block.
1280 * Calling vlrurecycle() from the bowels of filesystem code has some
1281 * interesting deadlock problems.
1282 */
1283static struct proc *vnlruproc;
1284static int vnlruproc_sig;
1285
1286/*
1287 * The main freevnodes counter is only updated when threads requeue their vnode
1288 * batches. CPUs are conditionally walked to compute a more accurate total.
1289 *
1290 * Limit how much of a slop are we willing to tolerate. Note: the actual value
1291 * at any given moment can still exceed slop, but it should not be by significant
1292 * margin in practice.
1293 */
1294#define VNLRU_FREEVNODES_SLOP 128
1295
1296static __inline void
1297vn_freevnodes_inc(void)
1298{
1299	struct vdbatch *vd;
1300
1301	critical_enter();
1302	vd = DPCPU_PTR(vd);
1303	vd->freevnodes++;
1304	critical_exit();
1305}
1306
1307static __inline void
1308vn_freevnodes_dec(void)
1309{
1310	struct vdbatch *vd;
1311
1312	critical_enter();
1313	vd = DPCPU_PTR(vd);
1314	vd->freevnodes--;
1315	critical_exit();
1316}
1317
1318static u_long
1319vnlru_read_freevnodes(void)
1320{
1321	struct vdbatch *vd;
1322	long slop;
1323	int cpu;
1324
1325	mtx_assert(&vnode_list_mtx, MA_OWNED);
1326	if (freevnodes > freevnodes_old)
1327		slop = freevnodes - freevnodes_old;
1328	else
1329		slop = freevnodes_old - freevnodes;
1330	if (slop < VNLRU_FREEVNODES_SLOP)
1331		return (freevnodes >= 0 ? freevnodes : 0);
1332	freevnodes_old = freevnodes;
1333	CPU_FOREACH(cpu) {
1334		vd = DPCPU_ID_PTR((cpu), vd);
1335		freevnodes_old += vd->freevnodes;
1336	}
1337	return (freevnodes_old >= 0 ? freevnodes_old : 0);
1338}
1339
1340static bool
1341vnlru_under(u_long rnumvnodes, u_long limit)
1342{
1343	u_long rfreevnodes, space;
1344
1345	if (__predict_false(rnumvnodes > desiredvnodes))
1346		return (true);
1347
1348	space = desiredvnodes - rnumvnodes;
1349	if (space < limit) {
1350		rfreevnodes = vnlru_read_freevnodes();
1351		if (rfreevnodes > wantfreevnodes)
1352			space += rfreevnodes - wantfreevnodes;
1353	}
1354	return (space < limit);
1355}
1356
1357static bool
1358vnlru_under_unlocked(u_long rnumvnodes, u_long limit)
1359{
1360	long rfreevnodes, space;
1361
1362	if (__predict_false(rnumvnodes > desiredvnodes))
1363		return (true);
1364
1365	space = desiredvnodes - rnumvnodes;
1366	if (space < limit) {
1367		rfreevnodes = atomic_load_long(&freevnodes);
1368		if (rfreevnodes > wantfreevnodes)
1369			space += rfreevnodes - wantfreevnodes;
1370	}
1371	return (space < limit);
1372}
1373
1374static void
1375vnlru_kick(void)
1376{
1377
1378	mtx_assert(&vnode_list_mtx, MA_OWNED);
1379	if (vnlruproc_sig == 0) {
1380		vnlruproc_sig = 1;
1381		wakeup(vnlruproc);
1382	}
1383}
1384
1385static void
1386vnlru_proc(void)
1387{
1388	u_long rnumvnodes, rfreevnodes, target;
1389	unsigned long onumvnodes;
1390	int done, force, trigger, usevnodes;
1391	bool reclaim_nc_src, want_reread;
1392
1393	EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, vnlruproc,
1394	    SHUTDOWN_PRI_FIRST);
1395
1396	force = 0;
1397	want_reread = false;
1398	for (;;) {
1399		kproc_suspend_check(vnlruproc);
1400		mtx_lock(&vnode_list_mtx);
1401		rnumvnodes = atomic_load_long(&numvnodes);
1402
1403		if (want_reread) {
1404			force = vnlru_under(numvnodes, vhiwat) ? 1 : 0;
1405			want_reread = false;
1406		}
1407
1408		/*
1409		 * If numvnodes is too large (due to desiredvnodes being
1410		 * adjusted using its sysctl, or emergency growth), first
1411		 * try to reduce it by discarding from the free list.
1412		 */
1413		if (rnumvnodes > desiredvnodes) {
1414			vnlru_free_locked(rnumvnodes - desiredvnodes, NULL);
1415			rnumvnodes = atomic_load_long(&numvnodes);
1416		}
1417		/*
1418		 * Sleep if the vnode cache is in a good state.  This is
1419		 * when it is not over-full and has space for about a 4%
1420		 * or 9% expansion (by growing its size or inexcessively
1421		 * reducing its free list).  Otherwise, try to reclaim
1422		 * space for a 10% expansion.
1423		 */
1424		if (vstir && force == 0) {
1425			force = 1;
1426			vstir = 0;
1427		}
1428		if (force == 0 && !vnlru_under(rnumvnodes, vlowat)) {
1429			vnlruproc_sig = 0;
1430			wakeup(&vnlruproc_sig);
1431			msleep(vnlruproc, &vnode_list_mtx,
1432			    PVFS|PDROP, "vlruwt", hz);
1433			continue;
1434		}
1435		rfreevnodes = vnlru_read_freevnodes();
1436
1437		onumvnodes = rnumvnodes;
1438		/*
1439		 * Calculate parameters for recycling.  These are the same
1440		 * throughout the loop to give some semblance of fairness.
1441		 * The trigger point is to avoid recycling vnodes with lots
1442		 * of resident pages.  We aren't trying to free memory; we
1443		 * are trying to recycle or at least free vnodes.
1444		 */
1445		if (rnumvnodes <= desiredvnodes)
1446			usevnodes = rnumvnodes - rfreevnodes;
1447		else
1448			usevnodes = rnumvnodes;
1449		if (usevnodes <= 0)
1450			usevnodes = 1;
1451		/*
1452		 * The trigger value is is chosen to give a conservatively
1453		 * large value to ensure that it alone doesn't prevent
1454		 * making progress.  The value can easily be so large that
1455		 * it is effectively infinite in some congested and
1456		 * misconfigured cases, and this is necessary.  Normally
1457		 * it is about 8 to 100 (pages), which is quite large.
1458		 */
1459		trigger = vm_cnt.v_page_count * 2 / usevnodes;
1460		if (force < 2)
1461			trigger = vsmalltrigger;
1462		reclaim_nc_src = force >= 3;
1463		target = rnumvnodes * (int64_t)gapvnodes / imax(desiredvnodes, 1);
1464		target = target / 10 + 1;
1465		done = vlrureclaim(reclaim_nc_src, trigger, target);
1466		mtx_unlock(&vnode_list_mtx);
1467		if (onumvnodes > desiredvnodes && numvnodes <= desiredvnodes)
1468			uma_reclaim(UMA_RECLAIM_DRAIN);
1469		if (done == 0) {
1470			if (force == 0 || force == 1) {
1471				force = 2;
1472				continue;
1473			}
1474			if (force == 2) {
1475				force = 3;
1476				continue;
1477			}
1478			want_reread = true;
1479			force = 0;
1480			vnlru_nowhere++;
1481			tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3);
1482		} else {
1483			want_reread = true;
1484			kern_yield(PRI_USER);
1485		}
1486	}
1487}
1488
1489static struct kproc_desc vnlru_kp = {
1490	"vnlru",
1491	vnlru_proc,
1492	&vnlruproc
1493};
1494SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start,
1495    &vnlru_kp);
1496
1497/*
1498 * Routines having to do with the management of the vnode table.
1499 */
1500
1501/*
1502 * Try to recycle a freed vnode.  We abort if anyone picks up a reference
1503 * before we actually vgone().  This function must be called with the vnode
1504 * held to prevent the vnode from being returned to the free list midway
1505 * through vgone().
1506 */
1507static int
1508vtryrecycle(struct vnode *vp)
1509{
1510	struct mount *vnmp;
1511
1512	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
1513	VNASSERT(vp->v_holdcnt, vp,
1514	    ("vtryrecycle: Recycling vp %p without a reference.", vp));
1515	/*
1516	 * This vnode may found and locked via some other list, if so we
1517	 * can't recycle it yet.
1518	 */
1519	if (VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT) != 0) {
1520		CTR2(KTR_VFS,
1521		    "%s: impossible to recycle, vp %p lock is already held",
1522		    __func__, vp);
1523		return (EWOULDBLOCK);
1524	}
1525	/*
1526	 * Don't recycle if its filesystem is being suspended.
1527	 */
1528	if (vn_start_write(vp, &vnmp, V_NOWAIT) != 0) {
1529		VOP_UNLOCK(vp);
1530		CTR2(KTR_VFS,
1531		    "%s: impossible to recycle, cannot start the write for %p",
1532		    __func__, vp);
1533		return (EBUSY);
1534	}
1535	/*
1536	 * If we got this far, we need to acquire the interlock and see if
1537	 * anyone picked up this vnode from another list.  If not, we will
1538	 * mark it with DOOMED via vgonel() so that anyone who does find it
1539	 * will skip over it.
1540	 */
1541	VI_LOCK(vp);
1542	if (vp->v_usecount) {
1543		VOP_UNLOCK(vp);
1544		VI_UNLOCK(vp);
1545		vn_finished_write(vnmp);
1546		CTR2(KTR_VFS,
1547		    "%s: impossible to recycle, %p is already referenced",
1548		    __func__, vp);
1549		return (EBUSY);
1550	}
1551	if (!VN_IS_DOOMED(vp)) {
1552		counter_u64_add(recycles_free_count, 1);
1553		vgonel(vp);
1554	}
1555	VOP_UNLOCK(vp);
1556	VI_UNLOCK(vp);
1557	vn_finished_write(vnmp);
1558	return (0);
1559}
1560
1561/*
1562 * Allocate a new vnode.
1563 *
1564 * The operation never returns an error. Returning an error was disabled
1565 * in r145385 (dated 2005) with the following comment:
1566 *
1567 * XXX Not all VFS_VGET/ffs_vget callers check returns.
1568 *
1569 * Given the age of this commit (almost 15 years at the time of writing this
1570 * comment) restoring the ability to fail requires a significant audit of
1571 * all codepaths.
1572 *
1573 * The routine can try to free a vnode or stall for up to 1 second waiting for
1574 * vnlru to clear things up, but ultimately always performs a M_WAITOK allocation.
1575 */
1576static u_long vn_alloc_cyclecount;
1577
1578static struct vnode * __noinline
1579vn_alloc_hard(struct mount *mp)
1580{
1581	u_long rnumvnodes, rfreevnodes;
1582
1583	mtx_lock(&vnode_list_mtx);
1584	rnumvnodes = atomic_load_long(&numvnodes);
1585	if (rnumvnodes + 1 < desiredvnodes) {
1586		vn_alloc_cyclecount = 0;
1587		goto alloc;
1588	}
1589	rfreevnodes = vnlru_read_freevnodes();
1590	if (vn_alloc_cyclecount++ >= rfreevnodes) {
1591		vn_alloc_cyclecount = 0;
1592		vstir = 1;
1593	}
1594	/*
1595	 * Grow the vnode cache if it will not be above its target max
1596	 * after growing.  Otherwise, if the free list is nonempty, try
1597	 * to reclaim 1 item from it before growing the cache (possibly
1598	 * above its target max if the reclamation failed or is delayed).
1599	 * Otherwise, wait for some space.  In all cases, schedule
1600	 * vnlru_proc() if we are getting short of space.  The watermarks
1601	 * should be chosen so that we never wait or even reclaim from
1602	 * the free list to below its target minimum.
1603	 */
1604	if (vnlru_free_locked(1, NULL) > 0)
1605		goto alloc;
1606	if (mp == NULL || (mp->mnt_kern_flag & MNTK_SUSPEND) == 0) {
1607		/*
1608		 * Wait for space for a new vnode.
1609		 */
1610		vnlru_kick();
1611		msleep(&vnlruproc_sig, &vnode_list_mtx, PVFS, "vlruwk", hz);
1612		if (atomic_load_long(&numvnodes) + 1 > desiredvnodes &&
1613		    vnlru_read_freevnodes() > 1)
1614			vnlru_free_locked(1, NULL);
1615	}
1616alloc:
1617	rnumvnodes = atomic_fetchadd_long(&numvnodes, 1) + 1;
1618	if (vnlru_under(rnumvnodes, vlowat))
1619		vnlru_kick();
1620	mtx_unlock(&vnode_list_mtx);
1621	return (uma_zalloc_smr(vnode_zone, M_WAITOK));
1622}
1623
1624static struct vnode *
1625vn_alloc(struct mount *mp)
1626{
1627	u_long rnumvnodes;
1628
1629	if (__predict_false(vn_alloc_cyclecount != 0))
1630		return (vn_alloc_hard(mp));
1631	rnumvnodes = atomic_fetchadd_long(&numvnodes, 1) + 1;
1632	if (__predict_false(vnlru_under_unlocked(rnumvnodes, vlowat))) {
1633		atomic_subtract_long(&numvnodes, 1);
1634		return (vn_alloc_hard(mp));
1635	}
1636
1637	return (uma_zalloc_smr(vnode_zone, M_WAITOK));
1638}
1639
1640static void
1641vn_free(struct vnode *vp)
1642{
1643
1644	atomic_subtract_long(&numvnodes, 1);
1645	uma_zfree_smr(vnode_zone, vp);
1646}
1647
1648/*
1649 * Return the next vnode from the free list.
1650 */
1651int
1652getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops,
1653    struct vnode **vpp)
1654{
1655	struct vnode *vp;
1656	struct thread *td;
1657	struct lock_object *lo;
1658
1659	CTR3(KTR_VFS, "%s: mp %p with tag %s", __func__, mp, tag);
1660
1661	KASSERT(vops->registered,
1662	    ("%s: not registered vector op %p\n", __func__, vops));
1663
1664	td = curthread;
1665	if (td->td_vp_reserved != NULL) {
1666		vp = td->td_vp_reserved;
1667		td->td_vp_reserved = NULL;
1668	} else {
1669		vp = vn_alloc(mp);
1670	}
1671	counter_u64_add(vnodes_created, 1);
1672	/*
1673	 * Locks are given the generic name "vnode" when created.
1674	 * Follow the historic practice of using the filesystem
1675	 * name when they allocated, e.g., "zfs", "ufs", "nfs, etc.
1676	 *
1677	 * Locks live in a witness group keyed on their name. Thus,
1678	 * when a lock is renamed, it must also move from the witness
1679	 * group of its old name to the witness group of its new name.
1680	 *
1681	 * The change only needs to be made when the vnode moves
1682	 * from one filesystem type to another. We ensure that each
1683	 * filesystem use a single static name pointer for its tag so
1684	 * that we can compare pointers rather than doing a strcmp().
1685	 */
1686	lo = &vp->v_vnlock->lock_object;
1687#ifdef WITNESS
1688	if (lo->lo_name != tag) {
1689#endif
1690		lo->lo_name = tag;
1691#ifdef WITNESS
1692		WITNESS_DESTROY(lo);
1693		WITNESS_INIT(lo, tag);
1694	}
1695#endif
1696	/*
1697	 * By default, don't allow shared locks unless filesystems opt-in.
1698	 */
1699	vp->v_vnlock->lock_object.lo_flags |= LK_NOSHARE;
1700	/*
1701	 * Finalize various vnode identity bits.
1702	 */
1703	KASSERT(vp->v_object == NULL, ("stale v_object %p", vp));
1704	KASSERT(vp->v_lockf == NULL, ("stale v_lockf %p", vp));
1705	KASSERT(vp->v_pollinfo == NULL, ("stale v_pollinfo %p", vp));
1706	vp->v_type = VNON;
1707	vp->v_op = vops;
1708	v_init_counters(vp);
1709	vp->v_bufobj.bo_ops = &buf_ops_bio;
1710#ifdef DIAGNOSTIC
1711	if (mp == NULL && vops != &dead_vnodeops)
1712		printf("NULL mp in getnewvnode(9), tag %s\n", tag);
1713#endif
1714#ifdef MAC
1715	mac_vnode_init(vp);
1716	if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0)
1717		mac_vnode_associate_singlelabel(mp, vp);
1718#endif
1719	if (mp != NULL) {
1720		vp->v_bufobj.bo_bsize = mp->mnt_stat.f_iosize;
1721		if ((mp->mnt_kern_flag & MNTK_NOKNOTE) != 0)
1722			vp->v_vflag |= VV_NOKNOTE;
1723	}
1724
1725	/*
1726	 * For the filesystems which do not use vfs_hash_insert(),
1727	 * still initialize v_hash to have vfs_hash_index() useful.
1728	 * E.g., nullfs uses vfs_hash_index() on the lower vnode for
1729	 * its own hashing.
1730	 */
1731	vp->v_hash = (uintptr_t)vp >> vnsz2log;
1732
1733	*vpp = vp;
1734	return (0);
1735}
1736
1737void
1738getnewvnode_reserve(void)
1739{
1740	struct thread *td;
1741
1742	td = curthread;
1743	MPASS(td->td_vp_reserved == NULL);
1744	td->td_vp_reserved = vn_alloc(NULL);
1745}
1746
1747void
1748getnewvnode_drop_reserve(void)
1749{
1750	struct thread *td;
1751
1752	td = curthread;
1753	if (td->td_vp_reserved != NULL) {
1754		vn_free(td->td_vp_reserved);
1755		td->td_vp_reserved = NULL;
1756	}
1757}
1758
1759static void __noinline
1760freevnode(struct vnode *vp)
1761{
1762	struct bufobj *bo;
1763
1764	/*
1765	 * The vnode has been marked for destruction, so free it.
1766	 *
1767	 * The vnode will be returned to the zone where it will
1768	 * normally remain until it is needed for another vnode. We
1769	 * need to cleanup (or verify that the cleanup has already
1770	 * been done) any residual data left from its current use
1771	 * so as not to contaminate the freshly allocated vnode.
1772	 */
1773	CTR2(KTR_VFS, "%s: destroying the vnode %p", __func__, vp);
1774	/*
1775	 * Paired with vgone.
1776	 */
1777	vn_seqc_write_end_locked(vp);
1778	VNPASS(vp->v_seqc_users == 0, vp);
1779
1780	bo = &vp->v_bufobj;
1781	VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't"));
1782	VNPASS(vp->v_holdcnt == VHOLD_NO_SMR, vp);
1783	VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count"));
1784	VNASSERT(vp->v_writecount == 0, vp, ("Non-zero write count"));
1785	VNASSERT(bo->bo_numoutput == 0, vp, ("Clean vnode has pending I/O's"));
1786	VNASSERT(bo->bo_clean.bv_cnt == 0, vp, ("cleanbufcnt not 0"));
1787	VNASSERT(pctrie_is_empty(&bo->bo_clean.bv_root), vp,
1788	    ("clean blk trie not empty"));
1789	VNASSERT(bo->bo_dirty.bv_cnt == 0, vp, ("dirtybufcnt not 0"));
1790	VNASSERT(pctrie_is_empty(&bo->bo_dirty.bv_root), vp,
1791	    ("dirty blk trie not empty"));
1792	VNASSERT(TAILQ_EMPTY(&vp->v_cache_dst), vp, ("vp has namecache dst"));
1793	VNASSERT(LIST_EMPTY(&vp->v_cache_src), vp, ("vp has namecache src"));
1794	VNASSERT(vp->v_cache_dd == NULL, vp, ("vp has namecache for .."));
1795	VNASSERT(TAILQ_EMPTY(&vp->v_rl.rl_waiters), vp,
1796	    ("Dangling rangelock waiters"));
1797	VI_UNLOCK(vp);
1798#ifdef MAC
1799	mac_vnode_destroy(vp);
1800#endif
1801	if (vp->v_pollinfo != NULL) {
1802		destroy_vpollinfo(vp->v_pollinfo);
1803		vp->v_pollinfo = NULL;
1804	}
1805#ifdef INVARIANTS
1806	/* XXX Elsewhere we detect an already freed vnode via NULL v_op. */
1807	vp->v_op = NULL;
1808#endif
1809	vp->v_mountedhere = NULL;
1810	vp->v_unpcb = NULL;
1811	vp->v_rdev = NULL;
1812	vp->v_fifoinfo = NULL;
1813	vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0;
1814	vp->v_irflag = 0;
1815	vp->v_iflag = 0;
1816	vp->v_vflag = 0;
1817	bo->bo_flag = 0;
1818	vn_free(vp);
1819}
1820
1821/*
1822 * Delete from old mount point vnode list, if on one.
1823 */
1824static void
1825delmntque(struct vnode *vp)
1826{
1827	struct mount *mp;
1828
1829	VNPASS((vp->v_mflag & VMP_LAZYLIST) == 0, vp);
1830
1831	mp = vp->v_mount;
1832	if (mp == NULL)
1833		return;
1834	MNT_ILOCK(mp);
1835	VI_LOCK(vp);
1836	vp->v_mount = NULL;
1837	VI_UNLOCK(vp);
1838	VNASSERT(mp->mnt_nvnodelistsize > 0, vp,
1839		("bad mount point vnode list size"));
1840	TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
1841	mp->mnt_nvnodelistsize--;
1842	MNT_REL(mp);
1843	MNT_IUNLOCK(mp);
1844}
1845
1846static void
1847insmntque_stddtr(struct vnode *vp, void *dtr_arg)
1848{
1849
1850	vp->v_data = NULL;
1851	vp->v_op = &dead_vnodeops;
1852	vgone(vp);
1853	vput(vp);
1854}
1855
1856/*
1857 * Insert into list of vnodes for the new mount point, if available.
1858 */
1859int
1860insmntque1(struct vnode *vp, struct mount *mp,
1861	void (*dtr)(struct vnode *, void *), void *dtr_arg)
1862{
1863
1864	KASSERT(vp->v_mount == NULL,
1865		("insmntque: vnode already on per mount vnode list"));
1866	VNASSERT(mp != NULL, vp, ("Don't call insmntque(foo, NULL)"));
1867	ASSERT_VOP_ELOCKED(vp, "insmntque: non-locked vp");
1868
1869	/*
1870	 * We acquire the vnode interlock early to ensure that the
1871	 * vnode cannot be recycled by another process releasing a
1872	 * holdcnt on it before we get it on both the vnode list
1873	 * and the active vnode list. The mount mutex protects only
1874	 * manipulation of the vnode list and the vnode freelist
1875	 * mutex protects only manipulation of the active vnode list.
1876	 * Hence the need to hold the vnode interlock throughout.
1877	 */
1878	MNT_ILOCK(mp);
1879	VI_LOCK(vp);
1880	if (((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0 &&
1881	    ((mp->mnt_kern_flag & MNTK_UNMOUNTF) != 0 ||
1882	    mp->mnt_nvnodelistsize == 0)) &&
1883	    (vp->v_vflag & VV_FORCEINSMQ) == 0) {
1884		VI_UNLOCK(vp);
1885		MNT_IUNLOCK(mp);
1886		if (dtr != NULL)
1887			dtr(vp, dtr_arg);
1888		return (EBUSY);
1889	}
1890	vp->v_mount = mp;
1891	MNT_REF(mp);
1892	TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
1893	VNASSERT(mp->mnt_nvnodelistsize >= 0, vp,
1894		("neg mount point vnode list size"));
1895	mp->mnt_nvnodelistsize++;
1896	VI_UNLOCK(vp);
1897	MNT_IUNLOCK(mp);
1898	return (0);
1899}
1900
1901int
1902insmntque(struct vnode *vp, struct mount *mp)
1903{
1904
1905	return (insmntque1(vp, mp, insmntque_stddtr, NULL));
1906}
1907
1908/*
1909 * Flush out and invalidate all buffers associated with a bufobj
1910 * Called with the underlying object locked.
1911 */
1912int
1913bufobj_invalbuf(struct bufobj *bo, int flags, int slpflag, int slptimeo)
1914{
1915	int error;
1916
1917	BO_LOCK(bo);
1918	if (flags & V_SAVE) {
1919		error = bufobj_wwait(bo, slpflag, slptimeo);
1920		if (error) {
1921			BO_UNLOCK(bo);
1922			return (error);
1923		}
1924		if (bo->bo_dirty.bv_cnt > 0) {
1925			BO_UNLOCK(bo);
1926			if ((error = BO_SYNC(bo, MNT_WAIT)) != 0)
1927				return (error);
1928			/*
1929			 * XXX We could save a lock/unlock if this was only
1930			 * enabled under INVARIANTS
1931			 */
1932			BO_LOCK(bo);
1933			if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0)
1934				panic("vinvalbuf: dirty bufs");
1935		}
1936	}
1937	/*
1938	 * If you alter this loop please notice that interlock is dropped and
1939	 * reacquired in flushbuflist.  Special care is needed to ensure that
1940	 * no race conditions occur from this.
1941	 */
1942	do {
1943		error = flushbuflist(&bo->bo_clean,
1944		    flags, bo, slpflag, slptimeo);
1945		if (error == 0 && !(flags & V_CLEANONLY))
1946			error = flushbuflist(&bo->bo_dirty,
1947			    flags, bo, slpflag, slptimeo);
1948		if (error != 0 && error != EAGAIN) {
1949			BO_UNLOCK(bo);
1950			return (error);
1951		}
1952	} while (error != 0);
1953
1954	/*
1955	 * Wait for I/O to complete.  XXX needs cleaning up.  The vnode can
1956	 * have write I/O in-progress but if there is a VM object then the
1957	 * VM object can also have read-I/O in-progress.
1958	 */
1959	do {
1960		bufobj_wwait(bo, 0, 0);
1961		if ((flags & V_VMIO) == 0 && bo->bo_object != NULL) {
1962			BO_UNLOCK(bo);
1963			vm_object_pip_wait_unlocked(bo->bo_object, "bovlbx");
1964			BO_LOCK(bo);
1965		}
1966	} while (bo->bo_numoutput > 0);
1967	BO_UNLOCK(bo);
1968
1969	/*
1970	 * Destroy the copy in the VM cache, too.
1971	 */
1972	if (bo->bo_object != NULL &&
1973	    (flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO)) == 0) {
1974		VM_OBJECT_WLOCK(bo->bo_object);
1975		vm_object_page_remove(bo->bo_object, 0, 0, (flags & V_SAVE) ?
1976		    OBJPR_CLEANONLY : 0);
1977		VM_OBJECT_WUNLOCK(bo->bo_object);
1978	}
1979
1980#ifdef INVARIANTS
1981	BO_LOCK(bo);
1982	if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO |
1983	    V_ALLOWCLEAN)) == 0 && (bo->bo_dirty.bv_cnt > 0 ||
1984	    bo->bo_clean.bv_cnt > 0))
1985		panic("vinvalbuf: flush failed");
1986	if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO)) == 0 &&
1987	    bo->bo_dirty.bv_cnt > 0)
1988		panic("vinvalbuf: flush dirty failed");
1989	BO_UNLOCK(bo);
1990#endif
1991	return (0);
1992}
1993
1994/*
1995 * Flush out and invalidate all buffers associated with a vnode.
1996 * Called with the underlying object locked.
1997 */
1998int
1999vinvalbuf(struct vnode *vp, int flags, int slpflag, int slptimeo)
2000{
2001
2002	CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags);
2003	ASSERT_VOP_LOCKED(vp, "vinvalbuf");
2004	if (vp->v_object != NULL && vp->v_object->handle != vp)
2005		return (0);
2006	return (bufobj_invalbuf(&vp->v_bufobj, flags, slpflag, slptimeo));
2007}
2008
2009/*
2010 * Flush out buffers on the specified list.
2011 *
2012 */
2013static int
2014flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, int slpflag,
2015    int slptimeo)
2016{
2017	struct buf *bp, *nbp;
2018	int retval, error;
2019	daddr_t lblkno;
2020	b_xflags_t xflags;
2021
2022	ASSERT_BO_WLOCKED(bo);
2023
2024	retval = 0;
2025	TAILQ_FOREACH_SAFE(bp, &bufv->bv_hd, b_bobufs, nbp) {
2026		/*
2027		 * If we are flushing both V_NORMAL and V_ALT buffers then
2028		 * do not skip any buffers. If we are flushing only V_NORMAL
2029		 * buffers then skip buffers marked as BX_ALTDATA. If we are
2030		 * flushing only V_ALT buffers then skip buffers not marked
2031		 * as BX_ALTDATA.
2032		 */
2033		if (((flags & (V_NORMAL | V_ALT)) != (V_NORMAL | V_ALT)) &&
2034		   (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA) != 0) ||
2035		    ((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0))) {
2036			continue;
2037		}
2038		if (nbp != NULL) {
2039			lblkno = nbp->b_lblkno;
2040			xflags = nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN);
2041		}
2042		retval = EAGAIN;
2043		error = BUF_TIMELOCK(bp,
2044		    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo),
2045		    "flushbuf", slpflag, slptimeo);
2046		if (error) {
2047			BO_LOCK(bo);
2048			return (error != ENOLCK ? error : EAGAIN);
2049		}
2050		KASSERT(bp->b_bufobj == bo,
2051		    ("bp %p wrong b_bufobj %p should be %p",
2052		    bp, bp->b_bufobj, bo));
2053		/*
2054		 * XXX Since there are no node locks for NFS, I
2055		 * believe there is a slight chance that a delayed
2056		 * write will occur while sleeping just above, so
2057		 * check for it.
2058		 */
2059		if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) &&
2060		    (flags & V_SAVE)) {
2061			bremfree(bp);
2062			bp->b_flags |= B_ASYNC;
2063			bwrite(bp);
2064			BO_LOCK(bo);
2065			return (EAGAIN);	/* XXX: why not loop ? */
2066		}
2067		bremfree(bp);
2068		bp->b_flags |= (B_INVAL | B_RELBUF);
2069		bp->b_flags &= ~B_ASYNC;
2070		brelse(bp);
2071		BO_LOCK(bo);
2072		if (nbp == NULL)
2073			break;
2074		nbp = gbincore(bo, lblkno);
2075		if (nbp == NULL || (nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
2076		    != xflags)
2077			break;			/* nbp invalid */
2078	}
2079	return (retval);
2080}
2081
2082int
2083bnoreuselist(struct bufv *bufv, struct bufobj *bo, daddr_t startn, daddr_t endn)
2084{
2085	struct buf *bp;
2086	int error;
2087	daddr_t lblkno;
2088
2089	ASSERT_BO_LOCKED(bo);
2090
2091	for (lblkno = startn;;) {
2092again:
2093		bp = BUF_PCTRIE_LOOKUP_GE(&bufv->bv_root, lblkno);
2094		if (bp == NULL || bp->b_lblkno >= endn ||
2095		    bp->b_lblkno < startn)
2096			break;
2097		error = BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL |
2098		    LK_INTERLOCK, BO_LOCKPTR(bo), "brlsfl", 0, 0);
2099		if (error != 0) {
2100			BO_RLOCK(bo);
2101			if (error == ENOLCK)
2102				goto again;
2103			return (error);
2104		}
2105		KASSERT(bp->b_bufobj == bo,
2106		    ("bp %p wrong b_bufobj %p should be %p",
2107		    bp, bp->b_bufobj, bo));
2108		lblkno = bp->b_lblkno + 1;
2109		if ((bp->b_flags & B_MANAGED) == 0)
2110			bremfree(bp);
2111		bp->b_flags |= B_RELBUF;
2112		/*
2113		 * In the VMIO case, use the B_NOREUSE flag to hint that the
2114		 * pages backing each buffer in the range are unlikely to be
2115		 * reused.  Dirty buffers will have the hint applied once
2116		 * they've been written.
2117		 */
2118		if ((bp->b_flags & B_VMIO) != 0)
2119			bp->b_flags |= B_NOREUSE;
2120		brelse(bp);
2121		BO_RLOCK(bo);
2122	}
2123	return (0);
2124}
2125
2126/*
2127 * Truncate a file's buffer and pages to a specified length.  This
2128 * is in lieu of the old vinvalbuf mechanism, which performed unneeded
2129 * sync activity.
2130 */
2131int
2132vtruncbuf(struct vnode *vp, off_t length, int blksize)
2133{
2134	struct buf *bp, *nbp;
2135	struct bufobj *bo;
2136	daddr_t startlbn;
2137
2138	CTR4(KTR_VFS, "%s: vp %p with block %d:%ju", __func__,
2139	    vp, blksize, (uintmax_t)length);
2140
2141	/*
2142	 * Round up to the *next* lbn.
2143	 */
2144	startlbn = howmany(length, blksize);
2145
2146	ASSERT_VOP_LOCKED(vp, "vtruncbuf");
2147
2148	bo = &vp->v_bufobj;
2149restart_unlocked:
2150	BO_LOCK(bo);
2151
2152	while (v_inval_buf_range_locked(vp, bo, startlbn, INT64_MAX) == EAGAIN)
2153		;
2154
2155	if (length > 0) {
2156restartsync:
2157		TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
2158			if (bp->b_lblkno > 0)
2159				continue;
2160			/*
2161			 * Since we hold the vnode lock this should only
2162			 * fail if we're racing with the buf daemon.
2163			 */
2164			if (BUF_LOCK(bp,
2165			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
2166			    BO_LOCKPTR(bo)) == ENOLCK)
2167				goto restart_unlocked;
2168
2169			VNASSERT((bp->b_flags & B_DELWRI), vp,
2170			    ("buf(%p) on dirty queue without DELWRI", bp));
2171
2172			bremfree(bp);
2173			bawrite(bp);
2174			BO_LOCK(bo);
2175			goto restartsync;
2176		}
2177	}
2178
2179	bufobj_wwait(bo, 0, 0);
2180	BO_UNLOCK(bo);
2181	vnode_pager_setsize(vp, length);
2182
2183	return (0);
2184}
2185
2186/*
2187 * Invalidate the cached pages of a file's buffer within the range of block
2188 * numbers [startlbn, endlbn).
2189 */
2190void
2191v_inval_buf_range(struct vnode *vp, daddr_t startlbn, daddr_t endlbn,
2192    int blksize)
2193{
2194	struct bufobj *bo;
2195	off_t start, end;
2196
2197	ASSERT_VOP_LOCKED(vp, "v_inval_buf_range");
2198
2199	start = blksize * startlbn;
2200	end = blksize * endlbn;
2201
2202	bo = &vp->v_bufobj;
2203	BO_LOCK(bo);
2204	MPASS(blksize == bo->bo_bsize);
2205
2206	while (v_inval_buf_range_locked(vp, bo, startlbn, endlbn) == EAGAIN)
2207		;
2208
2209	BO_UNLOCK(bo);
2210	vn_pages_remove(vp, OFF_TO_IDX(start), OFF_TO_IDX(end + PAGE_SIZE - 1));
2211}
2212
2213static int
2214v_inval_buf_range_locked(struct vnode *vp, struct bufobj *bo,
2215    daddr_t startlbn, daddr_t endlbn)
2216{
2217	struct buf *bp, *nbp;
2218	bool anyfreed;
2219
2220	ASSERT_VOP_LOCKED(vp, "v_inval_buf_range_locked");
2221	ASSERT_BO_LOCKED(bo);
2222
2223	do {
2224		anyfreed = false;
2225		TAILQ_FOREACH_SAFE(bp, &bo->bo_clean.bv_hd, b_bobufs, nbp) {
2226			if (bp->b_lblkno < startlbn || bp->b_lblkno >= endlbn)
2227				continue;
2228			if (BUF_LOCK(bp,
2229			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
2230			    BO_LOCKPTR(bo)) == ENOLCK) {
2231				BO_LOCK(bo);
2232				return (EAGAIN);
2233			}
2234
2235			bremfree(bp);
2236			bp->b_flags |= B_INVAL | B_RELBUF;
2237			bp->b_flags &= ~B_ASYNC;
2238			brelse(bp);
2239			anyfreed = true;
2240
2241			BO_LOCK(bo);
2242			if (nbp != NULL &&
2243			    (((nbp->b_xflags & BX_VNCLEAN) == 0) ||
2244			    nbp->b_vp != vp ||
2245			    (nbp->b_flags & B_DELWRI) != 0))
2246				return (EAGAIN);
2247		}
2248
2249		TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
2250			if (bp->b_lblkno < startlbn || bp->b_lblkno >= endlbn)
2251				continue;
2252			if (BUF_LOCK(bp,
2253			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
2254			    BO_LOCKPTR(bo)) == ENOLCK) {
2255				BO_LOCK(bo);
2256				return (EAGAIN);
2257			}
2258			bremfree(bp);
2259			bp->b_flags |= B_INVAL | B_RELBUF;
2260			bp->b_flags &= ~B_ASYNC;
2261			brelse(bp);
2262			anyfreed = true;
2263
2264			BO_LOCK(bo);
2265			if (nbp != NULL &&
2266			    (((nbp->b_xflags & BX_VNDIRTY) == 0) ||
2267			    (nbp->b_vp != vp) ||
2268			    (nbp->b_flags & B_DELWRI) == 0))
2269				return (EAGAIN);
2270		}
2271	} while (anyfreed);
2272	return (0);
2273}
2274
2275static void
2276buf_vlist_remove(struct buf *bp)
2277{
2278	struct bufv *bv;
2279	b_xflags_t flags;
2280
2281	flags = bp->b_xflags;
2282
2283	KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
2284	ASSERT_BO_WLOCKED(bp->b_bufobj);
2285	KASSERT((flags & (BX_VNDIRTY | BX_VNCLEAN)) != 0 &&
2286	    (flags & (BX_VNDIRTY | BX_VNCLEAN)) != (BX_VNDIRTY | BX_VNCLEAN),
2287	    ("%s: buffer %p has invalid queue state", __func__, bp));
2288
2289	if ((flags & BX_VNDIRTY) != 0)
2290		bv = &bp->b_bufobj->bo_dirty;
2291	else
2292		bv = &bp->b_bufobj->bo_clean;
2293	BUF_PCTRIE_REMOVE(&bv->bv_root, bp->b_lblkno);
2294	TAILQ_REMOVE(&bv->bv_hd, bp, b_bobufs);
2295	bv->bv_cnt--;
2296	bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
2297}
2298
2299/*
2300 * Add the buffer to the sorted clean or dirty block list.
2301 *
2302 * NOTE: xflags is passed as a constant, optimizing this inline function!
2303 */
2304static void
2305buf_vlist_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags)
2306{
2307	struct bufv *bv;
2308	struct buf *n;
2309	int error;
2310
2311	ASSERT_BO_WLOCKED(bo);
2312	KASSERT((bo->bo_flag & BO_NOBUFS) == 0,
2313	    ("buf_vlist_add: bo %p does not allow bufs", bo));
2314	KASSERT((xflags & BX_VNDIRTY) == 0 || (bo->bo_flag & BO_DEAD) == 0,
2315	    ("dead bo %p", bo));
2316	KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0,
2317	    ("buf_vlist_add: Buf %p has existing xflags %d", bp, bp->b_xflags));
2318	bp->b_xflags |= xflags;
2319	if (xflags & BX_VNDIRTY)
2320		bv = &bo->bo_dirty;
2321	else
2322		bv = &bo->bo_clean;
2323
2324	/*
2325	 * Keep the list ordered.  Optimize empty list insertion.  Assume
2326	 * we tend to grow at the tail so lookup_le should usually be cheaper
2327	 * than _ge.
2328	 */
2329	if (bv->bv_cnt == 0 ||
2330	    bp->b_lblkno > TAILQ_LAST(&bv->bv_hd, buflists)->b_lblkno)
2331		TAILQ_INSERT_TAIL(&bv->bv_hd, bp, b_bobufs);
2332	else if ((n = BUF_PCTRIE_LOOKUP_LE(&bv->bv_root, bp->b_lblkno)) == NULL)
2333		TAILQ_INSERT_HEAD(&bv->bv_hd, bp, b_bobufs);
2334	else
2335		TAILQ_INSERT_AFTER(&bv->bv_hd, n, bp, b_bobufs);
2336	error = BUF_PCTRIE_INSERT(&bv->bv_root, bp);
2337	if (error)
2338		panic("buf_vlist_add:  Preallocated nodes insufficient.");
2339	bv->bv_cnt++;
2340}
2341
2342/*
2343 * Look up a buffer using the buffer tries.
2344 */
2345struct buf *
2346gbincore(struct bufobj *bo, daddr_t lblkno)
2347{
2348	struct buf *bp;
2349
2350	ASSERT_BO_LOCKED(bo);
2351	bp = BUF_PCTRIE_LOOKUP(&bo->bo_clean.bv_root, lblkno);
2352	if (bp != NULL)
2353		return (bp);
2354	return (BUF_PCTRIE_LOOKUP(&bo->bo_dirty.bv_root, lblkno));
2355}
2356
2357/*
2358 * Look up a buf using the buffer tries, without the bufobj lock.  This relies
2359 * on SMR for safe lookup, and bufs being in a no-free zone to provide type
2360 * stability of the result.  Like other lockless lookups, the found buf may
2361 * already be invalid by the time this function returns.
2362 */
2363struct buf *
2364gbincore_unlocked(struct bufobj *bo, daddr_t lblkno)
2365{
2366	struct buf *bp;
2367
2368	ASSERT_BO_UNLOCKED(bo);
2369	bp = BUF_PCTRIE_LOOKUP_UNLOCKED(&bo->bo_clean.bv_root, lblkno);
2370	if (bp != NULL)
2371		return (bp);
2372	return (BUF_PCTRIE_LOOKUP_UNLOCKED(&bo->bo_dirty.bv_root, lblkno));
2373}
2374
2375/*
2376 * Associate a buffer with a vnode.
2377 */
2378void
2379bgetvp(struct vnode *vp, struct buf *bp)
2380{
2381	struct bufobj *bo;
2382
2383	bo = &vp->v_bufobj;
2384	ASSERT_BO_WLOCKED(bo);
2385	VNASSERT(bp->b_vp == NULL, bp->b_vp, ("bgetvp: not free"));
2386
2387	CTR3(KTR_BUF, "bgetvp(%p) vp %p flags %X", bp, vp, bp->b_flags);
2388	VNASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, vp,
2389	    ("bgetvp: bp already attached! %p", bp));
2390
2391	vhold(vp);
2392	bp->b_vp = vp;
2393	bp->b_bufobj = bo;
2394	/*
2395	 * Insert onto list for new vnode.
2396	 */
2397	buf_vlist_add(bp, bo, BX_VNCLEAN);
2398}
2399
2400/*
2401 * Disassociate a buffer from a vnode.
2402 */
2403void
2404brelvp(struct buf *bp)
2405{
2406	struct bufobj *bo;
2407	struct vnode *vp;
2408
2409	CTR3(KTR_BUF, "brelvp(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
2410	KASSERT(bp->b_vp != NULL, ("brelvp: NULL"));
2411
2412	/*
2413	 * Delete from old vnode list, if on one.
2414	 */
2415	vp = bp->b_vp;		/* XXX */
2416	bo = bp->b_bufobj;
2417	BO_LOCK(bo);
2418	buf_vlist_remove(bp);
2419	if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) {
2420		bo->bo_flag &= ~BO_ONWORKLST;
2421		mtx_lock(&sync_mtx);
2422		LIST_REMOVE(bo, bo_synclist);
2423		syncer_worklist_len--;
2424		mtx_unlock(&sync_mtx);
2425	}
2426	bp->b_vp = NULL;
2427	bp->b_bufobj = NULL;
2428	BO_UNLOCK(bo);
2429	vdrop(vp);
2430}
2431
2432/*
2433 * Add an item to the syncer work queue.
2434 */
2435static void
2436vn_syncer_add_to_worklist(struct bufobj *bo, int delay)
2437{
2438	int slot;
2439
2440	ASSERT_BO_WLOCKED(bo);
2441
2442	mtx_lock(&sync_mtx);
2443	if (bo->bo_flag & BO_ONWORKLST)
2444		LIST_REMOVE(bo, bo_synclist);
2445	else {
2446		bo->bo_flag |= BO_ONWORKLST;
2447		syncer_worklist_len++;
2448	}
2449
2450	if (delay > syncer_maxdelay - 2)
2451		delay = syncer_maxdelay - 2;
2452	slot = (syncer_delayno + delay) & syncer_mask;
2453
2454	LIST_INSERT_HEAD(&syncer_workitem_pending[slot], bo, bo_synclist);
2455	mtx_unlock(&sync_mtx);
2456}
2457
2458static int
2459sysctl_vfs_worklist_len(SYSCTL_HANDLER_ARGS)
2460{
2461	int error, len;
2462
2463	mtx_lock(&sync_mtx);
2464	len = syncer_worklist_len - sync_vnode_count;
2465	mtx_unlock(&sync_mtx);
2466	error = SYSCTL_OUT(req, &len, sizeof(len));
2467	return (error);
2468}
2469
2470SYSCTL_PROC(_vfs, OID_AUTO, worklist_len,
2471    CTLTYPE_INT | CTLFLAG_MPSAFE| CTLFLAG_RD, NULL, 0,
2472    sysctl_vfs_worklist_len, "I", "Syncer thread worklist length");
2473
2474static struct proc *updateproc;
2475static void sched_sync(void);
2476static struct kproc_desc up_kp = {
2477	"syncer",
2478	sched_sync,
2479	&updateproc
2480};
2481SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp);
2482
2483static int
2484sync_vnode(struct synclist *slp, struct bufobj **bo, struct thread *td)
2485{
2486	struct vnode *vp;
2487	struct mount *mp;
2488
2489	*bo = LIST_FIRST(slp);
2490	if (*bo == NULL)
2491		return (0);
2492	vp = bo2vnode(*bo);
2493	if (VOP_ISLOCKED(vp) != 0 || VI_TRYLOCK(vp) == 0)
2494		return (1);
2495	/*
2496	 * We use vhold in case the vnode does not
2497	 * successfully sync.  vhold prevents the vnode from
2498	 * going away when we unlock the sync_mtx so that
2499	 * we can acquire the vnode interlock.
2500	 */
2501	vholdl(vp);
2502	mtx_unlock(&sync_mtx);
2503	VI_UNLOCK(vp);
2504	if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
2505		vdrop(vp);
2506		mtx_lock(&sync_mtx);
2507		return (*bo == LIST_FIRST(slp));
2508	}
2509	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2510	(void) VOP_FSYNC(vp, MNT_LAZY, td);
2511	VOP_UNLOCK(vp);
2512	vn_finished_write(mp);
2513	BO_LOCK(*bo);
2514	if (((*bo)->bo_flag & BO_ONWORKLST) != 0) {
2515		/*
2516		 * Put us back on the worklist.  The worklist
2517		 * routine will remove us from our current
2518		 * position and then add us back in at a later
2519		 * position.
2520		 */
2521		vn_syncer_add_to_worklist(*bo, syncdelay);
2522	}
2523	BO_UNLOCK(*bo);
2524	vdrop(vp);
2525	mtx_lock(&sync_mtx);
2526	return (0);
2527}
2528
2529static int first_printf = 1;
2530
2531/*
2532 * System filesystem synchronizer daemon.
2533 */
2534static void
2535sched_sync(void)
2536{
2537	struct synclist *next, *slp;
2538	struct bufobj *bo;
2539	long starttime;
2540	struct thread *td = curthread;
2541	int last_work_seen;
2542	int net_worklist_len;
2543	int syncer_final_iter;
2544	int error;
2545
2546	last_work_seen = 0;
2547	syncer_final_iter = 0;
2548	syncer_state = SYNCER_RUNNING;
2549	starttime = time_uptime;
2550	td->td_pflags |= TDP_NORUNNINGBUF;
2551
2552	EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td->td_proc,
2553	    SHUTDOWN_PRI_LAST);
2554
2555	mtx_lock(&sync_mtx);
2556	for (;;) {
2557		if (syncer_state == SYNCER_FINAL_DELAY &&
2558		    syncer_final_iter == 0) {
2559			mtx_unlock(&sync_mtx);
2560			kproc_suspend_check(td->td_proc);
2561			mtx_lock(&sync_mtx);
2562		}
2563		net_worklist_len = syncer_worklist_len - sync_vnode_count;
2564		if (syncer_state != SYNCER_RUNNING &&
2565		    starttime != time_uptime) {
2566			if (first_printf) {
2567				printf("\nSyncing disks, vnodes remaining... ");
2568				first_printf = 0;
2569			}
2570			printf("%d ", net_worklist_len);
2571		}
2572		starttime = time_uptime;
2573
2574		/*
2575		 * Push files whose dirty time has expired.  Be careful
2576		 * of interrupt race on slp queue.
2577		 *
2578		 * Skip over empty worklist slots when shutting down.
2579		 */
2580		do {
2581			slp = &syncer_workitem_pending[syncer_delayno];
2582			syncer_delayno += 1;
2583			if (syncer_delayno == syncer_maxdelay)
2584				syncer_delayno = 0;
2585			next = &syncer_workitem_pending[syncer_delayno];
2586			/*
2587			 * If the worklist has wrapped since the
2588			 * it was emptied of all but syncer vnodes,
2589			 * switch to the FINAL_DELAY state and run
2590			 * for one more second.
2591			 */
2592			if (syncer_state == SYNCER_SHUTTING_DOWN &&
2593			    net_worklist_len == 0 &&
2594			    last_work_seen == syncer_delayno) {
2595				syncer_state = SYNCER_FINAL_DELAY;
2596				syncer_final_iter = SYNCER_SHUTDOWN_SPEEDUP;
2597			}
2598		} while (syncer_state != SYNCER_RUNNING && LIST_EMPTY(slp) &&
2599		    syncer_worklist_len > 0);
2600
2601		/*
2602		 * Keep track of the last time there was anything
2603		 * on the worklist other than syncer vnodes.
2604		 * Return to the SHUTTING_DOWN state if any
2605		 * new work appears.
2606		 */
2607		if (net_worklist_len > 0 || syncer_state == SYNCER_RUNNING)
2608			last_work_seen = syncer_delayno;
2609		if (net_worklist_len > 0 && syncer_state == SYNCER_FINAL_DELAY)
2610			syncer_state = SYNCER_SHUTTING_DOWN;
2611		while (!LIST_EMPTY(slp)) {
2612			error = sync_vnode(slp, &bo, td);
2613			if (error == 1) {
2614				LIST_REMOVE(bo, bo_synclist);
2615				LIST_INSERT_HEAD(next, bo, bo_synclist);
2616				continue;
2617			}
2618
2619			if (first_printf == 0) {
2620				/*
2621				 * Drop the sync mutex, because some watchdog
2622				 * drivers need to sleep while patting
2623				 */
2624				mtx_unlock(&sync_mtx);
2625				wdog_kern_pat(WD_LASTVAL);
2626				mtx_lock(&sync_mtx);
2627			}
2628		}
2629		if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter > 0)
2630			syncer_final_iter--;
2631		/*
2632		 * The variable rushjob allows the kernel to speed up the
2633		 * processing of the filesystem syncer process. A rushjob
2634		 * value of N tells the filesystem syncer to process the next
2635		 * N seconds worth of work on its queue ASAP. Currently rushjob
2636		 * is used by the soft update code to speed up the filesystem
2637		 * syncer process when the incore state is getting so far
2638		 * ahead of the disk that the kernel memory pool is being
2639		 * threatened with exhaustion.
2640		 */
2641		if (rushjob > 0) {
2642			rushjob -= 1;
2643			continue;
2644		}
2645		/*
2646		 * Just sleep for a short period of time between
2647		 * iterations when shutting down to allow some I/O
2648		 * to happen.
2649		 *
2650		 * If it has taken us less than a second to process the
2651		 * current work, then wait. Otherwise start right over
2652		 * again. We can still lose time if any single round
2653		 * takes more than two seconds, but it does not really
2654		 * matter as we are just trying to generally pace the
2655		 * filesystem activity.
2656		 */
2657		if (syncer_state != SYNCER_RUNNING ||
2658		    time_uptime == starttime) {
2659			thread_lock(td);
2660			sched_prio(td, PPAUSE);
2661			thread_unlock(td);
2662		}
2663		if (syncer_state != SYNCER_RUNNING)
2664			cv_timedwait(&sync_wakeup, &sync_mtx,
2665			    hz / SYNCER_SHUTDOWN_SPEEDUP);
2666		else if (time_uptime == starttime)
2667			cv_timedwait(&sync_wakeup, &sync_mtx, hz);
2668	}
2669}
2670
2671/*
2672 * Request the syncer daemon to speed up its work.
2673 * We never push it to speed up more than half of its
2674 * normal turn time, otherwise it could take over the cpu.
2675 */
2676int
2677speedup_syncer(void)
2678{
2679	int ret = 0;
2680
2681	mtx_lock(&sync_mtx);
2682	if (rushjob < syncdelay / 2) {
2683		rushjob += 1;
2684		stat_rush_requests += 1;
2685		ret = 1;
2686	}
2687	mtx_unlock(&sync_mtx);
2688	cv_broadcast(&sync_wakeup);
2689	return (ret);
2690}
2691
2692/*
2693 * Tell the syncer to speed up its work and run though its work
2694 * list several times, then tell it to shut down.
2695 */
2696static void
2697syncer_shutdown(void *arg, int howto)
2698{
2699
2700	if (howto & RB_NOSYNC)
2701		return;
2702	mtx_lock(&sync_mtx);
2703	syncer_state = SYNCER_SHUTTING_DOWN;
2704	rushjob = 0;
2705	mtx_unlock(&sync_mtx);
2706	cv_broadcast(&sync_wakeup);
2707	kproc_shutdown(arg, howto);
2708}
2709
2710void
2711syncer_suspend(void)
2712{
2713
2714	syncer_shutdown(updateproc, 0);
2715}
2716
2717void
2718syncer_resume(void)
2719{
2720
2721	mtx_lock(&sync_mtx);
2722	first_printf = 1;
2723	syncer_state = SYNCER_RUNNING;
2724	mtx_unlock(&sync_mtx);
2725	cv_broadcast(&sync_wakeup);
2726	kproc_resume(updateproc);
2727}
2728
2729/*
2730 * Move the buffer between the clean and dirty lists of its vnode.
2731 */
2732void
2733reassignbuf(struct buf *bp)
2734{
2735	struct vnode *vp;
2736	struct bufobj *bo;
2737	int delay;
2738#ifdef INVARIANTS
2739	struct bufv *bv;
2740#endif
2741
2742	vp = bp->b_vp;
2743	bo = bp->b_bufobj;
2744
2745	KASSERT((bp->b_flags & B_PAGING) == 0,
2746	    ("%s: cannot reassign paging buffer %p", __func__, bp));
2747
2748	CTR3(KTR_BUF, "reassignbuf(%p) vp %p flags %X",
2749	    bp, bp->b_vp, bp->b_flags);
2750
2751	BO_LOCK(bo);
2752	buf_vlist_remove(bp);
2753
2754	/*
2755	 * If dirty, put on list of dirty buffers; otherwise insert onto list
2756	 * of clean buffers.
2757	 */
2758	if (bp->b_flags & B_DELWRI) {
2759		if ((bo->bo_flag & BO_ONWORKLST) == 0) {
2760			switch (vp->v_type) {
2761			case VDIR:
2762				delay = dirdelay;
2763				break;
2764			case VCHR:
2765				delay = metadelay;
2766				break;
2767			default:
2768				delay = filedelay;
2769			}
2770			vn_syncer_add_to_worklist(bo, delay);
2771		}
2772		buf_vlist_add(bp, bo, BX_VNDIRTY);
2773	} else {
2774		buf_vlist_add(bp, bo, BX_VNCLEAN);
2775
2776		if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) {
2777			mtx_lock(&sync_mtx);
2778			LIST_REMOVE(bo, bo_synclist);
2779			syncer_worklist_len--;
2780			mtx_unlock(&sync_mtx);
2781			bo->bo_flag &= ~BO_ONWORKLST;
2782		}
2783	}
2784#ifdef INVARIANTS
2785	bv = &bo->bo_clean;
2786	bp = TAILQ_FIRST(&bv->bv_hd);
2787	KASSERT(bp == NULL || bp->b_bufobj == bo,
2788	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
2789	bp = TAILQ_LAST(&bv->bv_hd, buflists);
2790	KASSERT(bp == NULL || bp->b_bufobj == bo,
2791	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
2792	bv = &bo->bo_dirty;
2793	bp = TAILQ_FIRST(&bv->bv_hd);
2794	KASSERT(bp == NULL || bp->b_bufobj == bo,
2795	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
2796	bp = TAILQ_LAST(&bv->bv_hd, buflists);
2797	KASSERT(bp == NULL || bp->b_bufobj == bo,
2798	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
2799#endif
2800	BO_UNLOCK(bo);
2801}
2802
2803static void
2804v_init_counters(struct vnode *vp)
2805{
2806
2807	VNASSERT(vp->v_type == VNON && vp->v_data == NULL && vp->v_iflag == 0,
2808	    vp, ("%s called for an initialized vnode", __FUNCTION__));
2809	ASSERT_VI_UNLOCKED(vp, __FUNCTION__);
2810
2811	refcount_init(&vp->v_holdcnt, 1);
2812	refcount_init(&vp->v_usecount, 1);
2813}
2814
2815/*
2816 * Grab a particular vnode from the free list, increment its
2817 * reference count and lock it.  VIRF_DOOMED is set if the vnode
2818 * is being destroyed.  Only callers who specify LK_RETRY will
2819 * see doomed vnodes.  If inactive processing was delayed in
2820 * vput try to do it here.
2821 *
2822 * usecount is manipulated using atomics without holding any locks.
2823 *
2824 * holdcnt can be manipulated using atomics without holding any locks,
2825 * except when transitioning 1<->0, in which case the interlock is held.
2826 *
2827 * Consumers which don't guarantee liveness of the vnode can use SMR to
2828 * try to get a reference. Note this operation can fail since the vnode
2829 * may be awaiting getting freed by the time they get to it.
2830 */
2831enum vgetstate
2832vget_prep_smr(struct vnode *vp)
2833{
2834	enum vgetstate vs;
2835
2836	VFS_SMR_ASSERT_ENTERED();
2837
2838	if (refcount_acquire_if_not_zero(&vp->v_usecount)) {
2839		vs = VGET_USECOUNT;
2840	} else {
2841		if (vhold_smr(vp))
2842			vs = VGET_HOLDCNT;
2843		else
2844			vs = VGET_NONE;
2845	}
2846	return (vs);
2847}
2848
2849enum vgetstate
2850vget_prep(struct vnode *vp)
2851{
2852	enum vgetstate vs;
2853
2854	if (refcount_acquire_if_not_zero(&vp->v_usecount)) {
2855		vs = VGET_USECOUNT;
2856	} else {
2857		vhold(vp);
2858		vs = VGET_HOLDCNT;
2859	}
2860	return (vs);
2861}
2862
2863void
2864vget_abort(struct vnode *vp, enum vgetstate vs)
2865{
2866
2867	switch (vs) {
2868	case VGET_USECOUNT:
2869		vrele(vp);
2870		break;
2871	case VGET_HOLDCNT:
2872		vdrop(vp);
2873		break;
2874	default:
2875		__assert_unreachable();
2876	}
2877}
2878
2879int
2880vget(struct vnode *vp, int flags)
2881{
2882	enum vgetstate vs;
2883
2884	vs = vget_prep(vp);
2885	return (vget_finish(vp, flags, vs));
2886}
2887
2888int
2889vget_finish(struct vnode *vp, int flags, enum vgetstate vs)
2890{
2891	int error;
2892
2893	if ((flags & LK_INTERLOCK) != 0)
2894		ASSERT_VI_LOCKED(vp, __func__);
2895	else
2896		ASSERT_VI_UNLOCKED(vp, __func__);
2897	VNPASS(vs == VGET_HOLDCNT || vs == VGET_USECOUNT, vp);
2898	VNPASS(vp->v_holdcnt > 0, vp);
2899	VNPASS(vs == VGET_HOLDCNT || vp->v_usecount > 0, vp);
2900
2901	error = vn_lock(vp, flags);
2902	if (__predict_false(error != 0)) {
2903		vget_abort(vp, vs);
2904		CTR2(KTR_VFS, "%s: impossible to lock vnode %p", __func__,
2905		    vp);
2906		return (error);
2907	}
2908
2909	vget_finish_ref(vp, vs);
2910	return (0);
2911}
2912
2913void
2914vget_finish_ref(struct vnode *vp, enum vgetstate vs)
2915{
2916	int old;
2917
2918	VNPASS(vs == VGET_HOLDCNT || vs == VGET_USECOUNT, vp);
2919	VNPASS(vp->v_holdcnt > 0, vp);
2920	VNPASS(vs == VGET_HOLDCNT || vp->v_usecount > 0, vp);
2921
2922	if (vs == VGET_USECOUNT)
2923		return;
2924
2925	/*
2926	 * We hold the vnode. If the usecount is 0 it will be utilized to keep
2927	 * the vnode around. Otherwise someone else lended their hold count and
2928	 * we have to drop ours.
2929	 */
2930	old = atomic_fetchadd_int(&vp->v_usecount, 1);
2931	VNASSERT(old >= 0, vp, ("%s: wrong use count %d", __func__, old));
2932	if (old != 0) {
2933#ifdef INVARIANTS
2934		old = atomic_fetchadd_int(&vp->v_holdcnt, -1);
2935		VNASSERT(old > 1, vp, ("%s: wrong hold count %d", __func__, old));
2936#else
2937		refcount_release(&vp->v_holdcnt);
2938#endif
2939	}
2940}
2941
2942void
2943vref(struct vnode *vp)
2944{
2945	enum vgetstate vs;
2946
2947	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2948	vs = vget_prep(vp);
2949	vget_finish_ref(vp, vs);
2950}
2951
2952void
2953vrefact(struct vnode *vp)
2954{
2955
2956	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2957#ifdef INVARIANTS
2958	int old = atomic_fetchadd_int(&vp->v_usecount, 1);
2959	VNASSERT(old > 0, vp, ("%s: wrong use count %d", __func__, old));
2960#else
2961	refcount_acquire(&vp->v_usecount);
2962#endif
2963}
2964
2965void
2966vlazy(struct vnode *vp)
2967{
2968	struct mount *mp;
2969
2970	VNASSERT(vp->v_holdcnt > 0, vp, ("%s: vnode not held", __func__));
2971
2972	if ((vp->v_mflag & VMP_LAZYLIST) != 0)
2973		return;
2974	/*
2975	 * We may get here for inactive routines after the vnode got doomed.
2976	 */
2977	if (VN_IS_DOOMED(vp))
2978		return;
2979	mp = vp->v_mount;
2980	mtx_lock(&mp->mnt_listmtx);
2981	if ((vp->v_mflag & VMP_LAZYLIST) == 0) {
2982		vp->v_mflag |= VMP_LAZYLIST;
2983		TAILQ_INSERT_TAIL(&mp->mnt_lazyvnodelist, vp, v_lazylist);
2984		mp->mnt_lazyvnodelistsize++;
2985	}
2986	mtx_unlock(&mp->mnt_listmtx);
2987}
2988
2989/*
2990 * This routine is only meant to be called from vgonel prior to dooming
2991 * the vnode.
2992 */
2993static void
2994vunlazy_gone(struct vnode *vp)
2995{
2996	struct mount *mp;
2997
2998	ASSERT_VOP_ELOCKED(vp, __func__);
2999	ASSERT_VI_LOCKED(vp, __func__);
3000	VNPASS(!VN_IS_DOOMED(vp), vp);
3001
3002	if (vp->v_mflag & VMP_LAZYLIST) {
3003		mp = vp->v_mount;
3004		mtx_lock(&mp->mnt_listmtx);
3005		VNPASS(vp->v_mflag & VMP_LAZYLIST, vp);
3006		vp->v_mflag &= ~VMP_LAZYLIST;
3007		TAILQ_REMOVE(&mp->mnt_lazyvnodelist, vp, v_lazylist);
3008		mp->mnt_lazyvnodelistsize--;
3009		mtx_unlock(&mp->mnt_listmtx);
3010	}
3011}
3012
3013static void
3014vdefer_inactive(struct vnode *vp)
3015{
3016
3017	ASSERT_VI_LOCKED(vp, __func__);
3018	VNASSERT(vp->v_holdcnt > 0, vp,
3019	    ("%s: vnode without hold count", __func__));
3020	if (VN_IS_DOOMED(vp)) {
3021		vdropl(vp);
3022		return;
3023	}
3024	if (vp->v_iflag & VI_DEFINACT) {
3025		VNASSERT(vp->v_holdcnt > 1, vp, ("lost hold count"));
3026		vdropl(vp);
3027		return;
3028	}
3029	if (vp->v_usecount > 0) {
3030		vp->v_iflag &= ~VI_OWEINACT;
3031		vdropl(vp);
3032		return;
3033	}
3034	vlazy(vp);
3035	vp->v_iflag |= VI_DEFINACT;
3036	VI_UNLOCK(vp);
3037	counter_u64_add(deferred_inact, 1);
3038}
3039
3040static void
3041vdefer_inactive_unlocked(struct vnode *vp)
3042{
3043
3044	VI_LOCK(vp);
3045	if ((vp->v_iflag & VI_OWEINACT) == 0) {
3046		vdropl(vp);
3047		return;
3048	}
3049	vdefer_inactive(vp);
3050}
3051
3052enum vput_op { VRELE, VPUT, VUNREF };
3053
3054/*
3055 * Handle ->v_usecount transitioning to 0.
3056 *
3057 * By releasing the last usecount we take ownership of the hold count which
3058 * provides liveness of the vnode, meaning we have to vdrop.
3059 *
3060 * For all vnodes we may need to perform inactive processing. It requires an
3061 * exclusive lock on the vnode, while it is legal to call here with only a
3062 * shared lock (or no locks). If locking the vnode in an expected manner fails,
3063 * inactive processing gets deferred to the syncer.
3064 *
3065 * XXX Some filesystems pass in an exclusively locked vnode and strongly depend
3066 * on the lock being held all the way until VOP_INACTIVE. This in particular
3067 * happens with UFS which adds half-constructed vnodes to the hash, where they
3068 * can be found by other code.
3069 */
3070static void
3071vput_final(struct vnode *vp, enum vput_op func)
3072{
3073	int error;
3074	bool want_unlock;
3075
3076	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
3077	VNPASS(vp->v_holdcnt > 0, vp);
3078
3079	VI_LOCK(vp);
3080
3081	/*
3082	 * By the time we got here someone else might have transitioned
3083	 * the count back to > 0.
3084	 */
3085	if (vp->v_usecount > 0)
3086		goto out;
3087
3088	/*
3089	 * If the vnode is doomed vgone already performed inactive processing
3090	 * (if needed).
3091	 */
3092	if (VN_IS_DOOMED(vp))
3093		goto out;
3094
3095	if (__predict_true(VOP_NEED_INACTIVE(vp) == 0))
3096		goto out;
3097
3098	if (vp->v_iflag & VI_DOINGINACT)
3099		goto out;
3100
3101	/*
3102	 * Locking operations here will drop the interlock and possibly the
3103	 * vnode lock, opening a window where the vnode can get doomed all the
3104	 * while ->v_usecount is 0. Set VI_OWEINACT to let vgone know to
3105	 * perform inactive.
3106	 */
3107	vp->v_iflag |= VI_OWEINACT;
3108	want_unlock = false;
3109	error = 0;
3110	switch (func) {
3111	case VRELE:
3112		switch (VOP_ISLOCKED(vp)) {
3113		case LK_EXCLUSIVE:
3114			break;
3115		case LK_EXCLOTHER:
3116		case 0:
3117			want_unlock = true;
3118			error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK);
3119			VI_LOCK(vp);
3120			break;
3121		default:
3122			/*
3123			 * The lock has at least one sharer, but we have no way
3124			 * to conclude whether this is us. Play it safe and
3125			 * defer processing.
3126			 */
3127			error = EAGAIN;
3128			break;
3129		}
3130		break;
3131	case VPUT:
3132		want_unlock = true;
3133		if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) {
3134			error = VOP_LOCK(vp, LK_UPGRADE | LK_INTERLOCK |
3135			    LK_NOWAIT);
3136			VI_LOCK(vp);
3137		}
3138		break;
3139	case VUNREF:
3140		if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) {
3141			error = VOP_LOCK(vp, LK_TRYUPGRADE | LK_INTERLOCK);
3142			VI_LOCK(vp);
3143		}
3144		break;
3145	}
3146	if (error == 0) {
3147		vinactive(vp);
3148		if (want_unlock)
3149			VOP_UNLOCK(vp);
3150		vdropl(vp);
3151	} else {
3152		vdefer_inactive(vp);
3153	}
3154	return;
3155out:
3156	if (func == VPUT)
3157		VOP_UNLOCK(vp);
3158	vdropl(vp);
3159}
3160
3161/*
3162 * Decrement ->v_usecount for a vnode.
3163 *
3164 * Releasing the last use count requires additional processing, see vput_final
3165 * above for details.
3166 *
3167 * Comment above each variant denotes lock state on entry and exit.
3168 */
3169
3170/*
3171 * in: any
3172 * out: same as passed in
3173 */
3174void
3175vrele(struct vnode *vp)
3176{
3177
3178	ASSERT_VI_UNLOCKED(vp, __func__);
3179	if (!refcount_release(&vp->v_usecount))
3180		return;
3181	vput_final(vp, VRELE);
3182}
3183
3184/*
3185 * in: locked
3186 * out: unlocked
3187 */
3188void
3189vput(struct vnode *vp)
3190{
3191
3192	ASSERT_VOP_LOCKED(vp, __func__);
3193	ASSERT_VI_UNLOCKED(vp, __func__);
3194	if (!refcount_release(&vp->v_usecount)) {
3195		VOP_UNLOCK(vp);
3196		return;
3197	}
3198	vput_final(vp, VPUT);
3199}
3200
3201/*
3202 * in: locked
3203 * out: locked
3204 */
3205void
3206vunref(struct vnode *vp)
3207{
3208
3209	ASSERT_VOP_LOCKED(vp, __func__);
3210	ASSERT_VI_UNLOCKED(vp, __func__);
3211	if (!refcount_release(&vp->v_usecount))
3212		return;
3213	vput_final(vp, VUNREF);
3214}
3215
3216void
3217vhold(struct vnode *vp)
3218{
3219	int old;
3220
3221	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
3222	old = atomic_fetchadd_int(&vp->v_holdcnt, 1);
3223	VNASSERT(old >= 0 && (old & VHOLD_ALL_FLAGS) == 0, vp,
3224	    ("%s: wrong hold count %d", __func__, old));
3225	if (old == 0)
3226		vn_freevnodes_dec();
3227}
3228
3229void
3230vholdnz(struct vnode *vp)
3231{
3232
3233	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
3234#ifdef INVARIANTS
3235	int old = atomic_fetchadd_int(&vp->v_holdcnt, 1);
3236	VNASSERT(old > 0 && (old & VHOLD_ALL_FLAGS) == 0, vp,
3237	    ("%s: wrong hold count %d", __func__, old));
3238#else
3239	atomic_add_int(&vp->v_holdcnt, 1);
3240#endif
3241}
3242
3243/*
3244 * Grab a hold count unless the vnode is freed.
3245 *
3246 * Only use this routine if vfs smr is the only protection you have against
3247 * freeing the vnode.
3248 *
3249 * The code loops trying to add a hold count as long as the VHOLD_NO_SMR flag
3250 * is not set.  After the flag is set the vnode becomes immutable to anyone but
3251 * the thread which managed to set the flag.
3252 *
3253 * It may be tempting to replace the loop with:
3254 * count = atomic_fetchadd_int(&vp->v_holdcnt, 1);
3255 * if (count & VHOLD_NO_SMR) {
3256 *     backpedal and error out;
3257 * }
3258 *
3259 * However, while this is more performant, it hinders debugging by eliminating
3260 * the previously mentioned invariant.
3261 */
3262bool
3263vhold_smr(struct vnode *vp)
3264{
3265	int count;
3266
3267	VFS_SMR_ASSERT_ENTERED();
3268
3269	count = atomic_load_int(&vp->v_holdcnt);
3270	for (;;) {
3271		if (count & VHOLD_NO_SMR) {
3272			VNASSERT((count & ~VHOLD_NO_SMR) == 0, vp,
3273			    ("non-zero hold count with flags %d\n", count));
3274			return (false);
3275		}
3276
3277		VNASSERT(count >= 0, vp, ("invalid hold count %d\n", count));
3278		if (atomic_fcmpset_int(&vp->v_holdcnt, &count, count + 1)) {
3279			if (count == 0)
3280				vn_freevnodes_dec();
3281			return (true);
3282		}
3283	}
3284}
3285
3286static void __noinline
3287vdbatch_process(struct vdbatch *vd)
3288{
3289	struct vnode *vp;
3290	int i;
3291
3292	mtx_assert(&vd->lock, MA_OWNED);
3293	MPASS(curthread->td_pinned > 0);
3294	MPASS(vd->index == VDBATCH_SIZE);
3295
3296	mtx_lock(&vnode_list_mtx);
3297	critical_enter();
3298	freevnodes += vd->freevnodes;
3299	for (i = 0; i < VDBATCH_SIZE; i++) {
3300		vp = vd->tab[i];
3301		TAILQ_REMOVE(&vnode_list, vp, v_vnodelist);
3302		TAILQ_INSERT_TAIL(&vnode_list, vp, v_vnodelist);
3303		MPASS(vp->v_dbatchcpu != NOCPU);
3304		vp->v_dbatchcpu = NOCPU;
3305	}
3306	mtx_unlock(&vnode_list_mtx);
3307	vd->freevnodes = 0;
3308	bzero(vd->tab, sizeof(vd->tab));
3309	vd->index = 0;
3310	critical_exit();
3311}
3312
3313static void
3314vdbatch_enqueue(struct vnode *vp)
3315{
3316	struct vdbatch *vd;
3317
3318	ASSERT_VI_LOCKED(vp, __func__);
3319	VNASSERT(!VN_IS_DOOMED(vp), vp,
3320	    ("%s: deferring requeue of a doomed vnode", __func__));
3321
3322	if (vp->v_dbatchcpu != NOCPU) {
3323		VI_UNLOCK(vp);
3324		return;
3325	}
3326
3327	sched_pin();
3328	vd = DPCPU_PTR(vd);
3329	mtx_lock(&vd->lock);
3330	MPASS(vd->index < VDBATCH_SIZE);
3331	MPASS(vd->tab[vd->index] == NULL);
3332	/*
3333	 * A hack: we depend on being pinned so that we know what to put in
3334	 * ->v_dbatchcpu.
3335	 */
3336	vp->v_dbatchcpu = curcpu;
3337	vd->tab[vd->index] = vp;
3338	vd->index++;
3339	VI_UNLOCK(vp);
3340	if (vd->index == VDBATCH_SIZE)
3341		vdbatch_process(vd);
3342	mtx_unlock(&vd->lock);
3343	sched_unpin();
3344}
3345
3346/*
3347 * This routine must only be called for vnodes which are about to be
3348 * deallocated. Supporting dequeue for arbitrary vndoes would require
3349 * validating that the locked batch matches.
3350 */
3351static void
3352vdbatch_dequeue(struct vnode *vp)
3353{
3354	struct vdbatch *vd;
3355	int i;
3356	short cpu;
3357
3358	VNASSERT(vp->v_type == VBAD || vp->v_type == VNON, vp,
3359	    ("%s: called for a used vnode\n", __func__));
3360
3361	cpu = vp->v_dbatchcpu;
3362	if (cpu == NOCPU)
3363		return;
3364
3365	vd = DPCPU_ID_PTR(cpu, vd);
3366	mtx_lock(&vd->lock);
3367	for (i = 0; i < vd->index; i++) {
3368		if (vd->tab[i] != vp)
3369			continue;
3370		vp->v_dbatchcpu = NOCPU;
3371		vd->index--;
3372		vd->tab[i] = vd->tab[vd->index];
3373		vd->tab[vd->index] = NULL;
3374		break;
3375	}
3376	mtx_unlock(&vd->lock);
3377	/*
3378	 * Either we dequeued the vnode above or the target CPU beat us to it.
3379	 */
3380	MPASS(vp->v_dbatchcpu == NOCPU);
3381}
3382
3383/*
3384 * Drop the hold count of the vnode.  If this is the last reference to
3385 * the vnode we place it on the free list unless it has been vgone'd
3386 * (marked VIRF_DOOMED) in which case we will free it.
3387 *
3388 * Because the vnode vm object keeps a hold reference on the vnode if
3389 * there is at least one resident non-cached page, the vnode cannot
3390 * leave the active list without the page cleanup done.
3391 */
3392static void
3393vdrop_deactivate(struct vnode *vp)
3394{
3395	struct mount *mp;
3396
3397	ASSERT_VI_LOCKED(vp, __func__);
3398	/*
3399	 * Mark a vnode as free: remove it from its active list
3400	 * and put it up for recycling on the freelist.
3401	 */
3402	VNASSERT(!VN_IS_DOOMED(vp), vp,
3403	    ("vdrop: returning doomed vnode"));
3404	VNASSERT(vp->v_op != NULL, vp,
3405	    ("vdrop: vnode already reclaimed."));
3406	VNASSERT((vp->v_iflag & VI_OWEINACT) == 0, vp,
3407	    ("vnode with VI_OWEINACT set"));
3408	VNASSERT((vp->v_iflag & VI_DEFINACT) == 0, vp,
3409	    ("vnode with VI_DEFINACT set"));
3410	if (vp->v_mflag & VMP_LAZYLIST) {
3411		mp = vp->v_mount;
3412		mtx_lock(&mp->mnt_listmtx);
3413		VNASSERT(vp->v_mflag & VMP_LAZYLIST, vp, ("lost VMP_LAZYLIST"));
3414		/*
3415		 * Don't remove the vnode from the lazy list if another thread
3416		 * has increased the hold count. It may have re-enqueued the
3417		 * vnode to the lazy list and is now responsible for its
3418		 * removal.
3419		 */
3420		if (vp->v_holdcnt == 0) {
3421			vp->v_mflag &= ~VMP_LAZYLIST;
3422			TAILQ_REMOVE(&mp->mnt_lazyvnodelist, vp, v_lazylist);
3423			mp->mnt_lazyvnodelistsize--;
3424		}
3425		mtx_unlock(&mp->mnt_listmtx);
3426	}
3427	vdbatch_enqueue(vp);
3428}
3429
3430static void __noinline
3431vdropl_final(struct vnode *vp)
3432{
3433
3434	ASSERT_VI_LOCKED(vp, __func__);
3435	VNPASS(VN_IS_DOOMED(vp), vp);
3436	/*
3437	 * Set the VHOLD_NO_SMR flag.
3438	 *
3439	 * We may be racing against vhold_smr. If they win we can just pretend
3440	 * we never got this far, they will vdrop later.
3441	 */
3442	if (__predict_false(!atomic_cmpset_int(&vp->v_holdcnt, 0, VHOLD_NO_SMR))) {
3443		vn_freevnodes_inc();
3444		VI_UNLOCK(vp);
3445		/*
3446		 * We lost the aforementioned race. Any subsequent access is
3447		 * invalid as they might have managed to vdropl on their own.
3448		 */
3449		return;
3450	}
3451	/*
3452	 * Don't bump freevnodes as this one is going away.
3453	 */
3454	freevnode(vp);
3455}
3456
3457void
3458vdrop(struct vnode *vp)
3459{
3460
3461	ASSERT_VI_UNLOCKED(vp, __func__);
3462	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
3463	if (refcount_release_if_not_last(&vp->v_holdcnt))
3464		return;
3465	VI_LOCK(vp);
3466	vdropl(vp);
3467}
3468
3469void
3470vdropl(struct vnode *vp)
3471{
3472
3473	ASSERT_VI_LOCKED(vp, __func__);
3474	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
3475	if (!refcount_release(&vp->v_holdcnt)) {
3476		VI_UNLOCK(vp);
3477		return;
3478	}
3479	if (!VN_IS_DOOMED(vp)) {
3480		vn_freevnodes_inc();
3481		vdrop_deactivate(vp);
3482		/*
3483		 * Also unlocks the interlock. We can't assert on it as we
3484		 * released our hold and by now the vnode might have been
3485		 * freed.
3486		 */
3487		return;
3488	}
3489	vdropl_final(vp);
3490}
3491
3492/*
3493 * Call VOP_INACTIVE on the vnode and manage the DOINGINACT and OWEINACT
3494 * flags.  DOINGINACT prevents us from recursing in calls to vinactive.
3495 */
3496static void
3497vinactivef(struct vnode *vp)
3498{
3499	struct vm_object *obj;
3500
3501	ASSERT_VOP_ELOCKED(vp, "vinactive");
3502	ASSERT_VI_LOCKED(vp, "vinactive");
3503	VNASSERT((vp->v_iflag & VI_DOINGINACT) == 0, vp,
3504	    ("vinactive: recursed on VI_DOINGINACT"));
3505	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
3506	vp->v_iflag |= VI_DOINGINACT;
3507	vp->v_iflag &= ~VI_OWEINACT;
3508	VI_UNLOCK(vp);
3509	/*
3510	 * Before moving off the active list, we must be sure that any
3511	 * modified pages are converted into the vnode's dirty
3512	 * buffers, since these will no longer be checked once the
3513	 * vnode is on the inactive list.
3514	 *
3515	 * The write-out of the dirty pages is asynchronous.  At the
3516	 * point that VOP_INACTIVE() is called, there could still be
3517	 * pending I/O and dirty pages in the object.
3518	 */
3519	if ((obj = vp->v_object) != NULL && (vp->v_vflag & VV_NOSYNC) == 0 &&
3520	    vm_object_mightbedirty(obj)) {
3521		VM_OBJECT_WLOCK(obj);
3522		vm_object_page_clean(obj, 0, 0, 0);
3523		VM_OBJECT_WUNLOCK(obj);
3524	}
3525	VOP_INACTIVE(vp, curthread);
3526	VI_LOCK(vp);
3527	VNASSERT(vp->v_iflag & VI_DOINGINACT, vp,
3528	    ("vinactive: lost VI_DOINGINACT"));
3529	vp->v_iflag &= ~VI_DOINGINACT;
3530}
3531
3532void
3533vinactive(struct vnode *vp)
3534{
3535
3536	ASSERT_VOP_ELOCKED(vp, "vinactive");
3537	ASSERT_VI_LOCKED(vp, "vinactive");
3538	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
3539
3540	if ((vp->v_iflag & VI_OWEINACT) == 0)
3541		return;
3542	if (vp->v_iflag & VI_DOINGINACT)
3543		return;
3544	if (vp->v_usecount > 0) {
3545		vp->v_iflag &= ~VI_OWEINACT;
3546		return;
3547	}
3548	vinactivef(vp);
3549}
3550
3551/*
3552 * Remove any vnodes in the vnode table belonging to mount point mp.
3553 *
3554 * If FORCECLOSE is not specified, there should not be any active ones,
3555 * return error if any are found (nb: this is a user error, not a
3556 * system error). If FORCECLOSE is specified, detach any active vnodes
3557 * that are found.
3558 *
3559 * If WRITECLOSE is set, only flush out regular file vnodes open for
3560 * writing.
3561 *
3562 * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped.
3563 *
3564 * `rootrefs' specifies the base reference count for the root vnode
3565 * of this filesystem. The root vnode is considered busy if its
3566 * v_usecount exceeds this value. On a successful return, vflush(, td)
3567 * will call vrele() on the root vnode exactly rootrefs times.
3568 * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must
3569 * be zero.
3570 */
3571#ifdef DIAGNOSTIC
3572static int busyprt = 0;		/* print out busy vnodes */
3573SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "Print out busy vnodes");
3574#endif
3575
3576int
3577vflush(struct mount *mp, int rootrefs, int flags, struct thread *td)
3578{
3579	struct vnode *vp, *mvp, *rootvp = NULL;
3580	struct vattr vattr;
3581	int busy = 0, error;
3582
3583	CTR4(KTR_VFS, "%s: mp %p with rootrefs %d and flags %d", __func__, mp,
3584	    rootrefs, flags);
3585	if (rootrefs > 0) {
3586		KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0,
3587		    ("vflush: bad args"));
3588		/*
3589		 * Get the filesystem root vnode. We can vput() it
3590		 * immediately, since with rootrefs > 0, it won't go away.
3591		 */
3592		if ((error = VFS_ROOT(mp, LK_EXCLUSIVE, &rootvp)) != 0) {
3593			CTR2(KTR_VFS, "%s: vfs_root lookup failed with %d",
3594			    __func__, error);
3595			return (error);
3596		}
3597		vput(rootvp);
3598	}
3599loop:
3600	MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
3601		vholdl(vp);
3602		error = vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE);
3603		if (error) {
3604			vdrop(vp);
3605			MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
3606			goto loop;
3607		}
3608		/*
3609		 * Skip over a vnodes marked VV_SYSTEM.
3610		 */
3611		if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) {
3612			VOP_UNLOCK(vp);
3613			vdrop(vp);
3614			continue;
3615		}
3616		/*
3617		 * If WRITECLOSE is set, flush out unlinked but still open
3618		 * files (even if open only for reading) and regular file
3619		 * vnodes open for writing.
3620		 */
3621		if (flags & WRITECLOSE) {
3622			if (vp->v_object != NULL) {
3623				VM_OBJECT_WLOCK(vp->v_object);
3624				vm_object_page_clean(vp->v_object, 0, 0, 0);
3625				VM_OBJECT_WUNLOCK(vp->v_object);
3626			}
3627			error = VOP_FSYNC(vp, MNT_WAIT, td);
3628			if (error != 0) {
3629				VOP_UNLOCK(vp);
3630				vdrop(vp);
3631				MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
3632				return (error);
3633			}
3634			error = VOP_GETATTR(vp, &vattr, td->td_ucred);
3635			VI_LOCK(vp);
3636
3637			if ((vp->v_type == VNON ||
3638			    (error == 0 && vattr.va_nlink > 0)) &&
3639			    (vp->v_writecount <= 0 || vp->v_type != VREG)) {
3640				VOP_UNLOCK(vp);
3641				vdropl(vp);
3642				continue;
3643			}
3644		} else
3645			VI_LOCK(vp);
3646		/*
3647		 * With v_usecount == 0, all we need to do is clear out the
3648		 * vnode data structures and we are done.
3649		 *
3650		 * If FORCECLOSE is set, forcibly close the vnode.
3651		 */
3652		if (vp->v_usecount == 0 || (flags & FORCECLOSE)) {
3653			vgonel(vp);
3654		} else {
3655			busy++;
3656#ifdef DIAGNOSTIC
3657			if (busyprt)
3658				vn_printf(vp, "vflush: busy vnode ");
3659#endif
3660		}
3661		VOP_UNLOCK(vp);
3662		vdropl(vp);
3663	}
3664	if (rootrefs > 0 && (flags & FORCECLOSE) == 0) {
3665		/*
3666		 * If just the root vnode is busy, and if its refcount
3667		 * is equal to `rootrefs', then go ahead and kill it.
3668		 */
3669		VI_LOCK(rootvp);
3670		KASSERT(busy > 0, ("vflush: not busy"));
3671		VNASSERT(rootvp->v_usecount >= rootrefs, rootvp,
3672		    ("vflush: usecount %d < rootrefs %d",
3673		     rootvp->v_usecount, rootrefs));
3674		if (busy == 1 && rootvp->v_usecount == rootrefs) {
3675			VOP_LOCK(rootvp, LK_EXCLUSIVE|LK_INTERLOCK);
3676			vgone(rootvp);
3677			VOP_UNLOCK(rootvp);
3678			busy = 0;
3679		} else
3680			VI_UNLOCK(rootvp);
3681	}
3682	if (busy) {
3683		CTR2(KTR_VFS, "%s: failing as %d vnodes are busy", __func__,
3684		    busy);
3685		return (EBUSY);
3686	}
3687	for (; rootrefs > 0; rootrefs--)
3688		vrele(rootvp);
3689	return (0);
3690}
3691
3692/*
3693 * Recycle an unused vnode to the front of the free list.
3694 */
3695int
3696vrecycle(struct vnode *vp)
3697{
3698	int recycled;
3699
3700	VI_LOCK(vp);
3701	recycled = vrecyclel(vp);
3702	VI_UNLOCK(vp);
3703	return (recycled);
3704}
3705
3706/*
3707 * vrecycle, with the vp interlock held.
3708 */
3709int
3710vrecyclel(struct vnode *vp)
3711{
3712	int recycled;
3713
3714	ASSERT_VOP_ELOCKED(vp, __func__);
3715	ASSERT_VI_LOCKED(vp, __func__);
3716	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
3717	recycled = 0;
3718	if (vp->v_usecount == 0) {
3719		recycled = 1;
3720		vgonel(vp);
3721	}
3722	return (recycled);
3723}
3724
3725/*
3726 * Eliminate all activity associated with a vnode
3727 * in preparation for reuse.
3728 */
3729void
3730vgone(struct vnode *vp)
3731{
3732	VI_LOCK(vp);
3733	vgonel(vp);
3734	VI_UNLOCK(vp);
3735}
3736
3737static void
3738notify_lowervp_vfs_dummy(struct mount *mp __unused,
3739    struct vnode *lowervp __unused)
3740{
3741}
3742
3743/*
3744 * Notify upper mounts about reclaimed or unlinked vnode.
3745 */
3746void
3747vfs_notify_upper(struct vnode *vp, int event)
3748{
3749	static struct vfsops vgonel_vfsops = {
3750		.vfs_reclaim_lowervp = notify_lowervp_vfs_dummy,
3751		.vfs_unlink_lowervp = notify_lowervp_vfs_dummy,
3752	};
3753	struct mount *mp, *ump, *mmp;
3754
3755	mp = vp->v_mount;
3756	if (mp == NULL)
3757		return;
3758	if (TAILQ_EMPTY(&mp->mnt_uppers))
3759		return;
3760
3761	mmp = malloc(sizeof(struct mount), M_TEMP, M_WAITOK | M_ZERO);
3762	mmp->mnt_op = &vgonel_vfsops;
3763	mmp->mnt_kern_flag |= MNTK_MARKER;
3764	MNT_ILOCK(mp);
3765	mp->mnt_kern_flag |= MNTK_VGONE_UPPER;
3766	for (ump = TAILQ_FIRST(&mp->mnt_uppers); ump != NULL;) {
3767		if ((ump->mnt_kern_flag & MNTK_MARKER) != 0) {
3768			ump = TAILQ_NEXT(ump, mnt_upper_link);
3769			continue;
3770		}
3771		TAILQ_INSERT_AFTER(&mp->mnt_uppers, ump, mmp, mnt_upper_link);
3772		MNT_IUNLOCK(mp);
3773		switch (event) {
3774		case VFS_NOTIFY_UPPER_RECLAIM:
3775			VFS_RECLAIM_LOWERVP(ump, vp);
3776			break;
3777		case VFS_NOTIFY_UPPER_UNLINK:
3778			VFS_UNLINK_LOWERVP(ump, vp);
3779			break;
3780		default:
3781			KASSERT(0, ("invalid event %d", event));
3782			break;
3783		}
3784		MNT_ILOCK(mp);
3785		ump = TAILQ_NEXT(mmp, mnt_upper_link);
3786		TAILQ_REMOVE(&mp->mnt_uppers, mmp, mnt_upper_link);
3787	}
3788	free(mmp, M_TEMP);
3789	mp->mnt_kern_flag &= ~MNTK_VGONE_UPPER;
3790	if ((mp->mnt_kern_flag & MNTK_VGONE_WAITER) != 0) {
3791		mp->mnt_kern_flag &= ~MNTK_VGONE_WAITER;
3792		wakeup(&mp->mnt_uppers);
3793	}
3794	MNT_IUNLOCK(mp);
3795}
3796
3797/*
3798 * vgone, with the vp interlock held.
3799 */
3800static void
3801vgonel(struct vnode *vp)
3802{
3803	struct thread *td;
3804	struct mount *mp;
3805	vm_object_t object;
3806	bool active, oweinact;
3807
3808	ASSERT_VOP_ELOCKED(vp, "vgonel");
3809	ASSERT_VI_LOCKED(vp, "vgonel");
3810	VNASSERT(vp->v_holdcnt, vp,
3811	    ("vgonel: vp %p has no reference.", vp));
3812	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
3813	td = curthread;
3814
3815	/*
3816	 * Don't vgonel if we're already doomed.
3817	 */
3818	if (vp->v_irflag & VIRF_DOOMED)
3819		return;
3820	/*
3821	 * Paired with freevnode.
3822	 */
3823	vn_seqc_write_begin_locked(vp);
3824	vunlazy_gone(vp);
3825	vp->v_irflag |= VIRF_DOOMED;
3826
3827	/*
3828	 * Check to see if the vnode is in use.  If so, we have to call
3829	 * VOP_CLOSE() and VOP_INACTIVE().
3830	 */
3831	active = vp->v_usecount > 0;
3832	oweinact = (vp->v_iflag & VI_OWEINACT) != 0;
3833	/*
3834	 * If we need to do inactive VI_OWEINACT will be set.
3835	 */
3836	if (vp->v_iflag & VI_DEFINACT) {
3837		VNASSERT(vp->v_holdcnt > 1, vp, ("lost hold count"));
3838		vp->v_iflag &= ~VI_DEFINACT;
3839		vdropl(vp);
3840	} else {
3841		VNASSERT(vp->v_holdcnt > 0, vp, ("vnode without hold count"));
3842		VI_UNLOCK(vp);
3843	}
3844	cache_purge_vgone(vp);
3845	vfs_notify_upper(vp, VFS_NOTIFY_UPPER_RECLAIM);
3846
3847	/*
3848	 * If purging an active vnode, it must be closed and
3849	 * deactivated before being reclaimed.
3850	 */
3851	if (active)
3852		VOP_CLOSE(vp, FNONBLOCK, NOCRED, td);
3853	if (oweinact || active) {
3854		VI_LOCK(vp);
3855		vinactivef(vp);
3856		VI_UNLOCK(vp);
3857	}
3858	if (vp->v_type == VSOCK)
3859		vfs_unp_reclaim(vp);
3860
3861	/*
3862	 * Clean out any buffers associated with the vnode.
3863	 * If the flush fails, just toss the buffers.
3864	 */
3865	mp = NULL;
3866	if (!TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd))
3867		(void) vn_start_secondary_write(vp, &mp, V_WAIT);
3868	if (vinvalbuf(vp, V_SAVE, 0, 0) != 0) {
3869		while (vinvalbuf(vp, 0, 0, 0) != 0)
3870			;
3871	}
3872
3873	BO_LOCK(&vp->v_bufobj);
3874	KASSERT(TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd) &&
3875	    vp->v_bufobj.bo_dirty.bv_cnt == 0 &&
3876	    TAILQ_EMPTY(&vp->v_bufobj.bo_clean.bv_hd) &&
3877	    vp->v_bufobj.bo_clean.bv_cnt == 0,
3878	    ("vp %p bufobj not invalidated", vp));
3879
3880	/*
3881	 * For VMIO bufobj, BO_DEAD is set later, or in
3882	 * vm_object_terminate() after the object's page queue is
3883	 * flushed.
3884	 */
3885	object = vp->v_bufobj.bo_object;
3886	if (object == NULL)
3887		vp->v_bufobj.bo_flag |= BO_DEAD;
3888	BO_UNLOCK(&vp->v_bufobj);
3889
3890	/*
3891	 * Handle the VM part.  Tmpfs handles v_object on its own (the
3892	 * OBJT_VNODE check).  Nullfs or other bypassing filesystems
3893	 * should not touch the object borrowed from the lower vnode
3894	 * (the handle check).
3895	 */
3896	if (object != NULL && object->type == OBJT_VNODE &&
3897	    object->handle == vp)
3898		vnode_destroy_vobject(vp);
3899
3900	/*
3901	 * Reclaim the vnode.
3902	 */
3903	if (VOP_RECLAIM(vp))
3904		panic("vgone: cannot reclaim");
3905	if (mp != NULL)
3906		vn_finished_secondary_write(mp);
3907	VNASSERT(vp->v_object == NULL, vp,
3908	    ("vop_reclaim left v_object vp=%p", vp));
3909	/*
3910	 * Clear the advisory locks and wake up waiting threads.
3911	 */
3912	(void)VOP_ADVLOCKPURGE(vp);
3913	vp->v_lockf = NULL;
3914	/*
3915	 * Delete from old mount point vnode list.
3916	 */
3917	delmntque(vp);
3918	/*
3919	 * Done with purge, reset to the standard lock and invalidate
3920	 * the vnode.
3921	 */
3922	VI_LOCK(vp);
3923	vp->v_vnlock = &vp->v_lock;
3924	vp->v_op = &dead_vnodeops;
3925	vp->v_type = VBAD;
3926}
3927
3928/*
3929 * Print out a description of a vnode.
3930 */
3931static const char * const typename[] =
3932{"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD",
3933 "VMARKER"};
3934
3935_Static_assert((VHOLD_ALL_FLAGS & ~VHOLD_NO_SMR) == 0,
3936    "new hold count flag not added to vn_printf");
3937
3938void
3939vn_printf(struct vnode *vp, const char *fmt, ...)
3940{
3941	va_list ap;
3942	char buf[256], buf2[16];
3943	u_long flags;
3944	u_int holdcnt;
3945
3946	va_start(ap, fmt);
3947	vprintf(fmt, ap);
3948	va_end(ap);
3949	printf("%p: ", (void *)vp);
3950	printf("type %s\n", typename[vp->v_type]);
3951	holdcnt = atomic_load_int(&vp->v_holdcnt);
3952	printf("    usecount %d, writecount %d, refcount %d seqc users %d",
3953	    vp->v_usecount, vp->v_writecount, holdcnt & ~VHOLD_ALL_FLAGS,
3954	    vp->v_seqc_users);
3955	switch (vp->v_type) {
3956	case VDIR:
3957		printf(" mountedhere %p\n", vp->v_mountedhere);
3958		break;
3959	case VCHR:
3960		printf(" rdev %p\n", vp->v_rdev);
3961		break;
3962	case VSOCK:
3963		printf(" socket %p\n", vp->v_unpcb);
3964		break;
3965	case VFIFO:
3966		printf(" fifoinfo %p\n", vp->v_fifoinfo);
3967		break;
3968	default:
3969		printf("\n");
3970		break;
3971	}
3972	buf[0] = '\0';
3973	buf[1] = '\0';
3974	if (holdcnt & VHOLD_NO_SMR)
3975		strlcat(buf, "|VHOLD_NO_SMR", sizeof(buf));
3976	printf("    hold count flags (%s)\n", buf + 1);
3977
3978	buf[0] = '\0';
3979	buf[1] = '\0';
3980	if (vp->v_irflag & VIRF_DOOMED)
3981		strlcat(buf, "|VIRF_DOOMED", sizeof(buf));
3982	if (vp->v_irflag & VIRF_PGREAD)
3983		strlcat(buf, "|VIRF_PGREAD", sizeof(buf));
3984	flags = vp->v_irflag & ~(VIRF_DOOMED | VIRF_PGREAD);
3985	if (flags != 0) {
3986		snprintf(buf2, sizeof(buf2), "|VIRF(0x%lx)", flags);
3987		strlcat(buf, buf2, sizeof(buf));
3988	}
3989	if (vp->v_vflag & VV_ROOT)
3990		strlcat(buf, "|VV_ROOT", sizeof(buf));
3991	if (vp->v_vflag & VV_ISTTY)
3992		strlcat(buf, "|VV_ISTTY", sizeof(buf));
3993	if (vp->v_vflag & VV_NOSYNC)
3994		strlcat(buf, "|VV_NOSYNC", sizeof(buf));
3995	if (vp->v_vflag & VV_ETERNALDEV)
3996		strlcat(buf, "|VV_ETERNALDEV", sizeof(buf));
3997	if (vp->v_vflag & VV_CACHEDLABEL)
3998		strlcat(buf, "|VV_CACHEDLABEL", sizeof(buf));
3999	if (vp->v_vflag & VV_VMSIZEVNLOCK)
4000		strlcat(buf, "|VV_VMSIZEVNLOCK", sizeof(buf));
4001	if (vp->v_vflag & VV_COPYONWRITE)
4002		strlcat(buf, "|VV_COPYONWRITE", sizeof(buf));
4003	if (vp->v_vflag & VV_SYSTEM)
4004		strlcat(buf, "|VV_SYSTEM", sizeof(buf));
4005	if (vp->v_vflag & VV_PROCDEP)
4006		strlcat(buf, "|VV_PROCDEP", sizeof(buf));
4007	if (vp->v_vflag & VV_NOKNOTE)
4008		strlcat(buf, "|VV_NOKNOTE", sizeof(buf));
4009	if (vp->v_vflag & VV_DELETED)
4010		strlcat(buf, "|VV_DELETED", sizeof(buf));
4011	if (vp->v_vflag & VV_MD)
4012		strlcat(buf, "|VV_MD", sizeof(buf));
4013	if (vp->v_vflag & VV_FORCEINSMQ)
4014		strlcat(buf, "|VV_FORCEINSMQ", sizeof(buf));
4015	if (vp->v_vflag & VV_READLINK)
4016		strlcat(buf, "|VV_READLINK", sizeof(buf));
4017	flags = vp->v_vflag & ~(VV_ROOT | VV_ISTTY | VV_NOSYNC | VV_ETERNALDEV |
4018	    VV_CACHEDLABEL | VV_COPYONWRITE | VV_SYSTEM | VV_PROCDEP |
4019	    VV_NOKNOTE | VV_DELETED | VV_MD | VV_FORCEINSMQ);
4020	if (flags != 0) {
4021		snprintf(buf2, sizeof(buf2), "|VV(0x%lx)", flags);
4022		strlcat(buf, buf2, sizeof(buf));
4023	}
4024	if (vp->v_iflag & VI_TEXT_REF)
4025		strlcat(buf, "|VI_TEXT_REF", sizeof(buf));
4026	if (vp->v_iflag & VI_MOUNT)
4027		strlcat(buf, "|VI_MOUNT", sizeof(buf));
4028	if (vp->v_iflag & VI_DOINGINACT)
4029		strlcat(buf, "|VI_DOINGINACT", sizeof(buf));
4030	if (vp->v_iflag & VI_OWEINACT)
4031		strlcat(buf, "|VI_OWEINACT", sizeof(buf));
4032	if (vp->v_iflag & VI_DEFINACT)
4033		strlcat(buf, "|VI_DEFINACT", sizeof(buf));
4034	flags = vp->v_iflag & ~(VI_TEXT_REF | VI_MOUNT | VI_DOINGINACT |
4035	    VI_OWEINACT | VI_DEFINACT);
4036	if (flags != 0) {
4037		snprintf(buf2, sizeof(buf2), "|VI(0x%lx)", flags);
4038		strlcat(buf, buf2, sizeof(buf));
4039	}
4040	if (vp->v_mflag & VMP_LAZYLIST)
4041		strlcat(buf, "|VMP_LAZYLIST", sizeof(buf));
4042	flags = vp->v_mflag & ~(VMP_LAZYLIST);
4043	if (flags != 0) {
4044		snprintf(buf2, sizeof(buf2), "|VMP(0x%lx)", flags);
4045		strlcat(buf, buf2, sizeof(buf));
4046	}
4047	printf("    flags (%s)\n", buf + 1);
4048	if (mtx_owned(VI_MTX(vp)))
4049		printf(" VI_LOCKed");
4050	if (vp->v_object != NULL)
4051		printf("    v_object %p ref %d pages %d "
4052		    "cleanbuf %d dirtybuf %d\n",
4053		    vp->v_object, vp->v_object->ref_count,
4054		    vp->v_object->resident_page_count,
4055		    vp->v_bufobj.bo_clean.bv_cnt,
4056		    vp->v_bufobj.bo_dirty.bv_cnt);
4057	printf("    ");
4058	lockmgr_printinfo(vp->v_vnlock);
4059	if (vp->v_data != NULL)
4060		VOP_PRINT(vp);
4061}
4062
4063#ifdef DDB
4064/*
4065 * List all of the locked vnodes in the system.
4066 * Called when debugging the kernel.
4067 */
4068DB_SHOW_COMMAND(lockedvnods, lockedvnodes)
4069{
4070	struct mount *mp;
4071	struct vnode *vp;
4072
4073	/*
4074	 * Note: because this is DDB, we can't obey the locking semantics
4075	 * for these structures, which means we could catch an inconsistent
4076	 * state and dereference a nasty pointer.  Not much to be done
4077	 * about that.
4078	 */
4079	db_printf("Locked vnodes\n");
4080	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
4081		TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
4082			if (vp->v_type != VMARKER && VOP_ISLOCKED(vp))
4083				vn_printf(vp, "vnode ");
4084		}
4085	}
4086}
4087
4088/*
4089 * Show details about the given vnode.
4090 */
4091DB_SHOW_COMMAND(vnode, db_show_vnode)
4092{
4093	struct vnode *vp;
4094
4095	if (!have_addr)
4096		return;
4097	vp = (struct vnode *)addr;
4098	vn_printf(vp, "vnode ");
4099}
4100
4101/*
4102 * Show details about the given mount point.
4103 */
4104DB_SHOW_COMMAND(mount, db_show_mount)
4105{
4106	struct mount *mp;
4107	struct vfsopt *opt;
4108	struct statfs *sp;
4109	struct vnode *vp;
4110	char buf[512];
4111	uint64_t mflags;
4112	u_int flags;
4113
4114	if (!have_addr) {
4115		/* No address given, print short info about all mount points. */
4116		TAILQ_FOREACH(mp, &mountlist, mnt_list) {
4117			db_printf("%p %s on %s (%s)\n", mp,
4118			    mp->mnt_stat.f_mntfromname,
4119			    mp->mnt_stat.f_mntonname,
4120			    mp->mnt_stat.f_fstypename);
4121			if (db_pager_quit)
4122				break;
4123		}
4124		db_printf("\nMore info: show mount <addr>\n");
4125		return;
4126	}
4127
4128	mp = (struct mount *)addr;
4129	db_printf("%p %s on %s (%s)\n", mp, mp->mnt_stat.f_mntfromname,
4130	    mp->mnt_stat.f_mntonname, mp->mnt_stat.f_fstypename);
4131
4132	buf[0] = '\0';
4133	mflags = mp->mnt_flag;
4134#define	MNT_FLAG(flag)	do {						\
4135	if (mflags & (flag)) {						\
4136		if (buf[0] != '\0')					\
4137			strlcat(buf, ", ", sizeof(buf));		\
4138		strlcat(buf, (#flag) + 4, sizeof(buf));			\
4139		mflags &= ~(flag);					\
4140	}								\
4141} while (0)
4142	MNT_FLAG(MNT_RDONLY);
4143	MNT_FLAG(MNT_SYNCHRONOUS);
4144	MNT_FLAG(MNT_NOEXEC);
4145	MNT_FLAG(MNT_NOSUID);
4146	MNT_FLAG(MNT_NFS4ACLS);
4147	MNT_FLAG(MNT_UNION);
4148	MNT_FLAG(MNT_ASYNC);
4149	MNT_FLAG(MNT_SUIDDIR);
4150	MNT_FLAG(MNT_SOFTDEP);
4151	MNT_FLAG(MNT_NOSYMFOLLOW);
4152	MNT_FLAG(MNT_GJOURNAL);
4153	MNT_FLAG(MNT_MULTILABEL);
4154	MNT_FLAG(MNT_ACLS);
4155	MNT_FLAG(MNT_NOATIME);
4156	MNT_FLAG(MNT_NOCLUSTERR);
4157	MNT_FLAG(MNT_NOCLUSTERW);
4158	MNT_FLAG(MNT_SUJ);
4159	MNT_FLAG(MNT_EXRDONLY);
4160	MNT_FLAG(MNT_EXPORTED);
4161	MNT_FLAG(MNT_DEFEXPORTED);
4162	MNT_FLAG(MNT_EXPORTANON);
4163	MNT_FLAG(MNT_EXKERB);
4164	MNT_FLAG(MNT_EXPUBLIC);
4165	MNT_FLAG(MNT_LOCAL);
4166	MNT_FLAG(MNT_QUOTA);
4167	MNT_FLAG(MNT_ROOTFS);
4168	MNT_FLAG(MNT_USER);
4169	MNT_FLAG(MNT_IGNORE);
4170	MNT_FLAG(MNT_UPDATE);
4171	MNT_FLAG(MNT_DELEXPORT);
4172	MNT_FLAG(MNT_RELOAD);
4173	MNT_FLAG(MNT_FORCE);
4174	MNT_FLAG(MNT_SNAPSHOT);
4175	MNT_FLAG(MNT_BYFSID);
4176#undef MNT_FLAG
4177	if (mflags != 0) {
4178		if (buf[0] != '\0')
4179			strlcat(buf, ", ", sizeof(buf));
4180		snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf),
4181		    "0x%016jx", mflags);
4182	}
4183	db_printf("    mnt_flag = %s\n", buf);
4184
4185	buf[0] = '\0';
4186	flags = mp->mnt_kern_flag;
4187#define	MNT_KERN_FLAG(flag)	do {					\
4188	if (flags & (flag)) {						\
4189		if (buf[0] != '\0')					\
4190			strlcat(buf, ", ", sizeof(buf));		\
4191		strlcat(buf, (#flag) + 5, sizeof(buf));			\
4192		flags &= ~(flag);					\
4193	}								\
4194} while (0)
4195	MNT_KERN_FLAG(MNTK_UNMOUNTF);
4196	MNT_KERN_FLAG(MNTK_ASYNC);
4197	MNT_KERN_FLAG(MNTK_SOFTDEP);
4198	MNT_KERN_FLAG(MNTK_DRAINING);
4199	MNT_KERN_FLAG(MNTK_REFEXPIRE);
4200	MNT_KERN_FLAG(MNTK_EXTENDED_SHARED);
4201	MNT_KERN_FLAG(MNTK_SHARED_WRITES);
4202	MNT_KERN_FLAG(MNTK_NO_IOPF);
4203	MNT_KERN_FLAG(MNTK_VGONE_UPPER);
4204	MNT_KERN_FLAG(MNTK_VGONE_WAITER);
4205	MNT_KERN_FLAG(MNTK_LOOKUP_EXCL_DOTDOT);
4206	MNT_KERN_FLAG(MNTK_MARKER);
4207	MNT_KERN_FLAG(MNTK_USES_BCACHE);
4208	MNT_KERN_FLAG(MNTK_FPLOOKUP);
4209	MNT_KERN_FLAG(MNTK_NOASYNC);
4210	MNT_KERN_FLAG(MNTK_UNMOUNT);
4211	MNT_KERN_FLAG(MNTK_MWAIT);
4212	MNT_KERN_FLAG(MNTK_SUSPEND);
4213	MNT_KERN_FLAG(MNTK_SUSPEND2);
4214	MNT_KERN_FLAG(MNTK_SUSPENDED);
4215	MNT_KERN_FLAG(MNTK_LOOKUP_SHARED);
4216	MNT_KERN_FLAG(MNTK_NOKNOTE);
4217#undef MNT_KERN_FLAG
4218	if (flags != 0) {
4219		if (buf[0] != '\0')
4220			strlcat(buf, ", ", sizeof(buf));
4221		snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf),
4222		    "0x%08x", flags);
4223	}
4224	db_printf("    mnt_kern_flag = %s\n", buf);
4225
4226	db_printf("    mnt_opt = ");
4227	opt = TAILQ_FIRST(mp->mnt_opt);
4228	if (opt != NULL) {
4229		db_printf("%s", opt->name);
4230		opt = TAILQ_NEXT(opt, link);
4231		while (opt != NULL) {
4232			db_printf(", %s", opt->name);
4233			opt = TAILQ_NEXT(opt, link);
4234		}
4235	}
4236	db_printf("\n");
4237
4238	sp = &mp->mnt_stat;
4239	db_printf("    mnt_stat = { version=%u type=%u flags=0x%016jx "
4240	    "bsize=%ju iosize=%ju blocks=%ju bfree=%ju bavail=%jd files=%ju "
4241	    "ffree=%jd syncwrites=%ju asyncwrites=%ju syncreads=%ju "
4242	    "asyncreads=%ju namemax=%u owner=%u fsid=[%d, %d] }\n",
4243	    (u_int)sp->f_version, (u_int)sp->f_type, (uintmax_t)sp->f_flags,
4244	    (uintmax_t)sp->f_bsize, (uintmax_t)sp->f_iosize,
4245	    (uintmax_t)sp->f_blocks, (uintmax_t)sp->f_bfree,
4246	    (intmax_t)sp->f_bavail, (uintmax_t)sp->f_files,
4247	    (intmax_t)sp->f_ffree, (uintmax_t)sp->f_syncwrites,
4248	    (uintmax_t)sp->f_asyncwrites, (uintmax_t)sp->f_syncreads,
4249	    (uintmax_t)sp->f_asyncreads, (u_int)sp->f_namemax,
4250	    (u_int)sp->f_owner, (int)sp->f_fsid.val[0], (int)sp->f_fsid.val[1]);
4251
4252	db_printf("    mnt_cred = { uid=%u ruid=%u",
4253	    (u_int)mp->mnt_cred->cr_uid, (u_int)mp->mnt_cred->cr_ruid);
4254	if (jailed(mp->mnt_cred))
4255		db_printf(", jail=%d", mp->mnt_cred->cr_prison->pr_id);
4256	db_printf(" }\n");
4257	db_printf("    mnt_ref = %d (with %d in the struct)\n",
4258	    vfs_mount_fetch_counter(mp, MNT_COUNT_REF), mp->mnt_ref);
4259	db_printf("    mnt_gen = %d\n", mp->mnt_gen);
4260	db_printf("    mnt_nvnodelistsize = %d\n", mp->mnt_nvnodelistsize);
4261	db_printf("    mnt_lazyvnodelistsize = %d\n",
4262	    mp->mnt_lazyvnodelistsize);
4263	db_printf("    mnt_writeopcount = %d (with %d in the struct)\n",
4264	    vfs_mount_fetch_counter(mp, MNT_COUNT_WRITEOPCOUNT), mp->mnt_writeopcount);
4265	db_printf("    mnt_maxsymlinklen = %d\n", mp->mnt_maxsymlinklen);
4266	db_printf("    mnt_iosize_max = %d\n", mp->mnt_iosize_max);
4267	db_printf("    mnt_hashseed = %u\n", mp->mnt_hashseed);
4268	db_printf("    mnt_lockref = %d (with %d in the struct)\n",
4269	    vfs_mount_fetch_counter(mp, MNT_COUNT_LOCKREF), mp->mnt_lockref);
4270	db_printf("    mnt_secondary_writes = %d\n", mp->mnt_secondary_writes);
4271	db_printf("    mnt_secondary_accwrites = %d\n",
4272	    mp->mnt_secondary_accwrites);
4273	db_printf("    mnt_gjprovider = %s\n",
4274	    mp->mnt_gjprovider != NULL ? mp->mnt_gjprovider : "NULL");
4275	db_printf("    mnt_vfs_ops = %d\n", mp->mnt_vfs_ops);
4276
4277	db_printf("\n\nList of active vnodes\n");
4278	TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
4279		if (vp->v_type != VMARKER && vp->v_holdcnt > 0) {
4280			vn_printf(vp, "vnode ");
4281			if (db_pager_quit)
4282				break;
4283		}
4284	}
4285	db_printf("\n\nList of inactive vnodes\n");
4286	TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
4287		if (vp->v_type != VMARKER && vp->v_holdcnt == 0) {
4288			vn_printf(vp, "vnode ");
4289			if (db_pager_quit)
4290				break;
4291		}
4292	}
4293}
4294#endif	/* DDB */
4295
4296/*
4297 * Fill in a struct xvfsconf based on a struct vfsconf.
4298 */
4299static int
4300vfsconf2x(struct sysctl_req *req, struct vfsconf *vfsp)
4301{
4302	struct xvfsconf xvfsp;
4303
4304	bzero(&xvfsp, sizeof(xvfsp));
4305	strcpy(xvfsp.vfc_name, vfsp->vfc_name);
4306	xvfsp.vfc_typenum = vfsp->vfc_typenum;
4307	xvfsp.vfc_refcount = vfsp->vfc_refcount;
4308	xvfsp.vfc_flags = vfsp->vfc_flags;
4309	/*
4310	 * These are unused in userland, we keep them
4311	 * to not break binary compatibility.
4312	 */
4313	xvfsp.vfc_vfsops = NULL;
4314	xvfsp.vfc_next = NULL;
4315	return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp)));
4316}
4317
4318#ifdef COMPAT_FREEBSD32
4319struct xvfsconf32 {
4320	uint32_t	vfc_vfsops;
4321	char		vfc_name[MFSNAMELEN];
4322	int32_t		vfc_typenum;
4323	int32_t		vfc_refcount;
4324	int32_t		vfc_flags;
4325	uint32_t	vfc_next;
4326};
4327
4328static int
4329vfsconf2x32(struct sysctl_req *req, struct vfsconf *vfsp)
4330{
4331	struct xvfsconf32 xvfsp;
4332
4333	bzero(&xvfsp, sizeof(xvfsp));
4334	strcpy(xvfsp.vfc_name, vfsp->vfc_name);
4335	xvfsp.vfc_typenum = vfsp->vfc_typenum;
4336	xvfsp.vfc_refcount = vfsp->vfc_refcount;
4337	xvfsp.vfc_flags = vfsp->vfc_flags;
4338	return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp)));
4339}
4340#endif
4341
4342/*
4343 * Top level filesystem related information gathering.
4344 */
4345static int
4346sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS)
4347{
4348	struct vfsconf *vfsp;
4349	int error;
4350
4351	error = 0;
4352	vfsconf_slock();
4353	TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
4354#ifdef COMPAT_FREEBSD32
4355		if (req->flags & SCTL_MASK32)
4356			error = vfsconf2x32(req, vfsp);
4357		else
4358#endif
4359			error = vfsconf2x(req, vfsp);
4360		if (error)
4361			break;
4362	}
4363	vfsconf_sunlock();
4364	return (error);
4365}
4366
4367SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLTYPE_OPAQUE | CTLFLAG_RD |
4368    CTLFLAG_MPSAFE, NULL, 0, sysctl_vfs_conflist,
4369    "S,xvfsconf", "List of all configured filesystems");
4370
4371#ifndef BURN_BRIDGES
4372static int	sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS);
4373
4374static int
4375vfs_sysctl(SYSCTL_HANDLER_ARGS)
4376{
4377	int *name = (int *)arg1 - 1;	/* XXX */
4378	u_int namelen = arg2 + 1;	/* XXX */
4379	struct vfsconf *vfsp;
4380
4381	log(LOG_WARNING, "userland calling deprecated sysctl, "
4382	    "please rebuild world\n");
4383
4384#if 1 || defined(COMPAT_PRELITE2)
4385	/* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */
4386	if (namelen == 1)
4387		return (sysctl_ovfs_conf(oidp, arg1, arg2, req));
4388#endif
4389
4390	switch (name[1]) {
4391	case VFS_MAXTYPENUM:
4392		if (namelen != 2)
4393			return (ENOTDIR);
4394		return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int)));
4395	case VFS_CONF:
4396		if (namelen != 3)
4397			return (ENOTDIR);	/* overloaded */
4398		vfsconf_slock();
4399		TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
4400			if (vfsp->vfc_typenum == name[2])
4401				break;
4402		}
4403		vfsconf_sunlock();
4404		if (vfsp == NULL)
4405			return (EOPNOTSUPP);
4406#ifdef COMPAT_FREEBSD32
4407		if (req->flags & SCTL_MASK32)
4408			return (vfsconf2x32(req, vfsp));
4409		else
4410#endif
4411			return (vfsconf2x(req, vfsp));
4412	}
4413	return (EOPNOTSUPP);
4414}
4415
4416static SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP |
4417    CTLFLAG_MPSAFE, vfs_sysctl,
4418    "Generic filesystem");
4419
4420#if 1 || defined(COMPAT_PRELITE2)
4421
4422static int
4423sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS)
4424{
4425	int error;
4426	struct vfsconf *vfsp;
4427	struct ovfsconf ovfs;
4428
4429	vfsconf_slock();
4430	TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
4431		bzero(&ovfs, sizeof(ovfs));
4432		ovfs.vfc_vfsops = vfsp->vfc_vfsops;	/* XXX used as flag */
4433		strcpy(ovfs.vfc_name, vfsp->vfc_name);
4434		ovfs.vfc_index = vfsp->vfc_typenum;
4435		ovfs.vfc_refcount = vfsp->vfc_refcount;
4436		ovfs.vfc_flags = vfsp->vfc_flags;
4437		error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
4438		if (error != 0) {
4439			vfsconf_sunlock();
4440			return (error);
4441		}
4442	}
4443	vfsconf_sunlock();
4444	return (0);
4445}
4446
4447#endif /* 1 || COMPAT_PRELITE2 */
4448#endif /* !BURN_BRIDGES */
4449
4450#define KINFO_VNODESLOP		10
4451#ifdef notyet
4452/*
4453 * Dump vnode list (via sysctl).
4454 */
4455/* ARGSUSED */
4456static int
4457sysctl_vnode(SYSCTL_HANDLER_ARGS)
4458{
4459	struct xvnode *xvn;
4460	struct mount *mp;
4461	struct vnode *vp;
4462	int error, len, n;
4463
4464	/*
4465	 * Stale numvnodes access is not fatal here.
4466	 */
4467	req->lock = 0;
4468	len = (numvnodes + KINFO_VNODESLOP) * sizeof *xvn;
4469	if (!req->oldptr)
4470		/* Make an estimate */
4471		return (SYSCTL_OUT(req, 0, len));
4472
4473	error = sysctl_wire_old_buffer(req, 0);
4474	if (error != 0)
4475		return (error);
4476	xvn = malloc(len, M_TEMP, M_ZERO | M_WAITOK);
4477	n = 0;
4478	mtx_lock(&mountlist_mtx);
4479	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
4480		if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK))
4481			continue;
4482		MNT_ILOCK(mp);
4483		TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
4484			if (n == len)
4485				break;
4486			vref(vp);
4487			xvn[n].xv_size = sizeof *xvn;
4488			xvn[n].xv_vnode = vp;
4489			xvn[n].xv_id = 0;	/* XXX compat */
4490#define XV_COPY(field) xvn[n].xv_##field = vp->v_##field
4491			XV_COPY(usecount);
4492			XV_COPY(writecount);
4493			XV_COPY(holdcnt);
4494			XV_COPY(mount);
4495			XV_COPY(numoutput);
4496			XV_COPY(type);
4497#undef XV_COPY
4498			xvn[n].xv_flag = vp->v_vflag;
4499
4500			switch (vp->v_type) {
4501			case VREG:
4502			case VDIR:
4503			case VLNK:
4504				break;
4505			case VBLK:
4506			case VCHR:
4507				if (vp->v_rdev == NULL) {
4508					vrele(vp);
4509					continue;
4510				}
4511				xvn[n].xv_dev = dev2udev(vp->v_rdev);
4512				break;
4513			case VSOCK:
4514				xvn[n].xv_socket = vp->v_socket;
4515				break;
4516			case VFIFO:
4517				xvn[n].xv_fifo = vp->v_fifoinfo;
4518				break;
4519			case VNON:
4520			case VBAD:
4521			default:
4522				/* shouldn't happen? */
4523				vrele(vp);
4524				continue;
4525			}
4526			vrele(vp);
4527			++n;
4528		}
4529		MNT_IUNLOCK(mp);
4530		mtx_lock(&mountlist_mtx);
4531		vfs_unbusy(mp);
4532		if (n == len)
4533			break;
4534	}
4535	mtx_unlock(&mountlist_mtx);
4536
4537	error = SYSCTL_OUT(req, xvn, n * sizeof *xvn);
4538	free(xvn, M_TEMP);
4539	return (error);
4540}
4541
4542SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE | CTLFLAG_RD |
4543    CTLFLAG_MPSAFE, 0, 0, sysctl_vnode, "S,xvnode",
4544    "");
4545#endif
4546
4547static void
4548unmount_or_warn(struct mount *mp)
4549{
4550	int error;
4551
4552	error = dounmount(mp, MNT_FORCE, curthread);
4553	if (error != 0) {
4554		printf("unmount of %s failed (", mp->mnt_stat.f_mntonname);
4555		if (error == EBUSY)
4556			printf("BUSY)\n");
4557		else
4558			printf("%d)\n", error);
4559	}
4560}
4561
4562/*
4563 * Unmount all filesystems. The list is traversed in reverse order
4564 * of mounting to avoid dependencies.
4565 */
4566void
4567vfs_unmountall(void)
4568{
4569	struct mount *mp, *tmp;
4570
4571	CTR1(KTR_VFS, "%s: unmounting all filesystems", __func__);
4572
4573	/*
4574	 * Since this only runs when rebooting, it is not interlocked.
4575	 */
4576	TAILQ_FOREACH_REVERSE_SAFE(mp, &mountlist, mntlist, mnt_list, tmp) {
4577		vfs_ref(mp);
4578
4579		/*
4580		 * Forcibly unmounting "/dev" before "/" would prevent clean
4581		 * unmount of the latter.
4582		 */
4583		if (mp == rootdevmp)
4584			continue;
4585
4586		unmount_or_warn(mp);
4587	}
4588
4589	if (rootdevmp != NULL)
4590		unmount_or_warn(rootdevmp);
4591}
4592
4593static void
4594vfs_deferred_inactive(struct vnode *vp, int lkflags)
4595{
4596
4597	ASSERT_VI_LOCKED(vp, __func__);
4598	VNASSERT((vp->v_iflag & VI_DEFINACT) == 0, vp, ("VI_DEFINACT still set"));
4599	if ((vp->v_iflag & VI_OWEINACT) == 0) {
4600		vdropl(vp);
4601		return;
4602	}
4603	if (vn_lock(vp, lkflags) == 0) {
4604		VI_LOCK(vp);
4605		vinactive(vp);
4606		VOP_UNLOCK(vp);
4607		vdropl(vp);
4608		return;
4609	}
4610	vdefer_inactive_unlocked(vp);
4611}
4612
4613static int
4614vfs_periodic_inactive_filter(struct vnode *vp, void *arg)
4615{
4616
4617	return (vp->v_iflag & VI_DEFINACT);
4618}
4619
4620static void __noinline
4621vfs_periodic_inactive(struct mount *mp, int flags)
4622{
4623	struct vnode *vp, *mvp;
4624	int lkflags;
4625
4626	lkflags = LK_EXCLUSIVE | LK_INTERLOCK;
4627	if (flags != MNT_WAIT)
4628		lkflags |= LK_NOWAIT;
4629
4630	MNT_VNODE_FOREACH_LAZY(vp, mp, mvp, vfs_periodic_inactive_filter, NULL) {
4631		if ((vp->v_iflag & VI_DEFINACT) == 0) {
4632			VI_UNLOCK(vp);
4633			continue;
4634		}
4635		vp->v_iflag &= ~VI_DEFINACT;
4636		vfs_deferred_inactive(vp, lkflags);
4637	}
4638}
4639
4640static inline bool
4641vfs_want_msync(struct vnode *vp)
4642{
4643	struct vm_object *obj;
4644
4645	/*
4646	 * This test may be performed without any locks held.
4647	 * We rely on vm_object's type stability.
4648	 */
4649	if (vp->v_vflag & VV_NOSYNC)
4650		return (false);
4651	obj = vp->v_object;
4652	return (obj != NULL && vm_object_mightbedirty(obj));
4653}
4654
4655static int
4656vfs_periodic_msync_inactive_filter(struct vnode *vp, void *arg __unused)
4657{
4658
4659	if (vp->v_vflag & VV_NOSYNC)
4660		return (false);
4661	if (vp->v_iflag & VI_DEFINACT)
4662		return (true);
4663	return (vfs_want_msync(vp));
4664}
4665
4666static void __noinline
4667vfs_periodic_msync_inactive(struct mount *mp, int flags)
4668{
4669	struct vnode *vp, *mvp;
4670	struct vm_object *obj;
4671	int lkflags, objflags;
4672	bool seen_defer;
4673
4674	lkflags = LK_EXCLUSIVE | LK_INTERLOCK;
4675	if (flags != MNT_WAIT) {
4676		lkflags |= LK_NOWAIT;
4677		objflags = OBJPC_NOSYNC;
4678	} else {
4679		objflags = OBJPC_SYNC;
4680	}
4681
4682	MNT_VNODE_FOREACH_LAZY(vp, mp, mvp, vfs_periodic_msync_inactive_filter, NULL) {
4683		seen_defer = false;
4684		if (vp->v_iflag & VI_DEFINACT) {
4685			vp->v_iflag &= ~VI_DEFINACT;
4686			seen_defer = true;
4687		}
4688		if (!vfs_want_msync(vp)) {
4689			if (seen_defer)
4690				vfs_deferred_inactive(vp, lkflags);
4691			else
4692				VI_UNLOCK(vp);
4693			continue;
4694		}
4695		if (vget(vp, lkflags) == 0) {
4696			obj = vp->v_object;
4697			if (obj != NULL && (vp->v_vflag & VV_NOSYNC) == 0) {
4698				VM_OBJECT_WLOCK(obj);
4699				vm_object_page_clean(obj, 0, 0, objflags);
4700				VM_OBJECT_WUNLOCK(obj);
4701			}
4702			vput(vp);
4703			if (seen_defer)
4704				vdrop(vp);
4705		} else {
4706			if (seen_defer)
4707				vdefer_inactive_unlocked(vp);
4708		}
4709	}
4710}
4711
4712void
4713vfs_periodic(struct mount *mp, int flags)
4714{
4715
4716	CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
4717
4718	if ((mp->mnt_kern_flag & MNTK_NOMSYNC) != 0)
4719		vfs_periodic_inactive(mp, flags);
4720	else
4721		vfs_periodic_msync_inactive(mp, flags);
4722}
4723
4724static void
4725destroy_vpollinfo_free(struct vpollinfo *vi)
4726{
4727
4728	knlist_destroy(&vi->vpi_selinfo.si_note);
4729	mtx_destroy(&vi->vpi_lock);
4730	uma_zfree(vnodepoll_zone, vi);
4731}
4732
4733static void
4734destroy_vpollinfo(struct vpollinfo *vi)
4735{
4736
4737	knlist_clear(&vi->vpi_selinfo.si_note, 1);
4738	seldrain(&vi->vpi_selinfo);
4739	destroy_vpollinfo_free(vi);
4740}
4741
4742/*
4743 * Initialize per-vnode helper structure to hold poll-related state.
4744 */
4745void
4746v_addpollinfo(struct vnode *vp)
4747{
4748	struct vpollinfo *vi;
4749
4750	if (vp->v_pollinfo != NULL)
4751		return;
4752	vi = uma_zalloc(vnodepoll_zone, M_WAITOK | M_ZERO);
4753	mtx_init(&vi->vpi_lock, "vnode pollinfo", NULL, MTX_DEF);
4754	knlist_init(&vi->vpi_selinfo.si_note, vp, vfs_knllock,
4755	    vfs_knlunlock, vfs_knl_assert_locked, vfs_knl_assert_unlocked);
4756	VI_LOCK(vp);
4757	if (vp->v_pollinfo != NULL) {
4758		VI_UNLOCK(vp);
4759		destroy_vpollinfo_free(vi);
4760		return;
4761	}
4762	vp->v_pollinfo = vi;
4763	VI_UNLOCK(vp);
4764}
4765
4766/*
4767 * Record a process's interest in events which might happen to
4768 * a vnode.  Because poll uses the historic select-style interface
4769 * internally, this routine serves as both the ``check for any
4770 * pending events'' and the ``record my interest in future events''
4771 * functions.  (These are done together, while the lock is held,
4772 * to avoid race conditions.)
4773 */
4774int
4775vn_pollrecord(struct vnode *vp, struct thread *td, int events)
4776{
4777
4778	v_addpollinfo(vp);
4779	mtx_lock(&vp->v_pollinfo->vpi_lock);
4780	if (vp->v_pollinfo->vpi_revents & events) {
4781		/*
4782		 * This leaves events we are not interested
4783		 * in available for the other process which
4784		 * which presumably had requested them
4785		 * (otherwise they would never have been
4786		 * recorded).
4787		 */
4788		events &= vp->v_pollinfo->vpi_revents;
4789		vp->v_pollinfo->vpi_revents &= ~events;
4790
4791		mtx_unlock(&vp->v_pollinfo->vpi_lock);
4792		return (events);
4793	}
4794	vp->v_pollinfo->vpi_events |= events;
4795	selrecord(td, &vp->v_pollinfo->vpi_selinfo);
4796	mtx_unlock(&vp->v_pollinfo->vpi_lock);
4797	return (0);
4798}
4799
4800/*
4801 * Routine to create and manage a filesystem syncer vnode.
4802 */
4803#define sync_close ((int (*)(struct  vop_close_args *))nullop)
4804static int	sync_fsync(struct  vop_fsync_args *);
4805static int	sync_inactive(struct  vop_inactive_args *);
4806static int	sync_reclaim(struct  vop_reclaim_args *);
4807
4808static struct vop_vector sync_vnodeops = {
4809	.vop_bypass =	VOP_EOPNOTSUPP,
4810	.vop_close =	sync_close,		/* close */
4811	.vop_fsync =	sync_fsync,		/* fsync */
4812	.vop_inactive =	sync_inactive,	/* inactive */
4813	.vop_need_inactive = vop_stdneed_inactive, /* need_inactive */
4814	.vop_reclaim =	sync_reclaim,	/* reclaim */
4815	.vop_lock1 =	vop_stdlock,	/* lock */
4816	.vop_unlock =	vop_stdunlock,	/* unlock */
4817	.vop_islocked =	vop_stdislocked,	/* islocked */
4818};
4819VFS_VOP_VECTOR_REGISTER(sync_vnodeops);
4820
4821/*
4822 * Create a new filesystem syncer vnode for the specified mount point.
4823 */
4824void
4825vfs_allocate_syncvnode(struct mount *mp)
4826{
4827	struct vnode *vp;
4828	struct bufobj *bo;
4829	static long start, incr, next;
4830	int error;
4831
4832	/* Allocate a new vnode */
4833	error = getnewvnode("syncer", mp, &sync_vnodeops, &vp);
4834	if (error != 0)
4835		panic("vfs_allocate_syncvnode: getnewvnode() failed");
4836	vp->v_type = VNON;
4837	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4838	vp->v_vflag |= VV_FORCEINSMQ;
4839	error = insmntque(vp, mp);
4840	if (error != 0)
4841		panic("vfs_allocate_syncvnode: insmntque() failed");
4842	vp->v_vflag &= ~VV_FORCEINSMQ;
4843	VOP_UNLOCK(vp);
4844	/*
4845	 * Place the vnode onto the syncer worklist. We attempt to
4846	 * scatter them about on the list so that they will go off
4847	 * at evenly distributed times even if all the filesystems
4848	 * are mounted at once.
4849	 */
4850	next += incr;
4851	if (next == 0 || next > syncer_maxdelay) {
4852		start /= 2;
4853		incr /= 2;
4854		if (start == 0) {
4855			start = syncer_maxdelay / 2;
4856			incr = syncer_maxdelay;
4857		}
4858		next = start;
4859	}
4860	bo = &vp->v_bufobj;
4861	BO_LOCK(bo);
4862	vn_syncer_add_to_worklist(bo, syncdelay > 0 ? next % syncdelay : 0);
4863	/* XXX - vn_syncer_add_to_worklist() also grabs and drops sync_mtx. */
4864	mtx_lock(&sync_mtx);
4865	sync_vnode_count++;
4866	if (mp->mnt_syncer == NULL) {
4867		mp->mnt_syncer = vp;
4868		vp = NULL;
4869	}
4870	mtx_unlock(&sync_mtx);
4871	BO_UNLOCK(bo);
4872	if (vp != NULL) {
4873		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4874		vgone(vp);
4875		vput(vp);
4876	}
4877}
4878
4879void
4880vfs_deallocate_syncvnode(struct mount *mp)
4881{
4882	struct vnode *vp;
4883
4884	mtx_lock(&sync_mtx);
4885	vp = mp->mnt_syncer;
4886	if (vp != NULL)
4887		mp->mnt_syncer = NULL;
4888	mtx_unlock(&sync_mtx);
4889	if (vp != NULL)
4890		vrele(vp);
4891}
4892
4893/*
4894 * Do a lazy sync of the filesystem.
4895 */
4896static int
4897sync_fsync(struct vop_fsync_args *ap)
4898{
4899	struct vnode *syncvp = ap->a_vp;
4900	struct mount *mp = syncvp->v_mount;
4901	int error, save;
4902	struct bufobj *bo;
4903
4904	/*
4905	 * We only need to do something if this is a lazy evaluation.
4906	 */
4907	if (ap->a_waitfor != MNT_LAZY)
4908		return (0);
4909
4910	/*
4911	 * Move ourselves to the back of the sync list.
4912	 */
4913	bo = &syncvp->v_bufobj;
4914	BO_LOCK(bo);
4915	vn_syncer_add_to_worklist(bo, syncdelay);
4916	BO_UNLOCK(bo);
4917
4918	/*
4919	 * Walk the list of vnodes pushing all that are dirty and
4920	 * not already on the sync list.
4921	 */
4922	if (vfs_busy(mp, MBF_NOWAIT) != 0)
4923		return (0);
4924	if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) {
4925		vfs_unbusy(mp);
4926		return (0);
4927	}
4928	save = curthread_pflags_set(TDP_SYNCIO);
4929	/*
4930	 * The filesystem at hand may be idle with free vnodes stored in the
4931	 * batch.  Return them instead of letting them stay there indefinitely.
4932	 */
4933	vfs_periodic(mp, MNT_NOWAIT);
4934	error = VFS_SYNC(mp, MNT_LAZY);
4935	curthread_pflags_restore(save);
4936	vn_finished_write(mp);
4937	vfs_unbusy(mp);
4938	return (error);
4939}
4940
4941/*
4942 * The syncer vnode is no referenced.
4943 */
4944static int
4945sync_inactive(struct vop_inactive_args *ap)
4946{
4947
4948	vgone(ap->a_vp);
4949	return (0);
4950}
4951
4952/*
4953 * The syncer vnode is no longer needed and is being decommissioned.
4954 *
4955 * Modifications to the worklist must be protected by sync_mtx.
4956 */
4957static int
4958sync_reclaim(struct vop_reclaim_args *ap)
4959{
4960	struct vnode *vp = ap->a_vp;
4961	struct bufobj *bo;
4962
4963	bo = &vp->v_bufobj;
4964	BO_LOCK(bo);
4965	mtx_lock(&sync_mtx);
4966	if (vp->v_mount->mnt_syncer == vp)
4967		vp->v_mount->mnt_syncer = NULL;
4968	if (bo->bo_flag & BO_ONWORKLST) {
4969		LIST_REMOVE(bo, bo_synclist);
4970		syncer_worklist_len--;
4971		sync_vnode_count--;
4972		bo->bo_flag &= ~BO_ONWORKLST;
4973	}
4974	mtx_unlock(&sync_mtx);
4975	BO_UNLOCK(bo);
4976
4977	return (0);
4978}
4979
4980int
4981vn_need_pageq_flush(struct vnode *vp)
4982{
4983	struct vm_object *obj;
4984	int need;
4985
4986	MPASS(mtx_owned(VI_MTX(vp)));
4987	need = 0;
4988	if ((obj = vp->v_object) != NULL && (vp->v_vflag & VV_NOSYNC) == 0 &&
4989	    vm_object_mightbedirty(obj))
4990		need = 1;
4991	return (need);
4992}
4993
4994/*
4995 * Check if vnode represents a disk device
4996 */
4997bool
4998vn_isdisk_error(struct vnode *vp, int *errp)
4999{
5000	int error;
5001
5002	if (vp->v_type != VCHR) {
5003		error = ENOTBLK;
5004		goto out;
5005	}
5006	error = 0;
5007	dev_lock();
5008	if (vp->v_rdev == NULL)
5009		error = ENXIO;
5010	else if (vp->v_rdev->si_devsw == NULL)
5011		error = ENXIO;
5012	else if (!(vp->v_rdev->si_devsw->d_flags & D_DISK))
5013		error = ENOTBLK;
5014	dev_unlock();
5015out:
5016	*errp = error;
5017	return (error == 0);
5018}
5019
5020bool
5021vn_isdisk(struct vnode *vp)
5022{
5023	int error;
5024
5025	return (vn_isdisk_error(vp, &error));
5026}
5027
5028/*
5029 * VOP_FPLOOKUP_VEXEC routines are subject to special circumstances, see
5030 * the comment above cache_fplookup for details.
5031 */
5032int
5033vaccess_vexec_smr(mode_t file_mode, uid_t file_uid, gid_t file_gid, struct ucred *cred)
5034{
5035	int error;
5036
5037	VFS_SMR_ASSERT_ENTERED();
5038
5039	/* Check the owner. */
5040	if (cred->cr_uid == file_uid) {
5041		if (file_mode & S_IXUSR)
5042			return (0);
5043		goto out_error;
5044	}
5045
5046	/* Otherwise, check the groups (first match) */
5047	if (groupmember(file_gid, cred)) {
5048		if (file_mode & S_IXGRP)
5049			return (0);
5050		goto out_error;
5051	}
5052
5053	/* Otherwise, check everyone else. */
5054	if (file_mode & S_IXOTH)
5055		return (0);
5056out_error:
5057	/*
5058	 * Permission check failed, but it is possible denial will get overwritten
5059	 * (e.g., when root is traversing through a 700 directory owned by someone
5060	 * else).
5061	 *
5062	 * vaccess() calls priv_check_cred which in turn can descent into MAC
5063	 * modules overriding this result. It's quite unclear what semantics
5064	 * are allowed for them to operate, thus for safety we don't call them
5065	 * from within the SMR section. This also means if any such modules
5066	 * are present, we have to let the regular lookup decide.
5067	 */
5068	error = priv_check_cred_vfs_lookup_nomac(cred);
5069	switch (error) {
5070	case 0:
5071		return (0);
5072	case EAGAIN:
5073		/*
5074		 * MAC modules present.
5075		 */
5076		return (EAGAIN);
5077	case EPERM:
5078		return (EACCES);
5079	default:
5080		return (error);
5081	}
5082}
5083
5084/*
5085 * Common filesystem object access control check routine.  Accepts a
5086 * vnode's type, "mode", uid and gid, requested access mode, and credentials.
5087 * Returns 0 on success, or an errno on failure.
5088 */
5089int
5090vaccess(enum vtype type, mode_t file_mode, uid_t file_uid, gid_t file_gid,
5091    accmode_t accmode, struct ucred *cred)
5092{
5093	accmode_t dac_granted;
5094	accmode_t priv_granted;
5095
5096	KASSERT((accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | VAPPEND)) == 0,
5097	    ("invalid bit in accmode"));
5098	KASSERT((accmode & VAPPEND) == 0 || (accmode & VWRITE),
5099	    ("VAPPEND without VWRITE"));
5100
5101	/*
5102	 * Look for a normal, non-privileged way to access the file/directory
5103	 * as requested.  If it exists, go with that.
5104	 */
5105
5106	dac_granted = 0;
5107
5108	/* Check the owner. */
5109	if (cred->cr_uid == file_uid) {
5110		dac_granted |= VADMIN;
5111		if (file_mode & S_IXUSR)
5112			dac_granted |= VEXEC;
5113		if (file_mode & S_IRUSR)
5114			dac_granted |= VREAD;
5115		if (file_mode & S_IWUSR)
5116			dac_granted |= (VWRITE | VAPPEND);
5117
5118		if ((accmode & dac_granted) == accmode)
5119			return (0);
5120
5121		goto privcheck;
5122	}
5123
5124	/* Otherwise, check the groups (first match) */
5125	if (groupmember(file_gid, cred)) {
5126		if (file_mode & S_IXGRP)
5127			dac_granted |= VEXEC;
5128		if (file_mode & S_IRGRP)
5129			dac_granted |= VREAD;
5130		if (file_mode & S_IWGRP)
5131			dac_granted |= (VWRITE | VAPPEND);
5132
5133		if ((accmode & dac_granted) == accmode)
5134			return (0);
5135
5136		goto privcheck;
5137	}
5138
5139	/* Otherwise, check everyone else. */
5140	if (file_mode & S_IXOTH)
5141		dac_granted |= VEXEC;
5142	if (file_mode & S_IROTH)
5143		dac_granted |= VREAD;
5144	if (file_mode & S_IWOTH)
5145		dac_granted |= (VWRITE | VAPPEND);
5146	if ((accmode & dac_granted) == accmode)
5147		return (0);
5148
5149privcheck:
5150	/*
5151	 * Build a privilege mask to determine if the set of privileges
5152	 * satisfies the requirements when combined with the granted mask
5153	 * from above.  For each privilege, if the privilege is required,
5154	 * bitwise or the request type onto the priv_granted mask.
5155	 */
5156	priv_granted = 0;
5157
5158	if (type == VDIR) {
5159		/*
5160		 * For directories, use PRIV_VFS_LOOKUP to satisfy VEXEC
5161		 * requests, instead of PRIV_VFS_EXEC.
5162		 */
5163		if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
5164		    !priv_check_cred(cred, PRIV_VFS_LOOKUP))
5165			priv_granted |= VEXEC;
5166	} else {
5167		/*
5168		 * Ensure that at least one execute bit is on. Otherwise,
5169		 * a privileged user will always succeed, and we don't want
5170		 * this to happen unless the file really is executable.
5171		 */
5172		if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
5173		    (file_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) != 0 &&
5174		    !priv_check_cred(cred, PRIV_VFS_EXEC))
5175			priv_granted |= VEXEC;
5176	}
5177
5178	if ((accmode & VREAD) && ((dac_granted & VREAD) == 0) &&
5179	    !priv_check_cred(cred, PRIV_VFS_READ))
5180		priv_granted |= VREAD;
5181
5182	if ((accmode & VWRITE) && ((dac_granted & VWRITE) == 0) &&
5183	    !priv_check_cred(cred, PRIV_VFS_WRITE))
5184		priv_granted |= (VWRITE | VAPPEND);
5185
5186	if ((accmode & VADMIN) && ((dac_granted & VADMIN) == 0) &&
5187	    !priv_check_cred(cred, PRIV_VFS_ADMIN))
5188		priv_granted |= VADMIN;
5189
5190	if ((accmode & (priv_granted | dac_granted)) == accmode) {
5191		return (0);
5192	}
5193
5194	return ((accmode & VADMIN) ? EPERM : EACCES);
5195}
5196
5197/*
5198 * Credential check based on process requesting service, and per-attribute
5199 * permissions.
5200 */
5201int
5202extattr_check_cred(struct vnode *vp, int attrnamespace, struct ucred *cred,
5203    struct thread *td, accmode_t accmode)
5204{
5205
5206	/*
5207	 * Kernel-invoked always succeeds.
5208	 */
5209	if (cred == NOCRED)
5210		return (0);
5211
5212	/*
5213	 * Do not allow privileged processes in jail to directly manipulate
5214	 * system attributes.
5215	 */
5216	switch (attrnamespace) {
5217	case EXTATTR_NAMESPACE_SYSTEM:
5218		/* Potentially should be: return (EPERM); */
5219		return (priv_check_cred(cred, PRIV_VFS_EXTATTR_SYSTEM));
5220	case EXTATTR_NAMESPACE_USER:
5221		return (VOP_ACCESS(vp, accmode, cred, td));
5222	default:
5223		return (EPERM);
5224	}
5225}
5226
5227#ifdef DEBUG_VFS_LOCKS
5228/*
5229 * This only exists to suppress warnings from unlocked specfs accesses.  It is
5230 * no longer ok to have an unlocked VFS.
5231 */
5232#define	IGNORE_LOCK(vp) (KERNEL_PANICKED() || (vp) == NULL ||		\
5233	(vp)->v_type == VCHR ||	(vp)->v_type == VBAD)
5234
5235int vfs_badlock_ddb = 1;	/* Drop into debugger on violation. */
5236SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_ddb, CTLFLAG_RW, &vfs_badlock_ddb, 0,
5237    "Drop into debugger on lock violation");
5238
5239int vfs_badlock_mutex = 1;	/* Check for interlock across VOPs. */
5240SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_mutex, CTLFLAG_RW, &vfs_badlock_mutex,
5241    0, "Check for interlock across VOPs");
5242
5243int vfs_badlock_print = 1;	/* Print lock violations. */
5244SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_print, CTLFLAG_RW, &vfs_badlock_print,
5245    0, "Print lock violations");
5246
5247int vfs_badlock_vnode = 1;	/* Print vnode details on lock violations. */
5248SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_vnode, CTLFLAG_RW, &vfs_badlock_vnode,
5249    0, "Print vnode details on lock violations");
5250
5251#ifdef KDB
5252int vfs_badlock_backtrace = 1;	/* Print backtrace at lock violations. */
5253SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_backtrace, CTLFLAG_RW,
5254    &vfs_badlock_backtrace, 0, "Print backtrace at lock violations");
5255#endif
5256
5257static void
5258vfs_badlock(const char *msg, const char *str, struct vnode *vp)
5259{
5260
5261#ifdef KDB
5262	if (vfs_badlock_backtrace)
5263		kdb_backtrace();
5264#endif
5265	if (vfs_badlock_vnode)
5266		vn_printf(vp, "vnode ");
5267	if (vfs_badlock_print)
5268		printf("%s: %p %s\n", str, (void *)vp, msg);
5269	if (vfs_badlock_ddb)
5270		kdb_enter(KDB_WHY_VFSLOCK, "lock violation");
5271}
5272
5273void
5274assert_vi_locked(struct vnode *vp, const char *str)
5275{
5276
5277	if (vfs_badlock_mutex && !mtx_owned(VI_MTX(vp)))
5278		vfs_badlock("interlock is not locked but should be", str, vp);
5279}
5280
5281void
5282assert_vi_unlocked(struct vnode *vp, const char *str)
5283{
5284
5285	if (vfs_badlock_mutex && mtx_owned(VI_MTX(vp)))
5286		vfs_badlock("interlock is locked but should not be", str, vp);
5287}
5288
5289void
5290assert_vop_locked(struct vnode *vp, const char *str)
5291{
5292	int locked;
5293
5294	if (!IGNORE_LOCK(vp)) {
5295		locked = VOP_ISLOCKED(vp);
5296		if (locked == 0 || locked == LK_EXCLOTHER)
5297			vfs_badlock("is not locked but should be", str, vp);
5298	}
5299}
5300
5301void
5302assert_vop_unlocked(struct vnode *vp, const char *str)
5303{
5304
5305	if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) == LK_EXCLUSIVE)
5306		vfs_badlock("is locked but should not be", str, vp);
5307}
5308
5309void
5310assert_vop_elocked(struct vnode *vp, const char *str)
5311{
5312
5313	if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_EXCLUSIVE)
5314		vfs_badlock("is not exclusive locked but should be", str, vp);
5315}
5316#endif /* DEBUG_VFS_LOCKS */
5317
5318void
5319vop_rename_fail(struct vop_rename_args *ap)
5320{
5321
5322	if (ap->a_tvp != NULL)
5323		vput(ap->a_tvp);
5324	if (ap->a_tdvp == ap->a_tvp)
5325		vrele(ap->a_tdvp);
5326	else
5327		vput(ap->a_tdvp);
5328	vrele(ap->a_fdvp);
5329	vrele(ap->a_fvp);
5330}
5331
5332void
5333vop_rename_pre(void *ap)
5334{
5335	struct vop_rename_args *a = ap;
5336
5337#ifdef DEBUG_VFS_LOCKS
5338	if (a->a_tvp)
5339		ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME");
5340	ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME");
5341	ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME");
5342	ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME");
5343
5344	/* Check the source (from). */
5345	if (a->a_tdvp->v_vnlock != a->a_fdvp->v_vnlock &&
5346	    (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fdvp->v_vnlock))
5347		ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked");
5348	if (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fvp->v_vnlock)
5349		ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: fvp locked");
5350
5351	/* Check the target. */
5352	if (a->a_tvp)
5353		ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked");
5354	ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked");
5355#endif
5356	/*
5357	 * It may be tempting to add vn_seqc_write_begin/end calls here and
5358	 * in vop_rename_post but that's not going to work out since some
5359	 * filesystems relookup vnodes mid-rename. This is probably a bug.
5360	 *
5361	 * For now filesystems are expected to do the relevant calls after they
5362	 * decide what vnodes to operate on.
5363	 */
5364	if (a->a_tdvp != a->a_fdvp)
5365		vhold(a->a_fdvp);
5366	if (a->a_tvp != a->a_fvp)
5367		vhold(a->a_fvp);
5368	vhold(a->a_tdvp);
5369	if (a->a_tvp)
5370		vhold(a->a_tvp);
5371}
5372
5373#ifdef DEBUG_VFS_LOCKS
5374void
5375vop_fplookup_vexec_debugpre(void *ap __unused)
5376{
5377
5378	VFS_SMR_ASSERT_ENTERED();
5379}
5380
5381void
5382vop_fplookup_vexec_debugpost(void *ap __unused, int rc __unused)
5383{
5384
5385	VFS_SMR_ASSERT_ENTERED();
5386}
5387
5388void
5389vop_strategy_debugpre(void *ap)
5390{
5391	struct vop_strategy_args *a;
5392	struct buf *bp;
5393
5394	a = ap;
5395	bp = a->a_bp;
5396
5397	/*
5398	 * Cluster ops lock their component buffers but not the IO container.
5399	 */
5400	if ((bp->b_flags & B_CLUSTER) != 0)
5401		return;
5402
5403	if (!KERNEL_PANICKED() && !BUF_ISLOCKED(bp)) {
5404		if (vfs_badlock_print)
5405			printf(
5406			    "VOP_STRATEGY: bp is not locked but should be\n");
5407		if (vfs_badlock_ddb)
5408			kdb_enter(KDB_WHY_VFSLOCK, "lock violation");
5409	}
5410}
5411
5412void
5413vop_lock_debugpre(void *ap)
5414{
5415	struct vop_lock1_args *a = ap;
5416
5417	if ((a->a_flags & LK_INTERLOCK) == 0)
5418		ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
5419	else
5420		ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK");
5421}
5422
5423void
5424vop_lock_debugpost(void *ap, int rc)
5425{
5426	struct vop_lock1_args *a = ap;
5427
5428	ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
5429	if (rc == 0 && (a->a_flags & LK_EXCLOTHER) == 0)
5430		ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK");
5431}
5432
5433void
5434vop_unlock_debugpre(void *ap)
5435{
5436	struct vop_unlock_args *a = ap;
5437
5438	ASSERT_VOP_LOCKED(a->a_vp, "VOP_UNLOCK");
5439}
5440
5441void
5442vop_need_inactive_debugpre(void *ap)
5443{
5444	struct vop_need_inactive_args *a = ap;
5445
5446	ASSERT_VI_LOCKED(a->a_vp, "VOP_NEED_INACTIVE");
5447}
5448
5449void
5450vop_need_inactive_debugpost(void *ap, int rc)
5451{
5452	struct vop_need_inactive_args *a = ap;
5453
5454	ASSERT_VI_LOCKED(a->a_vp, "VOP_NEED_INACTIVE");
5455}
5456#endif
5457
5458void
5459vop_create_pre(void *ap)
5460{
5461	struct vop_create_args *a;
5462	struct vnode *dvp;
5463
5464	a = ap;
5465	dvp = a->a_dvp;
5466	vn_seqc_write_begin(dvp);
5467}
5468
5469void
5470vop_create_post(void *ap, int rc)
5471{
5472	struct vop_create_args *a;
5473	struct vnode *dvp;
5474
5475	a = ap;
5476	dvp = a->a_dvp;
5477	vn_seqc_write_end(dvp);
5478	if (!rc)
5479		VFS_KNOTE_LOCKED(dvp, NOTE_WRITE);
5480}
5481
5482void
5483vop_whiteout_pre(void *ap)
5484{
5485	struct vop_whiteout_args *a;
5486	struct vnode *dvp;
5487
5488	a = ap;
5489	dvp = a->a_dvp;
5490	vn_seqc_write_begin(dvp);
5491}
5492
5493void
5494vop_whiteout_post(void *ap, int rc)
5495{
5496	struct vop_whiteout_args *a;
5497	struct vnode *dvp;
5498
5499	a = ap;
5500	dvp = a->a_dvp;
5501	vn_seqc_write_end(dvp);
5502}
5503
5504void
5505vop_deleteextattr_pre(void *ap)
5506{
5507	struct vop_deleteextattr_args *a;
5508	struct vnode *vp;
5509
5510	a = ap;
5511	vp = a->a_vp;
5512	vn_seqc_write_begin(vp);
5513}
5514
5515void
5516vop_deleteextattr_post(void *ap, int rc)
5517{
5518	struct vop_deleteextattr_args *a;
5519	struct vnode *vp;
5520
5521	a = ap;
5522	vp = a->a_vp;
5523	vn_seqc_write_end(vp);
5524	if (!rc)
5525		VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
5526}
5527
5528void
5529vop_link_pre(void *ap)
5530{
5531	struct vop_link_args *a;
5532	struct vnode *vp, *tdvp;
5533
5534	a = ap;
5535	vp = a->a_vp;
5536	tdvp = a->a_tdvp;
5537	vn_seqc_write_begin(vp);
5538	vn_seqc_write_begin(tdvp);
5539}
5540
5541void
5542vop_link_post(void *ap, int rc)
5543{
5544	struct vop_link_args *a;
5545	struct vnode *vp, *tdvp;
5546
5547	a = ap;
5548	vp = a->a_vp;
5549	tdvp = a->a_tdvp;
5550	vn_seqc_write_end(vp);
5551	vn_seqc_write_end(tdvp);
5552	if (!rc) {
5553		VFS_KNOTE_LOCKED(vp, NOTE_LINK);
5554		VFS_KNOTE_LOCKED(tdvp, NOTE_WRITE);
5555	}
5556}
5557
5558void
5559vop_mkdir_pre(void *ap)
5560{
5561	struct vop_mkdir_args *a;
5562	struct vnode *dvp;
5563
5564	a = ap;
5565	dvp = a->a_dvp;
5566	vn_seqc_write_begin(dvp);
5567}
5568
5569void
5570vop_mkdir_post(void *ap, int rc)
5571{
5572	struct vop_mkdir_args *a;
5573	struct vnode *dvp;
5574
5575	a = ap;
5576	dvp = a->a_dvp;
5577	vn_seqc_write_end(dvp);
5578	if (!rc)
5579		VFS_KNOTE_LOCKED(dvp, NOTE_WRITE | NOTE_LINK);
5580}
5581
5582void
5583vop_mknod_pre(void *ap)
5584{
5585	struct vop_mknod_args *a;
5586	struct vnode *dvp;
5587
5588	a = ap;
5589	dvp = a->a_dvp;
5590	vn_seqc_write_begin(dvp);
5591}
5592
5593void
5594vop_mknod_post(void *ap, int rc)
5595{
5596	struct vop_mknod_args *a;
5597	struct vnode *dvp;
5598
5599	a = ap;
5600	dvp = a->a_dvp;
5601	vn_seqc_write_end(dvp);
5602	if (!rc)
5603		VFS_KNOTE_LOCKED(dvp, NOTE_WRITE);
5604}
5605
5606void
5607vop_reclaim_post(void *ap, int rc)
5608{
5609	struct vop_reclaim_args *a;
5610	struct vnode *vp;
5611
5612	a = ap;
5613	vp = a->a_vp;
5614	ASSERT_VOP_IN_SEQC(vp);
5615	if (!rc)
5616		VFS_KNOTE_LOCKED(vp, NOTE_REVOKE);
5617}
5618
5619void
5620vop_remove_pre(void *ap)
5621{
5622	struct vop_remove_args *a;
5623	struct vnode *dvp, *vp;
5624
5625	a = ap;
5626	dvp = a->a_dvp;
5627	vp = a->a_vp;
5628	vn_seqc_write_begin(dvp);
5629	vn_seqc_write_begin(vp);
5630}
5631
5632void
5633vop_remove_post(void *ap, int rc)
5634{
5635	struct vop_remove_args *a;
5636	struct vnode *dvp, *vp;
5637
5638	a = ap;
5639	dvp = a->a_dvp;
5640	vp = a->a_vp;
5641	vn_seqc_write_end(dvp);
5642	vn_seqc_write_end(vp);
5643	if (!rc) {
5644		VFS_KNOTE_LOCKED(dvp, NOTE_WRITE);
5645		VFS_KNOTE_LOCKED(vp, NOTE_DELETE);
5646	}
5647}
5648
5649void
5650vop_rename_post(void *ap, int rc)
5651{
5652	struct vop_rename_args *a = ap;
5653	long hint;
5654
5655	if (!rc) {
5656		hint = NOTE_WRITE;
5657		if (a->a_fdvp == a->a_tdvp) {
5658			if (a->a_tvp != NULL && a->a_tvp->v_type == VDIR)
5659				hint |= NOTE_LINK;
5660			VFS_KNOTE_UNLOCKED(a->a_fdvp, hint);
5661			VFS_KNOTE_UNLOCKED(a->a_tdvp, hint);
5662		} else {
5663			hint |= NOTE_EXTEND;
5664			if (a->a_fvp->v_type == VDIR)
5665				hint |= NOTE_LINK;
5666			VFS_KNOTE_UNLOCKED(a->a_fdvp, hint);
5667
5668			if (a->a_fvp->v_type == VDIR && a->a_tvp != NULL &&
5669			    a->a_tvp->v_type == VDIR)
5670				hint &= ~NOTE_LINK;
5671			VFS_KNOTE_UNLOCKED(a->a_tdvp, hint);
5672		}
5673
5674		VFS_KNOTE_UNLOCKED(a->a_fvp, NOTE_RENAME);
5675		if (a->a_tvp)
5676			VFS_KNOTE_UNLOCKED(a->a_tvp, NOTE_DELETE);
5677	}
5678	if (a->a_tdvp != a->a_fdvp)
5679		vdrop(a->a_fdvp);
5680	if (a->a_tvp != a->a_fvp)
5681		vdrop(a->a_fvp);
5682	vdrop(a->a_tdvp);
5683	if (a->a_tvp)
5684		vdrop(a->a_tvp);
5685}
5686
5687void
5688vop_rmdir_pre(void *ap)
5689{
5690	struct vop_rmdir_args *a;
5691	struct vnode *dvp, *vp;
5692
5693	a = ap;
5694	dvp = a->a_dvp;
5695	vp = a->a_vp;
5696	vn_seqc_write_begin(dvp);
5697	vn_seqc_write_begin(vp);
5698}
5699
5700void
5701vop_rmdir_post(void *ap, int rc)
5702{
5703	struct vop_rmdir_args *a;
5704	struct vnode *dvp, *vp;
5705
5706	a = ap;
5707	dvp = a->a_dvp;
5708	vp = a->a_vp;
5709	vn_seqc_write_end(dvp);
5710	vn_seqc_write_end(vp);
5711	if (!rc) {
5712		VFS_KNOTE_LOCKED(dvp, NOTE_WRITE | NOTE_LINK);
5713		VFS_KNOTE_LOCKED(vp, NOTE_DELETE);
5714	}
5715}
5716
5717void
5718vop_setattr_pre(void *ap)
5719{
5720	struct vop_setattr_args *a;
5721	struct vnode *vp;
5722
5723	a = ap;
5724	vp = a->a_vp;
5725	vn_seqc_write_begin(vp);
5726}
5727
5728void
5729vop_setattr_post(void *ap, int rc)
5730{
5731	struct vop_setattr_args *a;
5732	struct vnode *vp;
5733
5734	a = ap;
5735	vp = a->a_vp;
5736	vn_seqc_write_end(vp);
5737	if (!rc)
5738		VFS_KNOTE_LOCKED(vp, NOTE_ATTRIB);
5739}
5740
5741void
5742vop_setacl_pre(void *ap)
5743{
5744	struct vop_setacl_args *a;
5745	struct vnode *vp;
5746
5747	a = ap;
5748	vp = a->a_vp;
5749	vn_seqc_write_begin(vp);
5750}
5751
5752void
5753vop_setacl_post(void *ap, int rc __unused)
5754{
5755	struct vop_setacl_args *a;
5756	struct vnode *vp;
5757
5758	a = ap;
5759	vp = a->a_vp;
5760	vn_seqc_write_end(vp);
5761}
5762
5763void
5764vop_setextattr_pre(void *ap)
5765{
5766	struct vop_setextattr_args *a;
5767	struct vnode *vp;
5768
5769	a = ap;
5770	vp = a->a_vp;
5771	vn_seqc_write_begin(vp);
5772}
5773
5774void
5775vop_setextattr_post(void *ap, int rc)
5776{
5777	struct vop_setextattr_args *a;
5778	struct vnode *vp;
5779
5780	a = ap;
5781	vp = a->a_vp;
5782	vn_seqc_write_end(vp);
5783	if (!rc)
5784		VFS_KNOTE_LOCKED(vp, NOTE_ATTRIB);
5785}
5786
5787void
5788vop_symlink_pre(void *ap)
5789{
5790	struct vop_symlink_args *a;
5791	struct vnode *dvp;
5792
5793	a = ap;
5794	dvp = a->a_dvp;
5795	vn_seqc_write_begin(dvp);
5796}
5797
5798void
5799vop_symlink_post(void *ap, int rc)
5800{
5801	struct vop_symlink_args *a;
5802	struct vnode *dvp;
5803
5804	a = ap;
5805	dvp = a->a_dvp;
5806	vn_seqc_write_end(dvp);
5807	if (!rc)
5808		VFS_KNOTE_LOCKED(dvp, NOTE_WRITE);
5809}
5810
5811void
5812vop_open_post(void *ap, int rc)
5813{
5814	struct vop_open_args *a = ap;
5815
5816	if (!rc)
5817		VFS_KNOTE_LOCKED(a->a_vp, NOTE_OPEN);
5818}
5819
5820void
5821vop_close_post(void *ap, int rc)
5822{
5823	struct vop_close_args *a = ap;
5824
5825	if (!rc && (a->a_cred != NOCRED || /* filter out revokes */
5826	    !VN_IS_DOOMED(a->a_vp))) {
5827		VFS_KNOTE_LOCKED(a->a_vp, (a->a_fflag & FWRITE) != 0 ?
5828		    NOTE_CLOSE_WRITE : NOTE_CLOSE);
5829	}
5830}
5831
5832void
5833vop_read_post(void *ap, int rc)
5834{
5835	struct vop_read_args *a = ap;
5836
5837	if (!rc)
5838		VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ);
5839}
5840
5841void
5842vop_read_pgcache_post(void *ap, int rc)
5843{
5844	struct vop_read_pgcache_args *a = ap;
5845
5846	if (!rc)
5847		VFS_KNOTE_UNLOCKED(a->a_vp, NOTE_READ);
5848}
5849
5850void
5851vop_readdir_post(void *ap, int rc)
5852{
5853	struct vop_readdir_args *a = ap;
5854
5855	if (!rc)
5856		VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ);
5857}
5858
5859static struct knlist fs_knlist;
5860
5861static void
5862vfs_event_init(void *arg)
5863{
5864	knlist_init_mtx(&fs_knlist, NULL);
5865}
5866/* XXX - correct order? */
5867SYSINIT(vfs_knlist, SI_SUB_VFS, SI_ORDER_ANY, vfs_event_init, NULL);
5868
5869void
5870vfs_event_signal(fsid_t *fsid, uint32_t event, intptr_t data __unused)
5871{
5872
5873	KNOTE_UNLOCKED(&fs_knlist, event);
5874}
5875
5876static int	filt_fsattach(struct knote *kn);
5877static void	filt_fsdetach(struct knote *kn);
5878static int	filt_fsevent(struct knote *kn, long hint);
5879
5880struct filterops fs_filtops = {
5881	.f_isfd = 0,
5882	.f_attach = filt_fsattach,
5883	.f_detach = filt_fsdetach,
5884	.f_event = filt_fsevent
5885};
5886
5887static int
5888filt_fsattach(struct knote *kn)
5889{
5890
5891	kn->kn_flags |= EV_CLEAR;
5892	knlist_add(&fs_knlist, kn, 0);
5893	return (0);
5894}
5895
5896static void
5897filt_fsdetach(struct knote *kn)
5898{
5899
5900	knlist_remove(&fs_knlist, kn, 0);
5901}
5902
5903static int
5904filt_fsevent(struct knote *kn, long hint)
5905{
5906
5907	kn->kn_fflags |= hint;
5908	return (kn->kn_fflags != 0);
5909}
5910
5911static int
5912sysctl_vfs_ctl(SYSCTL_HANDLER_ARGS)
5913{
5914	struct vfsidctl vc;
5915	int error;
5916	struct mount *mp;
5917
5918	error = SYSCTL_IN(req, &vc, sizeof(vc));
5919	if (error)
5920		return (error);
5921	if (vc.vc_vers != VFS_CTL_VERS1)
5922		return (EINVAL);
5923	mp = vfs_getvfs(&vc.vc_fsid);
5924	if (mp == NULL)
5925		return (ENOENT);
5926	/* ensure that a specific sysctl goes to the right filesystem. */
5927	if (strcmp(vc.vc_fstypename, "*") != 0 &&
5928	    strcmp(vc.vc_fstypename, mp->mnt_vfc->vfc_name) != 0) {
5929		vfs_rel(mp);
5930		return (EINVAL);
5931	}
5932	VCTLTOREQ(&vc, req);
5933	error = VFS_SYSCTL(mp, vc.vc_op, req);
5934	vfs_rel(mp);
5935	return (error);
5936}
5937
5938SYSCTL_PROC(_vfs, OID_AUTO, ctl, CTLTYPE_OPAQUE | CTLFLAG_MPSAFE | CTLFLAG_WR,
5939    NULL, 0, sysctl_vfs_ctl, "",
5940    "Sysctl by fsid");
5941
5942/*
5943 * Function to initialize a va_filerev field sensibly.
5944 * XXX: Wouldn't a random number make a lot more sense ??
5945 */
5946u_quad_t
5947init_va_filerev(void)
5948{
5949	struct bintime bt;
5950
5951	getbinuptime(&bt);
5952	return (((u_quad_t)bt.sec << 32LL) | (bt.frac >> 32LL));
5953}
5954
5955static int	filt_vfsread(struct knote *kn, long hint);
5956static int	filt_vfswrite(struct knote *kn, long hint);
5957static int	filt_vfsvnode(struct knote *kn, long hint);
5958static void	filt_vfsdetach(struct knote *kn);
5959static struct filterops vfsread_filtops = {
5960	.f_isfd = 1,
5961	.f_detach = filt_vfsdetach,
5962	.f_event = filt_vfsread
5963};
5964static struct filterops vfswrite_filtops = {
5965	.f_isfd = 1,
5966	.f_detach = filt_vfsdetach,
5967	.f_event = filt_vfswrite
5968};
5969static struct filterops vfsvnode_filtops = {
5970	.f_isfd = 1,
5971	.f_detach = filt_vfsdetach,
5972	.f_event = filt_vfsvnode
5973};
5974
5975static void
5976vfs_knllock(void *arg)
5977{
5978	struct vnode *vp = arg;
5979
5980	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
5981}
5982
5983static void
5984vfs_knlunlock(void *arg)
5985{
5986	struct vnode *vp = arg;
5987
5988	VOP_UNLOCK(vp);
5989}
5990
5991static void
5992vfs_knl_assert_locked(void *arg)
5993{
5994#ifdef DEBUG_VFS_LOCKS
5995	struct vnode *vp = arg;
5996
5997	ASSERT_VOP_LOCKED(vp, "vfs_knl_assert_locked");
5998#endif
5999}
6000
6001static void
6002vfs_knl_assert_unlocked(void *arg)
6003{
6004#ifdef DEBUG_VFS_LOCKS
6005	struct vnode *vp = arg;
6006
6007	ASSERT_VOP_UNLOCKED(vp, "vfs_knl_assert_unlocked");
6008#endif
6009}
6010
6011int
6012vfs_kqfilter(struct vop_kqfilter_args *ap)
6013{
6014	struct vnode *vp = ap->a_vp;
6015	struct knote *kn = ap->a_kn;
6016	struct knlist *knl;
6017
6018	switch (kn->kn_filter) {
6019	case EVFILT_READ:
6020		kn->kn_fop = &vfsread_filtops;
6021		break;
6022	case EVFILT_WRITE:
6023		kn->kn_fop = &vfswrite_filtops;
6024		break;
6025	case EVFILT_VNODE:
6026		kn->kn_fop = &vfsvnode_filtops;
6027		break;
6028	default:
6029		return (EINVAL);
6030	}
6031
6032	kn->kn_hook = (caddr_t)vp;
6033
6034	v_addpollinfo(vp);
6035	if (vp->v_pollinfo == NULL)
6036		return (ENOMEM);
6037	knl = &vp->v_pollinfo->vpi_selinfo.si_note;
6038	vhold(vp);
6039	knlist_add(knl, kn, 0);
6040
6041	return (0);
6042}
6043
6044/*
6045 * Detach knote from vnode
6046 */
6047static void
6048filt_vfsdetach(struct knote *kn)
6049{
6050	struct vnode *vp = (struct vnode *)kn->kn_hook;
6051
6052	KASSERT(vp->v_pollinfo != NULL, ("Missing v_pollinfo"));
6053	knlist_remove(&vp->v_pollinfo->vpi_selinfo.si_note, kn, 0);
6054	vdrop(vp);
6055}
6056
6057/*ARGSUSED*/
6058static int
6059filt_vfsread(struct knote *kn, long hint)
6060{
6061	struct vnode *vp = (struct vnode *)kn->kn_hook;
6062	struct vattr va;
6063	int res;
6064
6065	/*
6066	 * filesystem is gone, so set the EOF flag and schedule
6067	 * the knote for deletion.
6068	 */
6069	if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) {
6070		VI_LOCK(vp);
6071		kn->kn_flags |= (EV_EOF | EV_ONESHOT);
6072		VI_UNLOCK(vp);
6073		return (1);
6074	}
6075
6076	if (VOP_GETATTR(vp, &va, curthread->td_ucred))
6077		return (0);
6078
6079	VI_LOCK(vp);
6080	kn->kn_data = va.va_size - kn->kn_fp->f_offset;
6081	res = (kn->kn_sfflags & NOTE_FILE_POLL) != 0 || kn->kn_data != 0;
6082	VI_UNLOCK(vp);
6083	return (res);
6084}
6085
6086/*ARGSUSED*/
6087static int
6088filt_vfswrite(struct knote *kn, long hint)
6089{
6090	struct vnode *vp = (struct vnode *)kn->kn_hook;
6091
6092	VI_LOCK(vp);
6093
6094	/*
6095	 * filesystem is gone, so set the EOF flag and schedule
6096	 * the knote for deletion.
6097	 */
6098	if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD))
6099		kn->kn_flags |= (EV_EOF | EV_ONESHOT);
6100
6101	kn->kn_data = 0;
6102	VI_UNLOCK(vp);
6103	return (1);
6104}
6105
6106static int
6107filt_vfsvnode(struct knote *kn, long hint)
6108{
6109	struct vnode *vp = (struct vnode *)kn->kn_hook;
6110	int res;
6111
6112	VI_LOCK(vp);
6113	if (kn->kn_sfflags & hint)
6114		kn->kn_fflags |= hint;
6115	if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) {
6116		kn->kn_flags |= EV_EOF;
6117		VI_UNLOCK(vp);
6118		return (1);
6119	}
6120	res = (kn->kn_fflags != 0);
6121	VI_UNLOCK(vp);
6122	return (res);
6123}
6124
6125/*
6126 * Returns whether the directory is empty or not.
6127 * If it is empty, the return value is 0; otherwise
6128 * the return value is an error value (which may
6129 * be ENOTEMPTY).
6130 */
6131int
6132vfs_emptydir(struct vnode *vp)
6133{
6134	struct uio uio;
6135	struct iovec iov;
6136	struct dirent *dirent, *dp, *endp;
6137	int error, eof;
6138
6139	error = 0;
6140	eof = 0;
6141
6142	ASSERT_VOP_LOCKED(vp, "vfs_emptydir");
6143
6144	dirent = malloc(sizeof(struct dirent), M_TEMP, M_WAITOK);
6145	iov.iov_base = dirent;
6146	iov.iov_len = sizeof(struct dirent);
6147
6148	uio.uio_iov = &iov;
6149	uio.uio_iovcnt = 1;
6150	uio.uio_offset = 0;
6151	uio.uio_resid = sizeof(struct dirent);
6152	uio.uio_segflg = UIO_SYSSPACE;
6153	uio.uio_rw = UIO_READ;
6154	uio.uio_td = curthread;
6155
6156	while (eof == 0 && error == 0) {
6157		error = VOP_READDIR(vp, &uio, curthread->td_ucred, &eof,
6158		    NULL, NULL);
6159		if (error != 0)
6160			break;
6161		endp = (void *)((uint8_t *)dirent +
6162		    sizeof(struct dirent) - uio.uio_resid);
6163		for (dp = dirent; dp < endp;
6164		     dp = (void *)((uint8_t *)dp + GENERIC_DIRSIZ(dp))) {
6165			if (dp->d_type == DT_WHT)
6166				continue;
6167			if (dp->d_namlen == 0)
6168				continue;
6169			if (dp->d_type != DT_DIR &&
6170			    dp->d_type != DT_UNKNOWN) {
6171				error = ENOTEMPTY;
6172				break;
6173			}
6174			if (dp->d_namlen > 2) {
6175				error = ENOTEMPTY;
6176				break;
6177			}
6178			if (dp->d_namlen == 1 &&
6179			    dp->d_name[0] != '.') {
6180				error = ENOTEMPTY;
6181				break;
6182			}
6183			if (dp->d_namlen == 2 &&
6184			    dp->d_name[1] != '.') {
6185				error = ENOTEMPTY;
6186				break;
6187			}
6188			uio.uio_resid = sizeof(struct dirent);
6189		}
6190	}
6191	free(dirent, M_TEMP);
6192	return (error);
6193}
6194
6195int
6196vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off)
6197{
6198	int error;
6199
6200	if (dp->d_reclen > ap->a_uio->uio_resid)
6201		return (ENAMETOOLONG);
6202	error = uiomove(dp, dp->d_reclen, ap->a_uio);
6203	if (error) {
6204		if (ap->a_ncookies != NULL) {
6205			if (ap->a_cookies != NULL)
6206				free(ap->a_cookies, M_TEMP);
6207			ap->a_cookies = NULL;
6208			*ap->a_ncookies = 0;
6209		}
6210		return (error);
6211	}
6212	if (ap->a_ncookies == NULL)
6213		return (0);
6214
6215	KASSERT(ap->a_cookies,
6216	    ("NULL ap->a_cookies value with non-NULL ap->a_ncookies!"));
6217
6218	*ap->a_cookies = realloc(*ap->a_cookies,
6219	    (*ap->a_ncookies + 1) * sizeof(u_long), M_TEMP, M_WAITOK | M_ZERO);
6220	(*ap->a_cookies)[*ap->a_ncookies] = off;
6221	*ap->a_ncookies += 1;
6222	return (0);
6223}
6224
6225/*
6226 * The purpose of this routine is to remove granularity from accmode_t,
6227 * reducing it into standard unix access bits - VEXEC, VREAD, VWRITE,
6228 * VADMIN and VAPPEND.
6229 *
6230 * If it returns 0, the caller is supposed to continue with the usual
6231 * access checks using 'accmode' as modified by this routine.  If it
6232 * returns nonzero value, the caller is supposed to return that value
6233 * as errno.
6234 *
6235 * Note that after this routine runs, accmode may be zero.
6236 */
6237int
6238vfs_unixify_accmode(accmode_t *accmode)
6239{
6240	/*
6241	 * There is no way to specify explicit "deny" rule using
6242	 * file mode or POSIX.1e ACLs.
6243	 */
6244	if (*accmode & VEXPLICIT_DENY) {
6245		*accmode = 0;
6246		return (0);
6247	}
6248
6249	/*
6250	 * None of these can be translated into usual access bits.
6251	 * Also, the common case for NFSv4 ACLs is to not contain
6252	 * either of these bits. Caller should check for VWRITE
6253	 * on the containing directory instead.
6254	 */
6255	if (*accmode & (VDELETE_CHILD | VDELETE))
6256		return (EPERM);
6257
6258	if (*accmode & VADMIN_PERMS) {
6259		*accmode &= ~VADMIN_PERMS;
6260		*accmode |= VADMIN;
6261	}
6262
6263	/*
6264	 * There is no way to deny VREAD_ATTRIBUTES, VREAD_ACL
6265	 * or VSYNCHRONIZE using file mode or POSIX.1e ACL.
6266	 */
6267	*accmode &= ~(VSTAT_PERMS | VSYNCHRONIZE);
6268
6269	return (0);
6270}
6271
6272/*
6273 * Clear out a doomed vnode (if any) and replace it with a new one as long
6274 * as the fs is not being unmounted. Return the root vnode to the caller.
6275 */
6276static int __noinline
6277vfs_cache_root_fallback(struct mount *mp, int flags, struct vnode **vpp)
6278{
6279	struct vnode *vp;
6280	int error;
6281
6282restart:
6283	if (mp->mnt_rootvnode != NULL) {
6284		MNT_ILOCK(mp);
6285		vp = mp->mnt_rootvnode;
6286		if (vp != NULL) {
6287			if (!VN_IS_DOOMED(vp)) {
6288				vrefact(vp);
6289				MNT_IUNLOCK(mp);
6290				error = vn_lock(vp, flags);
6291				if (error == 0) {
6292					*vpp = vp;
6293					return (0);
6294				}
6295				vrele(vp);
6296				goto restart;
6297			}
6298			/*
6299			 * Clear the old one.
6300			 */
6301			mp->mnt_rootvnode = NULL;
6302		}
6303		MNT_IUNLOCK(mp);
6304		if (vp != NULL) {
6305			vfs_op_barrier_wait(mp);
6306			vrele(vp);
6307		}
6308	}
6309	error = VFS_CACHEDROOT(mp, flags, vpp);
6310	if (error != 0)
6311		return (error);
6312	if (mp->mnt_vfs_ops == 0) {
6313		MNT_ILOCK(mp);
6314		if (mp->mnt_vfs_ops != 0) {
6315			MNT_IUNLOCK(mp);
6316			return (0);
6317		}
6318		if (mp->mnt_rootvnode == NULL) {
6319			vrefact(*vpp);
6320			mp->mnt_rootvnode = *vpp;
6321		} else {
6322			if (mp->mnt_rootvnode != *vpp) {
6323				if (!VN_IS_DOOMED(mp->mnt_rootvnode)) {
6324					panic("%s: mismatch between vnode returned "
6325					    " by VFS_CACHEDROOT and the one cached "
6326					    " (%p != %p)",
6327					    __func__, *vpp, mp->mnt_rootvnode);
6328				}
6329			}
6330		}
6331		MNT_IUNLOCK(mp);
6332	}
6333	return (0);
6334}
6335
6336int
6337vfs_cache_root(struct mount *mp, int flags, struct vnode **vpp)
6338{
6339	struct vnode *vp;
6340	int error;
6341
6342	if (!vfs_op_thread_enter(mp))
6343		return (vfs_cache_root_fallback(mp, flags, vpp));
6344	vp = atomic_load_ptr(&mp->mnt_rootvnode);
6345	if (vp == NULL || VN_IS_DOOMED(vp)) {
6346		vfs_op_thread_exit(mp);
6347		return (vfs_cache_root_fallback(mp, flags, vpp));
6348	}
6349	vrefact(vp);
6350	vfs_op_thread_exit(mp);
6351	error = vn_lock(vp, flags);
6352	if (error != 0) {
6353		vrele(vp);
6354		return (vfs_cache_root_fallback(mp, flags, vpp));
6355	}
6356	*vpp = vp;
6357	return (0);
6358}
6359
6360struct vnode *
6361vfs_cache_root_clear(struct mount *mp)
6362{
6363	struct vnode *vp;
6364
6365	/*
6366	 * ops > 0 guarantees there is nobody who can see this vnode
6367	 */
6368	MPASS(mp->mnt_vfs_ops > 0);
6369	vp = mp->mnt_rootvnode;
6370	if (vp != NULL)
6371		vn_seqc_write_begin(vp);
6372	mp->mnt_rootvnode = NULL;
6373	return (vp);
6374}
6375
6376void
6377vfs_cache_root_set(struct mount *mp, struct vnode *vp)
6378{
6379
6380	MPASS(mp->mnt_vfs_ops > 0);
6381	vrefact(vp);
6382	mp->mnt_rootvnode = vp;
6383}
6384
6385/*
6386 * These are helper functions for filesystems to traverse all
6387 * their vnodes.  See MNT_VNODE_FOREACH_ALL() in sys/mount.h.
6388 *
6389 * This interface replaces MNT_VNODE_FOREACH.
6390 */
6391
6392struct vnode *
6393__mnt_vnode_next_all(struct vnode **mvp, struct mount *mp)
6394{
6395	struct vnode *vp;
6396
6397	if (should_yield())
6398		kern_yield(PRI_USER);
6399	MNT_ILOCK(mp);
6400	KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
6401	for (vp = TAILQ_NEXT(*mvp, v_nmntvnodes); vp != NULL;
6402	    vp = TAILQ_NEXT(vp, v_nmntvnodes)) {
6403		/* Allow a racy peek at VIRF_DOOMED to save a lock acquisition. */
6404		if (vp->v_type == VMARKER || VN_IS_DOOMED(vp))
6405			continue;
6406		VI_LOCK(vp);
6407		if (VN_IS_DOOMED(vp)) {
6408			VI_UNLOCK(vp);
6409			continue;
6410		}
6411		break;
6412	}
6413	if (vp == NULL) {
6414		__mnt_vnode_markerfree_all(mvp, mp);
6415		/* MNT_IUNLOCK(mp); -- done in above function */
6416		mtx_assert(MNT_MTX(mp), MA_NOTOWNED);
6417		return (NULL);
6418	}
6419	TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes);
6420	TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes);
6421	MNT_IUNLOCK(mp);
6422	return (vp);
6423}
6424
6425struct