120280f1imp/*-
24736ccfpfg * SPDX-License-Identifier: BSD-3-Clause
34736ccfpfg *
48fb65cergrimes * Copyright (c) 1989, 1993
58fb65cergrimes *	The Regents of the University of California.  All rights reserved.
68fb65cergrimes * (c) UNIX System Laboratories, Inc.
78fb65cergrimes * All or some portions of this file are derived from material licensed
88fb65cergrimes * to the University of California by American Telephone and Telegraph
98fb65cergrimes * Co. or Unix System Laboratories, Inc. and are reproduced herein with
108fb65cergrimes * the permission of UNIX System Laboratories, Inc.
118fb65cergrimes *
128fb65cergrimes * Redistribution and use in source and binary forms, with or without
138fb65cergrimes * modification, are permitted provided that the following conditions
148fb65cergrimes * are met:
158fb65cergrimes * 1. Redistributions of source code must retain the above copyright
168fb65cergrimes *    notice, this list of conditions and the following disclaimer.
178fb65cergrimes * 2. Redistributions in binary form must reproduce the above copyright
188fb65cergrimes *    notice, this list of conditions and the following disclaimer in the
198fb65cergrimes *    documentation and/or other materials provided with the distribution.
2000b67b1emaste * 3. Neither the name of the University nor the names of its contributors
218fb65cergrimes *    may be used to endorse or promote products derived from this software
228fb65cergrimes *    without specific prior written permission.
238fb65cergrimes *
248fb65cergrimes * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
258fb65cergrimes * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
268fb65cergrimes * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
278fb65cergrimes * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
288fb65cergrimes * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
298fb65cergrimes * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
308fb65cergrimes * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
318fb65cergrimes * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
328fb65cergrimes * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
338fb65cergrimes * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
348fb65cergrimes * SUCH DAMAGE.
358fb65cergrimes *
3610f666adyson *	@(#)vfs_subr.c	8.31 (Berkeley) 5/26/95
378fb65cergrimes */
388fb65cergrimes
398fb65cergrimes/*
408fb65cergrimes * External virtual filesystem routines
418fb65cergrimes */
423b8fff9obrien
433b8fff9obrien#include <sys/cdefs.h>
443b8fff9obrien__FBSDID("$FreeBSD$");
453b8fff9obrien
4639d3a9awollman#include "opt_ddb.h"
47d685681attilio#include "opt_watchdog.h"
488fb65cergrimes
498fb65cergrimes#include <sys/param.h>
508fb65cergrimes#include <sys/systm.h>
5136c3965phk#include <sys/bio.h>
52e100d44luoqi#include <sys/buf.h>
53e0dae39asomers#include <sys/capsicum.h>
54faa0cdded#include <sys/condvar.h>
55e100d44luoqi#include <sys/conf.h>
569412199mjg#include <sys/counter.h>
574e50b9ephk#include <sys/dirent.h>
58bbaa6c3alfred#include <sys/event.h>
59e100d44luoqi#include <sys/eventhandler.h>
6055be95dphk#include <sys/extattr.h>
610835f7bssouhlal#include <sys/file.h>
6285c5587bde#include <sys/fcntl.h>
63ad49fbepjd#include <sys/jail.h>
64c20ced5marcel#include <sys/kdb.h>
65a7a3961bde#include <sys/kernel.h>
666cb5fe6peter#include <sys/kthread.h>
673038f1acem#include <sys/ktr.h>
685224340kib#include <sys/lockf.h>
69a5dcc1frwatson#include <sys/malloc.h>
708fb65cergrimes#include <sys/mount.h>
71767bad2eivind#include <sys/namei.h>
721cfa4a3jeff#include <sys/pctrie.h>
7310d0d9crwatson#include <sys/priv.h>
7454d23a3truckman#include <sys/reboot.h>
7528fa5eemjg#include <sys/refcount.h>
76658534eattilio#include <sys/rwlock.h>
773a7fd5fjhb#include <sys/sched.h>
78d25301cjhb#include <sys/sleepqueue.h>
794074673mjg#include <sys/smr.h>
802818562kib#include <sys/smp.h>
818fb65cergrimes#include <sys/stat.h>
82e100d44luoqi#include <sys/sysctl.h>
831750942dillon#include <sys/syslog.h>
84cd67bb8dyson#include <sys/vmmeter.h>
85e100d44luoqi#include <sys/vnode.h>
86d685681attilio#include <sys/watchdog.h>
878fb65cergrimes
88dc1cfeaphk#include <machine/stdarg.h>
89dc1cfeaphk
907beaaf5rwatson#include <security/mac/mac_framework.h>
917beaaf5rwatson
928fb65cergrimes#include <vm/vm.h>
93c30f46cdg#include <vm/vm_object.h>
94c30f46cdg#include <vm/vm_extern.h>
956bd1f74dyson#include <vm/pmap.h>
966bd1f74dyson#include <vm/vm_map.h>
97df24433dillon#include <vm/vm_page.h>
9853638c7alc#include <vm/vm_kern.h>
99318cbeejeff#include <vm/uma.h>
1008fb65cergrimes
10112baf6epjd#ifdef DDB
10212baf6epjd#include <ddb/ddb.h>
10312baf6epjd#endif
10412baf6epjd
105070a613phkstatic void	delmntque(struct vnode *vp);
10666dfd63phkstatic int	flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo,
107649a01ephk		    int slpflag, int slptimeo);
1089ed03e6truckmanstatic void	syncer_shutdown(void *arg, int howto);
1096b8324ejeffstatic int	vtryrecycle(struct vnode *vp);
11028fa5eemjgstatic void	v_init_counters(struct vnode *);
111ca07a9fjeffstatic void	vgonel(struct vnode *);
112efe31cdssouhlalstatic void	vfs_knllock(void *arg);
113efe31cdssouhlalstatic void	vfs_knlunlock(void *arg);
114e1cb294kibstatic void	vfs_knl_assert_locked(void *arg);
115e1cb294kibstatic void	vfs_knl_assert_unlocked(void *arg);
11686b5e61kibstatic void	destroy_vpollinfo(struct vpollinfo *vi);
1173c6bfc0asomersstatic int	v_inval_buf_range_locked(struct vnode *vp, struct bufobj *bo,
1183c6bfc0asomers		    daddr_t startlbn, daddr_t endlbn);
119f378822mjgstatic void	vnlru_recalc(void);
1208e9b2cdeivind
1218e9b2cdeivind/*
1224e68a99markj * These fences are intended for cases where some synchronization is
1234e68a99markj * needed between access of v_iflags and lockless vnode refcount (v_holdcnt
1244e68a99markj * and v_usecount) updates.  Access to v_iflags is generally synchronized
1254e68a99markj * by the interlock, but we have some internal assertions that check vnode
126d1a00acmarkj * flags without acquiring the lock.  Thus, these fences are INVARIANTS-only
1274e68a99markj * for now.
1284e68a99markj */
1294e68a99markj#ifdef INVARIANTS
1304e68a99markj#define	VNODE_REFCOUNT_FENCE_ACQ()	atomic_thread_fence_acq()
1314e68a99markj#define	VNODE_REFCOUNT_FENCE_REL()	atomic_thread_fence_rel()
1324e68a99markj#else
1334e68a99markj#define	VNODE_REFCOUNT_FENCE_ACQ()
1344e68a99markj#define	VNODE_REFCOUNT_FENCE_REL()
1354e68a99markj#endif
1364e68a99markj
1374e68a99markj/*
1388e9b2cdeivind * Number of vnodes in existence.  Increased whenever getnewvnode()
139bcfa67amjg * allocates a new vnode, decreased in vdropl() for VIRF_DOOMED vnode.
1408e9b2cdeivind */
1416facf4dmjgstatic u_long __exclusive_cache_line numvnodes;
14205b2183dillon
143f6a71a4mdfSYSCTL_ULONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0,
144d3c1b43brucec    "Number of vnodes in existence");
1453089ef0bde
1469412199mjgstatic counter_u64_t vnodes_created;
1479412199mjgSYSCTL_COUNTER_U64(_vfs, OID_AUTO, vnodes_created, CTLFLAG_RD, &vnodes_created,
1489412199mjg    "Number of vnodes created by getnewvnode");
1494247c4fjhb
1508e9b2cdeivind/*
1518e9b2cdeivind * Conversion tables for conversion from vnode types to inode formats
1528e9b2cdeivind * and back.
1538e9b2cdeivind */
1548fb65cergrimesenum vtype iftovt_tab[16] = {
1558fb65cergrimes	VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
15651739d2mckusick	VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VNON
1578fb65cergrimes};
158d344c11teggeint vttoif_tab[10] = {
1598fb65cergrimes	0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
160d344c11tegge	S_IFSOCK, S_IFIFO, S_IFMT, S_IFMT
1618fb65cergrimes};
1628fb65cergrimes
1638dd2b8deivind/*
1649b80414mjg * List of allocates vnodes in the system.
1658dd2b8deivind */
1669b80414mjgstatic TAILQ_HEAD(freelst, vnode) vnode_list;
1679b80414mjgstatic struct vnode *vnode_list_free_marker;
1682fbceb1mjgstatic struct vnode *vnode_list_reclaim_marker;
169d9d8bf6dyson
1708e9b2cdeivind/*
171896302bkib * "Free" vnode target.  Free vnodes are rarely completely free, but are
172896302bkib * just ones that are cheap to recycle.  Usually they are for files which
173896302bkib * have been stat'd but not read; these usually have inode and namecache
174896302bkib * data attached to them.  This target is the preferred minimum size of a
175896302bkib * sub-cache consisting mostly of such files. The system balances the size
176896302bkib * of this sub-cache with its complement to try to prevent either from
177896302bkib * thrashing while the other is relatively inactive.  The targets express
178896302bkib * a preference for the best balance.
179896302bkib *
180896302bkib * "Above" this target there are 2 further targets (watermarks) related
181896302bkib * to recyling of free vnodes.  In the best-operating case, the cache is
182896302bkib * exactly full, the free list has size between vlowat and vhiwat above the
183896302bkib * free target, and recycling from it and normal use maintains this state.
184896302bkib * Sometimes the free list is below vlowat or even empty, but this state
185896302bkib * is even better for immediate use provided the cache is not full.
186896302bkib * Otherwise, vnlru_proc() runs to reclaim enough vnodes (usually non-free
187896302bkib * ones) to reach one of these states.  The watermarks are currently hard-
188896302bkib * coded as 4% and 9% of the available space higher.  These and the default
189896302bkib * of 25% for wantfreevnodes are too large if the memory size is large.
190896302bkib * E.g., 9% of 75% of MAXVNODES is more than 566000 vnodes to reclaim
191896302bkib * whenever vnlru_proc() becomes active.
1928e9b2cdeivind */
1935acc96fmjgstatic long wantfreevnodes;
1945acc96fmjgstatic long __exclusive_cache_line freevnodes;
195896302bkibSYSCTL_ULONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD,
196896302bkib    &freevnodes, 0, "Number of \"free\" vnodes");
1975acc96fmjgstatic long freevnodes_old;
1985660354kib
1999412199mjgstatic counter_u64_t recycles_count;
2009412199mjgSYSCTL_COUNTER_U64(_vfs, OID_AUTO, recycles, CTLFLAG_RD, &recycles_count,
201896302bkib    "Number of vnodes recycled to meet vnode cache targets");
2024247c4fjhb
203037b781mjgstatic counter_u64_t recycles_free_count;
204037b781mjgSYSCTL_COUNTER_U64(_vfs, OID_AUTO, recycles_free, CTLFLAG_RD, &recycles_free_count,
205037b781mjg    "Number of free vnodes recycled to meet vnode cache targets");
206037b781mjg
2071204b9cmjgstatic counter_u64_t deferred_inact;
2081204b9cmjgSYSCTL_COUNTER_U64(_vfs, OID_AUTO, deferred_inact, CTLFLAG_RD, &deferred_inact,
2091204b9cmjg    "Number of times inactive processing was deferred");
210becc575kib
2118e9b2cdeivind/* To keep more than one thread at a time from running vfs_getnewfsid */
2128d2ec1ejasonestatic struct mtx mntid_mtx;
2138dd2b8deivind
214d18378ejeff/*
215d18378ejeff * Lock for any access to the following:
2169b80414mjg *	vnode_list
217d18378ejeff *	numvnodes
218d18378ejeff *	freevnodes
219d18378ejeff */
2209b80414mjgstatic struct mtx __exclusive_cache_line vnode_list_mtx;
2218e9b2cdeivind
2228e9b2cdeivind/* Publicly exported FS */
2238e9b2cdeivindstruct nfs_public nfs_pub;
2248dd2b8deivind
2251cfa4a3jeffstatic uma_zone_t buf_trie_zone;
226b4901e2cemstatic smr_t buf_trie_smr;
2271cfa4a3jeff
2288e9b2cdeivind/* Zone for allocation of new vnodes - used exclusively by getnewvnode() */
229318cbeejeffstatic uma_zone_t vnode_zone;
230318cbeejeffstatic uma_zone_t vnodepoll_zone;
2318dd2b8deivind
2324074673mjg__read_frequently smr_t vfs_smr;
2334074673mjg
23410c5cccjulian/*
23510c5cccjulian * The workitem queue.
2368aef2acdes *
2378e9b2cdeivind * It is useful to delay writes of file data and filesystem metadata
2388e9b2cdeivind * for tens of seconds so that quickly created and deleted files need
2398e9b2cdeivind * not waste disk bandwidth being created and removed. To realize this,
2408e9b2cdeivind * we append vnodes to a "workitem" queue. When running with a soft
2418e9b2cdeivind * updates implementation, most pending metadata dependencies should
2428e9b2cdeivind * not wait for more than a few seconds. Thus, mounted on block devices
2438e9b2cdeivind * are delayed only about a half the time that file data is delayed.
2448e9b2cdeivind * Similarly, directory updates are more critical, so are only delayed
2458e9b2cdeivind * about a third the time that file data is delayed. Thus, there are
2468e9b2cdeivind * SYNCER_MAXDELAY queues that are processed round-robin at a rate of
2478e9b2cdeivind * one each second (driven off the filesystem syncer process). The
2488e9b2cdeivind * syncer_delayno variable indicates the next queue that is to be processed.
2498e9b2cdeivind * Items that need to be processed soon are placed in this queue:
2508e9b2cdeivind *
2518e9b2cdeivind *	syncer_workitem_pending[syncer_delayno]
2528e9b2cdeivind *
2538e9b2cdeivind * A delay of fifteen seconds is done by placing the request fifteen
2548e9b2cdeivind * entries later in the queue:
2558e9b2cdeivind *
2568e9b2cdeivind *	syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
2578e9b2cdeivind *
25810c5cccjulian */
259d6d1e90peterstatic int syncer_delayno;
2608aef2acdesstatic long syncer_mask;
26156a7ee8phkLIST_HEAD(synclist, bufobj);
262560aa75kibstatic struct synclist *syncer_workitem_pending;
263ee7cd91jeff/*
264ee7cd91jeff * The sync_mtx protects:
26556a7ee8phk *	bo->bo_synclist
2669ed03e6truckman *	sync_vnode_count
267ee7cd91jeff *	syncer_delayno
268471ab74truckman *	syncer_state
269ee7cd91jeff *	syncer_workitem_pending
2709ed03e6truckman *	syncer_worklist_len
271ee7cd91jeff *	rushjob
272ee7cd91jeff */
273ee7cd91jeffstatic struct mtx sync_mtx;
274faa0cddedstatic struct cv sync_wakeup;
2758e9b2cdeivind
27610c5cccjulian#define SYNCER_MAXDELAY		32
277a031711eivindstatic int syncer_maxdelay = SYNCER_MAXDELAY;	/* maximum delay time */
278b37309fdillonstatic int syncdelay = 30;		/* max time to delay syncing data */
279b37309fdillonstatic int filedelay = 30;		/* time to delay syncing files */
28034e35cbbrucecSYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0,
281d3c1b43brucec    "Time to delay syncing files (in seconds)");
282b37309fdillonstatic int dirdelay = 29;		/* time to delay syncing directories */
28334e35cbbrucecSYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0,
284d3c1b43brucec    "Time to delay syncing directories (in seconds)");
285b37309fdillonstatic int metadelay = 28;		/* time to delay syncing metadata */
28634e35cbbrucecSYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0,
287d3c1b43brucec    "Time to delay syncing metadata (in seconds)");
2888e9b2cdeivindstatic int rushjob;		/* number of slots to run ASAP */
28902e5fe8mckusickstatic int stat_rush_requests;	/* number of times I/O speeded up */
29034e35cbbrucecSYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0,
291d3c1b43brucec    "Number of times I/O speeded up (rush requests)");
29210c5cccjulian
2939bce1ffmjg#define	VDBATCH_SIZE 8
2949bce1ffmjgstruct vdbatch {
2959bce1ffmjg	u_int index;
2965acc96fmjg	long freevnodes;
2979bce1ffmjg	struct mtx lock;
2989bce1ffmjg	struct vnode *tab[VDBATCH_SIZE];
2999bce1ffmjg};
3009bce1ffmjgDPCPU_DEFINE_STATIC(struct vdbatch, vd);
3019bce1ffmjg
3029bce1ffmjgstatic void	vdbatch_dequeue(struct vnode *vp);
3039bce1ffmjg
3048e9b2cdeivind/*
305471ab74truckman * When shutting down the syncer, run it at four times normal speed.
3069ed03e6truckman */
307471ab74truckman#define SYNCER_SHUTDOWN_SPEEDUP		4
3089ed03e6truckmanstatic int sync_vnode_count;
3099ed03e6truckmanstatic int syncer_worklist_len;
310471ab74truckmanstatic enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY }
311471ab74truckman    syncer_state;
3129ed03e6truckman
313896302bkib/* Target for maximum number of vnodes. */
31485edb79mjgu_long desiredvnodes;
31585edb79mjgstatic u_long gapvnodes;		/* gap between wanted and desired */
31685edb79mjgstatic u_long vhiwat;		/* enough extras after expansion */
31785edb79mjgstatic u_long vlowat;		/* minimal extras before expansion */
31885edb79mjgstatic u_long vstir;		/* nonzero to stir non-free vnodes */
319896302bkibstatic volatile int vsmalltrigger = 8;	/* pref to keep if > this many pages */
3201735746mckusick
3215acc96fmjgstatic u_long vnlru_read_freevnodes(void);
3225acc96fmjg
323f378822mjg/*
324f378822mjg * Note that no attempt is made to sanitize these parameters.
325f378822mjg */
3261735746mckusickstatic int
327f378822mjgsysctl_maxvnodes(SYSCTL_HANDLER_ARGS)
3281735746mckusick{
329f378822mjg	u_long val;
33085edb79mjg	int error;
3311735746mckusick
332f378822mjg	val = desiredvnodes;
333f378822mjg	error = sysctl_handle_long(oidp, &val, 0, req);
334f378822mjg	if (error != 0 || req->newptr == NULL)
3351735746mckusick		return (error);
336f378822mjg
337f378822mjg	if (val == desiredvnodes)
338f378822mjg		return (0);
3399b80414mjg	mtx_lock(&vnode_list_mtx);
340f378822mjg	desiredvnodes = val;
341f378822mjg	wantfreevnodes = desiredvnodes / 4;
342f378822mjg	vnlru_recalc();
3439b80414mjg	mtx_unlock(&vnode_list_mtx);
344f378822mjg	/*
345f378822mjg	 * XXX There is no protection against multiple threads changing
346f378822mjg	 * desiredvnodes at the same time. Locking above only helps vnlru and
347f378822mjg	 * getnewvnode.
348f378822mjg	 */
349f378822mjg	vfs_hash_changesize(desiredvnodes);
350f378822mjg	cache_changesize(desiredvnodes);
3511735746mckusick	return (0);
3521735746mckusick}
3531735746mckusick
3541735746mckusickSYSCTL_PROC(_kern, KERN_MAXVNODES, maxvnodes,
355f378822mjg    CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_maxvnodes,
356eb9cc37glebius    "LU", "Target for maximum number of vnodes");
357f378822mjg
358f378822mjgstatic int
359f378822mjgsysctl_wantfreevnodes(SYSCTL_HANDLER_ARGS)
360f378822mjg{
361f378822mjg	u_long val;
362f378822mjg	int error;
363f378822mjg
364f378822mjg	val = wantfreevnodes;
365f378822mjg	error = sysctl_handle_long(oidp, &val, 0, req);
366f378822mjg	if (error != 0 || req->newptr == NULL)
367f378822mjg		return (error);
368f378822mjg
369f378822mjg	if (val == wantfreevnodes)
370f378822mjg		return (0);
3719b80414mjg	mtx_lock(&vnode_list_mtx);
372f378822mjg	wantfreevnodes = val;
373f378822mjg	vnlru_recalc();
3749b80414mjg	mtx_unlock(&vnode_list_mtx);
375f378822mjg	return (0);
376f378822mjg}
377f378822mjg
378f378822mjgSYSCTL_PROC(_vfs, OID_AUTO, wantfreevnodes,
379f378822mjg    CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_wantfreevnodes,
380eb9cc37glebius    "LU", "Target for minimum number of \"free\" vnodes");
381f378822mjg
382f6a71a4mdfSYSCTL_ULONG(_kern, OID_AUTO, minvnodes, CTLFLAG_RW,
383896302bkib    &wantfreevnodes, 0, "Old name for vfs.wantfreevnodes (legacy)");
384d6d1e90peterstatic int vnlru_nowhere;
3852ffbc24kanSYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW,
3862ffbc24kan    &vnlru_nowhere, 0, "Number of times the vnlru process ran without success");
3878fb65cergrimes
388e0dae39asomersstatic int
389e0dae39asomerssysctl_try_reclaim_vnode(SYSCTL_HANDLER_ARGS)
390e0dae39asomers{
391e0dae39asomers	struct vnode *vp;
392e0dae39asomers	struct nameidata nd;
393e0dae39asomers	char *buf;
394e0dae39asomers	unsigned long ndflags;
395e0dae39asomers	int error;
396e0dae39asomers
397e0dae39asomers	if (req->newptr == NULL)
398e0dae39asomers		return (EINVAL);
3993913895emaste	if (req->newlen >= PATH_MAX)
400e0dae39asomers		return (E2BIG);
401e0dae39asomers
4023913895emaste	buf = malloc(PATH_MAX, M_TEMP, M_WAITOK);
403e0dae39asomers	error = SYSCTL_IN(req, buf, req->newlen);
404e0dae39asomers	if (error != 0)
405e0dae39asomers		goto out;
406e0dae39asomers
407e0dae39asomers	buf[req->newlen] = '\0';
408e0dae39asomers
409e57f3aamjg	ndflags = LOCKLEAF | NOFOLLOW | AUDITVNODE1 | SAVENAME;
410e0dae39asomers	NDINIT(&nd, LOOKUP, ndflags, UIO_SYSSPACE, buf, curthread);
411e0dae39asomers	if ((error = namei(&nd)) != 0)
412e0dae39asomers		goto out;
413e0dae39asomers	vp = nd.ni_vp;
414e0dae39asomers
415bcfa67amjg	if (VN_IS_DOOMED(vp)) {
416e0dae39asomers		/*
417e0dae39asomers		 * This vnode is being recycled.  Return != 0 to let the caller
418e0dae39asomers		 * know that the sysctl had no effect.  Return EAGAIN because a
419e0dae39asomers		 * subsequent call will likely succeed (since namei will create
420e0dae39asomers		 * a new vnode if necessary)
421e0dae39asomers		 */
422e0dae39asomers		error = EAGAIN;
423e0dae39asomers		goto putvnode;
424e0dae39asomers	}
425e0dae39asomers
426e0dae39asomers	counter_u64_add(recycles_count, 1);
427e0dae39asomers	vgone(vp);
428e0dae39asomersputvnode:
429e0dae39asomers	NDFREE(&nd, 0);
430e0dae39asomersout:
431e0dae39asomers	free(buf, M_TEMP);
432e0dae39asomers	return (error);
433e0dae39asomers}
434e0dae39asomers
435e0dae39asomersstatic int
436e0dae39asomerssysctl_ftry_reclaim_vnode(SYSCTL_HANDLER_ARGS)
437e0dae39asomers{
438e0dae39asomers	struct thread *td = curthread;
439e0dae39asomers	struct vnode *vp;
440e0dae39asomers	struct file *fp;
441e0dae39asomers	int error;
442e0dae39asomers	int fd;
443e0dae39asomers
444e0dae39asomers	if (req->newptr == NULL)
445e0dae39asomers		return (EBADF);
446e0dae39asomers
447e0dae39asomers        error = sysctl_handle_int(oidp, &fd, 0, req);
448e0dae39asomers        if (error != 0)
449e0dae39asomers                return (error);
450e0dae39asomers	error = getvnode(curthread, fd, &cap_fcntl_rights, &fp);
451e0dae39asomers	if (error != 0)
452e0dae39asomers		return (error);
453e0dae39asomers	vp = fp->f_vnode;
454e0dae39asomers
455e0dae39asomers	error = vn_lock(vp, LK_EXCLUSIVE);
456e0dae39asomers	if (error != 0)
457e0dae39asomers		goto drop;
458e0dae39asomers
459e0dae39asomers	counter_u64_add(recycles_count, 1);
460e0dae39asomers	vgone(vp);
461f121d45mjg	VOP_UNLOCK(vp);
462e0dae39asomersdrop:
463e0dae39asomers	fdrop(fp, td);
464e0dae39asomers	return (error);
465e0dae39asomers}
466e0dae39asomers
467e0dae39asomersSYSCTL_PROC(_debug, OID_AUTO, try_reclaim_vnode,
468e0dae39asomers    CTLTYPE_STRING | CTLFLAG_MPSAFE | CTLFLAG_WR, NULL, 0,
469e0dae39asomers    sysctl_try_reclaim_vnode, "A", "Try to reclaim a vnode by its pathname");
470e0dae39asomersSYSCTL_PROC(_debug, OID_AUTO, ftry_reclaim_vnode,
471e0dae39asomers    CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_WR, NULL, 0,
472e0dae39asomers    sysctl_ftry_reclaim_vnode, "I",
473e0dae39asomers    "Try to reclaim a vnode by its file descriptor");
474e0dae39asomers
4755a71b32kib/* Shift count for (uintptr_t)vp to initialize vp->v_hash. */
476cef8617kibstatic int vnsz2log;
477d289cc6jeff
478d289cc6jeff/*
4791cfa4a3jeff * Support for the bufobj clean & dirty pctrie.
4801cfa4a3jeff */
4811cfa4a3jeffstatic void *
4821cfa4a3jeffbuf_trie_alloc(struct pctrie *ptree)
4831cfa4a3jeff{
484b4901e2cem	return (uma_zalloc_smr(buf_trie_zone, M_NOWAIT));
4851cfa4a3jeff}
4861cfa4a3jeff
4871cfa4a3jeffstatic void
4881cfa4a3jeffbuf_trie_free(struct pctrie *ptree, void *node)
4891cfa4a3jeff{
490b4901e2cem	uma_zfree_smr(buf_trie_zone, node);
4911cfa4a3jeff}
492b4901e2cemPCTRIE_DEFINE_SMR(BUF, buf, b_lblkno, buf_trie_alloc, buf_trie_free,
493b4901e2cem    buf_trie_smr);
4941cfa4a3jeff
4951cfa4a3jeff/*
4968fb65cergrimes * Initialize the vnode management data structures.
497329f9f0alc *
498329f9f0alc * Reevaluate the following cap on the number of vnodes after the physical
499329f9f0alc * memory size exceeds 512GB.  In the limit, as the physical memory size
500421a929eadler * grows, the ratio of the memory size in KB to vnodes approaches 64:1.
5018fb65cergrimes */
5028f41b4eobrien#ifndef	MAXVNODES_MAX
50385edb79mjg#define	MAXVNODES_MAX	(512UL * 1024 * 1024 / 64)	/* 8M */
5048f41b4eobrien#endif
5056f0b4b3mckusick
506bb33b4emjgstatic MALLOC_DEFINE(M_VNODE_MARKER, "vnodemarker", "vnode marker");
507bb33b4emjg
508bb33b4emjgstatic struct vnode *
509bb33b4emjgvn_alloc_marker(struct mount *mp)
510bb33b4emjg{
511bb33b4emjg	struct vnode *vp;
512bb33b4emjg
513bb33b4emjg	vp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO);
514bb33b4emjg	vp->v_type = VMARKER;
515bb33b4emjg	vp->v_mount = mp;
516bb33b4emjg
517bb33b4emjg	return (vp);
518bb33b4emjg}
519bb33b4emjg
520bb33b4emjgstatic void
521bb33b4emjgvn_free_marker(struct vnode *vp)
522bb33b4emjg{
523bb33b4emjg
524bb33b4emjg	MPASS(vp->v_type == VMARKER);
525bb33b4emjg	free(vp, M_VNODE_MARKER);
526bb33b4emjg}
527bb33b4emjg
5286f0b4b3mckusick/*
5296f0b4b3mckusick * Initialize a vnode as it first enters the zone.
5306f0b4b3mckusick */
5316f0b4b3mckusickstatic int
5326f0b4b3mckusickvnode_init(void *mem, int size, int flags)
5336f0b4b3mckusick{
5346f0b4b3mckusick	struct vnode *vp;
5356f0b4b3mckusick
5366f0b4b3mckusick	vp = mem;
5376f0b4b3mckusick	bzero(vp, size);
5386f0b4b3mckusick	/*
5396f0b4b3mckusick	 * Setup locks.
5406f0b4b3mckusick	 */
5416f0b4b3mckusick	vp->v_vnlock = &vp->v_lock;
5426f0b4b3mckusick	mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF);
5436f0b4b3mckusick	/*
5446f0b4b3mckusick	 * By default, don't allow shared locks unless filesystems opt-in.
5456f0b4b3mckusick	 */
5466f0b4b3mckusick	lockinit(vp->v_vnlock, PVFS, "vnode", VLKTIMEOUT,
5476f0b4b3mckusick	    LK_NOSHARE | LK_IS_VNODE);
5486f0b4b3mckusick	/*
5496f0b4b3mckusick	 * Initialize bufobj.
5506f0b4b3mckusick	 */
551e3be9f8jeff	bufobj_init(&vp->v_bufobj, vp);
5526f0b4b3mckusick	/*
5536f0b4b3mckusick	 * Initialize namecache.
5546f0b4b3mckusick	 */
5554189dc5mjg	cache_vnode_init(vp);
5566f0b4b3mckusick	/*
5576f0b4b3mckusick	 * Initialize rangelocks.
5586f0b4b3mckusick	 */
5596f0b4b3mckusick	rangelock_init(&vp->v_rl);
5609b80414mjg
5619bce1ffmjg	vp->v_dbatchcpu = NOCPU;
5629bce1ffmjg
5639b80414mjg	mtx_lock(&vnode_list_mtx);
5649b80414mjg	TAILQ_INSERT_BEFORE(vnode_list_free_marker, vp, v_vnodelist);
5659b80414mjg	mtx_unlock(&vnode_list_mtx);
5666f0b4b3mckusick	return (0);
5676f0b4b3mckusick}
5686f0b4b3mckusick
5696f0b4b3mckusick/*
5706f0b4b3mckusick * Free a vnode when it is cleared from the zone.
5716f0b4b3mckusick */
5726f0b4b3mckusickstatic void
5736f0b4b3mckusickvnode_fini(void *mem, int size)
5746f0b4b3mckusick{
5756f0b4b3mckusick	struct vnode *vp;
5766f0b4b3mckusick	struct bufobj *bo;
5776f0b4b3mckusick
5786f0b4b3mckusick	vp = mem;
5799bce1ffmjg	vdbatch_dequeue(vp);
5809b80414mjg	mtx_lock(&vnode_list_mtx);
5819b80414mjg	TAILQ_REMOVE(&vnode_list, vp, v_vnodelist);
5829b80414mjg	mtx_unlock(&vnode_list_mtx);
5836f0b4b3mckusick	rangelock_destroy(&vp->v_rl);
5846f0b4b3mckusick	lockdestroy(vp->v_vnlock);
5856f0b4b3mckusick	mtx_destroy(&vp->v_interlock);
5866f0b4b3mckusick	bo = &vp->v_bufobj;
5876f0b4b3mckusick	rw_destroy(BO_LOCKPTR(bo));
5886f0b4b3mckusick}
5896f0b4b3mckusick
590392fea7kib/*
591392fea7kib * Provide the size of NFS nclnode and NFS fh for calculation of the
592392fea7kib * vnode memory consumption.  The size is specified directly to
593392fea7kib * eliminate dependency on NFS-private header.
594392fea7kib *
595392fea7kib * Other filesystems may use bigger or smaller (like UFS and ZFS)
596392fea7kib * private inode data, but the NFS-based estimation is ample enough.
597392fea7kib * Still, we care about differences in the size between 64- and 32-bit
598392fea7kib * platforms.
599392fea7kib *
600392fea7kib * Namecache structure size is heuristically
601392fea7kib * sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1.
602392fea7kib */
603392fea7kib#ifdef _LP64
604392fea7kib#define	NFS_NCLNODE_SZ	(528 + 64)
605392fea7kib#define	NC_SZ		148
606392fea7kib#else
607392fea7kib#define	NFS_NCLNODE_SZ	(360 + 32)
608392fea7kib#define	NC_SZ		92
609392fea7kib#endif
610392fea7kib
611eb5dd3dpeterstatic void
612eb5dd3dpetervntblinit(void *dummy __unused)
6138fb65cergrimes{
6149bce1ffmjg	struct vdbatch *vd;
6159bce1ffmjg	int cpu, physvnodes, virtvnodes;
616cef8617kib	u_int i;
6178fb65cergrimes
61853638c7alc	/*
619329f9f0alc	 * Desiredvnodes is a function of the physical memory size and the
620329f9f0alc	 * kernel's heap size.  Generally speaking, it scales with the
621896302bkib	 * physical memory size.  The ratio of desiredvnodes to the physical
622896302bkib	 * memory size is 1:16 until desiredvnodes exceeds 98,304.
623896302bkib	 * Thereafter, the
624896302b