120280f1imp/*-
24736ccfpfg * SPDX-License-Identifier: BSD-3-Clause
34736ccfpfg *
48fb65cergrimes * Copyright (c) 1989, 1993
58fb65cergrimes *	The Regents of the University of California.  All rights reserved.
68fb65cergrimes * (c) UNIX System Laboratories, Inc.
78fb65cergrimes * All or some portions of this file are derived from material licensed
88fb65cergrimes * to the University of California by American Telephone and Telegraph
98fb65cergrimes * Co. or Unix System Laboratories, Inc. and are reproduced herein with
108fb65cergrimes * the permission of UNIX System Laboratories, Inc.
118fb65cergrimes *
128fb65cergrimes * Redistribution and use in source and binary forms, with or without
138fb65cergrimes * modification, are permitted provided that the following conditions
148fb65cergrimes * are met:
158fb65cergrimes * 1. Redistributions of source code must retain the above copyright
168fb65cergrimes *    notice, this list of conditions and the following disclaimer.
178fb65cergrimes * 2. Redistributions in binary form must reproduce the above copyright
188fb65cergrimes *    notice, this list of conditions and the following disclaimer in the
198fb65cergrimes *    documentation and/or other materials provided with the distribution.
2000b67b1emaste * 3. Neither the name of the University nor the names of its contributors
218fb65cergrimes *    may be used to endorse or promote products derived from this software
228fb65cergrimes *    without specific prior written permission.
238fb65cergrimes *
248fb65cergrimes * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
258fb65cergrimes * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
268fb65cergrimes * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
278fb65cergrimes * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
288fb65cergrimes * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
298fb65cergrimes * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
308fb65cergrimes * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
318fb65cergrimes * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
328fb65cergrimes * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
338fb65cergrimes * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
348fb65cergrimes * SUCH DAMAGE.
358fb65cergrimes *
3610f666adyson *	@(#)vfs_subr.c	8.31 (Berkeley) 5/26/95
378fb65cergrimes */
388fb65cergrimes
398fb65cergrimes/*
408fb65cergrimes * External virtual filesystem routines
418fb65cergrimes */
423b8fff9obrien
433b8fff9obrien#include <sys/cdefs.h>
443b8fff9obrien__FBSDID("$FreeBSD$");
453b8fff9obrien
4639d3a9awollman#include "opt_ddb.h"
47d685681attilio#include "opt_watchdog.h"
488fb65cergrimes
498fb65cergrimes#include <sys/param.h>
508fb65cergrimes#include <sys/systm.h>
5136c3965phk#include <sys/bio.h>
52e100d44luoqi#include <sys/buf.h>
53e0dae39asomers#include <sys/capsicum.h>
54faa0cdded#include <sys/condvar.h>
55e100d44luoqi#include <sys/conf.h>
569412199mjg#include <sys/counter.h>
574e50b9ephk#include <sys/dirent.h>
58bbaa6c3alfred#include <sys/event.h>
59e100d44luoqi#include <sys/eventhandler.h>
6055be95dphk#include <sys/extattr.h>
610835f7bssouhlal#include <sys/file.h>
6285c5587bde#include <sys/fcntl.h>
63ad49fbepjd#include <sys/jail.h>
64c20ced5marcel#include <sys/kdb.h>
65a7a3961bde#include <sys/kernel.h>
666cb5fe6peter#include <sys/kthread.h>
673038f1acem#include <sys/ktr.h>
685224340kib#include <sys/lockf.h>
69a5dcc1frwatson#include <sys/malloc.h>
708fb65cergrimes#include <sys/mount.h>
71767bad2eivind#include <sys/namei.h>
721cfa4a3jeff#include <sys/pctrie.h>
7310d0d9crwatson#include <sys/priv.h>
7454d23a3truckman#include <sys/reboot.h>
7528fa5eemjg#include <sys/refcount.h>
76658534eattilio#include <sys/rwlock.h>
773a7fd5fjhb#include <sys/sched.h>
78d25301cjhb#include <sys/sleepqueue.h>
792818562kib#include <sys/smp.h>
808fb65cergrimes#include <sys/stat.h>
81e100d44luoqi#include <sys/sysctl.h>
821750942dillon#include <sys/syslog.h>
83cd67bb8dyson#include <sys/vmmeter.h>
84e100d44luoqi#include <sys/vnode.h>
85d685681attilio#include <sys/watchdog.h>
868fb65cergrimes
87dc1cfeaphk#include <machine/stdarg.h>
88dc1cfeaphk
897beaaf5rwatson#include <security/mac/mac_framework.h>
907beaaf5rwatson
918fb65cergrimes#include <vm/vm.h>
92c30f46cdg#include <vm/vm_object.h>
93c30f46cdg#include <vm/vm_extern.h>
946bd1f74dyson#include <vm/pmap.h>
956bd1f74dyson#include <vm/vm_map.h>
96df24433dillon#include <vm/vm_page.h>
9753638c7alc#include <vm/vm_kern.h>
98318cbeejeff#include <vm/uma.h>
998fb65cergrimes
10012baf6epjd#ifdef DDB
10112baf6epjd#include <ddb/ddb.h>
10212baf6epjd#endif
10312baf6epjd
104070a613phkstatic void	delmntque(struct vnode *vp);
10566dfd63phkstatic int	flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo,
106649a01ephk		    int slpflag, int slptimeo);
1079ed03e6truckmanstatic void	syncer_shutdown(void *arg, int howto);
1086b8324ejeffstatic int	vtryrecycle(struct vnode *vp);
10928fa5eemjgstatic void	v_init_counters(struct vnode *);
11074d7b1emjgstatic void	v_incr_devcount(struct vnode *);
11174d7b1emjgstatic void	v_decr_devcount(struct vnode *);
112ca07a9fjeffstatic void	vgonel(struct vnode *);
113efe31cdssouhlalstatic void	vfs_knllock(void *arg);
114efe31cdssouhlalstatic void	vfs_knlunlock(void *arg);
115e1cb294kibstatic void	vfs_knl_assert_locked(void *arg);
116e1cb294kibstatic void	vfs_knl_assert_unlocked(void *arg);
11786b5e61kibstatic void	destroy_vpollinfo(struct vpollinfo *vi);
1183c6bfc0asomersstatic int	v_inval_buf_range_locked(struct vnode *vp, struct bufobj *bo,
1193c6bfc0asomers		    daddr_t startlbn, daddr_t endlbn);
120f378822mjgstatic void	vnlru_recalc(void);
1218e9b2cdeivind
1228e9b2cdeivind/*
1234e68a99markj * These fences are intended for cases where some synchronization is
1244e68a99markj * needed between access of v_iflags and lockless vnode refcount (v_holdcnt
1254e68a99markj * and v_usecount) updates.  Access to v_iflags is generally synchronized
1264e68a99markj * by the interlock, but we have some internal assertions that check vnode
127d1a00acmarkj * flags without acquiring the lock.  Thus, these fences are INVARIANTS-only
1284e68a99markj * for now.
1294e68a99markj */
1304e68a99markj#ifdef INVARIANTS
1314e68a99markj#define	VNODE_REFCOUNT_FENCE_ACQ()	atomic_thread_fence_acq()
1324e68a99markj#define	VNODE_REFCOUNT_FENCE_REL()	atomic_thread_fence_rel()
1334e68a99markj#else
1344e68a99markj#define	VNODE_REFCOUNT_FENCE_ACQ()
1354e68a99markj#define	VNODE_REFCOUNT_FENCE_REL()
1364e68a99markj#endif
1374e68a99markj
1384e68a99markj/*
1398e9b2cdeivind * Number of vnodes in existence.  Increased whenever getnewvnode()
140bcfa67amjg * allocates a new vnode, decreased in vdropl() for VIRF_DOOMED vnode.
1418e9b2cdeivind */
1426facf4dmjgstatic u_long __exclusive_cache_line numvnodes;
14305b2183dillon
144f6a71a4mdfSYSCTL_ULONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0,
145d3c1b43brucec    "Number of vnodes in existence");
1463089ef0bde
1479412199mjgstatic counter_u64_t vnodes_created;
1489412199mjgSYSCTL_COUNTER_U64(_vfs, OID_AUTO, vnodes_created, CTLFLAG_RD, &vnodes_created,
1499412199mjg    "Number of vnodes created by getnewvnode");
1504247c4fjhb
1518e9b2cdeivind/*
1528e9b2cdeivind * Conversion tables for conversion from vnode types to inode formats
1538e9b2cdeivind * and back.
1548e9b2cdeivind */
1558fb65cergrimesenum vtype iftovt_tab[16] = {
1568fb65cergrimes	VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
15751739d2mckusick	VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VNON
1588fb65cergrimes};
159d344c11teggeint vttoif_tab[10] = {
1608fb65cergrimes	0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
161d344c11tegge	S_IFSOCK, S_IFIFO, S_IFMT, S_IFMT
1628fb65cergrimes};
1638fb65cergrimes
1648dd2b8deivind/*
1659b80414mjg * List of allocates vnodes in the system.
1668dd2b8deivind */
1679b80414mjgstatic TAILQ_HEAD(freelst, vnode) vnode_list;
1689b80414mjgstatic struct vnode *vnode_list_free_marker;
1692fbceb1mjgstatic struct vnode *vnode_list_reclaim_marker;
170d9d8bf6dyson
1718e9b2cdeivind/*
172896302bkib * "Free" vnode target.  Free vnodes are rarely completely free, but are
173896302bkib * just ones that are cheap to recycle.  Usually they are for files which
174896302bkib * have been stat'd but not read; these usually have inode and namecache
175896302bkib * data attached to them.  This target is the preferred minimum size of a
176896302bkib * sub-cache consisting mostly of such files. The system balances the size
177896302bkib * of this sub-cache with its complement to try to prevent either from
178896302bkib * thrashing while the other is relatively inactive.  The targets express
179896302bkib * a preference for the best balance.
180896302bkib *
181896302bkib * "Above" this target there are 2 further targets (watermarks) related
182896302bkib * to recyling of free vnodes.  In the best-operating case, the cache is
183896302bkib * exactly full, the free list has size between vlowat and vhiwat above the
184896302bkib * free target, and recycling from it and normal use maintains this state.
185896302bkib * Sometimes the free list is below vlowat or even empty, but this state
186896302bkib * is even better for immediate use provided the cache is not full.
187896302bkib * Otherwise, vnlru_proc() runs to reclaim enough vnodes (usually non-free
188896302bkib * ones) to reach one of these states.  The watermarks are currently hard-
189896302bkib * coded as 4% and 9% of the available space higher.  These and the default
190896302bkib * of 25% for wantfreevnodes are too large if the memory size is large.
191896302bkib * E.g., 9% of 75% of MAXVNODES is more than 566000 vnodes to reclaim
192896302bkib * whenever vnlru_proc() becomes active.
1938e9b2cdeivind */
1945acc96fmjgstatic long wantfreevnodes;
1955acc96fmjgstatic long __exclusive_cache_line freevnodes;
196896302bkibSYSCTL_ULONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD,
197896302bkib    &freevnodes, 0, "Number of \"free\" vnodes");
1985acc96fmjgstatic long freevnodes_old;
1995660354kib
2009412199mjgstatic counter_u64_t recycles_count;
2019412199mjgSYSCTL_COUNTER_U64(_vfs, OID_AUTO, recycles, CTLFLAG_RD, &recycles_count,
202896302bkib    "Number of vnodes recycled to meet vnode cache targets");
2034247c4fjhb
204037b781mjgstatic counter_u64_t recycles_free_count;
205037b781mjgSYSCTL_COUNTER_U64(_vfs, OID_AUTO, recycles_free, CTLFLAG_RD, &recycles_free_count,
206037b781mjg    "Number of free vnodes recycled to meet vnode cache targets");
207037b781mjg
2088e9b2cdeivind/*
2098dd2b8deivind * Various variables used for debugging the new implementation of
2108dd2b8deivind * reassignbuf().
2118dd2b8deivind * XXX these are probably of (very) limited utility now.
2128e9b2cdeivind */
2139d4f0d7mckusickstatic int reassignbufcalls;
214580abacvangyzenSYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW | CTLFLAG_STATS,
215580abacvangyzen    &reassignbufcalls, 0, "Number of calls to reassignbuf");
2169d4f0d7mckusick
2171204b9cmjgstatic counter_u64_t deferred_inact;
2181204b9cmjgSYSCTL_COUNTER_U64(_vfs, OID_AUTO, deferred_inact, CTLFLAG_RD, &deferred_inact,
2191204b9cmjg    "Number of times inactive processing was deferred");
220becc575kib
2218e9b2cdeivind/* To keep more than one thread at a time from running vfs_getnewfsid */
2228d2ec1ejasonestatic struct mtx mntid_mtx;
2238dd2b8deivind
224d18378ejeff/*
225d18378ejeff * Lock for any access to the following:
2269b80414mjg *	vnode_list
227d18378ejeff *	numvnodes
228d18378ejeff *	freevnodes
229d18378ejeff */
2309b80414mjgstatic struct mtx __exclusive_cache_line vnode_list_mtx;
2318e9b2cdeivind
2328e9b2cdeivind/* Publicly exported FS */
2338e9b2cdeivindstruct nfs_public nfs_pub;
2348dd2b8deivind
2351cfa4a3jeffstatic uma_zone_t buf_trie_zone;
2361cfa4a3jeff
2378e9b2cdeivind/* Zone for allocation of new vnodes - used exclusively by getnewvnode() */
238318cbeejeffstatic uma_zone_t vnode_zone;
239318cbeejeffstatic uma_zone_t vnodepoll_zone;
2408dd2b8deivind
24110c5cccjulian/*
24210c5cccjulian * The workitem queue.
2438aef2acdes *
2448e9b2cdeivind * It is useful to delay writes of file data and filesystem metadata
2458e9b2cdeivind * for tens of seconds so that quickly created and deleted files need
2468e9b2cdeivind * not waste disk bandwidth being created and removed. To realize this,
2478e9b2cdeivind * we append vnodes to a "workitem" queue. When running with a soft
2488e9b2cdeivind * updates implementation, most pending metadata dependencies should
2498e9b2cdeivind * not wait for more than a few seconds. Thus, mounted on block devices
2508e9b2cdeivind * are delayed only about a half the time that file data is delayed.
2518e9b2cdeivind * Similarly, directory updates are more critical, so are only delayed
2528e9b2cdeivind * about a third the time that file data is delayed. Thus, there are
2538e9b2cdeivind * SYNCER_MAXDELAY queues that are processed round-robin at a rate of
2548e9b2cdeivind * one each second (driven off the filesystem syncer process). The
2558e9b2cdeivind * syncer_delayno variable indicates the next queue that is to be processed.
2568e9b2cdeivind * Items that need to be processed soon are placed in this queue:
2578e9b2cdeivind *
2588e9b2cdeivind *	syncer_workitem_pending[syncer_delayno]
2598e9b2cdeivind *
2608e9b2cdeivind * A delay of fifteen seconds is done by placing the request fifteen
2618e9b2cdeivind * entries later in the queue:
2628e9b2cdeivind *
2638e9b2cdeivind *	syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
2648e9b2cdeivind *
26510c5cccjulian */
266d6d1e90peterstatic int syncer_delayno;
2678aef2acdesstatic long syncer_mask;
26856a7ee8phkLIST_HEAD(synclist, bufobj);
269560aa75kibstatic struct synclist *syncer_workitem_pending;
270ee7cd91jeff/*
271ee7cd91jeff * The sync_mtx protects:
27256a7ee8phk *	bo->bo_synclist
2739ed03e6truckman *	sync_vnode_count
274ee7cd91jeff *	syncer_delayno
275471ab74truckman *	syncer_state
276ee7cd91jeff *	syncer_workitem_pending
2779ed03e6truckman *	syncer_worklist_len
278ee7cd91jeff *	rushjob
279ee7cd91jeff */
280ee7cd91jeffstatic struct mtx sync_mtx;
281faa0cddedstatic struct cv sync_wakeup;
2828e9b2cdeivind
28310c5cccjulian#define SYNCER_MAXDELAY		32
284a031711eivindstatic int syncer_maxdelay = SYNCER_MAXDELAY;	/* maximum delay time */
285b37309fdillonstatic int syncdelay = 30;		/* max time to delay syncing data */
286b37309fdillonstatic int filedelay = 30;		/* time to delay syncing files */
28734e35cbbrucecSYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0,
288d3c1b43brucec    "Time to delay syncing files (in seconds)");
289b37309fdillonstatic int dirdelay = 29;		/* time to delay syncing directories */
29034e35cbbrucecSYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0,
291d3c1b43brucec    "Time to delay syncing directories (in seconds)");
292b37309fdillonstatic int metadelay = 28;		/* time to delay syncing metadata */
29334e35cbbrucecSYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0,
294d3c1b43brucec    "Time to delay syncing metadata (in seconds)");
2958e9b2cdeivindstatic int rushjob;		/* number of slots to run ASAP */
29602e5fe8mckusickstatic int stat_rush_requests;	/* number of times I/O speeded up */
29734e35cbbrucecSYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0,
298d3c1b43brucec    "Number of times I/O speeded up (rush requests)");
29910c5cccjulian
3009bce1ffmjg#define	VDBATCH_SIZE 8
3019bce1ffmjgstruct vdbatch {
3029bce1ffmjg	u_int index;
3035acc96fmjg	long freevnodes;
3049bce1ffmjg	struct mtx lock;
3059bce1ffmjg	struct vnode *tab[VDBATCH_SIZE];
3069bce1ffmjg};
3079bce1ffmjgDPCPU_DEFINE_STATIC(struct vdbatch, vd);
3089bce1ffmjg
3099bce1ffmjgstatic void	vdbatch_dequeue(struct vnode *vp);
3109bce1ffmjg
3118e9b2cdeivind/*
312471ab74truckman * When shutting down the syncer, run it at four times normal speed.
3139ed03e6truckman */
314471ab74truckman#define SYNCER_SHUTDOWN_SPEEDUP		4
3159ed03e6truckmanstatic int sync_vnode_count;
3169ed03e6truckmanstatic int syncer_worklist_len;
317471ab74truckmanstatic enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY }
318471ab74truckman    syncer_state;
3199ed03e6truckman
320896302bkib/* Target for maximum number of vnodes. */
32185edb79mjgu_long desiredvnodes;
32285edb79mjgstatic u_long gapvnodes;		/* gap between wanted and desired */
32385edb79mjgstatic u_long vhiwat;		/* enough extras after expansion */
32485edb79mjgstatic u_long vlowat;		/* minimal extras before expansion */
32585edb79mjgstatic u_long vstir;		/* nonzero to stir non-free vnodes */
326896302bkibstatic volatile int vsmalltrigger = 8;	/* pref to keep if > this many pages */
3271735746mckusick
3285acc96fmjgstatic u_long vnlru_read_freevnodes(void);
3295acc96fmjg
330f378822mjg/*
331f378822mjg * Note that no attempt is made to sanitize these parameters.
332f378822mjg */
3331735746mckusickstatic int
334f378822mjgsysctl_maxvnodes(SYSCTL_HANDLER_ARGS)
3351735746mckusick{
336f378822mjg	u_long val;
33785edb79mjg	int error;
3381735746mckusick
339f378822mjg	val = desiredvnodes;
340f378822mjg	error = sysctl_handle_long(oidp, &val, 0, req);
341f378822mjg	if (error != 0 || req->newptr == NULL)
3421735746mckusick		return (error);
343f378822mjg
344f378822mjg	if (val == desiredvnodes)
345f378822mjg		return (0);
3469b80414mjg	mtx_lock(&vnode_list_mtx);
347f378822mjg	desiredvnodes = val;
348f378822mjg	wantfreevnodes = desiredvnodes / 4;
349f378822mjg	vnlru_recalc();
3509b80414mjg	mtx_unlock(&vnode_list_mtx);
351f378822mjg	/*
352f378822mjg	 * XXX There is no protection against multiple threads changing
353f378822mjg	 * desiredvnodes at the same time. Locking above only helps vnlru and
354f378822mjg	 * getnewvnode.
355f378822mjg	 */
356f378822mjg	vfs_hash_changesize(desiredvnodes);
357f378822mjg	cache_changesize(desiredvnodes);
3581735746mckusick	return (0);
3591735746mckusick}
3601735746mckusick
3611735746mckusickSYSCTL_PROC(_kern, KERN_MAXVNODES, maxvnodes,
362f378822mjg    CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_maxvnodes,
363eb9cc37glebius    "LU", "Target for maximum number of vnodes");
364f378822mjg
365f378822mjgstatic int
366f378822mjgsysctl_wantfreevnodes(SYSCTL_HANDLER_ARGS)
367f378822mjg{
368f378822mjg	u_long val;
369f378822mjg	int error;
370f378822mjg
371f378822mjg	val = wantfreevnodes;
372f378822mjg	error = sysctl_handle_long(oidp, &val, 0, req);
373f378822mjg	if (error != 0 || req->newptr == NULL)
374f378822mjg		return (error);
375f378822mjg
376f378822mjg	if (val == wantfreevnodes)
377f378822mjg		return (0);
3789b80414mjg	mtx_lock(&vnode_list_mtx);
379f378822mjg	wantfreevnodes = val;
380f378822mjg	vnlru_recalc();
3819b80414mjg	mtx_unlock(&vnode_list_mtx);
382f378822mjg	return (0);
383f378822mjg}
384f378822mjg
385f378822mjgSYSCTL_PROC(_vfs, OID_AUTO, wantfreevnodes,
386f378822mjg    CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_wantfreevnodes,
387eb9cc37glebius    "LU", "Target for minimum number of \"free\" vnodes");
388f378822mjg
389f6a71a4mdfSYSCTL_ULONG(_kern, OID_AUTO, minvnodes, CTLFLAG_RW,
390896302bkib    &wantfreevnodes, 0, "Old name for vfs.wantfreevnodes (legacy)");
391d6d1e90peterstatic int vnlru_nowhere;
3922ffbc24kanSYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW,
3932ffbc24kan    &vnlru_nowhere, 0, "Number of times the vnlru process ran without success");
3948fb65cergrimes
395e0dae39asomersstatic int
396e0dae39asomerssysctl_try_reclaim_vnode(SYSCTL_HANDLER_ARGS)
397e0dae39asomers{
398e0dae39asomers	struct vnode *vp;
399e0dae39asomers	struct nameidata nd;
400e0dae39asomers	char *buf;
401e0dae39asomers	unsigned long ndflags;
402e0dae39asomers	int error;
403e0dae39asomers
404e0dae39asomers	if (req->newptr == NULL)
405e0dae39asomers		return (EINVAL);
4063913895emaste	if (req->newlen >= PATH_MAX)
407e0dae39asomers		return (E2BIG);
408e0dae39asomers
4093913895emaste	buf = malloc(PATH_MAX, M_TEMP, M_WAITOK);
410e0dae39asomers	error = SYSCTL_IN(req, buf, req->newlen);
411e0dae39asomers	if (error != 0)
412e0dae39asomers		goto out;
413e0dae39asomers
414e0dae39asomers	buf[req->newlen] = '\0';
415e0dae39asomers
416e0dae39asomers	ndflags = LOCKLEAF | NOFOLLOW | AUDITVNODE1 | NOCACHE | SAVENAME;
417e0dae39asomers	NDINIT(&nd, LOOKUP, ndflags, UIO_SYSSPACE, buf, curthread);
418e0dae39asomers	if ((error = namei(&nd)) != 0)
419e0dae39asomers		goto out;
420e0dae39asomers	vp = nd.ni_vp;
421e0dae39asomers
422bcfa67amjg	if (VN_IS_DOOMED(vp)) {
423e0dae39asomers		/*
424e0dae39asomers		 * This vnode is being recycled.  Return != 0 to let the caller
425e0dae39asomers		 * know that the sysctl had no effect.  Return EAGAIN because a
426e0dae39asomers		 * subsequent call will likely succeed (since namei will create
427e0dae39asomers		 * a new vnode if necessary)
428e0dae39asomers		 */
429e0dae39asomers		error = EAGAIN;
430e0dae39asomers		goto putvnode;
431e0dae39asomers	}
432e0dae39asomers
433e0dae39asomers	counter_u64_add(recycles_count, 1);
434e0dae39asomers	vgone(vp);
435e0dae39asomersputvnode:
436e0dae39asomers	NDFREE(&nd, 0);
437e0dae39asomersout:
438e0dae39asomers	free(buf, M_TEMP);
439e0dae39asomers	return (error);
440e0dae39asomers}
441e0dae39asomers
442e0dae39asomersstatic int
443e0dae39asomerssysctl_ftry_reclaim_vnode(SYSCTL_HANDLER_ARGS)
444e0dae39asomers{
445e0dae39asomers	struct thread *td = curthread;
446e0dae39asomers	struct vnode *vp;
447e0dae39asomers	struct file *fp;
448e0dae39asomers	int error;
449e0dae39asomers	int fd;
450e0dae39asomers
451e0dae39asomers	if (req->newptr == NULL)
452e0dae39asomers		return (EBADF);
453e0dae39asomers
454e0dae39asomers        error = sysctl_handle_int(oidp, &fd, 0, req);
455e0dae39asomers        if (error != 0)
456e0dae39asomers                return (error);
457e0dae39asomers	error = getvnode(curthread, fd, &cap_fcntl_rights, &fp);
458e0dae39asomers	if (error != 0)
459e0dae39asomers		return (error);
460e0dae39asomers	vp = fp->f_vnode;
461e0dae39asomers
462e0dae39asomers	error = vn_lock(vp, LK_EXCLUSIVE);
463e0dae39asomers	if (error != 0)
464e0dae39asomers		goto drop;
465e0dae39asomers
466e0dae39asomers	counter_u64_add(recycles_count, 1);
467e0dae39asomers	vgone(vp);
468f121d45mjg	VOP_UNLOCK(vp);
469e0dae39asomersdrop:
470e0dae39asomers	fdrop(fp, td);
471e0dae39asomers	return (error);
472e0dae39asomers}
473e0dae39asomers
474e0dae39asomersSYSCTL_PROC(_debug, OID_AUTO, try_reclaim_vnode,
475e0dae39asomers    CTLTYPE_STRING | CTLFLAG_MPSAFE | CTLFLAG_WR, NULL, 0,
476e0dae39asomers    sysctl_try_reclaim_vnode, "A", "Try to reclaim a vnode by its pathname");
477e0dae39asomersSYSCTL_PROC(_debug, OID_AUTO, ftry_reclaim_vnode,
478e0dae39asomers    CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_WR, NULL, 0,
479e0dae39asomers    sysctl_ftry_reclaim_vnode, "I",
480e0dae39asomers    "Try to reclaim a vnode by its file descriptor");
481e0dae39asomers
4825a71b32kib/* Shift count for (uintptr_t)vp to initialize vp->v_hash. */
483cef8617kibstatic int vnsz2log;
484d289cc6jeff
485d289cc6jeff/*
4861cfa4a3jeff * Support for the bufobj clean & dirty pctrie.
4871cfa4a3jeff */
4881cfa4a3jeffstatic void *
4891cfa4a3jeffbuf_trie_alloc(struct pctrie *ptree)
4901cfa4a3jeff{
4911cfa4a3jeff
4921cfa4a3jeff	return uma_zalloc(buf_trie_zone, M_NOWAIT);
4931cfa4a3jeff}
4941cfa4a3jeff
4951cfa4a3jeffstatic void
4961cfa4a3jeffbuf_trie_free(struct pctrie *ptree, void *node)
4971cfa4a3jeff{
4981cfa4a3jeff
4991cfa4a3jeff	uma_zfree(buf_trie_zone, node);
5001cfa4a3jeff}
5011cfa4a3jeffPCTRIE_DEFINE(BUF, buf, b_lblkno, buf_trie_alloc, buf_trie_free);
5021cfa4a3jeff
5031cfa4a3jeff/*
5048fb65cergrimes * Initialize the vnode management data structures.
505329f9f0alc *
506329f9f0alc * Reevaluate the following cap on the number of vnodes after the physical
507329f9f0alc * memory size exceeds 512GB.  In the limit, as the physical memory size
508421a929eadler * grows, the ratio of the memory size in KB to vnodes approaches 64:1.
5098fb65cergrimes */
5108f41b4eobrien#ifndef	MAXVNODES_MAX
51185edb79mjg#define	MAXVNODES_MAX	(512UL * 1024 * 1024 / 64)	/* 8M */
5128f41b4eobrien#endif
5136f0b4b3mckusick
514bb33b4emjgstatic MALLOC_DEFINE(M_VNODE_MARKER, "vnodemarker", "vnode marker");
515bb33b4emjg
516bb33b4emjgstatic struct vnode *
517bb33b4emjgvn_alloc_marker(struct mount *mp)
518bb33b4emjg{
519bb33b4emjg	struct vnode *vp;
520bb33b4emjg
521bb33b4emjg	vp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO);
522bb33b4emjg	vp->v_type = VMARKER;
523bb33b4emjg	vp->v_mount = mp;
524bb33b4emjg
525bb33b4emjg	return (vp);
526bb33b4emjg}
527bb33b4emjg
528bb33b4emjgstatic void
529bb33b4emjgvn_free_marker(struct vnode *vp)
530bb33b4emjg{
531bb33b4emjg
532bb33b4emjg	MPASS(vp->v_type == VMARKER);
533bb33b4emjg	free(vp, M_VNODE_MARKER);
534bb33b4emjg}
535bb33b4emjg
5366f0b4b3mckusick/*
5376f0b4b3mckusick * Initialize a vnode as it first enters the zone.
5386f0b4b3mckusick */
5396f0b4b3mckusickstatic int
5406f0b4b3mckusickvnode_init(void *mem, int size, int flags)
5416f0b4b3mckusick{
5426f0b4b3mckusick	struct vnode *vp;
5436f0b4b3mckusick
5446f0b4b3mckusick	vp = mem;
5456f0b4b3mckusick	bzero(vp, size);
5466f0b4b3mckusick	/*
5476f0b4b3mckusick	 * Setup locks.
5486f0b4b3mckusick	 */
5496f0b4b3mckusick	vp->v_vnlock = &vp->v_lock;
5506f0b4b3mckusick	mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF);
5516f0b4b3mckusick	/*
5526f0b4b3mckusick	 * By default, don't allow shared locks unless filesystems opt-in.
5536f0b4b3mckusick	 */
5546f0b4b3mckusick	lockinit(vp->v_vnlock, PVFS, "vnode", VLKTIMEOUT,
5556f0b4b3mckusick	    LK_NOSHARE | LK_IS_VNODE);
5566f0b4b3mckusick	/*
5576f0b4b3mckusick	 * Initialize bufobj.
5586f0b4b3mckusick	 */
559e3be9f8jeff	bufobj_init(&vp->v_bufobj, vp);
5606f0b4b3mckusick	/*
5616f0b4b3mckusick	 * Initialize namecache.
5626f0b4b3mckusick	 */
5636f0b4b3mckusick	LIST_INIT(&vp->v_cache_src);
5646f0b4b3mckusick	TAILQ_INIT(&vp->v_cache_dst);
5656f0b4b3mckusick	/*
5666f0b4b3mckusick	 * Initialize rangelocks.
5676f0b4b3mckusick	 */
5686f0b4b3mckusick	rangelock_init(&vp->v_rl);
5699b80414mjg
5709bce1ffmjg	vp->v_dbatchcpu = NOCPU;
5719bce1ffmjg
5729b80414mjg	mtx_lock(&vnode_list_mtx);
5739b80414mjg	TAILQ_INSERT_BEFORE(vnode_list_free_marker, vp, v_vnodelist);
5749b80414mjg	mtx_unlock(&vnode_list_mtx);
5756f0b4b3mckusick	return (0);
5766f0b4b3mckusick}
5776f0b4b3mckusick
5786f0b4b3mckusick/*
5796f0b4b3mckusick * Free a vnode when it is cleared from the zone.
5806f0b4b3mckusick */
5816f0b4b3mckusickstatic void
5826f0b4b3mckusickvnode_fini(void *mem, int size)
5836f0b4b3mckusick{
5846f0b4b3mckusick	struct vnode *vp;
5856f0b4b3mckusick	struct bufobj *bo;
5866f0b4b3mckusick
5876f0b4b3mckusick	vp = mem;
5889bce1ffmjg	vdbatch_dequeue(vp);
5899b80414mjg	mtx_lock(&vnode_list_mtx);
5909b80414mjg	TAILQ_REMOVE(&vnode_list, vp, v_vnodelist);
5919b80414mjg	mtx_unlock(&vnode_list_mtx);
5926f0b4b3mckusick	rangelock_destroy(&vp->v_rl);
5936f0b4b3mckusick	lockdestroy(vp->v_vnlock);
5946f0b4b3mckusick	mtx_destroy(&vp->v_interlock);
5956f0b4b3mckusick	bo = &vp->v_bufobj;
5966f0b4b3mckusick	rw_destroy(BO_LOCKPTR(bo));
5976f0b4b3mckusick}
5986f0b4b3mckusick
599392fea7kib/*
600392fea7kib * Provide the size of NFS nclnode and NFS fh for calculation of the
601392fea7kib * vnode memory consumption.  The size is specified directly to
602392fea7kib * eliminate dependency on NFS-private header.
603392fea7kib *
604392fea7kib * Other filesystems may use bigger or smaller (like UFS and ZFS)
605392fea7kib * private inode data, but the NFS-based estimation is ample enough.
606392fea7kib * Still, we care about differences in the size between 64- and 32-bit
607392fea7kib * platforms.
608392fea7kib *
609392fea7kib * Namecache structure size is heuristically
610392fea7kib * sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1.
611392fea7kib */
612392fea7kib#ifdef _LP64
613392fea7kib#define	NFS_NCLNODE_SZ	(528 + 64)
614392fea7kib#define	NC_SZ		148
615392fea7kib#else
616392fea7kib#define	NFS_NCLNODE_SZ	(360 + 32)
617392fea7kib#define	NC_SZ		92
618392fea7kib#endif
619392fea7kib
620eb5dd3dpeterstatic void
621