xref: /illumos-gate/usr/src/uts/common/fs/vnode.c (revision 41a4986b)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Copyright 2020 Joyent, Inc.
25  * Copyright 2022 Spencer Evans-Cole.
26  * Copyright 2016 Nexenta Systems, Inc.  All rights reserved.
27  * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
28  */
29 
30 /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
31 /*	  All Rights Reserved	*/
32 
33 /*
34  * University Copyright- Copyright (c) 1982, 1986, 1988
35  * The Regents of the University of California
36  * All Rights Reserved
37  *
38  * University Acknowledgment- Portions of this document are derived from
39  * software developed by the University of California, Berkeley, and its
40  * contributors.
41  */
42 
43 #include <sys/types.h>
44 #include <sys/param.h>
45 #include <sys/t_lock.h>
46 #include <sys/errno.h>
47 #include <sys/cred.h>
48 #include <sys/user.h>
49 #include <sys/uio.h>
50 #include <sys/file.h>
51 #include <sys/pathname.h>
52 #include <sys/vfs.h>
53 #include <sys/vfs_opreg.h>
54 #include <sys/vnode.h>
55 #include <sys/filio.h>
56 #include <sys/rwstlock.h>
57 #include <sys/fem.h>
58 #include <sys/stat.h>
59 #include <sys/mode.h>
60 #include <sys/conf.h>
61 #include <sys/sysmacros.h>
62 #include <sys/cmn_err.h>
63 #include <sys/systm.h>
64 #include <sys/kmem.h>
65 #include <sys/debug.h>
66 #include <c2/audit.h>
67 #include <sys/acl.h>
68 #include <sys/nbmlock.h>
69 #include <sys/fcntl.h>
70 #include <fs/fs_subr.h>
71 #include <sys/taskq.h>
72 #include <fs/fs_reparse.h>
73 #include <sys/time.h>
74 #include <sys/sdt.h>
75 
76 /* Determine if this vnode is a file that is read-only */
77 #define	ISROFILE(vp)	\
78 	((vp)->v_type != VCHR && (vp)->v_type != VBLK && \
79 	    (vp)->v_type != VFIFO && vn_is_readonly(vp))
80 
81 /* Tunable via /etc/system; used only by admin/install */
82 int nfs_global_client_only;
83 
84 /*
85  * Array of vopstats_t for per-FS-type vopstats.  This array has the same
86  * number of entries as and parallel to the vfssw table.  (Arguably, it could
87  * be part of the vfssw table.)  Once it's initialized, it's accessed using
88  * the same fstype index that is used to index into the vfssw table.
89  */
90 vopstats_t **vopstats_fstype;
91 
92 /* vopstats initialization template used for fast initialization via bcopy() */
93 static vopstats_t *vs_templatep;
94 
95 /* Kmem cache handle for vsk_anchor_t allocations */
96 kmem_cache_t *vsk_anchor_cache;
97 
98 /* file events cleanup routine */
99 extern void free_fopdata(vnode_t *);
100 
101 /*
102  * Root of AVL tree for the kstats associated with vopstats.  Lock protects
103  * updates to vsktat_tree.
104  */
105 avl_tree_t	vskstat_tree;
106 kmutex_t	vskstat_tree_lock;
107 
108 /* Global variable which enables/disables the vopstats collection */
109 int vopstats_enabled = 1;
110 
111 /* Global used for empty/invalid v_path */
112 char *vn_vpath_empty = "";
113 
114 /*
115  * forward declarations for internal vnode specific data (vsd)
116  */
117 static void *vsd_realloc(void *, size_t, size_t);
118 
119 /*
120  * forward declarations for reparse point functions
121  */
122 static int fs_reparse_mark(char *target, vattr_t *vap, xvattr_t *xvattr);
123 
124 /*
125  * VSD -- VNODE SPECIFIC DATA
126  * The v_data pointer is typically used by a file system to store a
127  * pointer to the file system's private node (e.g. ufs inode, nfs rnode).
128  * However, there are times when additional project private data needs
129  * to be stored separately from the data (node) pointed to by v_data.
130  * This additional data could be stored by the file system itself or
131  * by a completely different kernel entity.  VSD provides a way for
132  * callers to obtain a key and store a pointer to private data associated
133  * with a vnode.
134  *
135  * Callers are responsible for protecting the vsd by holding v_vsd_lock
136  * for calls to vsd_set() and vsd_get().
137  */
138 
139 /*
140  * vsd_lock protects:
141  *   vsd_nkeys - creation and deletion of vsd keys
142  *   vsd_list - insertion and deletion of vsd_node in the vsd_list
143  *   vsd_destructor - adding and removing destructors to the list
144  */
145 static kmutex_t		vsd_lock;
146 static uint_t		vsd_nkeys;	 /* size of destructor array */
147 /* list of vsd_node's */
148 static list_t *vsd_list = NULL;
149 /* per-key destructor funcs */
150 static void		(**vsd_destructor)(void *);
151 
152 /*
153  * The following is the common set of actions needed to update the
154  * vopstats structure from a vnode op.  Both VOPSTATS_UPDATE() and
155  * VOPSTATS_UPDATE_IO() do almost the same thing, except for the
156  * recording of the bytes transferred.  Since the code is similar
157  * but small, it is nearly a duplicate.  Consequently any changes
158  * to one may need to be reflected in the other.
159  * Rundown of the variables:
160  * vp - Pointer to the vnode
161  * counter - Partial name structure member to update in vopstats for counts
162  * bytecounter - Partial name structure member to update in vopstats for bytes
163  * bytesval - Value to update in vopstats for bytes
164  * fstype - Index into vsanchor_fstype[], same as index into vfssw[]
165  * vsp - Pointer to vopstats structure (either in vfs or vsanchor_fstype[i])
166  */
167 
168 #define	VOPSTATS_UPDATE(vp, counter) {					\
169 	vfs_t *vfsp = (vp)->v_vfsp;					\
170 	if (vfsp && vfsp->vfs_implp &&					\
171 	    (vfsp->vfs_flag & VFS_STATS) && (vp)->v_type != VBAD) {	\
172 		vopstats_t *vsp = &vfsp->vfs_vopstats;			\
173 		uint64_t *stataddr = &(vsp->n##counter.value.ui64);	\
174 		extern void __dtrace_probe___fsinfo_##counter(vnode_t *, \
175 		    size_t, uint64_t *);				\
176 		__dtrace_probe___fsinfo_##counter(vp, 0, stataddr);	\
177 		(*stataddr)++;						\
178 		if ((vsp = vfsp->vfs_fstypevsp) != NULL) {		\
179 			vsp->n##counter.value.ui64++;			\
180 		}							\
181 	}								\
182 }
183 
184 #define	VOPSTATS_UPDATE_IO(vp, counter, bytecounter, bytesval) {	\
185 	vfs_t *vfsp = (vp)->v_vfsp;					\
186 	if (vfsp && vfsp->vfs_implp &&					\
187 	    (vfsp->vfs_flag & VFS_STATS) && (vp)->v_type != VBAD) {	\
188 		vopstats_t *vsp = &vfsp->vfs_vopstats;			\
189 		uint64_t *stataddr = &(vsp->n##counter.value.ui64);	\
190 		extern void __dtrace_probe___fsinfo_##counter(vnode_t *, \
191 		    size_t, uint64_t *);				\
192 		__dtrace_probe___fsinfo_##counter(vp, bytesval, stataddr); \
193 		(*stataddr)++;						\
194 		vsp->bytecounter.value.ui64 += bytesval;		\
195 		if ((vsp = vfsp->vfs_fstypevsp) != NULL) {		\
196 			vsp->n##counter.value.ui64++;			\
197 			vsp->bytecounter.value.ui64 += bytesval;	\
198 		}							\
199 	}								\
200 }
201 
202 /*
203  * If the filesystem does not support XIDs map credential
204  * If the vfsp is NULL, perhaps we should also map?
205  */
206 #define	VOPXID_MAP_CR(vp, cr)	{					\
207 	vfs_t *vfsp = (vp)->v_vfsp;					\
208 	if (vfsp != NULL && (vfsp->vfs_flag & VFS_XID) == 0)		\
209 		cr = crgetmapped(cr);					\
210 	}
211 
212 /*
213  * Convert stat(2) formats to vnode types and vice versa.  (Knows about
214  * numerical order of S_IFMT and vnode types.)
215  */
216 enum vtype iftovt_tab[] = {
217 	VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
218 	VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VNON
219 };
220 
221 ushort_t vttoif_tab[] = {
222 	0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, S_IFIFO,
223 	S_IFDOOR, 0, S_IFSOCK, S_IFPORT, 0
224 };
225 
226 /*
227  * The system vnode cache.
228  */
229 
230 kmem_cache_t *vn_cache;
231 
232 
233 /*
234  * Vnode operations vector.
235  */
236 
237 static const fs_operation_trans_def_t vn_ops_table[] = {
238 	VOPNAME_OPEN, offsetof(struct vnodeops, vop_open),
239 	    fs_nosys, fs_nosys,
240 
241 	VOPNAME_CLOSE, offsetof(struct vnodeops, vop_close),
242 	    fs_nosys, fs_nosys,
243 
244 	VOPNAME_READ, offsetof(struct vnodeops, vop_read),
245 	    fs_nosys, fs_nosys,
246 
247 	VOPNAME_WRITE, offsetof(struct vnodeops, vop_write),
248 	    fs_nosys, fs_nosys,
249 
250 	VOPNAME_IOCTL, offsetof(struct vnodeops, vop_ioctl),
251 	    fs_nosys, fs_nosys,
252 
253 	VOPNAME_SETFL, offsetof(struct vnodeops, vop_setfl),
254 	    fs_setfl, fs_nosys,
255 
256 	VOPNAME_GETATTR, offsetof(struct vnodeops, vop_getattr),
257 	    fs_nosys, fs_nosys,
258 
259 	VOPNAME_SETATTR, offsetof(struct vnodeops, vop_setattr),
260 	    fs_nosys, fs_nosys,
261 
262 	VOPNAME_ACCESS, offsetof(struct vnodeops, vop_access),
263 	    fs_nosys, fs_nosys,
264 
265 	VOPNAME_LOOKUP, offsetof(struct vnodeops, vop_lookup),
266 	    fs_nosys, fs_nosys,
267 
268 	VOPNAME_CREATE, offsetof(struct vnodeops, vop_create),
269 	    fs_nosys, fs_nosys,
270 
271 	VOPNAME_REMOVE, offsetof(struct vnodeops, vop_remove),
272 	    fs_nosys, fs_nosys,
273 
274 	VOPNAME_LINK, offsetof(struct vnodeops, vop_link),
275 	    fs_nosys, fs_nosys,
276 
277 	VOPNAME_RENAME, offsetof(struct vnodeops, vop_rename),
278 	    fs_nosys, fs_nosys,
279 
280 	VOPNAME_MKDIR, offsetof(struct vnodeops, vop_mkdir),
281 	    fs_nosys, fs_nosys,
282 
283 	VOPNAME_RMDIR, offsetof(struct vnodeops, vop_rmdir),
284 	    fs_nosys, fs_nosys,
285 
286 	VOPNAME_READDIR, offsetof(struct vnodeops, vop_readdir),
287 	    fs_nosys, fs_nosys,
288 
289 	VOPNAME_SYMLINK, offsetof(struct vnodeops, vop_symlink),
290 	    fs_nosys, fs_nosys,
291 
292 	VOPNAME_READLINK, offsetof(struct vnodeops, vop_readlink),
293 	    fs_nosys, fs_nosys,
294 
295 	VOPNAME_FSYNC, offsetof(struct vnodeops, vop_fsync),
296 	    fs_nosys, fs_nosys,
297 
298 	VOPNAME_INACTIVE, offsetof(struct vnodeops, vop_inactive),
299 	    fs_nosys, fs_nosys,
300 
301 	VOPNAME_FID, offsetof(struct vnodeops, vop_fid),
302 	    fs_nosys, fs_nosys,
303 
304 	VOPNAME_RWLOCK, offsetof(struct vnodeops, vop_rwlock),
305 	    fs_rwlock, fs_rwlock,
306 
307 	VOPNAME_RWUNLOCK, offsetof(struct vnodeops, vop_rwunlock),
308 	    (fs_generic_func_p)(uintptr_t)fs_rwunlock,
309 	    (fs_generic_func_p)(uintptr_t)fs_rwunlock,	/* no errors allowed */
310 
311 	VOPNAME_SEEK, offsetof(struct vnodeops, vop_seek),
312 	    fs_nosys, fs_nosys,
313 
314 	VOPNAME_CMP, offsetof(struct vnodeops, vop_cmp),
315 	    fs_cmp, fs_cmp,		/* no errors allowed */
316 
317 	VOPNAME_FRLOCK, offsetof(struct vnodeops, vop_frlock),
318 	    fs_frlock, fs_nosys,
319 
320 	VOPNAME_SPACE, offsetof(struct vnodeops, vop_space),
321 	    fs_nosys, fs_nosys,
322 
323 	VOPNAME_REALVP, offsetof(struct vnodeops, vop_realvp),
324 	    fs_nosys, fs_nosys,
325 
326 	VOPNAME_GETPAGE, offsetof(struct vnodeops, vop_getpage),
327 	    fs_nosys, fs_nosys,
328 
329 	VOPNAME_PUTPAGE, offsetof(struct vnodeops, vop_putpage),
330 	    fs_nosys, fs_nosys,
331 
332 	VOPNAME_MAP, offsetof(struct vnodeops, vop_map),
333 	    (fs_generic_func_p) fs_nosys_map,
334 	    (fs_generic_func_p) fs_nosys_map,
335 
336 	VOPNAME_ADDMAP, offsetof(struct vnodeops, vop_addmap),
337 	    (fs_generic_func_p) fs_nosys_addmap,
338 	    (fs_generic_func_p) fs_nosys_addmap,
339 
340 	VOPNAME_DELMAP, offsetof(struct vnodeops, vop_delmap),
341 	    fs_nosys, fs_nosys,
342 
343 	VOPNAME_POLL, offsetof(struct vnodeops, vop_poll),
344 	    (fs_generic_func_p) fs_poll, (fs_generic_func_p) fs_nosys_poll,
345 
346 	VOPNAME_DUMP, offsetof(struct vnodeops, vop_dump),
347 	    fs_nosys, fs_nosys,
348 
349 	VOPNAME_PATHCONF, offsetof(struct vnodeops, vop_pathconf),
350 	    fs_pathconf, fs_nosys,
351 
352 	VOPNAME_PAGEIO, offsetof(struct vnodeops, vop_pageio),
353 	    fs_nosys, fs_nosys,
354 
355 	VOPNAME_DUMPCTL, offsetof(struct vnodeops, vop_dumpctl),
356 	    fs_nosys, fs_nosys,
357 
358 	VOPNAME_DISPOSE, offsetof(struct vnodeops, vop_dispose),
359 	    (fs_generic_func_p)(uintptr_t)fs_dispose,
360 	    (fs_generic_func_p)(uintptr_t)fs_nodispose,
361 
362 	VOPNAME_SETSECATTR, offsetof(struct vnodeops, vop_setsecattr),
363 	    fs_nosys, fs_nosys,
364 
365 	VOPNAME_GETSECATTR, offsetof(struct vnodeops, vop_getsecattr),
366 	    fs_fab_acl, fs_nosys,
367 
368 	VOPNAME_SHRLOCK, offsetof(struct vnodeops, vop_shrlock),
369 	    fs_shrlock, fs_nosys,
370 
371 	VOPNAME_VNEVENT, offsetof(struct vnodeops, vop_vnevent),
372 	    (fs_generic_func_p) fs_vnevent_nosupport,
373 	    (fs_generic_func_p) fs_vnevent_nosupport,
374 
375 	VOPNAME_REQZCBUF, offsetof(struct vnodeops, vop_reqzcbuf),
376 	    fs_nosys, fs_nosys,
377 
378 	VOPNAME_RETZCBUF, offsetof(struct vnodeops, vop_retzcbuf),
379 	    fs_nosys, fs_nosys,
380 
381 	NULL, 0, NULL, NULL
382 };
383 
384 /* Extensible attribute (xva) routines. */
385 
386 /*
387  * Zero out the structure, set the size of the requested/returned bitmaps,
388  * set AT_XVATTR in the embedded vattr_t's va_mask, and set up the pointer
389  * to the returned attributes array.
390  */
391 void
xva_init(xvattr_t * xvap)392 xva_init(xvattr_t *xvap)
393 {
394 	bzero(xvap, sizeof (xvattr_t));
395 	xvap->xva_mapsize = XVA_MAPSIZE;
396 	xvap->xva_magic = XVA_MAGIC;
397 	xvap->xva_vattr.va_mask = AT_XVATTR;
398 	xvap->xva_rtnattrmapp = &(xvap->xva_rtnattrmap)[0];
399 }
400 
401 /*
402  * If AT_XVATTR is set, returns a pointer to the embedded xoptattr_t
403  * structure.  Otherwise, returns NULL.
404  */
405 xoptattr_t *
xva_getxoptattr(xvattr_t * xvap)406 xva_getxoptattr(xvattr_t *xvap)
407 {
408 	xoptattr_t *xoap = NULL;
409 	if (xvap->xva_vattr.va_mask & AT_XVATTR)
410 		xoap = &xvap->xva_xoptattrs;
411 	return (xoap);
412 }
413 
414 /*
415  * Used by the AVL routines to compare two vsk_anchor_t structures in the tree.
416  * We use the f_fsid reported by VFS_STATVFS() since we use that for the
417  * kstat name.
418  */
419 static int
vska_compar(const void * n1,const void * n2)420 vska_compar(const void *n1, const void *n2)
421 {
422 	int ret;
423 	ulong_t p1 = ((vsk_anchor_t *)n1)->vsk_fsid;
424 	ulong_t p2 = ((vsk_anchor_t *)n2)->vsk_fsid;
425 
426 	if (p1 < p2) {
427 		ret = -1;
428 	} else if (p1 > p2) {
429 		ret = 1;
430 	} else {
431 		ret = 0;
432 	}
433 
434 	return (ret);
435 }
436 
437 /*
438  * Used to create a single template which will be bcopy()ed to a newly
439  * allocated vsanchor_combo_t structure in new_vsanchor(), below.
440  */
441 static vopstats_t *
create_vopstats_template()442 create_vopstats_template()
443 {
444 	vopstats_t		*vsp;
445 
446 	vsp = kmem_alloc(sizeof (vopstats_t), KM_SLEEP);
447 	bzero(vsp, sizeof (*vsp));	/* Start fresh */
448 
449 	/* VOP_OPEN */
450 	kstat_named_init(&vsp->nopen, "nopen", KSTAT_DATA_UINT64);
451 	/* VOP_CLOSE */
452 	kstat_named_init(&vsp->nclose, "nclose", KSTAT_DATA_UINT64);
453 	/* VOP_READ I/O */
454 	kstat_named_init(&vsp->nread, "nread", KSTAT_DATA_UINT64);
455 	kstat_named_init(&vsp->read_bytes, "read_bytes", KSTAT_DATA_UINT64);
456 	/* VOP_WRITE I/O */
457 	kstat_named_init(&vsp->nwrite, "nwrite", KSTAT_DATA_UINT64);
458 	kstat_named_init(&vsp->write_bytes, "write_bytes", KSTAT_DATA_UINT64);
459 	/* VOP_IOCTL */
460 	kstat_named_init(&vsp->nioctl, "nioctl", KSTAT_DATA_UINT64);
461 	/* VOP_SETFL */
462 	kstat_named_init(&vsp->nsetfl, "nsetfl", KSTAT_DATA_UINT64);
463 	/* VOP_GETATTR */
464 	kstat_named_init(&vsp->ngetattr, "ngetattr", KSTAT_DATA_UINT64);
465 	/* VOP_SETATTR */
466 	kstat_named_init(&vsp->nsetattr, "nsetattr", KSTAT_DATA_UINT64);
467 	/* VOP_ACCESS */
468 	kstat_named_init(&vsp->naccess, "naccess", KSTAT_DATA_UINT64);
469 	/* VOP_LOOKUP */
470 	kstat_named_init(&vsp->nlookup, "nlookup", KSTAT_DATA_UINT64);
471 	/* VOP_CREATE */
472 	kstat_named_init(&vsp->ncreate, "ncreate", KSTAT_DATA_UINT64);
473 	/* VOP_REMOVE */
474 	kstat_named_init(&vsp->nremove, "nremove", KSTAT_DATA_UINT64);
475 	/* VOP_LINK */
476 	kstat_named_init(&vsp->nlink, "nlink", KSTAT_DATA_UINT64);
477 	/* VOP_RENAME */
478 	kstat_named_init(&vsp->nrename, "nrename", KSTAT_DATA_UINT64);
479 	/* VOP_MKDIR */
480 	kstat_named_init(&vsp->nmkdir, "nmkdir", KSTAT_DATA_UINT64);
481 	/* VOP_RMDIR */
482 	kstat_named_init(&vsp->nrmdir, "nrmdir", KSTAT_DATA_UINT64);
483 	/* VOP_READDIR I/O */
484 	kstat_named_init(&vsp->nreaddir, "nreaddir", KSTAT_DATA_UINT64);
485 	kstat_named_init(&vsp->readdir_bytes, "readdir_bytes",
486 	    KSTAT_DATA_UINT64);
487 	/* VOP_SYMLINK */
488 	kstat_named_init(&vsp->nsymlink, "nsymlink", KSTAT_DATA_UINT64);
489 	/* VOP_READLINK */
490 	kstat_named_init(&vsp->nreadlink, "nreadlink", KSTAT_DATA_UINT64);
491 	/* VOP_FSYNC */
492 	kstat_named_init(&vsp->nfsync, "nfsync", KSTAT_DATA_UINT64);
493 	/* VOP_INACTIVE */
494 	kstat_named_init(&vsp->ninactive, "ninactive", KSTAT_DATA_UINT64);
495 	/* VOP_FID */
496 	kstat_named_init(&vsp->nfid, "nfid", KSTAT_DATA_UINT64);
497 	/* VOP_RWLOCK */
498 	kstat_named_init(&vsp->nrwlock, "nrwlock", KSTAT_DATA_UINT64);
499 	/* VOP_RWUNLOCK */
500 	kstat_named_init(&vsp->nrwunlock, "nrwunlock", KSTAT_DATA_UINT64);
501 	/* VOP_SEEK */
502 	kstat_named_init(&vsp->nseek, "nseek", KSTAT_DATA_UINT64);
503 	/* VOP_CMP */
504 	kstat_named_init(&vsp->ncmp, "ncmp", KSTAT_DATA_UINT64);
505 	/* VOP_FRLOCK */
506 	kstat_named_init(&vsp->nfrlock, "nfrlock", KSTAT_DATA_UINT64);
507 	/* VOP_SPACE */
508 	kstat_named_init(&vsp->nspace, "nspace", KSTAT_DATA_UINT64);
509 	/* VOP_REALVP */
510 	kstat_named_init(&vsp->nrealvp, "nrealvp", KSTAT_DATA_UINT64);
511 	/* VOP_GETPAGE */
512 	kstat_named_init(&vsp->ngetpage, "ngetpage", KSTAT_DATA_UINT64);
513 	/* VOP_PUTPAGE */
514 	kstat_named_init(&vsp->nputpage, "nputpage", KSTAT_DATA_UINT64);
515 	/* VOP_MAP */
516 	kstat_named_init(&vsp->nmap, "nmap", KSTAT_DATA_UINT64);
517 	/* VOP_ADDMAP */
518 	kstat_named_init(&vsp->naddmap, "naddmap", KSTAT_DATA_UINT64);
519 	/* VOP_DELMAP */
520 	kstat_named_init(&vsp->ndelmap, "ndelmap", KSTAT_DATA_UINT64);
521 	/* VOP_POLL */
522 	kstat_named_init(&vsp->npoll, "npoll", KSTAT_DATA_UINT64);
523 	/* VOP_DUMP */
524 	kstat_named_init(&vsp->ndump, "ndump", KSTAT_DATA_UINT64);
525 	/* VOP_PATHCONF */
526 	kstat_named_init(&vsp->npathconf, "npathconf", KSTAT_DATA_UINT64);
527 	/* VOP_PAGEIO */
528 	kstat_named_init(&vsp->npageio, "npageio", KSTAT_DATA_UINT64);
529 	/* VOP_DUMPCTL */
530 	kstat_named_init(&vsp->ndumpctl, "ndumpctl", KSTAT_DATA_UINT64);
531 	/* VOP_DISPOSE */
532 	kstat_named_init(&vsp->ndispose, "ndispose", KSTAT_DATA_UINT64);
533 	/* VOP_SETSECATTR */
534 	kstat_named_init(&vsp->nsetsecattr, "nsetsecattr", KSTAT_DATA_UINT64);
535 	/* VOP_GETSECATTR */
536 	kstat_named_init(&vsp->ngetsecattr, "ngetsecattr", KSTAT_DATA_UINT64);
537 	/* VOP_SHRLOCK */
538 	kstat_named_init(&vsp->nshrlock, "nshrlock", KSTAT_DATA_UINT64);
539 	/* VOP_VNEVENT */
540 	kstat_named_init(&vsp->nvnevent, "nvnevent", KSTAT_DATA_UINT64);
541 	/* VOP_REQZCBUF */
542 	kstat_named_init(&vsp->nreqzcbuf, "nreqzcbuf", KSTAT_DATA_UINT64);
543 	/* VOP_RETZCBUF */
544 	kstat_named_init(&vsp->nretzcbuf, "nretzcbuf", KSTAT_DATA_UINT64);
545 
546 	return (vsp);
547 }
548 
549 /*
550  * Creates a kstat structure associated with a vopstats structure.
551  */
552 kstat_t *
new_vskstat(char * ksname,vopstats_t * vsp)553 new_vskstat(char *ksname, vopstats_t *vsp)
554 {
555 	kstat_t		*ksp;
556 
557 	if (!vopstats_enabled) {
558 		return (NULL);
559 	}
560 
561 	ksp = kstat_create("unix", 0, ksname, "misc", KSTAT_TYPE_NAMED,
562 	    sizeof (vopstats_t)/sizeof (kstat_named_t),
563 	    KSTAT_FLAG_VIRTUAL|KSTAT_FLAG_WRITABLE);
564 	if (ksp) {
565 		ksp->ks_data = vsp;
566 		kstat_install(ksp);
567 	}
568 
569 	return (ksp);
570 }
571 
572 /*
573  * Called from vfsinit() to initialize the support mechanisms for vopstats
574  */
575 void
vopstats_startup()576 vopstats_startup()
577 {
578 	if (!vopstats_enabled)
579 		return;
580 
581 	/*
582 	 * Creates the AVL tree which holds per-vfs vopstat anchors.  This
583 	 * is necessary since we need to check if a kstat exists before we
584 	 * attempt to create it.  Also, initialize its lock.
585 	 */
586 	avl_create(&vskstat_tree, vska_compar, sizeof (vsk_anchor_t),
587 	    offsetof(vsk_anchor_t, vsk_node));
588 	mutex_init(&vskstat_tree_lock, NULL, MUTEX_DEFAULT, NULL);
589 
590 	vsk_anchor_cache = kmem_cache_create("vsk_anchor_cache",
591 	    sizeof (vsk_anchor_t), sizeof (uintptr_t), NULL, NULL, NULL,
592 	    NULL, NULL, 0);
593 
594 	/*
595 	 * Set up the array of pointers for the vopstats-by-FS-type.
596 	 * The entries will be allocated/initialized as each file system
597 	 * goes through modload/mod_installfs.
598 	 */
599 	vopstats_fstype = (vopstats_t **)kmem_zalloc(
600 	    (sizeof (vopstats_t *) * nfstype), KM_SLEEP);
601 
602 	/* Set up the global vopstats initialization template */
603 	vs_templatep = create_vopstats_template();
604 }
605 
606 /*
607  * We need to have the all of the counters zeroed.
608  * The initialization of the vopstats_t includes on the order of
609  * 50 calls to kstat_named_init().  Rather that do that on every call,
610  * we do it once in a template (vs_templatep) then bcopy it over.
611  */
612 void
initialize_vopstats(vopstats_t * vsp)613 initialize_vopstats(vopstats_t *vsp)
614 {
615 	if (vsp == NULL)
616 		return;
617 
618 	bcopy(vs_templatep, vsp, sizeof (vopstats_t));
619 }
620 
621 /*
622  * If possible, determine which vopstats by fstype to use and
623  * return a pointer to the caller.
624  */
625 vopstats_t *
get_fstype_vopstats(vfs_t * vfsp,struct vfssw * vswp)626 get_fstype_vopstats(vfs_t *vfsp, struct vfssw *vswp)
627 {
628 	int		fstype = 0;	/* Index into vfssw[] */
629 	vopstats_t	*vsp = NULL;
630 
631 	if (vfsp == NULL || (vfsp->vfs_flag & VFS_STATS) == 0 ||
632 	    !vopstats_enabled)
633 		return (NULL);
634 	/*
635 	 * Set up the fstype.  We go to so much trouble because all versions
636 	 * of NFS use the same fstype in their vfs even though they have
637 	 * distinct entries in the vfssw[] table.
638 	 * NOTE: A special vfs (e.g., EIO_vfs) may not have an entry.
639 	 */
640 	if (vswp) {
641 		fstype = vswp - vfssw;	/* Gets us the index */
642 	} else {
643 		fstype = vfsp->vfs_fstype;
644 	}
645 
646 	/*
647 	 * Point to the per-fstype vopstats. The only valid values are
648 	 * non-zero positive values less than the number of vfssw[] table
649 	 * entries.
650 	 */
651 	if (fstype > 0 && fstype < nfstype) {
652 		vsp = vopstats_fstype[fstype];
653 	}
654 
655 	return (vsp);
656 }
657 
658 /*
659  * Generate a kstat name, create the kstat structure, and allocate a
660  * vsk_anchor_t to hold it together.  Return the pointer to the vsk_anchor_t
661  * to the caller.  This must only be called from a mount.
662  */
663 vsk_anchor_t *
get_vskstat_anchor(vfs_t * vfsp)664 get_vskstat_anchor(vfs_t *vfsp)
665 {
666 	char		kstatstr[KSTAT_STRLEN]; /* kstat name for vopstats */
667 	statvfs64_t	statvfsbuf;		/* Needed to find f_fsid */
668 	vsk_anchor_t	*vskp = NULL;		/* vfs <--> kstat anchor */
669 	kstat_t		*ksp;			/* Ptr to new kstat */
670 	avl_index_t	where;			/* Location in the AVL tree */
671 
672 	if (vfsp == NULL || vfsp->vfs_implp == NULL ||
673 	    (vfsp->vfs_flag & VFS_STATS) == 0 || !vopstats_enabled)
674 		return (NULL);
675 
676 	/* Need to get the fsid to build a kstat name */
677 	if (VFS_STATVFS(vfsp, &statvfsbuf) == 0) {
678 		/* Create a name for our kstats based on fsid */
679 		(void) snprintf(kstatstr, KSTAT_STRLEN, "%s%lx",
680 		    VOPSTATS_STR, statvfsbuf.f_fsid);
681 
682 		/* Allocate and initialize the vsk_anchor_t */
683 		vskp = kmem_cache_alloc(vsk_anchor_cache, KM_SLEEP);
684 		bzero(vskp, sizeof (*vskp));
685 		vskp->vsk_fsid = statvfsbuf.f_fsid;
686 
687 		mutex_enter(&vskstat_tree_lock);
688 		if (avl_find(&vskstat_tree, vskp, &where) == NULL) {
689 			avl_insert(&vskstat_tree, vskp, where);
690 			mutex_exit(&vskstat_tree_lock);
691 
692 			/*
693 			 * Now that we've got the anchor in the AVL
694 			 * tree, we can create the kstat.
695 			 */
696 			ksp = new_vskstat(kstatstr, &vfsp->vfs_vopstats);
697 			if (ksp) {
698 				vskp->vsk_ksp = ksp;
699 			}
700 		} else {
701 			/* Oops, found one! Release memory and lock. */
702 			mutex_exit(&vskstat_tree_lock);
703 			kmem_cache_free(vsk_anchor_cache, vskp);
704 			vskp = NULL;
705 		}
706 	}
707 	return (vskp);
708 }
709 
710 /*
711  * We're in the process of tearing down the vfs and need to cleanup
712  * the data structures associated with the vopstats. Must only be called
713  * from dounmount().
714  */
715 void
teardown_vopstats(vfs_t * vfsp)716 teardown_vopstats(vfs_t *vfsp)
717 {
718 	vsk_anchor_t	*vskap;
719 	avl_index_t	where;
720 
721 	if (vfsp == NULL || vfsp->vfs_implp == NULL ||
722 	    (vfsp->vfs_flag & VFS_STATS) == 0 || !vopstats_enabled)
723 		return;
724 
725 	/* This is a safe check since VFS_STATS must be set (see above) */
726 	if ((vskap = vfsp->vfs_vskap) == NULL)
727 		return;
728 
729 	/* Whack the pointer right away */
730 	vfsp->vfs_vskap = NULL;
731 
732 	/* Lock the tree, remove the node, and delete the kstat */
733 	mutex_enter(&vskstat_tree_lock);
734 	if (avl_find(&vskstat_tree, vskap, &where)) {
735 		avl_remove(&vskstat_tree, vskap);
736 	}
737 
738 	if (vskap->vsk_ksp) {
739 		kstat_delete(vskap->vsk_ksp);
740 	}
741 	mutex_exit(&vskstat_tree_lock);
742 
743 	kmem_cache_free(vsk_anchor_cache, vskap);
744 }
745 
746 /*
747  * Read or write a vnode.  Called from kernel code.
748  */
749 int
vn_rdwr(enum uio_rw rw,struct vnode * vp,caddr_t base,ssize_t len,offset_t offset,enum uio_seg seg,int ioflag,rlim64_t ulimit,cred_t * cr,ssize_t * residp)750 vn_rdwr(
751 	enum uio_rw rw,
752 	struct vnode *vp,
753 	caddr_t base,
754 	ssize_t len,
755 	offset_t offset,
756 	enum uio_seg seg,
757 	int ioflag,
758 	rlim64_t ulimit,	/* meaningful only if rw is UIO_WRITE */
759 	cred_t *cr,
760 	ssize_t *residp)
761 {
762 	struct uio uio;
763 	struct iovec iov;
764 	int error;
765 	int in_crit = 0;
766 
767 	if (rw == UIO_WRITE && ISROFILE(vp))
768 		return (EROFS);
769 
770 	if (len < 0)
771 		return (EIO);
772 
773 	VOPXID_MAP_CR(vp, cr);
774 
775 	iov.iov_base = base;
776 	iov.iov_len = len;
777 	uio.uio_iov = &iov;
778 	uio.uio_iovcnt = 1;
779 	uio.uio_loffset = offset;
780 	uio.uio_segflg = (short)seg;
781 	uio.uio_resid = len;
782 	uio.uio_llimit = ulimit;
783 
784 	/*
785 	 * We have to enter the critical region before calling VOP_RWLOCK
786 	 * to avoid a deadlock with ufs.
787 	 */
788 	if (nbl_need_check(vp)) {
789 		int svmand;
790 
791 		nbl_start_crit(vp, RW_READER);
792 		in_crit = 1;
793 		error = nbl_svmand(vp, cr, &svmand);
794 		if (error != 0)
795 			goto done;
796 		if (nbl_conflict(vp, rw == UIO_WRITE ? NBL_WRITE : NBL_READ,
797 		    uio.uio_offset, uio.uio_resid, svmand, NULL)) {
798 			error = EACCES;
799 			goto done;
800 		}
801 	}
802 
803 	(void) VOP_RWLOCK(vp,
804 	    rw == UIO_WRITE ? V_WRITELOCK_TRUE : V_WRITELOCK_FALSE, NULL);
805 	if (rw == UIO_WRITE) {
806 		uio.uio_fmode = FWRITE;
807 		uio.uio_extflg = UIO_COPY_DEFAULT;
808 		error = VOP_WRITE(vp, &uio, ioflag, cr, NULL);
809 	} else {
810 		uio.uio_fmode = FREAD;
811 		uio.uio_extflg = UIO_COPY_CACHED;
812 		error = VOP_READ(vp, &uio, ioflag, cr, NULL);
813 	}
814 	VOP_RWUNLOCK(vp,
815 	    rw == UIO_WRITE ? V_WRITELOCK_TRUE : V_WRITELOCK_FALSE, NULL);
816 	if (residp)
817 		*residp = uio.uio_resid;
818 	else if (uio.uio_resid)
819 		error = EIO;
820 
821 done:
822 	if (in_crit)
823 		nbl_end_crit(vp);
824 	return (error);
825 }
826 
827 /*
828  * Release a vnode.  Call VOP_INACTIVE on last reference or
829  * decrement reference count.
830  *
831  * To avoid race conditions, the v_count is left at 1 for
832  * the call to VOP_INACTIVE. This prevents another thread
833  * from reclaiming and releasing the vnode *before* the
834  * VOP_INACTIVE routine has a chance to destroy the vnode.
835  * We can't have more than 1 thread calling VOP_INACTIVE
836  * on a vnode.
837  */
838 void
vn_rele(vnode_t * vp)839 vn_rele(vnode_t *vp)
840 {
841 	mutex_enter(&vp->v_lock);
842 	if (vp->v_count == 1) {
843 		mutex_exit(&vp->v_lock);
844 		VOP_INACTIVE(vp, CRED(), NULL);
845 		return;
846 	}
847 	VERIFY(vp->v_count > 0);
848 	VN_RELE_LOCKED(vp);
849 	mutex_exit(&vp->v_lock);
850 }
851 
852 /*
853  * Release a vnode referenced by the DNLC. Multiple DNLC references are treated
854  * as a single reference, so v_count is not decremented until the last DNLC hold
855  * is released. This makes it possible to distinguish vnodes that are referenced
856  * only by the DNLC.
857  */
858 void
vn_rele_dnlc(vnode_t * vp)859 vn_rele_dnlc(vnode_t *vp)
860 {
861 	mutex_enter(&vp->v_lock);
862 	VERIFY((vp->v_count > 0) && (vp->v_count_dnlc > 0));
863 	if (--vp->v_count_dnlc == 0) {
864 		if (vp->v_count == 1) {
865 			mutex_exit(&vp->v_lock);
866 			VOP_INACTIVE(vp, CRED(), NULL);
867 			return;
868 		}
869 		VN_RELE_LOCKED(vp);
870 	}
871 	mutex_exit(&vp->v_lock);
872 }
873 
874 /*
875  * Like vn_rele() except that it clears v_stream under v_lock.
876  * This is used by sockfs when it dismantles the association between
877  * the sockfs node and the vnode in the underlying file system.
878  * v_lock has to be held to prevent a thread coming through the lookupname
879  * path from accessing a stream head that is going away.
880  */
881 void
vn_rele_stream(vnode_t * vp)882 vn_rele_stream(vnode_t *vp)
883 {
884 	mutex_enter(&vp->v_lock);
885 	vp->v_stream = NULL;
886 	if (vp->v_count == 1) {
887 		mutex_exit(&vp->v_lock);
888 		VOP_INACTIVE(vp, CRED(), NULL);
889 		return;
890 	}
891 	VERIFY(vp->v_count > 0);
892 	VN_RELE_LOCKED(vp);
893 	mutex_exit(&vp->v_lock);
894 }
895 
896 static void
vn_rele_inactive(vnode_t * vp)897 vn_rele_inactive(vnode_t *vp)
898 {
899 	VOP_INACTIVE(vp, CRED(), NULL);
900 }
901 
902 /*
903  * Like vn_rele() except if we are going to call VOP_INACTIVE() then do it
904  * asynchronously using a taskq. This can avoid deadlocks caused by re-entering
905  * the file system as a result of releasing the vnode. Note, file systems
906  * already have to handle the race where the vnode is incremented before the
907  * inactive routine is called and does its locking.
908  *
909  * Warning: Excessive use of this routine can lead to performance problems.
910  * This is because taskqs throttle back allocation if too many are created.
911  */
912 void
vn_rele_async(vnode_t * vp,taskq_t * taskq)913 vn_rele_async(vnode_t *vp, taskq_t *taskq)
914 {
915 	mutex_enter(&vp->v_lock);
916 	if (vp->v_count == 1) {
917 		mutex_exit(&vp->v_lock);
918 		VERIFY(taskq_dispatch(taskq, (task_func_t *)vn_rele_inactive,
919 		    vp, TQ_SLEEP) != TASKQID_INVALID);
920 		return;
921 	}
922 	VERIFY(vp->v_count > 0);
923 	VN_RELE_LOCKED(vp);
924 	mutex_exit(&vp->v_lock);
925 }
926 
927 int
vn_open(char * pnamep,enum uio_seg seg,int filemode,int createmode,struct vnode ** vpp,enum create crwhy,mode_t umask)928 vn_open(
929 	char *pnamep,
930 	enum uio_seg seg,
931 	int filemode,
932 	int createmode,
933 	struct vnode **vpp,
934 	enum create crwhy,
935 	mode_t umask)
936 {
937 	return (vn_openat(pnamep, seg, filemode, createmode, vpp, crwhy,
938 	    umask, NULL, -1));
939 }
940 
941 
942 /*
943  * Open/create a vnode.
944  * This may be callable by the kernel, the only known use
945  * of user context being that the current user credentials
946  * are used for permissions.  crwhy is defined iff filemode & FCREAT.
947  */
948 int
vn_openat(char * pnamep,enum uio_seg seg,int filemode,int createmode,struct vnode ** vpp,enum create crwhy,mode_t umask,struct vnode * startvp,int fd)949 vn_openat(
950 	char *pnamep,
951 	enum uio_seg seg,
952 	int filemode,
953 	int createmode,
954 	struct vnode **vpp,
955 	enum create crwhy,
956 	mode_t umask,
957 	struct vnode *startvp,
958 	int fd)
959 {
960 	struct vnode *vp;
961 	int mode;
962 	int accessflags;
963 	int error;
964 	int in_crit = 0;
965 	int open_done = 0;
966 	int shrlock_done = 0;
967 	struct vattr vattr;
968 	enum symfollow follow;
969 	int estale_retry = 0;
970 	struct shrlock shr;
971 	struct shr_locowner shr_own;
972 	boolean_t create;
973 
974 	mode = 0;
975 	accessflags = 0;
976 	if (filemode & FREAD)
977 		mode |= VREAD;
978 	if (filemode & (FWRITE|FTRUNC))
979 		mode |= VWRITE;
980 	if (filemode & (FSEARCH|FEXEC|FXATTRDIROPEN))
981 		mode |= VEXEC;
982 
983 	/* symlink interpretation */
984 	if (filemode & FNOFOLLOW)
985 		follow = NO_FOLLOW;
986 	else
987 		follow = FOLLOW;
988 
989 	if (filemode & FAPPEND)
990 		accessflags |= V_APPEND;
991 
992 	/*
993 	 * We need to handle the case of FCREAT | FDIRECTORY and the case of
994 	 * FEXCL. If all three are specified, then we always fail because we
995 	 * cannot create a directory through this interface and FEXCL says we
996 	 * need to fail the request if we can't create it. If, however, only
997 	 * FCREAT | FDIRECTORY are specified, then we can treat this as the case
998 	 * of opening a file that already exists. If it exists, we can do
999 	 * something and if not, we fail. Effectively FCREAT | FDIRECTORY is
1000 	 * treated as FDIRECTORY.
1001 	 */
1002 	if ((filemode & (FCREAT | FDIRECTORY | FEXCL)) ==
1003 	    (FCREAT | FDIRECTORY | FEXCL)) {
1004 		return (EINVAL);
1005 	}
1006 
1007 	if ((filemode & (FCREAT | FDIRECTORY)) == (FCREAT | FDIRECTORY)) {
1008 		create = B_FALSE;
1009 	} else if ((filemode & FCREAT) != 0) {
1010 		create = B_TRUE;
1011 	} else {
1012 		create = B_FALSE;
1013 	}
1014 
1015 top:
1016 	if (create) {
1017 		enum vcexcl excl;
1018 
1019 		/*
1020 		 * Wish to create a file.
1021 		 */
1022 		vattr.va_type = VREG;
1023 		vattr.va_mode = createmode;
1024 		vattr.va_mask = AT_TYPE|AT_MODE;
1025 		if (filemode & FTRUNC) {
1026 			vattr.va_size = 0;
1027 			vattr.va_mask |= AT_SIZE;
1028 		}
1029 		if (filemode & FEXCL)
1030 			excl = EXCL;
1031 		else
1032 			excl = NONEXCL;
1033 
1034 		if (error =
1035 		    vn_createat(pnamep, seg, &vattr, excl, mode, &vp, crwhy,
1036 		    (filemode & ~(FTRUNC|FEXCL)), umask, startvp))
1037 			return (error);
1038 	} else {
1039 		/*
1040 		 * Wish to open a file.  Just look it up.
1041 		 */
1042 		if (error = lookupnameat(pnamep, seg, follow,
1043 		    NULLVPP, &vp, startvp)) {
1044 			if ((error == ESTALE) &&
1045 			    fs_need_estale_retry(estale_retry++))
1046 				goto top;
1047 			return (error);
1048 		}
1049 
1050 		/*
1051 		 * Get the attributes to check whether file is large.
1052 		 * We do this only if the FOFFMAX flag is not set and
1053 		 * only for regular files.
1054 		 */
1055 
1056 		if (!(filemode & FOFFMAX) && (vp->v_type == VREG)) {
1057 			vattr.va_mask = AT_SIZE;
1058 			if ((error = VOP_GETATTR(vp, &vattr, 0,
1059 			    CRED(), NULL))) {
1060 				goto out;
1061 			}
1062 			if (vattr.va_size > (u_offset_t)MAXOFF32_T) {
1063 				/*
1064 				 * Large File API - regular open fails
1065 				 * if FOFFMAX flag is set in file mode
1066 				 */
1067 				error = EOVERFLOW;
1068 				goto out;
1069 			}
1070 		}
1071 		/*
1072 		 * Can't write directories, active texts, or
1073 		 * read-only filesystems.  Can't truncate files
1074 		 * on which mandatory locking is in effect.
1075 		 */
1076 		if (filemode & (FWRITE|FTRUNC)) {
1077 			/*
1078 			 * Allow writable directory if VDIROPEN flag is set.
1079 			 */
1080 			if (vp->v_type == VDIR && !(vp->v_flag & VDIROPEN)) {
1081 				error = EISDIR;
1082 				goto out;
1083 			}
1084 			if (ISROFILE(vp)) {
1085 				error = EROFS;
1086 				goto out;
1087 			}
1088 			/*
1089 			 * Can't truncate files on which
1090 			 * sysv mandatory locking is in effect.
1091 			 */
1092 			if (filemode & FTRUNC) {
1093 				vnode_t *rvp;
1094 
1095 				if (VOP_REALVP(vp, &rvp, NULL) != 0)
1096 					rvp = vp;
1097 				if (rvp->v_filocks != NULL) {
1098 					vattr.va_mask = AT_MODE;
1099 					if ((error = VOP_GETATTR(vp,
1100 					    &vattr, 0, CRED(), NULL)) == 0 &&
1101 					    MANDLOCK(vp, vattr.va_mode))
1102 						error = EAGAIN;
1103 				}
1104 			}
1105 			if (error)
1106 				goto out;
1107 		}
1108 		/*
1109 		 * Check permissions.
1110 		 */
1111 		if (error = VOP_ACCESS(vp, mode, accessflags, CRED(), NULL))
1112 			goto out;
1113 
1114 		/*
1115 		 * Require FSEARCH and FDIRECTORY to return a directory. Require
1116 		 * FEXEC to return a regular file.
1117 		 */
1118 		if ((filemode & (FSEARCH|FDIRECTORY)) != 0 &&
1119 		    vp->v_type != VDIR) {
1120 			error = ENOTDIR;
1121 			goto out;
1122 		}
1123 		if ((filemode & FEXEC) && vp->v_type != VREG) {
1124 			error = ENOEXEC;	/* XXX: error code? */
1125 			goto out;
1126 		}
1127 	}
1128 
1129 	/*
1130 	 * Do remaining checks for FNOFOLLOW and FNOLINKS.
1131 	 */
1132 	if ((filemode & FNOFOLLOW) && vp->v_type == VLNK) {
1133 		error = ELOOP;
1134 		goto out;
1135 	}
1136 	if (filemode & FNOLINKS) {
1137 		vattr.va_mask = AT_NLINK;
1138 		if ((error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))) {
1139 			goto out;
1140 		}
1141 		if (vattr.va_nlink != 1) {
1142 			error = EMLINK;
1143 			goto out;
1144 		}
1145 	}
1146 
1147 	/*
1148 	 * Opening a socket corresponding to the AF_UNIX pathname
1149 	 * in the filesystem name space is not supported.
1150 	 * However, VSOCK nodes in namefs are supported in order
1151 	 * to make fattach work for sockets.
1152 	 *
1153 	 * XXX This uses VOP_REALVP to distinguish between
1154 	 * an unopened namefs node (where VOP_REALVP returns a
1155 	 * different VSOCK vnode) and a VSOCK created by vn_create
1156 	 * in some file system (where VOP_REALVP would never return
1157 	 * a different vnode).
1158 	 */
1159 	if (vp->v_type == VSOCK) {
1160 		struct vnode *nvp;
1161 
1162 		error = VOP_REALVP(vp, &nvp, NULL);
1163 		if (error != 0 || nvp == NULL || nvp == vp ||
1164 		    nvp->v_type != VSOCK) {
1165 			error = EOPNOTSUPP;
1166 			goto out;
1167 		}
1168 	}
1169 
1170 	if ((vp->v_type == VREG) && nbl_need_check(vp)) {
1171 		/* get share reservation */
1172 		shr.s_access = 0;
1173 		if (filemode & FWRITE)
1174 			shr.s_access |= F_WRACC;
1175 		if (filemode & FREAD)
1176 			shr.s_access |= F_RDACC;
1177 		shr.s_deny = 0;
1178 		shr.s_sysid = 0;
1179 		shr.s_pid = ttoproc(curthread)->p_pid;
1180 		shr_own.sl_pid = shr.s_pid;
1181 		shr_own.sl_id = fd;
1182 		shr.s_own_len = sizeof (shr_own);
1183 		shr.s_owner = (caddr_t)&shr_own;
1184 		error = VOP_SHRLOCK(vp, F_SHARE_NBMAND, &shr, filemode, CRED(),
1185 		    NULL);
1186 		if (error)
1187 			goto out;
1188 		shrlock_done = 1;
1189 
1190 		/* nbmand conflict check if truncating file */
1191 		if ((filemode & FTRUNC) && !(filemode & FCREAT)) {
1192 			nbl_start_crit(vp, RW_READER);
1193 			in_crit = 1;
1194 
1195 			vattr.va_mask = AT_SIZE;
1196 			if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))
1197 				goto out;
1198 			if (nbl_conflict(vp, NBL_WRITE, 0, vattr.va_size, 0,
1199 			    NULL)) {
1200 				error = EACCES;
1201 				goto out;
1202 			}
1203 		}
1204 	}
1205 
1206 	/*
1207 	 * Do opening protocol.
1208 	 */
1209 	error = VOP_OPEN(&vp, filemode, CRED(), NULL);
1210 	if (error)
1211 		goto out;
1212 	open_done = 1;
1213 
1214 	/*
1215 	 * Truncate if required.
1216 	 */
1217 	if ((filemode & FTRUNC) && !(filemode & FCREAT)) {
1218 		vattr.va_size = 0;
1219 		vattr.va_mask = AT_SIZE;
1220 		if ((error = VOP_SETATTR(vp, &vattr, 0, CRED(), NULL)) != 0)
1221 			goto out;
1222 	}
1223 
1224 	/*
1225 	 * Turn on directio, if requested.
1226 	 */
1227 	if (filemode & FDIRECT) {
1228 		if ((error = VOP_IOCTL(vp, _FIODIRECTIO, DIRECTIO_ON, 0,
1229 		    CRED(), NULL, NULL)) != 0) {
1230 			/*
1231 			 * On Linux, O_DIRECT returns EINVAL when the file
1232 			 * system does not support directio, so we'll do the
1233 			 * same.
1234 			 */
1235 			error = EINVAL;
1236 			goto out;
1237 		}
1238 	}
1239 out:
1240 	ASSERT(vp->v_count > 0);
1241 
1242 	if (in_crit) {
1243 		nbl_end_crit(vp);
1244 		in_crit = 0;
1245 	}
1246 	if (error) {
1247 		if (open_done) {
1248 			(void) VOP_CLOSE(vp, filemode, 1, (offset_t)0, CRED(),
1249 			    NULL);
1250 			open_done = 0;
1251 			shrlock_done = 0;
1252 		}
1253 		if (shrlock_done) {
1254 			(void) VOP_SHRLOCK(vp, F_UNSHARE, &shr, 0, CRED(),
1255 			    NULL);
1256 			shrlock_done = 0;
1257 		}
1258 
1259 		/*
1260 		 * The following clause was added to handle a problem
1261 		 * with NFS consistency.  It is possible that a lookup
1262 		 * of the file to be opened succeeded, but the file
1263 		 * itself doesn't actually exist on the server.  This
1264 		 * is chiefly due to the DNLC containing an entry for
1265 		 * the file which has been removed on the server.  In
1266 		 * this case, we just start over.  If there was some
1267 		 * other cause for the ESTALE error, then the lookup
1268 		 * of the file will fail and the error will be returned
1269 		 * above instead of looping around from here.
1270 		 */
1271 		VN_RELE(vp);
1272 		if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1273 			goto top;
1274 	} else
1275 		*vpp = vp;
1276 	return (error);
1277 }
1278 
1279 /*
1280  * The following two accessor functions are for the NFSv4 server.  Since there
1281  * is no VOP_OPEN_UP/DOWNGRADE we need a way for the NFS server to keep the
1282  * vnode open counts correct when a client "upgrades" an open or does an
1283  * open_downgrade.  In NFS, an upgrade or downgrade can not only change the
1284  * open mode (add or subtract read or write), but also change the share/deny
1285  * modes.  However, share reservations are not integrated with OPEN, yet, so
1286  * we need to handle each separately.  These functions are cleaner than having
1287  * the NFS server manipulate the counts directly, however, nobody else should
1288  * use these functions.
1289  */
1290 void
vn_open_upgrade(vnode_t * vp,int filemode)1291 vn_open_upgrade(
1292 	vnode_t *vp,
1293 	int filemode)
1294 {
1295 	ASSERT(vp->v_type == VREG);
1296 
1297 	if (filemode & FREAD)
1298 		atomic_inc_32(&vp->v_rdcnt);
1299 	if (filemode & FWRITE)
1300 		atomic_inc_32(&vp->v_wrcnt);
1301 
1302 }
1303 
1304 void
vn_open_downgrade(vnode_t * vp,int filemode)1305 vn_open_downgrade(
1306 	vnode_t *vp,
1307 	int filemode)
1308 {
1309 	ASSERT(vp->v_type == VREG);
1310 
1311 	if (filemode & FREAD) {
1312 		ASSERT(vp->v_rdcnt > 0);
1313 		atomic_dec_32(&vp->v_rdcnt);
1314 	}
1315 	if (filemode & FWRITE) {
1316 		ASSERT(vp->v_wrcnt > 0);
1317 		atomic_dec_32(&vp->v_wrcnt);
1318 	}
1319 
1320 }
1321 
1322 int
vn_create(char * pnamep,enum uio_seg seg,struct vattr * vap,enum vcexcl excl,int mode,struct vnode ** vpp,enum create why,int flag,mode_t umask)1323 vn_create(
1324 	char *pnamep,
1325 	enum uio_seg seg,
1326 	struct vattr *vap,
1327 	enum vcexcl excl,
1328 	int mode,
1329 	struct vnode **vpp,
1330 	enum create why,
1331 	int flag,
1332 	mode_t umask)
1333 {
1334 	return (vn_createat(pnamep, seg, vap, excl, mode, vpp, why, flag,
1335 	    umask, NULL));
1336 }
1337 
1338 /*
1339  * Create a vnode (makenode).
1340  */
1341 int
vn_createat(char * pnamep,enum uio_seg seg,struct vattr * vap,enum vcexcl excl,int mode,struct vnode ** vpp,enum create why,int flag,mode_t umask,struct vnode * startvp)1342 vn_createat(
1343 	char *pnamep,
1344 	enum uio_seg seg,
1345 	struct vattr *vap,
1346 	enum vcexcl excl,
1347 	int mode,
1348 	struct vnode **vpp,
1349 	enum create why,
1350 	int flag,
1351 	mode_t umask,
1352 	struct vnode *startvp)
1353 {
1354 	struct vnode *dvp;	/* ptr to parent dir vnode */
1355 	struct vnode *vp = NULL;
1356 	struct pathname pn;
1357 	int error;
1358 	int in_crit = 0;
1359 	struct vattr vattr;
1360 	enum symfollow follow;
1361 	int estale_retry = 0;
1362 	uint32_t auditing = AU_AUDITING();
1363 
1364 	ASSERT((vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));
1365 
1366 	/* symlink interpretation */
1367 	if ((flag & FNOFOLLOW) || excl == EXCL)
1368 		follow = NO_FOLLOW;
1369 	else
1370 		follow = FOLLOW;
1371 	flag &= ~(FNOFOLLOW|FNOLINKS);
1372 
1373 top:
1374 	/*
1375 	 * Lookup directory.
1376 	 * If new object is a file, call lower level to create it.
1377 	 * Note that it is up to the lower level to enforce exclusive
1378 	 * creation, if the file is already there.
1379 	 * This allows the lower level to do whatever
1380 	 * locking or protocol that is needed to prevent races.
1381 	 * If the new object is directory call lower level to make
1382 	 * the new directory, with "." and "..".
1383 	 */
1384 	if (error = pn_get(pnamep, seg, &pn))
1385 		return (error);
1386 	if (auditing)
1387 		audit_vncreate_start();
1388 	dvp = NULL;
1389 	*vpp = NULL;
1390 	/*
1391 	 * lookup will find the parent directory for the vnode.
1392 	 * When it is done the pn holds the name of the entry
1393 	 * in the directory.
1394 	 * If this is a non-exclusive create we also find the node itself.
1395 	 */
1396 	error = lookuppnat(&pn, NULL, follow, &dvp,
1397 	    (excl == EXCL) ? NULLVPP : vpp, startvp);
1398 	if (error) {
1399 		pn_free(&pn);
1400 		if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1401 			goto top;
1402 		if (why == CRMKDIR && error == EINVAL)
1403 			error = EEXIST;		/* SVID */
1404 		return (error);
1405 	}
1406 
1407 	if (why != CRMKNOD)
1408 		vap->va_mode &= ~VSVTX;
1409 
1410 	/*
1411 	 * If default ACLs are defined for the directory don't apply the
1412 	 * umask if umask is passed.
1413 	 */
1414 
1415 	if (umask) {
1416 
1417 		vsecattr_t vsec;
1418 
1419 		vsec.vsa_aclcnt = 0;
1420 		vsec.vsa_aclentp = NULL;
1421 		vsec.vsa_dfaclcnt = 0;
1422 		vsec.vsa_dfaclentp = NULL;
1423 		vsec.vsa_mask = VSA_DFACLCNT;
1424 		error = VOP_GETSECATTR(dvp, &vsec, 0, CRED(), NULL);
1425 		/*
1426 		 * If error is ENOSYS then treat it as no error
1427 		 * Don't want to force all file systems to support
1428 		 * aclent_t style of ACL's.
1429 		 */
1430 		if (error == ENOSYS)
1431 			error = 0;
1432 		if (error) {
1433 			if (*vpp != NULL)
1434 				VN_RELE(*vpp);
1435 			goto out;
1436 		} else {
1437 			/*
1438 			 * Apply the umask if no default ACLs.
1439 			 */
1440 			if (vsec.vsa_dfaclcnt == 0)
1441 				vap->va_mode &= ~umask;
1442 
1443 			/*
1444 			 * VOP_GETSECATTR() may have allocated memory for
1445 			 * ACLs we didn't request, so double-check and
1446 			 * free it if necessary.
1447 			 */
1448 			if (vsec.vsa_aclcnt && vsec.vsa_aclentp != NULL)
1449 				kmem_free((caddr_t)vsec.vsa_aclentp,
1450 				    vsec.vsa_aclcnt * sizeof (aclent_t));
1451 			if (vsec.vsa_dfaclcnt && vsec.vsa_dfaclentp != NULL)
1452 				kmem_free((caddr_t)vsec.vsa_dfaclentp,
1453 				    vsec.vsa_dfaclcnt * sizeof (aclent_t));
1454 		}
1455 	}
1456 
1457 	/*
1458 	 * In general we want to generate EROFS if the file system is
1459 	 * readonly.  However, POSIX (IEEE Std. 1003.1) section 5.3.1
1460 	 * documents the open system call, and it says that O_CREAT has no
1461 	 * effect if the file already exists.  Bug 1119649 states
1462 	 * that open(path, O_CREAT, ...) fails when attempting to open an
1463 	 * existing file on a read only file system.  Thus, the first part
1464 	 * of the following if statement has 3 checks:
1465 	 *	if the file exists &&
1466 	 *		it is being open with write access &&
1467 	 *		the file system is read only
1468 	 *	then generate EROFS
1469 	 */
1470 	if ((*vpp != NULL && (mode & VWRITE) && ISROFILE(*vpp)) ||
1471 	    (*vpp == NULL && dvp->v_vfsp->vfs_flag & VFS_RDONLY)) {
1472 		if (*vpp)
1473 			VN_RELE(*vpp);
1474 		error = EROFS;
1475 	} else if (excl == NONEXCL && *vpp != NULL) {
1476 		vnode_t *rvp;
1477 
1478 		/*
1479 		 * File already exists.  If a mandatory lock has been
1480 		 * applied, return error.
1481 		 */
1482 		vp = *vpp;
1483 		if (VOP_REALVP(vp, &rvp, NULL) != 0)
1484 			rvp = vp;
1485 		if ((vap->va_mask & AT_SIZE) && nbl_need_check(vp)) {
1486 			nbl_start_crit(vp, RW_READER);
1487 			in_crit = 1;
1488 		}
1489 		if (rvp->v_filocks != NULL || rvp->v_shrlocks != NULL) {
1490 			vattr.va_mask = AT_MODE|AT_SIZE;
1491 			if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL)) {
1492 				goto out;
1493 			}
1494 			if (MANDLOCK(vp, vattr.va_mode)) {
1495 				error = EAGAIN;
1496 				goto out;
1497 			}
1498 			/*
1499 			 * File cannot be truncated if non-blocking mandatory
1500 			 * locks are currently on the file.
1501 			 */
1502 			if ((vap->va_mask & AT_SIZE) && in_crit) {
1503 				u_offset_t offset;
1504 				ssize_t length;
1505 
1506 				offset = vap->va_size > vattr.va_size ?
1507 				    vattr.va_size : vap->va_size;
1508 				length = vap->va_size > vattr.va_size ?
1509 				    vap->va_size - vattr.va_size :
1510 				    vattr.va_size - vap->va_size;
1511 				if (nbl_conflict(vp, NBL_WRITE, offset,
1512 				    length, 0, NULL)) {
1513 					error = EACCES;
1514 					goto out;
1515 				}
1516 			}
1517 		}
1518 
1519 		/*
1520 		 * If the file is the root of a VFS, we've crossed a
1521 		 * mount point and the "containing" directory that we
1522 		 * acquired above (dvp) is irrelevant because it's in
1523 		 * a different file system.  We apply VOP_CREATE to the
1524 		 * target itself instead of to the containing directory
1525 		 * and supply a null path name to indicate (conventionally)
1526 		 * the node itself as the "component" of interest.
1527 		 *
1528 		 * The call to VOP_CREATE() is necessary to ensure
1529 		 * that the appropriate permission checks are made,
1530 		 * i.e. EISDIR, EACCES, etc.  We already know that vpp
1531 		 * exists since we are in the else condition where this
1532 		 * was checked.
1533 		 */
1534 		if (vp->v_flag & VROOT) {
1535 			ASSERT(why != CRMKDIR);
1536 			error = VOP_CREATE(vp, "", vap, excl, mode, vpp,
1537 			    CRED(), flag, NULL, NULL);
1538 			/*
1539 			 * If the create succeeded, it will have created a
1540 			 * new reference on a new vnode (*vpp) in the child
1541 			 * file system, so we want to drop our reference on
1542 			 * the old (vp) upon exit.
1543 			 */
1544 			goto out;
1545 		}
1546 
1547 		/*
1548 		 * Large File API - non-large open (FOFFMAX flag not set)
1549 		 * of regular file fails if the file size exceeds MAXOFF32_T.
1550 		 */
1551 		if (why != CRMKDIR &&
1552 		    !(flag & FOFFMAX) &&
1553 		    (vp->v_type == VREG)) {
1554 			vattr.va_mask = AT_SIZE;
1555 			if ((error = VOP_GETATTR(vp, &vattr, 0,
1556 			    CRED(), NULL))) {
1557 				goto out;
1558 			}
1559 			if ((vattr.va_size > (u_offset_t)MAXOFF32_T)) {
1560 				error = EOVERFLOW;
1561 				goto out;
1562 			}
1563 		}
1564 	}
1565 
1566 	if (error == 0) {
1567 		/*
1568 		 * Call mkdir() if specified, otherwise create().
1569 		 */
1570 		int must_be_dir = pn_fixslash(&pn);	/* trailing '/'? */
1571 
1572 		if (why == CRMKDIR)
1573 			/*
1574 			 * N.B., if vn_createat() ever requests
1575 			 * case-insensitive behavior then it will need
1576 			 * to be passed to VOP_MKDIR().  VOP_CREATE()
1577 			 * will already get it via "flag"
1578 			 */
1579 			error = VOP_MKDIR(dvp, pn.pn_path, vap, vpp, CRED(),
1580 			    NULL, 0, NULL);
1581 		else if (!must_be_dir)
1582 			error = VOP_CREATE(dvp, pn.pn_path, vap,
1583 			    excl, mode, vpp, CRED(), flag, NULL, NULL);
1584 		else
1585 			error = ENOTDIR;
1586 	}
1587 
1588 out:
1589 
1590 	if (auditing)
1591 		audit_vncreate_finish(*vpp, error);
1592 	if (in_crit) {
1593 		nbl_end_crit(vp);
1594 		in_crit = 0;
1595 	}
1596 	if (vp != NULL) {
1597 		VN_RELE(vp);
1598 		vp = NULL;
1599 	}
1600 	pn_free(&pn);
1601 	VN_RELE(dvp);
1602 	/*
1603 	 * The following clause was added to handle a problem
1604 	 * with NFS consistency.  It is possible that a lookup
1605 	 * of the file to be created succeeded, but the file
1606 	 * itself doesn't actually exist on the server.  This
1607 	 * is chiefly due to the DNLC containing an entry for
1608 	 * the file which has been removed on the server.  In
1609 	 * this case, we just start over.  If there was some
1610 	 * other cause for the ESTALE error, then the lookup
1611 	 * of the file will fail and the error will be returned
1612 	 * above instead of looping around from here.
1613 	 */
1614 	if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1615 		goto top;
1616 	return (error);
1617 }
1618 
1619 int
vn_link(char * from,char * to,enum uio_seg seg)1620 vn_link(char *from, char *to, enum uio_seg seg)
1621 {
1622 	return (vn_linkat(NULL, from, NO_FOLLOW, NULL, to, seg));
1623 }
1624 
1625 int
vn_linkat(vnode_t * fstartvp,char * from,enum symfollow follow,vnode_t * tstartvp,char * to,enum uio_seg seg)1626 vn_linkat(vnode_t *fstartvp, char *from, enum symfollow follow,
1627     vnode_t *tstartvp, char *to, enum uio_seg seg)
1628 {
1629 	struct vnode *fvp;		/* from vnode ptr */
1630 	struct vnode *tdvp;		/* to directory vnode ptr */
1631 	struct pathname pn;
1632 	int error;
1633 	struct vattr vattr;
1634 	dev_t fsid;
1635 	int estale_retry = 0;
1636 	uint32_t auditing = AU_AUDITING();
1637 
1638 top:
1639 	fvp = tdvp = NULL;
1640 	if (error = pn_get(to, seg, &pn))
1641 		return (error);
1642 	if (auditing && fstartvp != NULL)
1643 		audit_setfsat_path(1);
1644 	if (error = lookupnameat(from, seg, follow, NULLVPP, &fvp, fstartvp))
1645 		goto out;
1646 	if (auditing && tstartvp != NULL)
1647 		audit_setfsat_path(3);
1648 	if (error = lookuppnat(&pn, NULL, NO_FOLLOW, &tdvp, NULLVPP, tstartvp))
1649 		goto out;
1650 	/*
1651 	 * Make sure both source vnode and target directory vnode are
1652 	 * in the same vfs and that it is writeable.
1653 	 */
1654 	vattr.va_mask = AT_FSID;
1655 	if (error = VOP_GETATTR(fvp, &vattr, 0, CRED(), NULL))
1656 		goto out;
1657 	fsid = vattr.va_fsid;
1658 	vattr.va_mask = AT_FSID;
1659 	if (error = VOP_GETATTR(tdvp, &vattr, 0, CRED(), NULL))
1660 		goto out;
1661 	if (fsid != vattr.va_fsid) {
1662 		error = EXDEV;
1663 		goto out;
1664 	}
1665 	if (tdvp->v_vfsp->vfs_flag & VFS_RDONLY) {
1666 		error = EROFS;
1667 		goto out;
1668 	}
1669 	/*
1670 	 * Do the link.
1671 	 */
1672 	(void) pn_fixslash(&pn);
1673 	error = VOP_LINK(tdvp, fvp, pn.pn_path, CRED(), NULL, 0);
1674 out:
1675 	pn_free(&pn);
1676 	if (fvp)
1677 		VN_RELE(fvp);
1678 	if (tdvp)
1679 		VN_RELE(tdvp);
1680 	if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1681 		goto top;
1682 	return (error);
1683 }
1684 
1685 int
vn_rename(char * from,char * to,enum uio_seg seg)1686 vn_rename(char *from, char *to, enum uio_seg seg)
1687 {
1688 	return (vn_renameat(NULL, from, NULL, to, seg));
1689 }
1690 
1691 int
vn_renameat(vnode_t * fdvp,char * fname,vnode_t * tdvp,char * tname,enum uio_seg seg)1692 vn_renameat(vnode_t *fdvp, char *fname, vnode_t *tdvp,
1693     char *tname, enum uio_seg seg)
1694 {
1695 	int error;
1696 	struct vattr vattr;
1697 	struct pathname fpn;		/* from pathname */
1698 	struct pathname tpn;		/* to pathname */
1699 	dev_t fsid;
1700 	int in_crit_src, in_crit_targ;
1701 	vnode_t *fromvp, *fvp;
1702 	vnode_t *tovp, *targvp;
1703 	int estale_retry = 0;
1704 	uint32_t auditing = AU_AUDITING();
1705 
1706 top:
1707 	fvp = fromvp = tovp = targvp = NULL;
1708 	in_crit_src = in_crit_targ = 0;
1709 	/*
1710 	 * Get to and from pathnames.
1711 	 */
1712 	if (error = pn_get(fname, seg, &fpn))
1713 		return (error);
1714 	if (error = pn_get(tname, seg, &tpn)) {
1715 		pn_free(&fpn);
1716 		return (error);
1717 	}
1718 
1719 	/*
1720 	 * First we need to resolve the correct directories
1721 	 * The passed in directories may only be a starting point,
1722 	 * but we need the real directories the file(s) live in.
1723 	 * For example the fname may be something like usr/lib/sparc
1724 	 * and we were passed in the / directory, but we need to
1725 	 * use the lib directory for the rename.
1726 	 */
1727 
1728 	if (auditing && fdvp != NULL)
1729 		audit_setfsat_path(1);
1730 	/*
1731 	 * Lookup to and from directories.
1732 	 */
1733 	if (error = lookuppnat(&fpn, NULL, NO_FOLLOW, &fromvp, &fvp, fdvp)) {
1734 		goto out;
1735 	}
1736 
1737 	/*
1738 	 * Make sure there is an entry.
1739 	 */
1740 	if (fvp == NULL) {
1741 		error = ENOENT;
1742 		goto out;
1743 	}
1744 
1745 	if (auditing && tdvp != NULL)
1746 		audit_setfsat_path(3);
1747 	if (error = lookuppnat(&tpn, NULL, NO_FOLLOW, &tovp, &targvp, tdvp)) {
1748 		goto out;
1749 	}
1750 
1751 	/*
1752 	 * Make sure both the from vnode directory and the to directory
1753 	 * are in the same vfs and the to directory is writable.
1754 	 * We check fsid's, not vfs pointers, so loopback fs works.
1755 	 */
1756 	if (fromvp != tovp) {
1757 		vattr.va_mask = AT_FSID;
1758 		if (error = VOP_GETATTR(fromvp, &vattr, 0, CRED(), NULL))
1759 			goto out;
1760 		fsid = vattr.va_fsid;
1761 		vattr.va_mask = AT_FSID;
1762 		if (error = VOP_GETATTR(tovp, &vattr, 0, CRED(), NULL))
1763 			goto out;
1764 		if (fsid != vattr.va_fsid) {
1765 			error = EXDEV;
1766 			goto out;
1767 		}
1768 	}
1769 
1770 	if (tovp->v_vfsp->vfs_flag & VFS_RDONLY) {
1771 		error = EROFS;
1772 		goto out;
1773 	}
1774 
1775 	/*
1776 	 * Make sure "from" vp is not a mount point.
1777 	 * Note, lookup did traverse() already, so
1778 	 * we'll be looking at the mounted FS root.
1779 	 * (but allow files like mnttab)
1780 	 */
1781 	if ((fvp->v_flag & VROOT) != 0 && fvp->v_type == VDIR) {
1782 		error = EBUSY;
1783 		goto out;
1784 	}
1785 
1786 	if (targvp && (fvp != targvp)) {
1787 		nbl_start_crit(targvp, RW_READER);
1788 		in_crit_targ = 1;
1789 		if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0, NULL)) {
1790 			error = EACCES;
1791 			goto out;
1792 		}
1793 	}
1794 
1795 	if (nbl_need_check(fvp)) {
1796 		nbl_start_crit(fvp, RW_READER);
1797 		in_crit_src = 1;
1798 		if (nbl_conflict(fvp, NBL_RENAME, 0, 0, 0, NULL)) {
1799 			error = EACCES;
1800 			goto out;
1801 		}
1802 	}
1803 
1804 	/*
1805 	 * Do the rename.
1806 	 */
1807 	(void) pn_fixslash(&tpn);
1808 	error = VOP_RENAME(fromvp, fpn.pn_path, tovp, tpn.pn_path, CRED(),
1809 	    NULL, 0);
1810 
1811 out:
1812 	pn_free(&fpn);
1813 	pn_free(&tpn);
1814 	if (in_crit_src)
1815 		nbl_end_crit(fvp);
1816 	if (in_crit_targ)
1817 		nbl_end_crit(targvp);
1818 	if (fromvp)
1819 		VN_RELE(fromvp);
1820 	if (tovp)
1821 		VN_RELE(tovp);
1822 	if (targvp)
1823 		VN_RELE(targvp);
1824 	if (fvp)
1825 		VN_RELE(fvp);
1826 	if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1827 		goto top;
1828 	return (error);
1829 }
1830 
1831 /*
1832  * Remove a file or directory.
1833  */
1834 int
vn_remove(char * fnamep,enum uio_seg seg,enum rm dirflag)1835 vn_remove(char *fnamep, enum uio_seg seg, enum rm dirflag)
1836 {
1837 	return (vn_removeat(NULL, fnamep, seg, dirflag));
1838 }
1839 
1840 int
vn_removeat(vnode_t * startvp,char * fnamep,enum uio_seg seg,enum rm dirflag)1841 vn_removeat(vnode_t *startvp, char *fnamep, enum uio_seg seg, enum rm dirflag)
1842 {
1843 	struct vnode *vp;		/* entry vnode */
1844 	struct vnode *dvp;		/* ptr to parent dir vnode */
1845 	struct vnode *coveredvp;
1846 	struct pathname pn;		/* name of entry */
1847 	enum vtype vtype;
1848 	int error;
1849 	struct vfs *vfsp;
1850 	struct vfs *dvfsp;	/* ptr to parent dir vfs */
1851 	int in_crit = 0;
1852 	int estale_retry = 0;
1853 
1854 top:
1855 	if (error = pn_get(fnamep, seg, &pn))
1856 		return (error);
1857 	dvp = vp = NULL;
1858 	if (error = lookuppnat(&pn, NULL, NO_FOLLOW, &dvp, &vp, startvp)) {
1859 		pn_free(&pn);
1860 		if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1861 			goto top;
1862 		return (error);
1863 	}
1864 
1865 	/*
1866 	 * Make sure there is an entry.
1867 	 */
1868 	if (vp == NULL) {
1869 		error = ENOENT;
1870 		goto out;
1871 	}
1872 
1873 	vfsp = vp->v_vfsp;
1874 	dvfsp = dvp->v_vfsp;
1875 
1876 	/*
1877 	 * If the named file is the root of a mounted filesystem, fail,
1878 	 * unless it's marked unlinkable.  In that case, unmount the
1879 	 * filesystem and proceed to unlink the covered vnode.  (If the
1880 	 * covered vnode is a directory, use rmdir instead of unlink,
1881 	 * to avoid file system corruption.)
1882 	 */
1883 	if (vp->v_flag & VROOT) {
1884 		if ((vfsp->vfs_flag & VFS_UNLINKABLE) == 0) {
1885 			error = EBUSY;
1886 			goto out;
1887 		}
1888 
1889 		/*
1890 		 * Namefs specific code starts here.
1891 		 */
1892 
1893 		if (dirflag == RMDIRECTORY) {
1894 			/*
1895 			 * User called rmdir(2) on a file that has
1896 			 * been namefs mounted on top of.  Since
1897 			 * namefs doesn't allow directories to
1898 			 * be mounted on other files we know
1899 			 * vp is not of type VDIR so fail to operation.
1900 			 */
1901 			error = ENOTDIR;
1902 			goto out;
1903 		}
1904 
1905 		/*
1906 		 * If VROOT is still set after grabbing vp->v_lock,
1907 		 * noone has finished nm_unmount so far and coveredvp
1908 		 * is valid.
1909 		 * If we manage to grab vn_vfswlock(coveredvp) before releasing
1910 		 * vp->v_lock, any race window is eliminated.
1911 		 */
1912 
1913 		mutex_enter(&vp->v_lock);
1914 		if ((vp->v_flag & VROOT) == 0) {
1915 			/* Someone beat us to the unmount */
1916 			mutex_exit(&vp->v_lock);
1917 			error = EBUSY;
1918 			goto out;
1919 		}
1920 		vfsp = vp->v_vfsp;
1921 		coveredvp = vfsp->vfs_vnodecovered;
1922 		ASSERT(coveredvp);
1923 		/*
1924 		 * Note: Implementation of vn_vfswlock shows that ordering of
1925 		 * v_lock / vn_vfswlock is not an issue here.
1926 		 */
1927 		error = vn_vfswlock(coveredvp);
1928 		mutex_exit(&vp->v_lock);
1929 
1930 		if (error)
1931 			goto out;
1932 
1933 		VN_HOLD(coveredvp);
1934 		VN_RELE(vp);
1935 		error = dounmount(vfsp, 0, CRED());
1936 
1937 		/*
1938 		 * Unmounted the namefs file system; now get
1939 		 * the object it was mounted over.
1940 		 */
1941 		vp = coveredvp;
1942 		/*
1943 		 * If namefs was mounted over a directory, then
1944 		 * we want to use rmdir() instead of unlink().
1945 		 */
1946 		if (vp->v_type == VDIR)
1947 			dirflag = RMDIRECTORY;
1948 
1949 		if (error)
1950 			goto out;
1951 	}
1952 
1953 	/*
1954 	 * Make sure filesystem is writeable.
1955 	 * We check the parent directory's vfs in case this is an lofs vnode.
1956 	 */
1957 	if (dvfsp && dvfsp->vfs_flag & VFS_RDONLY) {
1958 		error = EROFS;
1959 		goto out;
1960 	}
1961 
1962 	vtype = vp->v_type;
1963 
1964 	/*
1965 	 * If there is the possibility of an nbmand share reservation, make
1966 	 * sure it's okay to remove the file.  Keep a reference to the
1967 	 * vnode, so that we can exit the nbl critical region after
1968 	 * calling VOP_REMOVE.
1969 	 * If there is no possibility of an nbmand share reservation,
1970 	 * release the vnode reference now.  Filesystems like NFS may
1971 	 * behave differently if there is an extra reference, so get rid of
1972 	 * this one.  Fortunately, we can't have nbmand mounts on NFS
1973 	 * filesystems.
1974 	 */
1975 	if (nbl_need_check(vp)) {
1976 		nbl_start_crit(vp, RW_READER);
1977 		in_crit = 1;
1978 		if (nbl_conflict(vp, NBL_REMOVE, 0, 0, 0, NULL)) {
1979 			error = EACCES;
1980 			goto out;
1981 		}
1982 	} else {
1983 		VN_RELE(vp);
1984 		vp = NULL;
1985 	}
1986 
1987 	if (dirflag == RMDIRECTORY) {
1988 		/*
1989 		 * Caller is using rmdir(2), which can only be applied to
1990 		 * directories.
1991 		 */
1992 		if (vtype != VDIR) {
1993 			error = ENOTDIR;
1994 		} else {
1995 			vnode_t *cwd;
1996 			proc_t *pp = curproc;
1997 
1998 			mutex_enter(&pp->p_lock);
1999 			cwd = PTOU(pp)->u_cdir;
2000 			VN_HOLD(cwd);
2001 			mutex_exit(&pp->p_lock);
2002 			error = VOP_RMDIR(dvp, pn.pn_path, cwd, CRED(),
2003 			    NULL, 0);
2004 			VN_RELE(cwd);
2005 		}
2006 	} else {
2007 		/*
2008 		 * Unlink(2) can be applied to anything.
2009 		 */
2010 		error = VOP_REMOVE(dvp, pn.pn_path, CRED(), NULL, 0);
2011 	}
2012 
2013 out:
2014 	pn_free(&pn);
2015 	if (in_crit) {
2016 		nbl_end_crit(vp);
2017 		in_crit = 0;
2018 	}
2019 	if (vp != NULL)
2020 		VN_RELE(vp);
2021 	if (dvp != NULL)
2022 		VN_RELE(dvp);
2023 	if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
2024 		goto top;
2025 	return (error);
2026 }
2027 
2028 /*
2029  * Utility function to compare equality of vnodes.
2030  * Compare the underlying real vnodes, if there are underlying vnodes.
2031  * This is a more thorough comparison than the VN_CMP() macro provides.
2032  */
2033 int
vn_compare(vnode_t * vp1,vnode_t * vp2)2034 vn_compare(vnode_t *vp1, vnode_t *vp2)
2035 {
2036 	vnode_t *realvp;
2037 
2038 	if (vp1 != NULL && VOP_REALVP(vp1, &realvp, NULL) == 0)
2039 		vp1 = realvp;
2040 	if (vp2 != NULL && VOP_REALVP(vp2, &realvp, NULL) == 0)
2041 		vp2 = realvp;
2042 	return (VN_CMP(vp1, vp2));
2043 }
2044 
2045 /*
2046  * The number of locks to hash into.  This value must be a power
2047  * of 2 minus 1 and should probably also be prime.
2048  */
2049 #define	NUM_BUCKETS	1023
2050 
2051 struct  vn_vfslocks_bucket {
2052 	kmutex_t vb_lock;
2053 	vn_vfslocks_entry_t *vb_list;
2054 	char pad[64 - sizeof (kmutex_t) - sizeof (void *)];
2055 };
2056 
2057 /*
2058  * Total number of buckets will be NUM_BUCKETS + 1 .
2059  */
2060 
2061 #pragma	align	64(vn_vfslocks_buckets)
2062 static	struct vn_vfslocks_bucket	vn_vfslocks_buckets[NUM_BUCKETS + 1];
2063 
2064 #define	VN_VFSLOCKS_SHIFT	9
2065 
2066 #define	VN_VFSLOCKS_HASH(vfsvpptr)	\
2067 	((((intptr_t)(vfsvpptr)) >> VN_VFSLOCKS_SHIFT) & NUM_BUCKETS)
2068 
2069 /*
2070  * vn_vfslocks_getlock() uses an HASH scheme to generate
2071  * rwstlock using vfs/vnode pointer passed to it.
2072  *
2073  * vn_vfslocks_rele() releases a reference in the
2074  * HASH table which allows the entry allocated by
2075  * vn_vfslocks_getlock() to be freed at a later
2076  * stage when the refcount drops to zero.
2077  */
2078 
2079 vn_vfslocks_entry_t *
vn_vfslocks_getlock(void * vfsvpptr)2080 vn_vfslocks_getlock(void *vfsvpptr)
2081 {
2082 	struct vn_vfslocks_bucket *bp;
2083 	vn_vfslocks_entry_t *vep;
2084 	vn_vfslocks_entry_t *tvep;
2085 
2086 	ASSERT(vfsvpptr != NULL);
2087 	bp = &vn_vfslocks_buckets[VN_VFSLOCKS_HASH(vfsvpptr)];
2088 
2089 	mutex_enter(&bp->vb_lock);
2090 	for (vep = bp->vb_list; vep != NULL; vep = vep->ve_next) {
2091 		if (vep->ve_vpvfs == vfsvpptr) {
2092 			vep->ve_refcnt++;
2093 			mutex_exit(&bp->vb_lock);
2094 			return (vep);
2095 		}
2096 	}
2097 	mutex_exit(&bp->vb_lock);
2098 	vep = kmem_alloc(sizeof (*vep), KM_SLEEP);
2099 	rwst_init(&vep->ve_lock, NULL, RW_DEFAULT, NULL);
2100 	vep->ve_vpvfs = (char *)vfsvpptr;
2101 	vep->ve_refcnt = 1;
2102 	mutex_enter(&bp->vb_lock);
2103 	for (tvep = bp->vb_list; tvep != NULL; tvep = tvep->ve_next) {
2104 		if (tvep->ve_vpvfs == vfsvpptr) {
2105 			tvep->ve_refcnt++;
2106 			mutex_exit(&bp->vb_lock);
2107 
2108 			/*
2109 			 * There is already an entry in the hash
2110 			 * destroy what we just allocated.
2111 			 */
2112 			rwst_destroy(&vep->ve_lock);
2113 			kmem_free(vep, sizeof (*vep));
2114 			return (tvep);
2115 		}
2116 	}
2117 	vep->ve_next = bp->vb_list;
2118 	bp->vb_list = vep;
2119 	mutex_exit(&bp->vb_lock);
2120 	return (vep);
2121 }
2122 
2123 void
vn_vfslocks_rele(vn_vfslocks_entry_t * vepent)2124 vn_vfslocks_rele(vn_vfslocks_entry_t *vepent)
2125 {
2126 	struct vn_vfslocks_bucket *bp;
2127 	vn_vfslocks_entry_t *vep;
2128 	vn_vfslocks_entry_t *pvep;
2129 
2130 	ASSERT(vepent != NULL);
2131 	ASSERT(vepent->ve_vpvfs != NULL);
2132 
2133 	bp = &vn_vfslocks_buckets[VN_VFSLOCKS_HASH(vepent->ve_vpvfs)];
2134 
2135 	mutex_enter(&bp->vb_lock);
2136 	vepent->ve_refcnt--;
2137 
2138 	if ((int32_t)vepent->ve_refcnt < 0)
2139 		cmn_err(CE_PANIC, "vn_vfslocks_rele: refcount negative");
2140 
2141 	pvep = NULL;
2142 	if (vepent->ve_refcnt == 0) {
2143 		for (vep = bp->vb_list; vep != NULL; vep = vep->ve_next) {
2144 			if (vep->ve_vpvfs == vepent->ve_vpvfs) {
2145 				if (pvep == NULL)
2146 					bp->vb_list = vep->ve_next;
2147 				else {
2148 					pvep->ve_next = vep->ve_next;
2149 				}
2150 				mutex_exit(&bp->vb_lock);
2151 				rwst_destroy(&vep->ve_lock);
2152 				kmem_free(vep, sizeof (*vep));
2153 				return;
2154 			}
2155 			pvep = vep;
2156 		}
2157 		cmn_err(CE_PANIC, "vn_vfslocks_rele: vp/vfs not found");
2158 	}
2159 	mutex_exit(&bp->vb_lock);
2160 }
2161 
2162 /*
2163  * vn_vfswlock_wait is used to implement a lock which is logically a writers
2164  * lock protecting the v_vfsmountedhere field.
2165  * vn_vfswlock_wait has been modified to be similar to vn_vfswlock,
2166  * except that it blocks to acquire the lock VVFSLOCK.
2167  *
2168  * traverse() and routines re-implementing part of traverse (e.g. autofs)
2169  * need to hold this lock. mount(), vn_rename(), vn_remove() and so on
2170  * need the non-blocking version of the writers lock i.e. vn_vfswlock
2171  */
2172 int
vn_vfswlock_wait(vnode_t * vp)2173 vn_vfswlock_wait(vnode_t *vp)
2174 {
2175 	int retval;
2176 	vn_vfslocks_entry_t *vpvfsentry;
2177 	ASSERT(vp != NULL);
2178 
2179 	vpvfsentry = vn_vfslocks_getlock(vp);
2180 	retval = rwst_enter_sig(&vpvfsentry->ve_lock, RW_WRITER);
2181 
2182 	if (retval == EINTR) {
2183 		vn_vfslocks_rele(vpvfsentry);
2184 		return (EINTR);
2185 	}
2186 	return (retval);
2187 }
2188 
2189 int
vn_vfsrlock_wait(vnode_t * vp)2190 vn_vfsrlock_wait(vnode_t *vp)
2191 {
2192 	int retval;
2193 	vn_vfslocks_entry_t *vpvfsentry;
2194 	ASSERT(vp != NULL);
2195 
2196 	vpvfsentry = vn_vfslocks_getlock(vp);
2197 	retval = rwst_enter_sig(&vpvfsentry->ve_lock, RW_READER);
2198 
2199 	if (retval == EINTR) {
2200 		vn_vfslocks_rele(vpvfsentry);
2201 		return (EINTR);
2202 	}
2203 
2204 	return (retval);
2205 }
2206 
2207 
2208 /*
2209  * vn_vfswlock is used to implement a lock which is logically a writers lock
2210  * protecting the v_vfsmountedhere field.
2211  */
2212 int
vn_vfswlock(vnode_t * vp)2213 vn_vfswlock(vnode_t *vp)
2214 {
2215 	vn_vfslocks_entry_t *vpvfsentry;
2216 
2217 	/*
2218 	 * If vp is NULL then somebody is trying to lock the covered vnode
2219 	 * of /.  (vfs_vnodecovered is NULL for /).  This situation will
2220 	 * only happen when unmounting /.  Since that operation will fail
2221 	 * anyway, return EBUSY here instead of in VFS_UNMOUNT.
2222 	 */
2223 	if (vp == NULL)
2224 		return (EBUSY);
2225 
2226 	vpvfsentry = vn_vfslocks_getlock(vp);
2227 
2228 	if (rwst_tryenter(&vpvfsentry->ve_lock, RW_WRITER))
2229 		return (0);
2230 
2231 	vn_vfslocks_rele(vpvfsentry);
2232 	return (EBUSY);
2233 }
2234 
2235 int
vn_vfsrlock(vnode_t * vp)2236 vn_vfsrlock(vnode_t *vp)
2237 {
2238 	vn_vfslocks_entry_t *vpvfsentry;
2239 
2240 	/*
2241 	 * If vp is NULL then somebody is trying to lock the covered vnode
2242 	 * of /.  (vfs_vnodecovered is NULL for /).  This situation will
2243 	 * only happen when unmounting /.  Since that operation will fail
2244 	 * anyway, return EBUSY here instead of in VFS_UNMOUNT.
2245 	 */
2246 	if (vp == NULL)
2247 		return (EBUSY);
2248 
2249 	vpvfsentry = vn_vfslocks_getlock(vp);
2250 
2251 	if (rwst_tryenter(&vpvfsentry->ve_lock, RW_READER))
2252 		return (0);
2253 
2254 	vn_vfslocks_rele(vpvfsentry);
2255 	return (EBUSY);
2256 }
2257 
2258 void
vn_vfsunlock(vnode_t * vp)2259 vn_vfsunlock(vnode_t *vp)
2260 {
2261 	vn_vfslocks_entry_t *vpvfsentry;
2262 
2263 	/*
2264 	 * ve_refcnt needs to be decremented twice.
2265 	 * 1. To release refernce after a call to vn_vfslocks_getlock()
2266 	 * 2. To release the reference from the locking routines like
2267 	 *    vn_vfsrlock/vn_vfswlock etc,.
2268 	 */
2269 	vpvfsentry = vn_vfslocks_getlock(vp);
2270 	vn_vfslocks_rele(vpvfsentry);
2271 
2272 	rwst_exit(&vpvfsentry->ve_lock);
2273 	vn_vfslocks_rele(vpvfsentry);
2274 }
2275 
2276 int
vn_vfswlock_held(vnode_t * vp)2277 vn_vfswlock_held(vnode_t *vp)
2278 {
2279 	int held;
2280 	vn_vfslocks_entry_t *vpvfsentry;
2281 
2282 	ASSERT(vp != NULL);
2283 
2284 	vpvfsentry = vn_vfslocks_getlock(vp);
2285 	held = rwst_lock_held(&vpvfsentry->ve_lock, RW_WRITER);
2286 
2287 	vn_vfslocks_rele(vpvfsentry);
2288 	return (held);
2289 }
2290 
2291 
2292 int
vn_make_ops(const char * name,const fs_operation_def_t * templ,vnodeops_t ** actual)2293 vn_make_ops(
2294 	const char *name,			/* Name of file system */
2295 	const fs_operation_def_t *templ,	/* Operation specification */
2296 	vnodeops_t **actual)			/* Return the vnodeops */
2297 {
2298 	int unused_ops;
2299 	int error;
2300 
2301 	*actual = (vnodeops_t *)kmem_alloc(sizeof (vnodeops_t), KM_SLEEP);
2302 
2303 	(*actual)->vnop_name = name;
2304 
2305 	error = fs_build_vector(*actual, &unused_ops, vn_ops_table, templ);
2306 	if (error) {
2307 		kmem_free(*actual, sizeof (vnodeops_t));
2308 	}
2309 
2310 #if DEBUG
2311 	if (unused_ops != 0)
2312 		cmn_err(CE_WARN, "vn_make_ops: %s: %d operations supplied "
2313 		    "but not used", name, unused_ops);
2314 #endif
2315 
2316 	return (error);
2317 }
2318 
2319 /*
2320  * Free the vnodeops created as a result of vn_make_ops()
2321  */
2322 void
vn_freevnodeops(vnodeops_t * vnops)2323 vn_freevnodeops(vnodeops_t *vnops)
2324 {
2325 	kmem_free(vnops, sizeof (vnodeops_t));
2326 }
2327 
2328 /*
2329  * Vnode cache.
2330  */
2331 
2332 /* ARGSUSED */
2333 static int
vn_cache_constructor(void * buf,void * cdrarg,int kmflags)2334 vn_cache_constructor(void *buf, void *cdrarg, int kmflags)
2335 {
2336 	struct vnode *vp;
2337 
2338 	vp = buf;
2339 
2340 	mutex_init(&vp->v_lock, NULL, MUTEX_DEFAULT, NULL);
2341 	mutex_init(&vp->v_vsd_lock, NULL, MUTEX_DEFAULT, NULL);
2342 	cv_init(&vp->v_cv, NULL, CV_DEFAULT, NULL);
2343 	rw_init(&vp->v_nbllock, NULL, RW_DEFAULT, NULL);
2344 	vp->v_femhead = NULL;	/* Must be done before vn_reinit() */
2345 	vp->v_path = vn_vpath_empty;
2346 	vp->v_path_stamp = 0;
2347 	vp->v_mpssdata = NULL;
2348 	vp->v_vsd = NULL;
2349 	vp->v_fopdata = NULL;
2350 
2351 	return (0);
2352 }
2353 
2354 /* ARGSUSED */
2355 static void
vn_cache_destructor(void * buf,void * cdrarg)2356 vn_cache_destructor(void *buf, void *cdrarg)
2357 {
2358 	struct vnode *vp;
2359 
2360 	vp = buf;
2361 
2362 	rw_destroy(&vp->v_nbllock);
2363 	cv_destroy(&vp->v_cv);
2364 	mutex_destroy(&vp->v_vsd_lock);
2365 	mutex_destroy(&vp->v_lock);
2366 }
2367 
2368 void
vn_create_cache(void)2369 vn_create_cache(void)
2370 {
2371 	/* LINTED */
2372 	ASSERT((1 << VNODE_ALIGN_LOG2) ==
2373 	    P2ROUNDUP(sizeof (struct vnode), VNODE_ALIGN));
2374 	vn_cache = kmem_cache_create("vn_cache", sizeof (struct vnode),
2375 	    VNODE_ALIGN, vn_cache_constructor, vn_cache_destructor, NULL, NULL,
2376 	    NULL, 0);
2377 }
2378 
2379 void
vn_destroy_cache(void)2380 vn_destroy_cache(void)
2381 {
2382 	kmem_cache_destroy(vn_cache);
2383 }
2384 
2385 /*
2386  * Used by file systems when fs-specific nodes (e.g., ufs inodes) are
2387  * cached by the file system and vnodes remain associated.
2388  */
2389 void
vn_recycle(vnode_t * vp)2390 vn_recycle(vnode_t *vp)
2391 {
2392 	ASSERT(vp->v_pages == NULL);
2393 	VERIFY(vp->v_path != NULL);
2394 
2395 	/*
2396 	 * XXX - This really belongs in vn_reinit(), but we have some issues
2397 	 * with the counts.  Best to have it here for clean initialization.
2398 	 */
2399 	vp->v_rdcnt = 0;
2400 	vp->v_wrcnt = 0;
2401 	vp->v_mmap_read = 0;
2402 	vp->v_mmap_write = 0;
2403 
2404 	/*
2405 	 * If FEM was in use, make sure everything gets cleaned up
2406 	 * NOTE: vp->v_femhead is initialized to NULL in the vnode
2407 	 * constructor.
2408 	 */
2409 	if (vp->v_femhead) {
2410 		/* XXX - There should be a free_femhead() that does all this */
2411 		ASSERT(vp->v_femhead->femh_list == NULL);
2412 		mutex_destroy(&vp->v_femhead->femh_lock);
2413 		kmem_free(vp->v_femhead, sizeof (*(vp->v_femhead)));
2414 		vp->v_femhead = NULL;
2415 	}
2416 	if (vp->v_path != vn_vpath_empty) {
2417 		kmem_free(vp->v_path, strlen(vp->v_path) + 1);
2418 		vp->v_path = vn_vpath_empty;
2419 	}
2420 	vp->v_path_stamp = 0;
2421 
2422 	if (vp->v_fopdata != NULL) {
2423 		free_fopdata(vp);
2424 	}
2425 	vp->v_mpssdata = NULL;
2426 	vsd_free(vp);
2427 }
2428 
2429 /*
2430  * Used to reset the vnode fields including those that are directly accessible
2431  * as well as those which require an accessor function.
2432  *
2433  * Does not initialize:
2434  *	synchronization objects: v_lock, v_vsd_lock, v_nbllock, v_cv
2435  *	v_data (since FS-nodes and vnodes point to each other and should
2436  *		be updated simultaneously)
2437  *	v_op (in case someone needs to make a VOP call on this object)
2438  */
2439 void
vn_reinit(vnode_t * vp)2440 vn_reinit(vnode_t *vp)
2441 {
2442 	vp->v_count = 1;
2443 	vp->v_count_dnlc = 0;
2444 	vp->v_vfsp = NULL;
2445 	vp->v_stream = NULL;
2446 	vp->v_vfsmountedhere = NULL;
2447 	vp->v_flag = 0;
2448 	vp->v_type = VNON;
2449 	vp->v_rdev = NODEV;
2450 
2451 	vp->v_filocks = NULL;
2452 	vp->v_shrlocks = NULL;
2453 	vp->v_pages = NULL;
2454 
2455 	vp->v_locality = NULL;
2456 	vp->v_xattrdir = NULL;
2457 
2458 	/*
2459 	 * In a few specific instances, vn_reinit() is used to initialize
2460 	 * locally defined vnode_t instances.  Lacking the construction offered
2461 	 * by vn_alloc(), these vnodes require v_path initialization.
2462 	 */
2463 	if (vp->v_path == NULL) {
2464 		vp->v_path = vn_vpath_empty;
2465 	}
2466 
2467 	/* Handles v_femhead, v_path, and the r/w/map counts */
2468 	vn_recycle(vp);
2469 }
2470 
2471 vnode_t *
vn_alloc(int kmflag)2472 vn_alloc(int kmflag)
2473 {
2474 	vnode_t *vp;
2475 
2476 	vp = kmem_cache_alloc(vn_cache, kmflag);
2477 
2478 	if (vp != NULL) {
2479 		vp->v_femhead = NULL;	/* Must be done before vn_reinit() */
2480 		vp->v_fopdata = NULL;
2481 		vn_reinit(vp);
2482 	}
2483 
2484 	return (vp);
2485 }
2486 
2487 void
vn_free(vnode_t * vp)2488 vn_free(vnode_t *vp)
2489 {
2490 	ASSERT(vp->v_shrlocks == NULL);
2491 	ASSERT(vp->v_filocks == NULL);
2492 
2493 	/*
2494 	 * Some file systems call vn_free() with v_count of zero,
2495 	 * some with v_count of 1.  In any case, the value should
2496 	 * never be anything else.
2497 	 */
2498 	ASSERT((vp->v_count == 0) || (vp->v_count == 1));
2499 	ASSERT(vp->v_count_dnlc == 0);
2500 	VERIFY(vp->v_path != NULL);
2501 	if (vp->v_path != vn_vpath_empty) {
2502 		kmem_free(vp->v_path, strlen(vp->v_path) + 1);
2503 		vp->v_path = vn_vpath_empty;
2504 	}
2505 
2506 	/* If FEM was in use, make sure everything gets cleaned up */
2507 	if (vp->v_femhead) {
2508 		/* XXX - There should be a free_femhead() that does all this */
2509 		ASSERT(vp->v_femhead->femh_list == NULL);
2510 		mutex_destroy(&vp->v_femhead->femh_lock);
2511 		kmem_free(vp->v_femhead, sizeof (*(vp->v_femhead)));
2512 		vp->v_femhead = NULL;
2513 	}
2514 
2515 	if (vp->v_fopdata != NULL) {
2516 		free_fopdata(vp);
2517 	}
2518 	vp->v_mpssdata = NULL;
2519 	vsd_free(vp);
2520 	kmem_cache_free(vn_cache, vp);
2521 }
2522 
2523 /*
2524  * vnode status changes, should define better states than 1, 0.
2525  */
2526 void
vn_reclaim(vnode_t * vp)2527 vn_reclaim(vnode_t *vp)
2528 {
2529 	vfs_t   *vfsp = vp->v_vfsp;
2530 
2531 	if (vfsp == NULL ||
2532 	    vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
2533 		return;
2534 	}
2535 	(void) VFS_VNSTATE(vfsp, vp, VNTRANS_RECLAIMED);
2536 }
2537 
2538 void
vn_idle(vnode_t * vp)2539 vn_idle(vnode_t *vp)
2540 {
2541 	vfs_t   *vfsp = vp->v_vfsp;
2542 
2543 	if (vfsp == NULL ||
2544 	    vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
2545 		return;
2546 	}
2547 	(void) VFS_VNSTATE(vfsp, vp, VNTRANS_IDLED);
2548 }
2549 void
vn_exists(vnode_t * vp)2550 vn_exists(vnode_t *vp)
2551 {
2552 	vfs_t   *vfsp = vp->v_vfsp;
2553 
2554 	if (vfsp == NULL ||
2555 	    vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
2556 		return;
2557 	}
2558 	(void) VFS_VNSTATE(vfsp, vp, VNTRANS_EXISTS);
2559 }
2560 
2561 void
vn_invalid(vnode_t * vp)2562 vn_invalid(vnode_t *vp)
2563 {
2564 	vfs_t   *vfsp = vp->v_vfsp;
2565 
2566 	if (vfsp == NULL ||
2567 	    vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
2568 		return;
2569 	}
2570 	(void) VFS_VNSTATE(vfsp, vp, VNTRANS_DESTROYED);
2571 }
2572 
2573 /* Vnode event notification */
2574 
2575 int
vnevent_support(vnode_t * vp,caller_context_t * ct)2576 vnevent_support(vnode_t *vp, caller_context_t *ct)
2577 {
2578 	if (vp == NULL)
2579 		return (EINVAL);
2580 
2581 	return (VOP_VNEVENT(vp, VE_SUPPORT, NULL, NULL, ct));
2582 }
2583 
2584 void
vnevent_rename_src(vnode_t * vp,vnode_t * dvp,char * name,caller_context_t * ct)2585 vnevent_rename_src(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct)
2586 {
2587 	if (vp == NULL || vp->v_femhead == NULL) {
2588 		return;
2589 	}
2590 	(void) VOP_VNEVENT(vp, VE_RENAME_SRC, dvp, name, ct);
2591 }
2592 
2593 void
vnevent_rename_dest(vnode_t * vp,vnode_t * dvp,char * name,caller_context_t * ct)2594 vnevent_rename_dest(vnode_t *vp, vnode_t *dvp, char *name,
2595     caller_context_t *ct)
2596 {
2597 	if (vp == NULL || vp->v_femhead == NULL) {
2598 		return;
2599 	}
2600 	(void) VOP_VNEVENT(vp, VE_RENAME_DEST, dvp, name, ct);
2601 }
2602 
2603 void
vnevent_rename_dest_dir(vnode_t * vp,caller_context_t * ct)2604 vnevent_rename_dest_dir(vnode_t *vp, caller_context_t *ct)
2605 {
2606 	if (vp == NULL || vp->v_femhead == NULL) {
2607 		return;
2608 	}
2609 	(void) VOP_VNEVENT(vp, VE_RENAME_DEST_DIR, NULL, NULL, ct);
2610 }
2611 
2612 void
vnevent_remove(vnode_t * vp,vnode_t * dvp,char * name,caller_context_t * ct)2613 vnevent_remove(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct)
2614 {
2615 	if (vp == NULL || vp->v_femhead == NULL) {
2616 		return;
2617 	}
2618 	(void) VOP_VNEVENT(vp, VE_REMOVE, dvp, name, ct);
2619 }
2620 
2621 void
vnevent_rmdir(vnode_t * vp,vnode_t * dvp,char * name,caller_context_t * ct)2622 vnevent_rmdir(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct)
2623 {
2624 	if (vp == NULL || vp->v_femhead == NULL) {
2625 		return;
2626 	}
2627 	(void) VOP_VNEVENT(vp, VE_RMDIR, dvp, name, ct);
2628 }
2629 
2630 void
vnevent_pre_rename_src(vnode_t * vp,vnode_t * dvp,char * name,caller_context_t * ct)2631 vnevent_pre_rename_src(vnode_t *vp, vnode_t *dvp, char *name,
2632     caller_context_t *ct)
2633 {
2634 	if (vp == NULL || vp->v_femhead == NULL) {
2635 		return;
2636 	}
2637 	(void) VOP_VNEVENT(vp, VE_PRE_RENAME_SRC, dvp, name, ct);
2638 }
2639 
2640 void
vnevent_pre_rename_dest(vnode_t * vp,vnode_t * dvp,char * name,caller_context_t * ct)2641 vnevent_pre_rename_dest(vnode_t *vp, vnode_t *dvp, char *name,
2642     caller_context_t *ct)
2643 {
2644 	if (vp == NULL || vp->v_femhead == NULL) {
2645 		return;
2646 	}
2647 	(void) VOP_VNEVENT(vp, VE_PRE_RENAME_DEST, dvp, name, ct);
2648 }
2649 
2650 void
vnevent_pre_rename_dest_dir(vnode_t * vp,vnode_t * nvp,char * name,caller_context_t * ct)2651 vnevent_pre_rename_dest_dir(vnode_t *vp, vnode_t *nvp, char *name,
2652     caller_context_t *ct)
2653 {
2654 	if (vp == NULL || vp->v_femhead == NULL) {
2655 		return;
2656 	}
2657 	(void) VOP_VNEVENT(vp, VE_PRE_RENAME_DEST_DIR, nvp, name, ct);
2658 }
2659 
2660 void
vnevent_create(vnode_t * vp,caller_context_t * ct)2661 vnevent_create(vnode_t *vp, caller_context_t *ct)
2662 {
2663 	if (vp == NULL || vp->v_femhead == NULL) {
2664 		return;
2665 	}
2666 	(void) VOP_VNEVENT(vp, VE_CREATE, NULL, NULL, ct);
2667 }
2668 
2669 void
vnevent_link(vnode_t * vp,caller_context_t * ct)2670 vnevent_link(vnode_t *vp, caller_context_t *ct)
2671 {
2672 	if (vp == NULL || vp->v_femhead == NULL) {
2673 		return;
2674 	}
2675 	(void) VOP_VNEVENT(vp, VE_LINK, NULL, NULL, ct);
2676 }
2677 
2678 void
vnevent_mountedover(vnode_t * vp,caller_context_t * ct)2679 vnevent_mountedover(vnode_t *vp, caller_context_t *ct)
2680 {
2681 	if (vp == NULL || vp->v_femhead == NULL) {
2682 		return;
2683 	}
2684 	(void) VOP_VNEVENT(vp, VE_MOUNTEDOVER, NULL, NULL, ct);
2685 }
2686 
2687 void
vnevent_truncate(vnode_t * vp,caller_context_t * ct)2688 vnevent_truncate(vnode_t *vp, caller_context_t *ct)
2689 {
2690 	if (vp == NULL || vp->v_femhead == NULL) {
2691 		return;
2692 	}
2693 	(void) VOP_VNEVENT(vp, VE_TRUNCATE, NULL, NULL, ct);
2694 }
2695 
2696 /*
2697  * Vnode accessors.
2698  */
2699 
2700 int
vn_is_readonly(vnode_t * vp)2701 vn_is_readonly(vnode_t *vp)
2702 {
2703 	return (vp->v_vfsp->vfs_flag & VFS_RDONLY);
2704 }
2705 
2706 int
vn_has_flocks(vnode_t * vp)2707 vn_has_flocks(vnode_t *vp)
2708 {
2709 	return (vp->v_filocks != NULL);
2710 }
2711 
2712 int
vn_has_mandatory_locks(vnode_t * vp,int mode)2713 vn_has_mandatory_locks(vnode_t *vp, int mode)
2714 {
2715 	return ((vp->v_filocks != NULL) && (MANDLOCK(vp, mode)));
2716 }
2717 
2718 int
vn_has_cached_data(vnode_t * vp)2719 vn_has_cached_data(vnode_t *vp)
2720 {
2721 	return (vp->v_pages != NULL);
2722 }
2723 
2724 /*
2725  * Return 0 if the vnode in question shouldn't be permitted into a zone via
2726  * zone_enter(2).
2727  */
2728 int
vn_can_change_zones(vnode_t * vp)2729 vn_can_change_zones(vnode_t *vp)
2730 {
2731 	struct vfssw *vswp;
2732 	int allow = 1;
2733 	vnode_t *rvp;
2734 
2735 	if (nfs_global_client_only != 0)
2736 		return (1);
2737 
2738 	/*
2739 	 * We always want to look at the underlying vnode if there is one.
2740 	 */
2741 	if (VOP_REALVP(vp, &rvp, NULL) != 0)
2742 		rvp = vp;
2743 	/*
2744 	 * Some pseudo filesystems (including doorfs) don't actually register
2745 	 * their vfsops_t, so the following may return NULL; we happily let
2746 	 * such vnodes switch zones.
2747 	 */
2748 	vswp = vfs_getvfsswbyvfsops(vfs_getops(rvp->v_vfsp));
2749 	if (vswp != NULL) {
2750 		if (vswp->vsw_flag & VSW_NOTZONESAFE)
2751 			allow = 0;
2752 		vfs_unrefvfssw(vswp);
2753 	}
2754 	return (allow);
2755 }
2756 
2757 /*
2758  * Return nonzero if the vnode is a mount point, zero if not.
2759  */
2760 int
vn_ismntpt(vnode_t * vp)2761 vn_ismntpt(vnode_t *vp)
2762 {
2763 	return (vp->v_vfsmountedhere != NULL);
2764 }
2765 
2766 /* Retrieve the vfs (if any) mounted on this vnode */
2767 vfs_t *
vn_mountedvfs(vnode_t * vp)2768 vn_mountedvfs(vnode_t *vp)
2769 {
2770 	return (vp->v_vfsmountedhere);
2771 }
2772 
2773 /*
2774  * Return nonzero if the vnode is referenced by the dnlc, zero if not.
2775  */
2776 int
vn_in_dnlc(vnode_t * vp)2777 vn_in_dnlc(vnode_t *vp)
2778 {
2779 	return (vp->v_count_dnlc > 0);
2780 }
2781 
2782 /*
2783  * vn_has_other_opens() checks whether a particular file is opened by more than
2784  * just the caller and whether the open is for read and/or write.
2785  * This routine is for calling after the caller has already called VOP_OPEN()
2786  * and the caller wishes to know if they are the only one with it open for
2787  * the mode(s) specified.
2788  *
2789  * Vnode counts are only kept on regular files (v_type=VREG).
2790  */
2791 int
vn_has_other_opens(vnode_t * vp,v_mode_t mode)2792 vn_has_other_opens(
2793 	vnode_t *vp,
2794 	v_mode_t mode)
2795 {
2796 
2797 	ASSERT(vp != NULL);
2798 
2799 	switch (mode) {
2800 	case V_WRITE:
2801 		if (vp->v_wrcnt > 1)
2802 			return (V_TRUE);
2803 		break;
2804 	case V_RDORWR:
2805 		if ((vp->v_rdcnt > 1) || (vp->v_wrcnt > 1))
2806 			return (V_TRUE);
2807 		break;
2808 	case V_RDANDWR:
2809 		if ((vp->v_rdcnt > 1) && (vp->v_wrcnt > 1))
2810 			return (V_TRUE);
2811 		break;
2812 	case V_READ:
2813 		if (vp->v_rdcnt > 1)
2814 			return (V_TRUE);
2815 		break;
2816 	}
2817 
2818 	return (V_FALSE);
2819 }
2820 
2821 /*
2822  * vn_is_opened() checks whether a particular file is opened and
2823  * whether the open is for read and/or write.
2824  *
2825  * Vnode counts are only kept on regular files (v_type=VREG).
2826  */
2827 int
vn_is_opened(vnode_t * vp,v_mode_t mode)2828 vn_is_opened(
2829 	vnode_t *vp,
2830 	v_mode_t mode)
2831 {
2832 
2833 	ASSERT(vp != NULL);
2834 
2835 	switch (mode) {
2836 	case V_WRITE:
2837 		if (vp->v_wrcnt)
2838 			return (V_TRUE);
2839 		break;
2840 	case V_RDANDWR:
2841 		if (vp->v_rdcnt && vp->v_wrcnt)
2842 			return (V_TRUE);
2843 		break;
2844 	case V_RDORWR:
2845 		if (vp->v_rdcnt || vp->v_wrcnt)
2846 			return (V_TRUE);
2847 		break;
2848 	case V_READ:
2849 		if (vp->v_rdcnt)
2850 			return (V_TRUE);
2851 		break;
2852 	}
2853 
2854 	return (V_FALSE);
2855 }
2856 
2857 /*
2858  * vn_is_mapped() checks whether a particular file is mapped and whether
2859  * the file is mapped read and/or write.
2860  */
2861 int
vn_is_mapped(vnode_t * vp,v_mode_t mode)2862 vn_is_mapped(
2863 	vnode_t *vp,
2864 	v_mode_t mode)
2865 {
2866 
2867 	ASSERT(vp != NULL);
2868 
2869 #if !defined(_LP64)
2870 	switch (mode) {
2871 	/*
2872 	 * The atomic_add_64_nv functions force atomicity in the
2873 	 * case of 32 bit architectures. Otherwise the 64 bit values
2874 	 * require two fetches. The value of the fields may be
2875 	 * (potentially) changed between the first fetch and the
2876 	 * second
2877 	 */
2878 	case V_WRITE:
2879 		if (atomic_add_64_nv((&(vp->v_mmap_write)), 0))
2880 			return (V_TRUE);
2881 		break;
2882 	case V_RDANDWR:
2883 		if ((atomic_add_64_nv((&(vp->v_mmap_read)), 0)) &&
2884 		    (atomic_add_64_nv((&(vp->v_mmap_write)), 0)))
2885 			return (V_TRUE);
2886 		break;
2887 	case V_RDORWR:
2888 		if ((atomic_add_64_nv((&(vp->v_mmap_read)), 0)) ||
2889 		    (atomic_add_64_nv((&(vp->v_mmap_write)), 0)))
2890 			return (V_TRUE);
2891 		break;
2892 	case V_READ:
2893 		if (atomic_add_64_nv((&(vp->v_mmap_read)), 0))
2894 			return (V_TRUE);
2895 		break;
2896 	}
2897 #else
2898 	switch (mode) {
2899 	case V_WRITE:
2900 		if (vp->v_mmap_write)
2901 			return (V_TRUE);
2902 		break;
2903 	case V_RDANDWR:
2904 		if (vp->v_mmap_read && vp->v_mmap_write)
2905 			return (V_TRUE);
2906 		break;
2907 	case V_RDORWR:
2908 		if (vp->v_mmap_read || vp->v_mmap_write)
2909 			return (V_TRUE);
2910 		break;
2911 	case V_READ:
2912 		if (vp->v_mmap_read)
2913 			return (V_TRUE);
2914 		break;
2915 	}
2916 #endif
2917 
2918 	return (V_FALSE);
2919 }
2920 
2921 /*
2922  * Set the operations vector for a vnode.
2923  *
2924  * FEM ensures that the v_femhead pointer is filled in before the
2925  * v_op pointer is changed.  This means that if the v_femhead pointer
2926  * is NULL, and the v_op field hasn't changed since before which checked
2927  * the v_femhead pointer; then our update is ok - we are not racing with
2928  * FEM.
2929  */
2930 void
vn_setops(vnode_t * vp,vnodeops_t * vnodeops)2931 vn_setops(vnode_t *vp, vnodeops_t *vnodeops)
2932 {
2933 	vnodeops_t	*op;
2934 
2935 	ASSERT(vp != NULL);
2936 	ASSERT(vnodeops != NULL);
2937 
2938 	op = vp->v_op;
2939 	membar_consumer();
2940 	/*
2941 	 * If vp->v_femhead == NULL, then we'll call atomic_cas_ptr() to do
2942 	 * the compare-and-swap on vp->v_op.  If either fails, then FEM is
2943 	 * in effect on the vnode and we need to have FEM deal with it.
2944 	 */
2945 	if (vp->v_femhead != NULL || atomic_cas_ptr(&vp->v_op, op, vnodeops) !=
2946 	    op) {
2947 		fem_setvnops(vp, vnodeops);
2948 	}
2949 }
2950 
2951 /*
2952  * Retrieve the operations vector for a vnode
2953  * As with vn_setops(above); make sure we aren't racing with FEM.
2954  * FEM sets the v_op to a special, internal, vnodeops that wouldn't
2955  * make sense to the callers of this routine.
2956  */
2957 vnodeops_t *
vn_getops(vnode_t * vp)2958 vn_getops(vnode_t *vp)
2959 {
2960 	vnodeops_t	*op;
2961 
2962 	ASSERT(vp != NULL);
2963 
2964 	op = vp->v_op;
2965 	membar_consumer();
2966 	if (vp->v_femhead == NULL && op == vp->v_op) {
2967 		return (op);
2968 	} else {
2969 		return (fem_getvnops(vp));
2970 	}
2971 }
2972 
2973 /*
2974  * Returns non-zero (1) if the vnodeops matches that of the vnode.
2975  * Returns zero (0) if not.
2976  */
2977 int
vn_matchops(vnode_t * vp,vnodeops_t * vnodeops)2978 vn_matchops(vnode_t *vp, vnodeops_t *vnodeops)
2979 {
2980 	return (vn_getops(vp) == vnodeops);
2981 }
2982 
2983 /*
2984  * Returns non-zero (1) if the specified operation matches the
2985  * corresponding operation for that the vnode.
2986  * Returns zero (0) if not.
2987  */
2988 
2989 #define	MATCHNAME(n1, n2) (((n1)[0] == (n2)[0]) && (strcmp((n1), (n2)) == 0))
2990 
2991 int
vn_matchopval(vnode_t * vp,char * vopname,fs_generic_func_p funcp)2992 vn_matchopval(vnode_t *vp, char *vopname, fs_generic_func_p funcp)
2993 {
2994 	const fs_operation_trans_def_t *otdp;
2995 	fs_generic_func_p *loc = NULL;
2996 	vnodeops_t	*vop = vn_getops(vp);
2997 
2998 	ASSERT(vopname != NULL);
2999 
3000 	for (otdp = vn_ops_table; otdp->name != NULL; otdp++) {
3001 		if (MATCHNAME(otdp->name, vopname)) {
3002 			loc = (fs_generic_func_p *)
3003 			    ((char *)(vop) + otdp->offset);
3004 			break;
3005 		}
3006 	}
3007 
3008 	return ((loc != NULL) && (*loc == funcp));
3009 }
3010 
3011 /*
3012  * fs_new_caller_id() needs to return a unique ID on a given local system.
3013  * The IDs do not need to survive across reboots.  These are primarily
3014  * used so that (FEM) monitors can detect particular callers (such as
3015  * the NFS server) to a given vnode/vfs operation.
3016  */
3017 u_longlong_t
fs_new_caller_id()3018 fs_new_caller_id()
3019 {
3020 	static uint64_t next_caller_id = 0LL; /* First call returns 1 */
3021 
3022 	return ((u_longlong_t)atomic_inc_64_nv(&next_caller_id));
3023 }
3024 
3025 /*
3026  * The value stored in v_path is relative to rootdir, located in the global
3027  * zone.  Zones or chroot environments which reside deeper inside the VFS
3028  * hierarchy will have a relative view of MAXPATHLEN since they are unaware of
3029  * what lies below their perceived root.  In order to keep v_path usable for
3030  * these child environments, its allocations are allowed to exceed MAXPATHLEN.
3031  *
3032  * An upper bound of max_vnode_path is placed upon v_path allocations to
3033  * prevent the system from going too wild at the behest of pathological
3034  * behavior from the operator.
3035  */
3036 size_t max_vnode_path = 4 * MAXPATHLEN;
3037 
3038 
3039 void
vn_clearpath(vnode_t * vp,hrtime_t compare_stamp)3040 vn_clearpath(vnode_t *vp, hrtime_t compare_stamp)
3041 {
3042 	char *buf;
3043 
3044 	mutex_enter(&vp->v_lock);
3045 	/*
3046 	 * If the snapshot of v_path_stamp passed in via compare_stamp does not
3047 	 * match the present value on the vnode, it indicates that subsequent
3048 	 * changes have occurred.  The v_path value is not cleared in this case
3049 	 * since the new value may be valid.
3050 	 */
3051 	if (compare_stamp != 0 && vp->v_path_stamp != compare_stamp) {
3052 		mutex_exit(&vp->v_lock);
3053 		return;
3054 	}
3055 	buf = vp->v_path;
3056 	vp->v_path = vn_vpath_empty;
3057 	vp->v_path_stamp = 0;
3058 	mutex_exit(&vp->v_lock);
3059 	if (buf != vn_vpath_empty) {
3060 		kmem_free(buf, strlen(buf) + 1);
3061 	}
3062 }
3063 
3064 static void
vn_setpath_common(vnode_t * pvp,vnode_t * vp,const char * name,size_t len,boolean_t is_rename)3065 vn_setpath_common(vnode_t *pvp, vnode_t *vp, const char *name, size_t len,
3066     boolean_t is_rename)
3067 {
3068 	char *buf, *oldbuf;
3069 	hrtime_t pstamp;
3070 	size_t baselen, buflen = 0;
3071 
3072 	/* Handle the vn_setpath_str case. */
3073 	if (pvp == NULL) {
3074 		if (len + 1 > max_vnode_path) {
3075 			DTRACE_PROBE4(vn__setpath__too__long, vnode_t *, pvp,
3076 			    vnode_t *, vp, char *, name, size_t, len + 1);
3077 			return;
3078 		}
3079 		buf = kmem_alloc(len + 1, KM_SLEEP);
3080 		bcopy(name, buf, len);
3081 		buf[len] = '\0';
3082 
3083 		mutex_enter(&vp->v_lock);
3084 		oldbuf = vp->v_path;
3085 		vp->v_path = buf;
3086 		vp->v_path_stamp = gethrtime();
3087 		mutex_exit(&vp->v_lock);
3088 		if (oldbuf != vn_vpath_empty) {
3089 			kmem_free(oldbuf, strlen(oldbuf) + 1);
3090 		}
3091 		return;
3092 	}
3093 
3094 	/* Take snapshot of parent dir */
3095 	mutex_enter(&pvp->v_lock);
3096 
3097 	if ((pvp->v_flag & VTRAVERSE) != 0) {
3098 		/*
3099 		 * When the parent vnode has VTRAVERSE set in its flags, normal
3100 		 * assumptions about v_path calculation no longer apply.  The
3101 		 * primary situation where this occurs is via the VFS tricks
3102 		 * which procfs plays in order to allow /proc/PID/(root|cwd) to
3103 		 * yield meaningful results.
3104 		 *
3105 		 * When this flag is set, v_path on the child must not be
3106 		 * updated since the calculated value is likely to be
3107 		 * incorrect, given the current context.
3108 		 */
3109 		mutex_exit(&pvp->v_lock);
3110 		return;
3111 	}
3112 
3113 retrybuf:
3114 	if (pvp->v_path == vn_vpath_empty) {
3115 		/*
3116 		 * Without v_path from the parent directory, generating a child
3117 		 * path from the name is impossible.
3118 		 */
3119 		if (len > 0) {
3120 			pstamp = pvp->v_path_stamp;
3121 			mutex_exit(&pvp->v_lock);
3122 			vn_clearpath(vp, pstamp);
3123 			return;
3124 		}
3125 
3126 		/*
3127 		 * The only feasible case here is where a NUL lookup is being
3128 		 * performed on rootdir prior to its v_path being populated.
3129 		 */
3130 		ASSERT(pvp->v_path_stamp == 0);
3131 		baselen = 0;
3132 		pstamp = 0;
3133 	} else {
3134 		pstamp = pvp->v_path_stamp;
3135 		baselen = strlen(pvp->v_path);
3136 		/* ignore a trailing slash if present */
3137 		if (pvp->v_path[baselen - 1] == '/') {
3138 			/* This should only the be case for rootdir */
3139 			ASSERT(baselen == 1 && pvp == rootdir);
3140 			baselen--;
3141 		}
3142 	}
3143 	mutex_exit(&pvp->v_lock);
3144 
3145 	if (buflen != 0) {
3146 		/* Free the existing (mis-sized) buffer in case of retry */
3147 		kmem_free(buf, buflen);
3148 	}
3149 	/* base, '/', name and trailing NUL */
3150 	buflen = baselen + len + 2;
3151 	if (buflen > max_vnode_path) {
3152 		DTRACE_PROBE4(vn__setpath_too__long, vnode_t *, pvp,
3153 		    vnode_t *, vp, char *, name, size_t, buflen);
3154 		return;
3155 	}
3156 	buf = kmem_alloc(buflen, KM_SLEEP);
3157 
3158 	mutex_enter(&pvp->v_lock);
3159 	if (pvp->v_path_stamp != pstamp) {
3160 		size_t vlen;
3161 
3162 		/*
3163 		 * Since v_path_stamp changed on the parent, it is likely that
3164 		 * v_path has been altered as well.  If the length does not
3165 		 * exactly match what was previously measured, the buffer
3166 		 * allocation must be repeated for proper sizing.
3167 		 */
3168 		if (pvp->v_path == vn_vpath_empty) {
3169 			/* Give up if parent lack v_path */
3170 			mutex_exit(&pvp->v_lock);
3171 			kmem_free(buf, buflen);
3172 			return;
3173 		}
3174 		vlen = strlen(pvp->v_path);
3175 		if (pvp->v_path[vlen - 1] == '/') {
3176 			vlen--;
3177 		}
3178 		if (vlen != baselen) {
3179 			goto retrybuf;
3180 		}
3181 	}
3182 	bcopy(pvp->v_path, buf, baselen);
3183 	mutex_exit(&pvp->v_lock);
3184 
3185 	buf[baselen] = '/';
3186 	baselen++;
3187 	bcopy(name, &buf[baselen], len + 1);
3188 
3189 	mutex_enter(&vp->v_lock);
3190 	if (vp->v_path_stamp == 0) {
3191 		/* never-visited vnode can inherit stamp from parent */
3192 		ASSERT(vp->v_path == vn_vpath_empty);
3193 		vp->v_path_stamp = pstamp;
3194 		vp->v_path = buf;
3195 		mutex_exit(&vp->v_lock);
3196 	} else if (vp->v_path_stamp < pstamp || is_rename) {
3197 		/*
3198 		 * Install the updated path and stamp, ensuring that the v_path
3199 		 * pointer is valid at all times for dtrace.
3200 		 */
3201 		oldbuf = vp->v_path;
3202 		vp->v_path = buf;
3203 		vp->v_path_stamp = gethrtime();
3204 		mutex_exit(&vp->v_lock);
3205 		kmem_free(oldbuf, strlen(oldbuf) + 1);
3206 	} else {
3207 		/*
3208 		 * If the timestamp matches or is greater, it means another
3209 		 * thread performed the update first while locks were dropped
3210 		 * here to make the allocation.  We defer to the newer value.
3211 		 */
3212 		mutex_exit(&vp->v_lock);
3213 		kmem_free(buf, buflen);
3214 	}
3215 	ASSERT(MUTEX_NOT_HELD(&vp->v_lock));
3216 }
3217 
3218 void
vn_updatepath(vnode_t * pvp,vnode_t * vp,const char * name)3219 vn_updatepath(vnode_t *pvp, vnode_t *vp, const char *name)
3220 {
3221 	size_t len;
3222 
3223 	/*
3224 	 * If the parent is older or empty, there's nothing further to do.
3225 	 */
3226 	if (pvp->v_path == vn_vpath_empty ||
3227 	    pvp->v_path_stamp <= vp->v_path_stamp) {
3228 		return;
3229 	}
3230 
3231 	/*
3232 	 * Given the lack of appropriate context, meaningful updates to v_path
3233 	 * cannot be made for during lookups for the '.' or '..' entries.
3234 	 */
3235 	len = strlen(name);
3236 	if (len == 0 || (len == 1 && name[0] == '.') ||
3237 	    (len == 2 && name[0] == '.' && name[1] == '.')) {
3238 		return;
3239 	}
3240 
3241 	vn_setpath_common(pvp, vp, name, len, B_FALSE);
3242 }
3243 
3244 /*
3245  * Given a starting vnode and a path, updates the path in the target vnode in
3246  * a safe manner.  If the vnode already has path information embedded, then the
3247  * cached path is left untouched.
3248  */
3249 /* ARGSUSED */
3250 void
vn_setpath(vnode_t * rootvp,vnode_t * pvp,vnode_t * vp,const char * name,size_t len)3251 vn_setpath(vnode_t *rootvp, vnode_t *pvp, vnode_t *vp, const char *name,
3252     size_t len)
3253 {
3254 	vn_setpath_common(pvp, vp, name, len, B_FALSE);
3255 }
3256 
3257 /*
3258  * Sets the path to the vnode to be the given string, regardless of current
3259  * context.  The string must be a complete path from rootdir.  This is only used
3260  * by fsop_root() for setting the path based on the mountpoint.
3261  */
3262 void
vn_setpath_str(vnode_t * vp,const char * str,size_t len)3263 vn_setpath_str(vnode_t *vp, const char *str, size_t len)
3264 {
3265 	vn_setpath_common(NULL, vp, str, len, B_FALSE);
3266 }
3267 
3268 /*
3269  * Called from within filesystem's vop_rename() to handle renames once the
3270  * target vnode is available.
3271  */
3272 void
vn_renamepath(vnode_t * pvp,vnode_t * vp,const char * name,size_t len)3273 vn_renamepath(vnode_t *pvp, vnode_t *vp, const char *name, size_t len)
3274 {
3275 	vn_setpath_common(pvp, vp, name, len, B_TRUE);
3276 }
3277 
3278 /*
3279  * Similar to vn_setpath_str(), this function sets the path of the destination
3280  * vnode to the be the same as the source vnode.
3281  */
3282 void
vn_copypath(struct vnode * src,struct vnode * dst)3283 vn_copypath(struct vnode *src, struct vnode *dst)
3284 {
3285 	char *buf;
3286 	hrtime_t stamp;
3287 	size_t buflen;
3288 
3289 	mutex_enter(&src->v_lock);
3290 	if (src->v_path == vn_vpath_empty) {
3291 		mutex_exit(&src->v_lock);
3292 		return;
3293 	}
3294 	buflen = strlen(src->v_path) + 1;
3295 	mutex_exit(&src->v_lock);
3296 
3297 	buf = kmem_alloc(buflen, KM_SLEEP);
3298 
3299 	mutex_enter(&src->v_lock);
3300 	if (src->v_path == vn_vpath_empty ||
3301 	    strlen(src->v_path) + 1 != buflen) {
3302 		mutex_exit(&src->v_lock);
3303 		kmem_free(buf, buflen);
3304 		return;
3305 	}
3306 	bcopy(src->v_path, buf, buflen);
3307 	stamp = src->v_path_stamp;
3308 	mutex_exit(&src->v_lock);
3309 
3310 	mutex_enter(&dst->v_lock);
3311 	if (dst->v_path != vn_vpath_empty) {
3312 		mutex_exit(&dst->v_lock);
3313 		kmem_free(buf, buflen);
3314 		return;
3315 	}
3316 	dst->v_path = buf;
3317 	dst->v_path_stamp = stamp;
3318 	mutex_exit(&dst->v_lock);
3319 }
3320 
3321 
3322 /*
3323  * XXX Private interface for segvn routines that handle vnode
3324  * large page segments.
3325  *
3326  * return 1 if vp's file system VOP_PAGEIO() implementation
3327  * can be safely used instead of VOP_GETPAGE() for handling
3328  * pagefaults against regular non swap files. VOP_PAGEIO()
3329  * interface is considered safe here if its implementation
3330  * is very close to VOP_GETPAGE() implementation.
3331  * e.g. It zero's out the part of the page beyond EOF. Doesn't
3332  * panic if there're file holes but instead returns an error.
3333  * Doesn't assume file won't be changed by user writes, etc.
3334  *
3335  * return 0 otherwise.
3336  *
3337  * For now allow segvn to only use VOP_PAGEIO() with ufs and nfs.
3338  */
3339 int
vn_vmpss_usepageio(vnode_t * vp)3340 vn_vmpss_usepageio(vnode_t *vp)
3341 {
3342 	vfs_t   *vfsp = vp->v_vfsp;
3343 	char *fsname = vfssw[vfsp->vfs_fstype].vsw_name;
3344 	char *pageio_ok_fss[] = {"ufs", "nfs", NULL};
3345 	char **fsok = pageio_ok_fss;
3346 
3347 	if (fsname == NULL) {
3348 		return (0);
3349 	}
3350 
3351 	for (; *fsok; fsok++) {
3352 		if (strcmp(*fsok, fsname) == 0) {
3353 			return (1);
3354 		}
3355 	}
3356 	return (0);
3357 }
3358 
3359 /* VOP_XXX() macros call the corresponding fop_xxx() function */
3360 
3361 int
fop_open(vnode_t ** vpp,int mode,cred_t * cr,caller_context_t * ct)3362 fop_open(
3363 	vnode_t **vpp,
3364 	int mode,
3365 	cred_t *cr,
3366 	caller_context_t *ct)
3367 {
3368 	int ret;
3369 	vnode_t *vp = *vpp;
3370 
3371 	VN_HOLD(vp);
3372 	/*
3373 	 * Adding to the vnode counts before calling open
3374 	 * avoids the need for a mutex. It circumvents a race
3375 	 * condition where a query made on the vnode counts results in a
3376 	 * false negative. The inquirer goes away believing the file is
3377 	 * not open when there is an open on the file already under way.
3378 	 *
3379 	 * The counts are meant to prevent NFS from granting a delegation
3380 	 * when it would be dangerous to do so.
3381 	 *
3382 	 * The vnode counts are only kept on regular files
3383 	 */
3384 	if ((*vpp)->v_type == VREG) {
3385 		if (mode & FREAD)
3386 			atomic_inc_32(&(*vpp)->v_rdcnt);
3387 		if (mode & FWRITE)
3388 			atomic_inc_32(&(*vpp)->v_wrcnt);
3389 	}
3390 
3391 	VOPXID_MAP_CR(vp, cr);
3392 
3393 	ret = (*(*(vpp))->v_op->vop_open)(vpp, mode, cr, ct);
3394 
3395 	if (ret) {
3396 		/*
3397 		 * Use the saved vp just in case the vnode ptr got trashed
3398 		 * by the error.
3399 		 */
3400 		VOPSTATS_UPDATE(vp, open);
3401 		if ((vp->v_type == VREG) && (mode & FREAD))
3402 			atomic_dec_32(&vp->v_rdcnt);
3403 		if ((vp->v_type == VREG) && (mode & FWRITE))
3404 			atomic_dec_32(&vp->v_wrcnt);
3405 	} else {
3406 		/*
3407 		 * Some filesystems will return a different vnode,
3408 		 * but the same path was still used to open it.
3409 		 * So if we do change the vnode and need to
3410 		 * copy over the path, do so here, rather than special
3411 		 * casing each filesystem. Adjust the vnode counts to
3412 		 * reflect the vnode switch.
3413 		 */
3414 		VOPSTATS_UPDATE(*vpp, open);
3415 		if (*vpp != vp) {
3416 			vn_copypath(vp, *vpp);
3417 			if (((*vpp)->v_type == VREG) && (mode & FREAD))
3418 				atomic_inc_32(&(*vpp)->v_rdcnt);
3419 			if ((vp->v_type == VREG) && (mode & FREAD))
3420 				atomic_dec_32(&vp->v_rdcnt);
3421 			if (((*vpp)->v_type == VREG) && (mode & FWRITE))
3422 				atomic_inc_32(&(*vpp)->v_wrcnt);
3423 			if ((vp->v_type == VREG) && (mode & FWRITE))
3424 				atomic_dec_32(&vp->v_wrcnt);
3425 		}
3426 	}
3427 	VN_RELE(vp);
3428 	return (ret);
3429 }
3430 
3431 int
fop_close(vnode_t * vp,int flag,int count,offset_t offset,cred_t * cr,caller_context_t * ct)3432 fop_close(
3433 	vnode_t *vp,
3434 	int flag,
3435 	int count,
3436 	offset_t offset,
3437 	cred_t *cr,
3438 	caller_context_t *ct)
3439 {
3440 	int err;
3441 
3442 	VOPXID_MAP_CR(vp, cr);
3443 
3444 	err = (*(vp)->v_op->vop_close)(vp, flag, count, offset, cr, ct);
3445 	VOPSTATS_UPDATE(vp, close);
3446 	/*
3447 	 * Check passed in count to handle possible dups. Vnode counts are only
3448 	 * kept on regular files
3449 	 */
3450 	if ((vp->v_type == VREG) && (count == 1))  {
3451 		if (flag & FREAD) {
3452 			ASSERT(vp->v_rdcnt > 0);
3453 			atomic_dec_32(&vp->v_rdcnt);
3454 		}
3455 		if (flag & FWRITE) {
3456 			ASSERT(vp->v_wrcnt > 0);
3457 			atomic_dec_32(&vp->v_wrcnt);
3458 		}
3459 	}
3460 	return (err);
3461 }
3462 
3463 int
fop_read(vnode_t * vp,uio_t * uiop,int ioflag,cred_t * cr,caller_context_t * ct)3464 fop_read(
3465 	vnode_t *vp,
3466 	uio_t *uiop,
3467 	int ioflag,
3468 	cred_t *cr,
3469 	caller_context_t *ct)
3470 {
3471 	int	err;
3472 	ssize_t	resid_start = uiop->uio_resid;
3473 
3474 	VOPXID_MAP_CR(vp, cr);
3475 
3476 	err = (*(vp)->v_op->vop_read)(vp, uiop, ioflag, cr, ct);
3477 	VOPSTATS_UPDATE_IO(vp, read,
3478 	    read_bytes, (resid_start - uiop->uio_resid));
3479 	return (err);
3480 }
3481 
3482 int
fop_write(vnode_t * vp,uio_t * uiop,int ioflag,cred_t * cr,caller_context_t * ct)3483 fop_write(
3484 	vnode_t *vp,
3485 	uio_t *uiop,
3486 	int ioflag,
3487 	cred_t *cr,
3488 	caller_context_t *ct)
3489 {
3490 	int	err;
3491 	ssize_t	resid_start = uiop->uio_resid;
3492 
3493 	VOPXID_MAP_CR(vp, cr);
3494 
3495 	err = (*(vp)->v_op->vop_write)(vp, uiop, ioflag, cr, ct);
3496 	VOPSTATS_UPDATE_IO(vp, write,
3497 	    write_bytes, (resid_start -