1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
24 * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved.
25 */
26
27/*
28 * ZFS control directory (a.k.a. ".zfs")
29 *
30 * This directory provides a common location for all ZFS meta-objects.
31 * Currently, this is only the 'snapshot' directory, but this may expand in the
32 * future.  The elements are built using the GFS primitives, as the hierarchy
33 * does not actually exist on disk.
34 *
35 * For 'snapshot', we don't want to have all snapshots always mounted, because
36 * this would take up a huge amount of space in /etc/mnttab.  We have three
37 * types of objects:
38 *
39 * 	ctldir ------> snapshotdir -------> snapshot
40 *                                             |
41 *                                             |
42 *                                             V
43 *                                         mounted fs
44 *
45 * The 'snapshot' node contains just enough information to lookup '..' and act
46 * as a mountpoint for the snapshot.  Whenever we lookup a specific snapshot, we
47 * perform an automount of the underlying filesystem and return the
48 * corresponding vnode.
49 *
50 * All mounts are handled automatically by the kernel, but unmounts are
51 * (currently) handled from user land.  The main reason is that there is no
52 * reliable way to auto-unmount the filesystem when it's "no longer in use".
53 * When the user unmounts a filesystem, we call zfsctl_unmount(), which
54 * unmounts any snapshots within the snapshot directory.
55 *
56 * The '.zfs', '.zfs/snapshot', and all directories created under
57 * '.zfs/snapshot' (ie: '.zfs/snapshot/<snapname>') are all GFS nodes and
58 * share the same vfs_t as the head filesystem (what '.zfs' lives under).
59 *
60 * File systems mounted ontop of the GFS nodes '.zfs/snapshot/<snapname>'
61 * (ie: snapshots) are ZFS nodes and have their own unique vfs_t.
62 * However, vnodes within these mounted on file systems have their v_vfsp
63 * fields set to the head filesystem to make NFS happy (see
64 * zfsctl_snapdir_lookup()). We VFS_HOLD the head filesystem's vfs_t
65 * so that it cannot be freed until all snapshots have been unmounted.
66 */
67
68#include <fs/fs_subr.h>
69#include <sys/zfs_ctldir.h>
70#include <sys/zfs_ioctl.h>
71#include <sys/zfs_vfsops.h>
72#include <sys/vfs_opreg.h>
73#include <sys/gfs.h>
74#include <sys/stat.h>
75#include <sys/dmu.h>
76#include <sys/dsl_destroy.h>
77#include <sys/dsl_deleg.h>
78#include <sys/mount.h>
79#include <sys/sunddi.h>
80
81#include "zfs_namecheck.h"
82
83typedef struct zfsctl_node {
84	gfs_dir_t	zc_gfs_private;
85	uint64_t	zc_id;
86	timestruc_t	zc_cmtime;	/* ctime and mtime, always the same */
87} zfsctl_node_t;
88
89typedef struct zfsctl_snapdir {
90	zfsctl_node_t	sd_node;
91	kmutex_t	sd_lock;
92	avl_tree_t	sd_snaps;
93} zfsctl_snapdir_t;
94
95typedef struct {
96	char		*se_name;
97	vnode_t		*se_root;
98	avl_node_t	se_node;
99} zfs_snapentry_t;
100
101static int
102snapentry_compare(const void *a, const void *b)
103{
104	const zfs_snapentry_t *sa = a;
105	const zfs_snapentry_t *sb = b;
106	int ret = strcmp(sa->se_name, sb->se_name);
107
108	if (ret < 0)
109		return (-1);
110	else if (ret > 0)
111		return (1);
112	else
113		return (0);
114}
115
116vnodeops_t *zfsctl_ops_root;
117vnodeops_t *zfsctl_ops_snapdir;
118vnodeops_t *zfsctl_ops_snapshot;
119vnodeops_t *zfsctl_ops_shares;
120
121static const fs_operation_def_t zfsctl_tops_root[];
122static const fs_operation_def_t zfsctl_tops_snapdir[];
123static const fs_operation_def_t zfsctl_tops_snapshot[];
124static const fs_operation_def_t zfsctl_tops_shares[];
125
126static vnode_t *zfsctl_mknode_snapdir(vnode_t *);
127static vnode_t *zfsctl_mknode_shares(vnode_t *);
128static vnode_t *zfsctl_snapshot_mknode(vnode_t *, uint64_t objset);
129static int zfsctl_unmount_snap(zfs_snapentry_t *, int, cred_t *);
130
131static gfs_opsvec_t zfsctl_opsvec[] = {
132	{ ".zfs", zfsctl_tops_root, &zfsctl_ops_root },
133	{ ".zfs/snapshot", zfsctl_tops_snapdir, &zfsctl_ops_snapdir },
134	{ ".zfs/snapshot/vnode", zfsctl_tops_snapshot, &zfsctl_ops_snapshot },
135	{ ".zfs/shares", zfsctl_tops_shares, &zfsctl_ops_shares },
136	{ NULL }
137};
138
139/*
140 * Root directory elements.  We only have two entries
141 * snapshot and shares.
142 */
143static gfs_dirent_t zfsctl_root_entries[] = {
144	{ "snapshot", zfsctl_mknode_snapdir, GFS_CACHE_VNODE },
145	{ "shares", zfsctl_mknode_shares, GFS_CACHE_VNODE },
146	{ NULL }
147};
148
149/* include . and .. in the calculation */
150#define	NROOT_ENTRIES	((sizeof (zfsctl_root_entries) / \
151    sizeof (gfs_dirent_t)) + 1)
152
153
154/*
155 * Initialize the various GFS pieces we'll need to create and manipulate .zfs
156 * directories.  This is called from the ZFS init routine, and initializes the
157 * vnode ops vectors that we'll be using.
158 */
159void
160zfsctl_init(void)
161{
162	VERIFY(gfs_make_opsvec(zfsctl_opsvec) == 0);
163}
164
165void
166zfsctl_fini(void)
167{
168	/*
169	 * Remove vfsctl vnode ops
170	 */
171	if (zfsctl_ops_root)
172		vn_freevnodeops(zfsctl_ops_root);
173	if (zfsctl_ops_snapdir)
174		vn_freevnodeops(zfsctl_ops_snapdir);
175	if (zfsctl_ops_snapshot)
176		vn_freevnodeops(zfsctl_ops_snapshot);
177	if (zfsctl_ops_shares)
178		vn_freevnodeops(zfsctl_ops_shares);
179
180	zfsctl_ops_root = NULL;
181	zfsctl_ops_snapdir = NULL;
182	zfsctl_ops_snapshot = NULL;
183	zfsctl_ops_shares = NULL;
184}
185
186boolean_t
187zfsctl_is_node(vnode_t *vp)
188{
189	return (vn_matchops(vp, zfsctl_ops_root) ||
190	    vn_matchops(vp, zfsctl_ops_snapdir) ||
191	    vn_matchops(vp, zfsctl_ops_snapshot) ||
192	    vn_matchops(vp, zfsctl_ops_shares));
193
194}
195
196/*
197 * Return the inode number associated with the 'snapshot' or
198 * 'shares' directory.
199 */
200/* ARGSUSED */
201static ino64_t
202zfsctl_root_inode_cb(vnode_t *vp, int index)
203{
204	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
205
206	ASSERT(index < 2);
207
208	if (index == 0)
209		return (ZFSCTL_INO_SNAPDIR);
210
211	return (zfsvfs->z_shares_dir);
212}
213
214/*
215 * Create the '.zfs' directory.  This directory is cached as part of the VFS
216 * structure.  This results in a hold on the vfs_t.  The code in zfs_umount()
217 * therefore checks against a vfs_count of 2 instead of 1.  This reference
218 * is removed when the ctldir is destroyed in the unmount.
219 */
220void
221zfsctl_create(zfsvfs_t *zfsvfs)
222{
223	vnode_t *vp, *rvp;
224	zfsctl_node_t *zcp;
225	uint64_t crtime[2];
226
227	ASSERT(zfsvfs->z_ctldir == NULL);
228
229	vp = gfs_root_create(sizeof (zfsctl_node_t), zfsvfs->z_vfs,
230	    zfsctl_ops_root, ZFSCTL_INO_ROOT, zfsctl_root_entries,
231	    zfsctl_root_inode_cb, MAXNAMELEN, NULL, NULL);
232	zcp = vp->v_data;
233	zcp->zc_id = ZFSCTL_INO_ROOT;
234
235	VERIFY(VFS_ROOT(zfsvfs->z_vfs, &rvp) == 0);
236	VERIFY(0 == sa_lookup(VTOZ(rvp)->z_sa_hdl, SA_ZPL_CRTIME(zfsvfs),
237	    &crtime, sizeof (crtime)));
238	ZFS_TIME_DECODE(&zcp->zc_cmtime, crtime);
239	VN_RELE(rvp);
240
241	/*
242	 * We're only faking the fact that we have a root of a filesystem for
243	 * the sake of the GFS interfaces.  Undo the flag manipulation it did
244	 * for us.
245	 */
246	vp->v_flag &= ~(VROOT | VNOCACHE | VNOMAP | VNOSWAP | VNOMOUNT);
247
248	zfsvfs->z_ctldir = vp;
249}
250
251/*
252 * Destroy the '.zfs' directory.  Only called when the filesystem is unmounted.
253 * There might still be more references if we were force unmounted, but only
254 * new zfs_inactive() calls can occur and they don't reference .zfs
255 */
256void
257zfsctl_destroy(zfsvfs_t *zfsvfs)
258{
259	VN_RELE(zfsvfs->z_ctldir);
260	zfsvfs->z_ctldir = NULL;
261}
262
263/*
264 * Given a root znode, retrieve the associated .zfs directory.
265 * Add a hold to the vnode and return it.
266 */
267vnode_t *
268zfsctl_root(znode_t *zp)
269{
270	ASSERT(zfs_has_ctldir(zp));
271	VN_HOLD(zp->z_zfsvfs->z_ctldir);
272	return (zp->z_zfsvfs->z_ctldir);
273}
274
275/*
276 * Common open routine.  Disallow any write access.
277 */
278/* ARGSUSED */
279static int
280zfsctl_common_open(vnode_t **vpp, int flags, cred_t *cr, caller_context_t *ct)
281{
282	if (flags & FWRITE)
283		return (SET_ERROR(EACCES));
284
285	return (0);
286}
287
288/*
289 * Common close routine.  Nothing to do here.
290 */
291/* ARGSUSED */
292static int
293zfsctl_common_close(vnode_t *vpp, int flags, int count, offset_t off,
294    cred_t *cr, caller_context_t *ct)
295{
296	return (0);
297}
298
299/*
300 * Common access routine.  Disallow writes.
301 */
302/* ARGSUSED */
303static int
304zfsctl_common_access(vnode_t *vp, int mode, int flags, cred_t *cr,
305    caller_context_t *ct)
306{
307	if (flags & V_ACE_MASK) {
308		if (mode & ACE_ALL_WRITE_PERMS)
309			return (SET_ERROR(EACCES));
310	} else {
311		if (mode & VWRITE)
312			return (SET_ERROR(EACCES));
313	}
314
315	return (0);
316}
317
318/*
319 * Common getattr function.  Fill in basic information.
320 */
321static void
322zfsctl_common_getattr(vnode_t *vp, vattr_t *vap)
323{
324	timestruc_t	now;
325
326	vap->va_uid = 0;
327	vap->va_gid = 0;
328	vap->va_rdev = 0;
329	/*
330	 * We are a purely virtual object, so we have no
331	 * blocksize or allocated blocks.
332	 */
333	vap->va_blksize = 0;
334	vap->va_nblocks = 0;
335	vap->va_seq = 0;
336	vap->va_fsid = vp->v_vfsp->vfs_dev;
337	vap->va_mode = S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP |
338	    S_IROTH | S_IXOTH;
339	vap->va_type = VDIR;
340	/*
341	 * We live in the now (for atime).
342	 */
343	gethrestime(&now);
344	vap->va_atime = now;
345}
346
347/*ARGSUSED*/
348static int
349zfsctl_common_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
350{
351	zfsvfs_t	*zfsvfs = vp->v_vfsp->vfs_data;
352	zfsctl_node_t	*zcp = vp->v_data;
353	uint64_t	object = zcp->zc_id;
354	zfid_short_t	*zfid;
355	int		i;
356
357	ZFS_ENTER(zfsvfs);
358
359	if (fidp->fid_len < SHORT_FID_LEN) {
360		fidp->fid_len = SHORT_FID_LEN;
361		ZFS_EXIT(zfsvfs);
362		return (SET_ERROR(ENOSPC));
363	}
364
365	zfid = (zfid_short_t *)fidp;
366
367	zfid->zf_len = SHORT_FID_LEN;
368
369	for (i = 0; i < sizeof (zfid->zf_object); i++)
370		zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
371
372	/* .zfs znodes always have a generation number of 0 */
373	for (i = 0; i < sizeof (zfid->zf_gen); i++)
374		zfid->zf_gen[i] = 0;
375
376	ZFS_EXIT(zfsvfs);
377	return (0);
378}
379
380
381/*ARGSUSED*/
382static int
383zfsctl_shares_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
384{
385	zfsvfs_t	*zfsvfs = vp->v_vfsp->vfs_data;
386	znode_t		*dzp;
387	int		error;
388
389	ZFS_ENTER(zfsvfs);
390
391	if (zfsvfs->z_shares_dir == 0) {
392		ZFS_EXIT(zfsvfs);
393		return (SET_ERROR(ENOTSUP));
394	}
395
396	if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp)) == 0) {
397		error = VOP_FID(ZTOV(dzp), fidp, ct);
398		VN_RELE(ZTOV(dzp));
399	}
400
401	ZFS_EXIT(zfsvfs);
402	return (error);
403}
404/*
405 * .zfs inode namespace
406 *
407 * We need to generate unique inode numbers for all files and directories
408 * within the .zfs pseudo-filesystem.  We use the following scheme:
409 *
410 * 	ENTRY			ZFSCTL_INODE
411 * 	.zfs			1
412 * 	.zfs/snapshot		2
413 * 	.zfs/snapshot/<snap>	objectid(snap)
414 */
415
416#define	ZFSCTL_INO_SNAP(id)	(id)
417
418/*
419 * Get root directory attributes.
420 */
421/* ARGSUSED */
422static int
423zfsctl_root_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
424    caller_context_t *ct)
425{
426	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
427	zfsctl_node_t *zcp = vp->v_data;
428
429	ZFS_ENTER(zfsvfs);
430	vap->va_nodeid = ZFSCTL_INO_ROOT;
431	vap->va_nlink = vap->va_size = NROOT_ENTRIES;
432	vap->va_mtime = vap->va_ctime = zcp->zc_cmtime;
433
434	zfsctl_common_getattr(vp, vap);
435	ZFS_EXIT(zfsvfs);
436
437	return (0);
438}
439
440/*
441 * Special case the handling of "..".
442 */
443/* ARGSUSED */
444int
445zfsctl_root_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
446    int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
447    int *direntflags, pathname_t *realpnp)
448{
449	zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
450	int err;
451
452	/*
453	 * No extended attributes allowed under .zfs
454	 */
455	if (flags & LOOKUP_XATTR)
456		return (SET_ERROR(EINVAL));
457
458	ZFS_ENTER(zfsvfs);
459
460	if (strcmp(nm, "..") == 0) {
461		err = VFS_ROOT(dvp->v_vfsp, vpp);
462	} else {
463		err = gfs_vop_lookup(dvp, nm, vpp, pnp, flags, rdir,
464		    cr, ct, direntflags, realpnp);
465	}
466
467	ZFS_EXIT(zfsvfs);
468
469	return (err);
470}
471
472static int
473zfsctl_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
474    caller_context_t *ct)
475{
476	/*
477	 * We only care about ACL_ENABLED so that libsec can
478	 * display ACL correctly and not default to POSIX draft.
479	 */
480	if (cmd == _PC_ACL_ENABLED) {
481		*valp = _ACL_ACE_ENABLED;
482		return (0);
483	}
484
485	return (fs_pathconf(vp, cmd, valp, cr, ct));
486}
487
488static const fs_operation_def_t zfsctl_tops_root[] = {
489	{ VOPNAME_OPEN,		{ .vop_open = zfsctl_common_open }	},
490	{ VOPNAME_CLOSE,	{ .vop_close = zfsctl_common_close }	},
491	{ VOPNAME_IOCTL,	{ .error = fs_inval }			},
492	{ VOPNAME_GETATTR,	{ .vop_getattr = zfsctl_root_getattr }	},
493	{ VOPNAME_ACCESS,	{ .vop_access = zfsctl_common_access }	},
494	{ VOPNAME_READDIR,	{ .vop_readdir = gfs_vop_readdir } 	},
495	{ VOPNAME_LOOKUP,	{ .vop_lookup = zfsctl_root_lookup }	},
496	{ VOPNAME_SEEK,		{ .vop_seek = fs_seek }			},
497	{ VOPNAME_INACTIVE,	{ .vop_inactive = gfs_vop_inactive }	},
498	{ VOPNAME_PATHCONF,	{ .vop_pathconf = zfsctl_pathconf }	},
499	{ VOPNAME_FID,		{ .vop_fid = zfsctl_common_fid	}	},
500	{ NULL }
501};
502
503/*
504 * Gets the full dataset name that corresponds to the given snapshot name
505 * Example:
506 * 	zfsctl_snapshot_zname("snap1") -> "mypool/myfs@snap1"
507 */
508static int
509zfsctl_snapshot_zname(vnode_t *vp, const char *name, int len, char *zname)
510{
511	objset_t *os = ((zfsvfs_t *)((vp)->v_vfsp->vfs_data))->z_os;
512
513	if (zfs_component_namecheck(name, NULL, NULL) != 0)
514		return (SET_ERROR(EILSEQ));
515	dmu_objset_name(os, zname);
516	if (strlen(zname) + 1 + strlen(name) >= len)
517		return (SET_ERROR(ENAMETOOLONG));
518	(void) strcat(zname, "@");
519	(void) strcat(zname, name);
520	return (0);
521}
522
523static int
524zfsctl_unmount_snap(zfs_snapentry_t *sep, int fflags, cred_t *cr)
525{
526	vnode_t *svp = sep->se_root;
527	int error;
528
529	ASSERT(vn_ismntpt(svp));
530
531	/* this will be dropped by dounmount() */
532	if ((error = vn_vfswlock(svp)) != 0)
533		return (error);
534
535	VN_HOLD(svp);
536	error = dounmount(vn_mountedvfs(svp), fflags, cr);
537	if (error) {
538		VN_RELE(svp);
539		return (error);
540	}
541
542	/*
543	 * We can't use VN_RELE(), as that will try to invoke
544	 * zfsctl_snapdir_inactive(), which would cause us to destroy
545	 * the sd_lock mutex held by our caller.
546	 */
547	ASSERT(svp->v_count == 1);
548	gfs_vop_inactive(svp, cr, NULL);
549
550	kmem_free(sep->se_name, strlen(sep->se_name) + 1);
551	kmem_free(sep, sizeof (zfs_snapentry_t));
552
553	return (0);
554}
555
556static void
557zfsctl_rename_snap(zfsctl_snapdir_t *sdp, zfs_snapentry_t *sep, const char *nm)
558{
559	avl_index_t where;
560	vfs_t *vfsp;
561	refstr_t *pathref;
562	char newpath[MAXNAMELEN];
563	char *tail;
564
565	ASSERT(MUTEX_HELD(&sdp->sd_lock));
566	ASSERT(sep != NULL);
567
568	vfsp = vn_mountedvfs(sep->se_root);
569	ASSERT(vfsp != NULL);
570
571	vfs_lock_wait(vfsp);
572
573	/*
574	 * Change the name in the AVL tree.
575	 */
576	avl_remove(&sdp->sd_snaps, sep);
577	kmem_free(sep->se_name, strlen(sep->se_name) + 1);
578	sep->se_name = kmem_alloc(strlen(nm) + 1, KM_SLEEP);
579	(void) strcpy(sep->se_name, nm);
580	VERIFY(avl_find(&sdp->sd_snaps, sep, &where) == NULL);
581	avl_insert(&sdp->sd_snaps, sep, where);
582
583	/*
584	 * Change the current mountpoint info:
585	 * 	- update the tail of the mntpoint path
586	 *	- update the tail of the resource path
587	 */
588	pathref = vfs_getmntpoint(vfsp);
589	(void) strncpy(newpath, refstr_value(pathref), sizeof (newpath));
590	VERIFY((tail = strrchr(newpath, '/')) != NULL);
591	*(tail+1) = '\0';
592	ASSERT3U(strlen(newpath) + strlen(nm), <, sizeof (newpath));
593	(void) strcat(newpath, nm);
594	refstr_rele(pathref);
595	vfs_setmntpoint(vfsp, newpath, 0);
596
597	pathref = vfs_getresource(vfsp);
598	(void) strncpy(newpath, refstr_value(pathref), sizeof (newpath));
599	VERIFY((tail = strrchr(newpath, '@')) != NULL);
600	*(tail+1) = '\0';
601	ASSERT3U(strlen(newpath) + strlen(nm), <, sizeof (newpath));
602	(void) strcat(newpath, nm);
603	refstr_rele(pathref);
604	vfs_setresource(vfsp, newpath, 0);
605
606	vfs_unlock(vfsp);
607}
608
609/*ARGSUSED*/
610static int
611zfsctl_snapdir_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm,
612    cred_t *cr, caller_context_t *ct, int flags)
613{
614	zfsctl_snapdir_t *sdp = sdvp->v_data;
615	zfs_snapentry_t search, *sep;
616	zfsvfs_t *zfsvfs;
617	avl_index_t where;
618	char from[ZFS_MAX_DATASET_NAME_LEN], to[ZFS_MAX_DATASET_NAME_LEN];
619	char real[ZFS_MAX_DATASET_NAME_LEN], fsname[ZFS_MAX_DATASET_NAME_LEN];
620	int err;
621
622	zfsvfs = sdvp->v_vfsp->vfs_data;
623	ZFS_ENTER(zfsvfs);
624
625	if ((flags & FIGNORECASE) || zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {
626		err = dmu_snapshot_realname(zfsvfs->z_os, snm, real,
627		    sizeof (real), NULL);
628		if (err == 0) {
629			snm = real;
630		} else if (err != ENOTSUP) {
631			ZFS_EXIT(zfsvfs);
632			return (err);
633		}
634	}
635
636	ZFS_EXIT(zfsvfs);
637
638	dmu_objset_name(zfsvfs->z_os, fsname);
639
640	err = zfsctl_snapshot_zname(sdvp, snm, sizeof (from), from);
641	if (err == 0)
642		err = zfsctl_snapshot_zname(tdvp, tnm, sizeof (to), to);
643	if (err == 0)
644		err = zfs_secpolicy_rename_perms(from, to, cr);
645	if (err != 0)
646		return (err);
647
648	/*
649	 * Cannot move snapshots out of the snapdir.
650	 */
651	if (sdvp != tdvp)
652		return (SET_ERROR(EINVAL));
653
654	if (strcmp(snm, tnm) == 0)
655		return (0);
656
657	mutex_enter(&sdp->sd_lock);
658
659	search.se_name = (char *)snm;
660	if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) == NULL) {
661		mutex_exit(&sdp->sd_lock);
662		return (SET_ERROR(ENOENT));
663	}
664
665	err = dsl_dataset_rename_snapshot(fsname, snm, tnm, B_FALSE);
666	if (err == 0)
667		zfsctl_rename_snap(sdp, sep, tnm);
668
669	mutex_exit(&sdp->sd_lock);
670
671	return (err);
672}
673
674/* ARGSUSED */
675static int
676zfsctl_snapdir_remove(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr,
677    caller_context_t *ct, int flags)
678{
679	zfsctl_snapdir_t *sdp = dvp->v_data;
680	zfs_snapentry_t *sep;
681	zfs_snapentry_t search;
682	zfsvfs_t *zfsvfs;
683	char snapname[ZFS_MAX_DATASET_NAME_LEN];
684	char real[ZFS_MAX_DATASET_NAME_LEN];
685	int err;
686
687	zfsvfs = dvp->v_vfsp->vfs_data;
688	ZFS_ENTER(zfsvfs);
689
690	if ((flags & FIGNORECASE) || zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {
691
692		err = dmu_snapshot_realname(zfsvfs->z_os, name, real,
693		    sizeof (real), NULL);
694		if (err == 0) {
695			name = real;
696		} else if (err != ENOTSUP) {
697			ZFS_EXIT(zfsvfs);
698			return (err);
699		}
700	}
701
702	ZFS_EXIT(zfsvfs);
703
704	err = zfsctl_snapshot_zname(dvp, name, sizeof (snapname), snapname);
705	if (err == 0)
706		err = zfs_secpolicy_destroy_perms(snapname, cr);
707	if (err != 0)
708		return (err);
709
710	mutex_enter(&sdp->sd_lock);
711
712	search.se_name = name;
713	sep = avl_find(&sdp->sd_snaps, &search, NULL);
714	if (sep) {
715		avl_remove(&sdp->sd_snaps, sep);
716		err = zfsctl_unmount_snap(sep, MS_FORCE, cr);
717		if (err != 0)
718			avl_add(&sdp->sd_snaps, sep);
719		else
720			err = dsl_destroy_snapshot(snapname, B_FALSE);
721	} else {
722		err = SET_ERROR(ENOENT);
723	}
724
725	mutex_exit(&sdp->sd_lock);
726
727	return (err);
728}
729
730/*
731 * This creates a snapshot under '.zfs/snapshot'.
732 */
733/* ARGSUSED */
734static int
735zfsctl_snapdir_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t  **vpp,
736    cred_t *cr, caller_context_t *cc, int flags, vsecattr_t *vsecp)
737{
738	zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
739	char name[ZFS_MAX_DATASET_NAME_LEN];
740	int err;
741	static enum symfollow follow = NO_FOLLOW;
742	static enum uio_seg seg = UIO_SYSSPACE;
743
744	if (zfs_component_namecheck(dirname, NULL, NULL) != 0)
745		return (SET_ERROR(EILSEQ));
746
747	dmu_objset_name(zfsvfs->z_os, name);
748
749	*vpp = NULL;
750
751	err = zfs_secpolicy_snapshot_perms(name, cr);
752	if (err != 0)
753		return (err);
754
755	if (err == 0) {
756		err = dmu_objset_snapshot_one(name, dirname);
757		if (err != 0)
758			return (err);
759		err = lookupnameat(dirname, seg, follow, NULL, vpp, dvp);
760	}
761
762	return (err);
763}
764
765/*
766 * Lookup entry point for the 'snapshot' directory.  Try to open the
767 * snapshot if it exist, creating the pseudo filesystem vnode as necessary.
768 * Perform a mount of the associated dataset on top of the vnode.
769 */
770/* ARGSUSED */
771static int
772zfsctl_snapdir_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
773    int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
774    int *direntflags, pathname_t *realpnp)
775{
776	zfsctl_snapdir_t *sdp = dvp->v_data;
777	objset_t *snap;
778	char snapname[ZFS_MAX_DATASET_NAME_LEN];
779	char real[ZFS_MAX_DATASET_NAME_LEN];
780	char *mountpoint;
781	zfs_snapentry_t *sep, search;
782	struct mounta margs;
783	vfs_t *vfsp;
784	size_t mountpoint_len;
785	avl_index_t where;
786	zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
787	int err;
788
789	/*
790	 * No extended attributes allowed under .zfs
791	 */
792	if (flags & LOOKUP_XATTR)
793		return (SET_ERROR(EINVAL));
794
795	ASSERT(dvp->v_type == VDIR);
796
797	/*
798	 * If we get a recursive call, that means we got called
799	 * from the domount() code while it was trying to look up the
800	 * spec (which looks like a local path for zfs).  We need to
801	 * add some flag to domount() to tell it not to do this lookup.
802	 */
803	if (MUTEX_HELD(&sdp->sd_lock))
804		return (SET_ERROR(ENOENT));
805
806	ZFS_ENTER(zfsvfs);
807
808	if (gfs_lookup_dot(vpp, dvp, zfsvfs->z_ctldir, nm) == 0) {
809		ZFS_EXIT(zfsvfs);
810		return (0);
811	}
812
813	if (flags & FIGNORECASE) {
814		boolean_t conflict = B_FALSE;
815
816		err = dmu_snapshot_realname(zfsvfs->z_os, nm, real,
817		    sizeof (real), &conflict);
818		if (err == 0) {
819			nm = real;
820		} else if (err != ENOTSUP) {
821			ZFS_EXIT(zfsvfs);
822			return (err);
823		}
824		if (realpnp)
825			(void) strlcpy(realpnp->pn_buf, nm,
826			    realpnp->pn_bufsize);
827		if (conflict && direntflags)
828			*direntflags = ED_CASE_CONFLICT;
829	}
830
831	mutex_enter(&sdp->sd_lock);
832	search.se_name = (char *)nm;
833	if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) != NULL) {
834		*vpp = sep->se_root;
835		VN_HOLD(*vpp);
836		err = traverse(vpp);
837		if (err != 0) {
838			VN_RELE(*vpp);
839			*vpp = NULL;
840		} else if (*vpp == sep->se_root) {
841			/*
842			 * The snapshot was unmounted behind our backs,
843			 * try to remount it.
844			 */
845			goto domount;
846		} else {
847			/*
848			 * VROOT was set during the traverse call.  We need
849			 * to clear it since we're pretending to be part
850			 * of our parent's vfs.
851			 */
852			(*vpp)->v_flag &= ~VROOT;
853		}
854		mutex_exit(&sdp->sd_lock);
855		ZFS_EXIT(zfsvfs);
856		return (err);
857	}
858
859	/*
860	 * The requested snapshot is not currently mounted, look it up.
861	 */
862	err = zfsctl_snapshot_zname(dvp, nm, sizeof (snapname), snapname);
863	if (err != 0) {
864		mutex_exit(&sdp->sd_lock);
865		ZFS_EXIT(zfsvfs);
866		/*
867		 * handle "ls *" or "?" in a graceful manner,
868		 * forcing EILSEQ to ENOENT.
869		 * Since shell ultimately passes "*" or "?" as name to lookup
870		 */
871		return (err == EILSEQ ? ENOENT : err);
872	}
873	if (dmu_objset_hold(snapname, FTAG, &snap) != 0) {
874		mutex_exit(&sdp->sd_lock);
875		ZFS_EXIT(zfsvfs);
876		return (SET_ERROR(ENOENT));
877	}
878
879	sep = kmem_alloc(sizeof (zfs_snapentry_t), KM_SLEEP);
880	sep->se_name = kmem_alloc(strlen(nm) + 1, KM_SLEEP);
881	(void) strcpy(sep->se_name, nm);
882	*vpp = sep->se_root = zfsctl_snapshot_mknode(dvp, dmu_objset_id(snap));
883	avl_insert(&sdp->sd_snaps, sep, where);
884
885	dmu_objset_rele(snap, FTAG);
886domount:
887	mountpoint_len = strlen(refstr_value(dvp->v_vfsp->vfs_mntpt)) +
888	    strlen("/.zfs/snapshot/") + strlen(nm) + 1;
889	mountpoint = kmem_alloc(mountpoint_len, KM_SLEEP);
890	(void) snprintf(mountpoint, mountpoint_len, "%s/.zfs/snapshot/%s",
891	    refstr_value(dvp->v_vfsp->vfs_mntpt), nm);
892
893	margs.spec = snapname;
894	margs.dir = mountpoint;
895	margs.flags = MS_SYSSPACE | MS_NOMNTTAB;
896	margs.fstype = "zfs";
897	margs.dataptr = NULL;
898	margs.datalen = 0;
899	margs.optptr = NULL;
900	margs.optlen = 0;
901
902	err = domount("zfs", &margs, *vpp, kcred, &vfsp);
903	kmem_free(mountpoint, mountpoint_len);
904
905	if (err == 0) {
906		/*
907		 * Return the mounted root rather than the covered mount point.
908		 * Takes the GFS vnode at .zfs/snapshot/<snapname> and returns
909		 * the ZFS vnode mounted on top of the GFS node.  This ZFS
910		 * vnode is the root of the newly created vfsp.
911		 */
912		VFS_RELE(vfsp);
913		err = traverse(vpp);
914	}
915
916	if (err == 0) {
917		/*
918		 * Fix up the root vnode mounted on .zfs/snapshot/<snapname>.
919		 *
920		 * This is where we lie about our v_vfsp in order to
921		 * make .zfs/snapshot/<snapname> accessible over NFS
922		 * without requiring manual mounts of <snapname>.
923		 */
924		ASSERT(VTOZ(*vpp)->z_zfsvfs != zfsvfs);
925		VTOZ(*vpp)->z_zfsvfs->z_parent = zfsvfs;
926		(*vpp)->v_vfsp = zfsvfs->z_vfs;
927		(*vpp)->v_flag &= ~VROOT;
928	}
929	mutex_exit(&sdp->sd_lock);
930	ZFS_EXIT(zfsvfs);
931
932	/*
933	 * If we had an error, drop our hold on the vnode and
934	 * zfsctl_snapshot_inactive() will clean up.
935	 */
936	if (err != 0) {
937		VN_RELE(*vpp);
938		*vpp = NULL;
939	}
940	return (err);
941}
942
943/* ARGSUSED */
944static int
945zfsctl_shares_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
946    int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
947    int *direntflags, pathname_t *realpnp)
948{
949	zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
950	znode_t *dzp;
951	int error;
952
953	ZFS_ENTER(zfsvfs);
954
955	if (gfs_lookup_dot(vpp, dvp, zfsvfs->z_ctldir, nm) == 0) {
956		ZFS_EXIT(zfsvfs);
957		return (0);
958	}
959
960	if (zfsvfs->z_shares_dir == 0) {
961		ZFS_EXIT(zfsvfs);
962		return (SET_ERROR(ENOTSUP));
963	}
964	if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp)) == 0) {
965		error = VOP_LOOKUP(ZTOV(dzp), nm, vpp, pnp,
966		    flags, rdir, cr, ct, direntflags, realpnp);
967		VN_RELE(ZTOV(dzp));
968	}
969
970	ZFS_EXIT(zfsvfs);
971
972	return (error);
973}
974
975/* ARGSUSED */
976static int
977zfsctl_snapdir_readdir_cb(vnode_t *vp, void *dp, int *eofp,
978    offset_t *offp, offset_t *nextp, void *data, int flags)
979{
980	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
981	char snapname[ZFS_MAX_DATASET_NAME_LEN];
982	uint64_t id, cookie;
983	boolean_t case_conflict;
984	int error;
985
986	ZFS_ENTER(zfsvfs);
987
988	cookie = *offp;
989	dsl_pool_config_enter(dmu_objset_pool(zfsvfs->z_os), FTAG);
990	error = dmu_snapshot_list_next(zfsvfs->z_os,
991	    sizeof (snapname), snapname, &id, &cookie, &case_conflict);
992	dsl_pool_config_exit(dmu_objset_pool(zfsvfs->z_os), FTAG);
993	if (error) {
994		ZFS_EXIT(zfsvfs);
995		if (error == ENOENT) {
996			*eofp = 1;
997			return (0);
998		}
999		return (error);
1000	}
1001
1002	if (flags & V_RDDIR_ENTFLAGS) {
1003		edirent_t *eodp = dp;
1004
1005		(void) strcpy(eodp->ed_name, snapname);
1006		eodp->ed_ino = ZFSCTL_INO_SNAP(id);
1007		eodp->ed_eflags = case_conflict ? ED_CASE_CONFLICT : 0;
1008	} else {
1009		struct dirent64 *odp = dp;
1010
1011		(void) strcpy(odp->d_name, snapname);
1012		odp->d_ino = ZFSCTL_INO_SNAP(id);
1013	}
1014	*nextp = cookie;
1015
1016	ZFS_EXIT(zfsvfs);
1017
1018	return (0);
1019}
1020
1021/* ARGSUSED */
1022static int
1023zfsctl_shares_readdir(vnode_t *vp, uio_t *uiop, cred_t *cr, int *eofp,
1024    caller_context_t *ct, int flags)
1025{
1026	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
1027	znode_t *dzp;
1028	int error;
1029
1030	ZFS_ENTER(zfsvfs);
1031
1032	if (zfsvfs->z_shares_dir == 0) {
1033		ZFS_EXIT(zfsvfs);
1034		return (SET_ERROR(ENOTSUP));
1035	}
1036	if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp)) == 0) {
1037		error = VOP_READDIR(ZTOV(dzp), uiop, cr, eofp, ct, flags);
1038		VN_RELE(ZTOV(dzp));
1039	} else {
1040		*eofp = 1;
1041		error = SET_ERROR(ENOENT);
1042	}
1043
1044	ZFS_EXIT(zfsvfs);
1045	return (error);
1046}
1047
1048/*
1049 * pvp is the '.zfs' directory (zfsctl_node_t).
1050 *
1051 * Creates vp, which is '.zfs/snapshot' (zfsctl_snapdir_t).
1052 *
1053 * This function is the callback to create a GFS vnode for '.zfs/snapshot'
1054 * when a lookup is performed on .zfs for "snapshot".
1055 */
1056vnode_t *
1057zfsctl_mknode_snapdir(vnode_t *pvp)
1058{
1059	vnode_t *vp;
1060	zfsctl_snapdir_t *sdp;
1061
1062	vp = gfs_dir_create(sizeof (zfsctl_snapdir_t), pvp,
1063	    zfsctl_ops_snapdir, NULL, NULL, MAXNAMELEN,
1064	    zfsctl_snapdir_readdir_cb, NULL);
1065	sdp = vp->v_data;
1066	sdp->sd_node.zc_id = ZFSCTL_INO_SNAPDIR;
1067	sdp->sd_node.zc_cmtime = ((zfsctl_node_t *)pvp->v_data)->zc_cmtime;
1068	mutex_init(&sdp->sd_lock, NULL, MUTEX_DEFAULT, NULL);
1069	avl_create(&sdp->sd_snaps, snapentry_compare,
1070	    sizeof (zfs_snapentry_t), offsetof(zfs_snapentry_t, se_node));
1071	return (vp);
1072}
1073
1074vnode_t *
1075zfsctl_mknode_shares(vnode_t *pvp)
1076{
1077	vnode_t *vp;
1078	zfsctl_node_t *sdp;
1079
1080	vp = gfs_dir_create(sizeof (zfsctl_node_t), pvp,
1081	    zfsctl_ops_shares, NULL, NULL, MAXNAMELEN,
1082	    NULL, NULL);
1083	sdp = vp->v_data;
1084	sdp->zc_cmtime = ((zfsctl_node_t *)pvp->v_data)->zc_cmtime;
1085	return (vp);
1086
1087}
1088
1089/* ARGSUSED */
1090static int
1091zfsctl_shares_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
1092    caller_context_t *ct)
1093{
1094	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
1095	znode_t *dzp;
1096	int error;
1097
1098	ZFS_ENTER(zfsvfs);
1099	if (zfsvfs->z_shares_dir == 0) {
1100		ZFS_EXIT(zfsvfs);
1101		return (SET_ERROR(ENOTSUP));
1102	}
1103	if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp)) == 0) {
1104		error = VOP_GETATTR(ZTOV(dzp), vap, flags, cr, ct);
1105		VN_RELE(ZTOV(dzp));
1106	}
1107	ZFS_EXIT(zfsvfs);
1108	return (error);
1109
1110
1111}
1112
1113/* ARGSUSED */
1114static int
1115zfsctl_snapdir_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
1116    caller_context_t *ct)
1117{
1118	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
1119	zfsctl_snapdir_t *sdp = vp->v_data;
1120
1121	ZFS_ENTER(zfsvfs);
1122	zfsctl_common_getattr(vp, vap);
1123	vap->va_nodeid = gfs_file_inode(vp);
1124	vap->va_nlink = vap->va_size = avl_numnodes(&sdp->sd_snaps) + 2;
1125	vap->va_ctime = vap->va_mtime = dmu_objset_snap_cmtime(zfsvfs->z_os);
1126	ZFS_EXIT(zfsvfs);
1127
1128	return (0);
1129}
1130
1131/* ARGSUSED */
1132static void
1133zfsctl_snapdir_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
1134{
1135	zfsctl_snapdir_t *sdp = vp->v_data;
1136	void *private;
1137
1138	private = gfs_dir_inactive(vp);
1139	if (private != NULL) {
1140		ASSERT(avl_numnodes(&sdp->sd_snaps) == 0);
1141		mutex_destroy(&sdp->sd_lock);
1142		avl_destroy(&sdp->sd_snaps);
1143		kmem_free(private, sizeof (zfsctl_snapdir_t));
1144	}
1145}
1146
1147static const fs_operation_def_t zfsctl_tops_snapdir[] = {
1148	{ VOPNAME_OPEN,		{ .vop_open = zfsctl_common_open }	},
1149	{ VOPNAME_CLOSE,	{ .vop_close = zfsctl_common_close }	},
1150	{ VOPNAME_IOCTL,	{ .error = fs_inval }			},
1151	{ VOPNAME_GETATTR,	{ .vop_getattr = zfsctl_snapdir_getattr } },
1152	{ VOPNAME_ACCESS,	{ .vop_access = zfsctl_common_access }	},
1153	{ VOPNAME_RENAME,	{ .vop_rename = zfsctl_snapdir_rename }	},
1154	{ VOPNAME_RMDIR,	{ .vop_rmdir = zfsctl_snapdir_remove }	},
1155	{ VOPNAME_MKDIR,	{ .vop_mkdir = zfsctl_snapdir_mkdir }	},
1156	{ VOPNAME_READDIR,	{ .vop_readdir = gfs_vop_readdir }	},
1157	{ VOPNAME_LOOKUP,	{ .vop_lookup = zfsctl_snapdir_lookup }	},
1158	{ VOPNAME_SEEK,		{ .vop_seek = fs_seek }			},
1159	{ VOPNAME_INACTIVE,	{ .vop_inactive = zfsctl_snapdir_inactive } },
1160	{ VOPNAME_FID,		{ .vop_fid = zfsctl_common_fid }	},
1161	{ NULL }
1162};
1163
1164static const fs_operation_def_t zfsctl_tops_shares[] = {
1165	{ VOPNAME_OPEN,		{ .vop_open = zfsctl_common_open }	},
1166	{ VOPNAME_CLOSE,	{ .vop_close = zfsctl_common_close }	},
1167	{ VOPNAME_IOCTL,	{ .error = fs_inval }			},
1168	{ VOPNAME_GETATTR,	{ .vop_getattr = zfsctl_shares_getattr } },
1169	{ VOPNAME_ACCESS,	{ .vop_access = zfsctl_common_access }	},
1170	{ VOPNAME_READDIR,	{ .vop_readdir = zfsctl_shares_readdir } },
1171	{ VOPNAME_LOOKUP,	{ .vop_lookup = zfsctl_shares_lookup }	},
1172	{ VOPNAME_SEEK,		{ .vop_seek = fs_seek }			},
1173	{ VOPNAME_INACTIVE,	{ .vop_inactive = gfs_vop_inactive } },
1174	{ VOPNAME_FID,		{ .vop_fid = zfsctl_shares_fid } },
1175	{ NULL }
1176};
1177
1178/*
1179 * pvp is the GFS vnode '.zfs/snapshot'.
1180 *
1181 * This creates a GFS node under '.zfs/snapshot' representing each
1182 * snapshot.  This newly created GFS node is what we mount snapshot
1183 * vfs_t's ontop of.
1184 */
1185static vnode_t *
1186zfsctl_snapshot_mknode(vnode_t *pvp, uint64_t objset)
1187{
1188	vnode_t *vp;
1189	zfsctl_node_t *zcp;
1190
1191	vp = gfs_dir_create(sizeof (zfsctl_node_t), pvp,
1192	    zfsctl_ops_snapshot, NULL, NULL, MAXNAMELEN, NULL, NULL);
1193	zcp = vp->v_data;
1194	zcp->zc_id = objset;
1195
1196	return (vp);
1197}
1198
1199static void
1200zfsctl_snapshot_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
1201{
1202	zfsctl_snapdir_t *sdp;
1203	zfs_snapentry_t *sep, *next;
1204	vnode_t *dvp;
1205
1206	VERIFY(gfs_dir_lookup(vp, "..", &dvp, cr, 0, NULL, NULL) == 0);
1207	sdp = dvp->v_data;
1208
1209	mutex_enter(&sdp->sd_lock);
1210
1211	mutex_enter(&vp->v_lock);
1212	if (vp->v_count > 1) {
1213		VN_RELE_LOCKED(vp);
1214		mutex_exit(&vp->v_lock);
1215		mutex_exit(&sdp->sd_lock);
1216		VN_RELE(dvp);
1217		return;
1218	}
1219	mutex_exit(&vp->v_lock);
1220	ASSERT(!vn_ismntpt(vp));
1221
1222	sep = avl_first(&sdp->sd_snaps);
1223	while (sep != NULL) {
1224		next = AVL_NEXT(&sdp->sd_snaps, sep);
1225
1226		if (sep->se_root == vp) {
1227			avl_remove(&sdp->sd_snaps, sep);
1228			kmem_free(sep->se_name, strlen(sep->se_name) + 1);
1229			kmem_free(sep, sizeof (zfs_snapentry_t));
1230			break;
1231		}
1232		sep = next;
1233	}
1234	ASSERT(sep != NULL);
1235
1236	mutex_exit(&sdp->sd_lock);
1237	VN_RELE(dvp);
1238
1239	/*
1240	 * Dispose of the vnode for the snapshot mount point.
1241	 * This is safe to do because once this entry has been removed
1242	 * from the AVL tree, it can't be found again, so cannot become
1243	 * "active".  If we lookup the same name again we will end up
1244	 * creating a new vnode.
1245	 */
1246	gfs_vop_inactive(vp, cr, ct);
1247}
1248
1249
1250/*
1251 * These VP's should never see the light of day.  They should always
1252 * be covered.
1253 */
1254static const fs_operation_def_t zfsctl_tops_snapshot[] = {
1255	VOPNAME_INACTIVE, { .vop_inactive =  zfsctl_snapshot_inactive },
1256	NULL, NULL
1257};
1258
1259int
1260zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp)
1261{
1262	zfsvfs_t *zfsvfs = vfsp->vfs_data;
1263	vnode_t *dvp, *vp;
1264	zfsctl_snapdir_t *sdp;
1265	zfsctl_node_t *zcp;
1266	zfs_snapentry_t *sep;
1267	int error;
1268
1269	ASSERT(zfsvfs->z_ctldir != NULL);
1270	error = zfsctl_root_lookup(zfsvfs->z_ctldir, "snapshot", &dvp,
1271	    NULL, 0, NULL, kcred, NULL, NULL, NULL);
1272	if (error != 0)
1273		return (error);
1274	sdp = dvp->v_data;
1275
1276	mutex_enter(&sdp->sd_lock);
1277	sep = avl_first(&sdp->sd_snaps);
1278	while (sep != NULL) {
1279		vp = sep->se_root;
1280		zcp = vp->v_data;
1281		if (zcp->zc_id == objsetid)
1282			break;
1283
1284		sep = AVL_NEXT(&sdp->sd_snaps, sep);
1285	}
1286
1287	if (sep != NULL) {
1288		VN_HOLD(vp);
1289		/*
1290		 * Return the mounted root rather than the covered mount point.
1291		 * Takes the GFS vnode at .zfs/snapshot/<snapshot objsetid>
1292		 * and returns the ZFS vnode mounted on top of the GFS node.
1293		 * This ZFS vnode is the root of the vfs for objset 'objsetid'.
1294		 */
1295		error = traverse(&vp);
1296		if (error == 0) {
1297			if (vp == sep->se_root)
1298				error = SET_ERROR(EINVAL);
1299			else
1300				*zfsvfsp = VTOZ(vp)->z_zfsvfs;
1301		}
1302		mutex_exit(&sdp->sd_lock);
1303		VN_RELE(vp);
1304	} else {
1305		error = SET_ERROR(EINVAL);
1306		mutex_exit(&sdp->sd_lock);
1307	}
1308
1309	VN_RELE(dvp);
1310
1311	return (error);
1312}
1313
1314/*
1315 * Unmount any snapshots for the given filesystem.  This is called from
1316 * zfs_umount() - if we have a ctldir, then go through and unmount all the
1317 * snapshots.
1318 */
1319int
1320zfsctl_umount_snapshots(vfs_t *vfsp, int fflags, cred_t *cr)
1321{
1322	zfsvfs_t *zfsvfs = vfsp->vfs_data;
1323	vnode_t *dvp;
1324	zfsctl_snapdir_t *sdp;
1325	zfs_snapentry_t *sep, *next;
1326	int error;
1327
1328	ASSERT(zfsvfs->z_ctldir != NULL);
1329	error = zfsctl_root_lookup(zfsvfs->z_ctldir, "snapshot", &dvp,
1330	    NULL, 0, NULL, cr, NULL, NULL, NULL);
1331	if (error != 0)
1332		return (error);
1333	sdp = dvp->v_data;
1334
1335	mutex_enter(&sdp->sd_lock);
1336
1337	sep = avl_first(&sdp->sd_snaps);
1338	while (sep != NULL) {
1339		next = AVL_NEXT(&sdp->sd_snaps, sep);
1340
1341		/*
1342		 * If this snapshot is not mounted, then it must
1343		 * have just been unmounted by somebody else, and
1344		 * will be cleaned up by zfsctl_snapdir_inactive().
1345		 */
1346		if (vn_ismntpt(sep->se_root)) {
1347			avl_remove(&sdp->sd_snaps, sep);
1348			error = zfsctl_unmount_snap(sep, fflags, cr);
1349			if (error) {
1350				avl_add(&sdp->sd_snaps, sep);
1351				break;
1352			}
1353		}
1354		sep = next;
1355	}
1356
1357	mutex_exit(&sdp->sd_lock);
1358	VN_RELE(dvp);
1359
1360	return (error);
1361}
1362