xref: /illumos-gate/usr/src/uts/common/fs/nfs/nfs4_stub_vnops.c (revision b9238976491622ad75a67ab0c12edf99e36212b9)
1*b9238976Sth /*
2*b9238976Sth  * CDDL HEADER START
3*b9238976Sth  *
4*b9238976Sth  * The contents of this file are subject to the terms of the
5*b9238976Sth  * Common Development and Distribution License (the "License").
6*b9238976Sth  * You may not use this file except in compliance with the License.
7*b9238976Sth  *
8*b9238976Sth  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9*b9238976Sth  * or http://www.opensolaris.org/os/licensing.
10*b9238976Sth  * See the License for the specific language governing permissions
11*b9238976Sth  * and limitations under the License.
12*b9238976Sth  *
13*b9238976Sth  * When distributing Covered Code, include this CDDL HEADER in each
14*b9238976Sth  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15*b9238976Sth  * If applicable, add the following below this CDDL HEADER, with the
16*b9238976Sth  * fields enclosed by brackets "[]" replaced with your own identifying
17*b9238976Sth  * information: Portions Copyright [yyyy] [name of copyright owner]
18*b9238976Sth  *
19*b9238976Sth  * CDDL HEADER END
20*b9238976Sth  */
21*b9238976Sth 
22*b9238976Sth /*
23*b9238976Sth  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
24*b9238976Sth  * Use is subject to license terms.
25*b9238976Sth  */
26*b9238976Sth 
27*b9238976Sth #pragma ident	"%Z%%M%	%I%	%E% SMI"
28*b9238976Sth 
29*b9238976Sth /*
30*b9238976Sth  * Support for ephemeral mounts, e.g. mirror-mounts. These mounts are
31*b9238976Sth  * triggered from a "stub" rnode via a special set of vnodeops.
32*b9238976Sth  */
33*b9238976Sth 
34*b9238976Sth #include <sys/param.h>
35*b9238976Sth #include <sys/types.h>
36*b9238976Sth #include <sys/systm.h>
37*b9238976Sth #include <sys/cred.h>
38*b9238976Sth #include <sys/time.h>
39*b9238976Sth #include <sys/vnode.h>
40*b9238976Sth #include <sys/vfs.h>
41*b9238976Sth #include <sys/vfs_opreg.h>
42*b9238976Sth #include <sys/file.h>
43*b9238976Sth #include <sys/filio.h>
44*b9238976Sth #include <sys/uio.h>
45*b9238976Sth #include <sys/buf.h>
46*b9238976Sth #include <sys/mman.h>
47*b9238976Sth #include <sys/pathname.h>
48*b9238976Sth #include <sys/dirent.h>
49*b9238976Sth #include <sys/debug.h>
50*b9238976Sth #include <sys/vmsystm.h>
51*b9238976Sth #include <sys/fcntl.h>
52*b9238976Sth #include <sys/flock.h>
53*b9238976Sth #include <sys/swap.h>
54*b9238976Sth #include <sys/errno.h>
55*b9238976Sth #include <sys/strsubr.h>
56*b9238976Sth #include <sys/sysmacros.h>
57*b9238976Sth #include <sys/kmem.h>
58*b9238976Sth #include <sys/mount.h>
59*b9238976Sth #include <sys/cmn_err.h>
60*b9238976Sth #include <sys/pathconf.h>
61*b9238976Sth #include <sys/utsname.h>
62*b9238976Sth #include <sys/dnlc.h>
63*b9238976Sth #include <sys/acl.h>
64*b9238976Sth #include <sys/systeminfo.h>
65*b9238976Sth #include <sys/policy.h>
66*b9238976Sth #include <sys/sdt.h>
67*b9238976Sth #include <sys/list.h>
68*b9238976Sth #include <sys/stat.h>
69*b9238976Sth #include <sys/mntent.h>
70*b9238976Sth 
71*b9238976Sth #include <rpc/types.h>
72*b9238976Sth #include <rpc/auth.h>
73*b9238976Sth #include <rpc/clnt.h>
74*b9238976Sth 
75*b9238976Sth #include <nfs/nfs.h>
76*b9238976Sth #include <nfs/nfs_clnt.h>
77*b9238976Sth #include <nfs/nfs_acl.h>
78*b9238976Sth #include <nfs/lm.h>
79*b9238976Sth #include <nfs/nfs4.h>
80*b9238976Sth #include <nfs/nfs4_kprot.h>
81*b9238976Sth #include <nfs/rnode4.h>
82*b9238976Sth #include <nfs/nfs4_clnt.h>
83*b9238976Sth 
84*b9238976Sth #include <vm/hat.h>
85*b9238976Sth #include <vm/as.h>
86*b9238976Sth #include <vm/page.h>
87*b9238976Sth #include <vm/pvn.h>
88*b9238976Sth #include <vm/seg.h>
89*b9238976Sth #include <vm/seg_map.h>
90*b9238976Sth #include <vm/seg_kpm.h>
91*b9238976Sth #include <vm/seg_vn.h>
92*b9238976Sth 
93*b9238976Sth #include <fs/fs_subr.h>
94*b9238976Sth 
95*b9238976Sth #include <sys/ddi.h>
96*b9238976Sth #include <sys/int_fmtio.h>
97*b9238976Sth 
98*b9238976Sth #include <util/string.h>
99*b9238976Sth 
100*b9238976Sth /*
101*b9238976Sth  * The automatic unmounter thread stuff!
102*b9238976Sth  */
103*b9238976Sth static int nfs4_trigger_thread_timer = 20;	/* in seconds */
104*b9238976Sth 
105*b9238976Sth /*
106*b9238976Sth  * Just a default....
107*b9238976Sth  */
108*b9238976Sth static uint_t nfs4_trigger_mount_to = 240;
109*b9238976Sth 
110*b9238976Sth typedef struct nfs4_trigger_globals {
111*b9238976Sth 	kmutex_t		ntg_forest_lock;
112*b9238976Sth 	uint_t			ntg_mount_to;
113*b9238976Sth 	int			ntg_thread_started;
114*b9238976Sth 	nfs4_ephemeral_tree_t	*ntg_forest;
115*b9238976Sth } nfs4_trigger_globals_t;
116*b9238976Sth 
117*b9238976Sth kmutex_t	nfs4_ephemeral_thread_lock;
118*b9238976Sth 
119*b9238976Sth zone_key_t	nfs4_ephemeral_key = ZONE_KEY_UNINITIALIZED;
120*b9238976Sth 
121*b9238976Sth static void	nfs4_ephemeral_start_harvester(nfs4_trigger_globals_t *);
122*b9238976Sth 
123*b9238976Sth /*
124*b9238976Sth  * Used for ephemeral mounts; contains data either duplicated from
125*b9238976Sth  * servinfo4_t, or hand-crafted, depending on type of ephemeral mount.
126*b9238976Sth  *
127*b9238976Sth  * It's intended that this structure is used solely for ephemeral
128*b9238976Sth  * mount-type specific data, for passing this data to
129*b9238976Sth  * nfs4_trigger_nargs_create().
130*b9238976Sth  */
131*b9238976Sth typedef struct ephemeral_servinfo {
132*b9238976Sth 	char			*esi_hostname;
133*b9238976Sth 	char			*esi_netname;
134*b9238976Sth 	char			*esi_path;
135*b9238976Sth 	int			esi_path_len;
136*b9238976Sth 	int			esi_mount_flags;
137*b9238976Sth 	struct netbuf		*esi_addr;
138*b9238976Sth 	struct netbuf		*esi_syncaddr;
139*b9238976Sth 	struct knetconfig	*esi_knconf;
140*b9238976Sth } ephemeral_servinfo_t;
141*b9238976Sth 
142*b9238976Sth /*
143*b9238976Sth  * Collect together the mount-type specific and generic data args.
144*b9238976Sth  */
145*b9238976Sth typedef struct domount_args {
146*b9238976Sth 	ephemeral_servinfo_t	*dma_esi;
147*b9238976Sth 	char			*dma_hostlist; /* comma-sep. for RO failover */
148*b9238976Sth 	struct nfs_args		*dma_nargs;
149*b9238976Sth } domount_args_t;
150*b9238976Sth 
151*b9238976Sth 
152*b9238976Sth /*
153*b9238976Sth  * The vnode ops functions for a trigger stub vnode
154*b9238976Sth  */
155*b9238976Sth static int	nfs4_trigger_open(vnode_t **, int, cred_t *);
156*b9238976Sth static int	nfs4_trigger_getattr(vnode_t *, struct vattr *, int, cred_t *);
157*b9238976Sth static int	nfs4_trigger_setattr(vnode_t *, struct vattr *, int, cred_t *,
158*b9238976Sth 			caller_context_t *);
159*b9238976Sth static int	nfs4_trigger_access(vnode_t *, int, int, cred_t *);
160*b9238976Sth static int	nfs4_trigger_readlink(vnode_t *, struct uio *, cred_t *);
161*b9238976Sth static int	nfs4_trigger_lookup(vnode_t *, char *, vnode_t **,
162*b9238976Sth 			struct pathname *, int, vnode_t *, cred_t *);
163*b9238976Sth static int	nfs4_trigger_create(vnode_t *, char *, struct vattr *,
164*b9238976Sth 			enum vcexcl, int, vnode_t **, cred_t *, int);
165*b9238976Sth static int	nfs4_trigger_remove(vnode_t *, char *, cred_t *);
166*b9238976Sth static int	nfs4_trigger_link(vnode_t *, vnode_t *, char *, cred_t *);
167*b9238976Sth static int	nfs4_trigger_rename(vnode_t *, char *, vnode_t *, char *,
168*b9238976Sth 			cred_t *);
169*b9238976Sth static int	nfs4_trigger_mkdir(vnode_t *, char *, struct vattr *,
170*b9238976Sth 			vnode_t **, cred_t *);
171*b9238976Sth static int	nfs4_trigger_rmdir(vnode_t *, char *, vnode_t *, cred_t *);
172*b9238976Sth static int	nfs4_trigger_symlink(vnode_t *, char *, struct vattr *, char *,
173*b9238976Sth 			cred_t *);
174*b9238976Sth static int	nfs4_trigger_cmp(vnode_t *, vnode_t *);
175*b9238976Sth 
176*b9238976Sth /*
177*b9238976Sth  * Regular NFSv4 vnodeops that we need to reference directly
178*b9238976Sth  */
179*b9238976Sth extern int	nfs4_getattr(vnode_t *, struct vattr *, int, cred_t *);
180*b9238976Sth extern void	nfs4_inactive(vnode_t *, cred_t *);
181*b9238976Sth extern int	nfs4_rwlock(vnode_t *, int, caller_context_t *);
182*b9238976Sth extern void	nfs4_rwunlock(vnode_t *, int, caller_context_t *);
183*b9238976Sth extern int	nfs4_lookup(vnode_t *, char *, vnode_t **,
184*b9238976Sth     struct pathname *, int, vnode_t *, cred_t *);
185*b9238976Sth extern int	nfs4_pathconf(vnode_t *, int, ulong_t *, cred_t *);
186*b9238976Sth extern int	nfs4_getsecattr(vnode_t *, vsecattr_t *, int, cred_t *);
187*b9238976Sth extern int	nfs4_fid(vnode_t *, fid_t *);
188*b9238976Sth extern int	nfs4_realvp(vnode_t *, vnode_t **);
189*b9238976Sth 
190*b9238976Sth static int	nfs4_trigger_mount(vnode_t *, vnode_t **);
191*b9238976Sth static int	nfs4_trigger_domount(vnode_t *, domount_args_t *, vfs_t **,
192*b9238976Sth     cred_t *);
193*b9238976Sth static domount_args_t  *nfs4_trigger_domount_args_create(vnode_t *);
194*b9238976Sth static void	nfs4_trigger_domount_args_destroy(domount_args_t *dma,
195*b9238976Sth     vnode_t *vp);
196*b9238976Sth static ephemeral_servinfo_t *nfs4_trigger_esi_create(vnode_t *, servinfo4_t *);
197*b9238976Sth static void	nfs4_trigger_esi_destroy(ephemeral_servinfo_t *, vnode_t *);
198*b9238976Sth static ephemeral_servinfo_t *nfs4_trigger_esi_create_mirrormount(vnode_t *,
199*b9238976Sth     servinfo4_t *);
200*b9238976Sth static struct nfs_args 	*nfs4_trigger_nargs_create(mntinfo4_t *, servinfo4_t *,
201*b9238976Sth     ephemeral_servinfo_t *);
202*b9238976Sth static void	nfs4_trigger_nargs_destroy(struct nfs_args *);
203*b9238976Sth static char	*nfs4_trigger_create_mntopts(vfs_t *);
204*b9238976Sth static void	nfs4_trigger_destroy_mntopts(char *);
205*b9238976Sth static int 	nfs4_trigger_add_mntopt(char *, char *, vfs_t *);
206*b9238976Sth static enum clnt_stat nfs4_trigger_ping_server(servinfo4_t *, int);
207*b9238976Sth 
208*b9238976Sth extern int	umount2_engine(vfs_t *, int, cred_t *, int);
209*b9238976Sth 
210*b9238976Sth 
211*b9238976Sth vnodeops_t *nfs4_trigger_vnodeops;
212*b9238976Sth 
213*b9238976Sth /*
214*b9238976Sth  * These are the vnodeops that we must define for stub vnodes.
215*b9238976Sth  *
216*b9238976Sth  *
217*b9238976Sth  * Many of the VOPs defined for NFSv4 do not need to be defined here,
218*b9238976Sth  * for various reasons. This will result in the VFS default function being
219*b9238976Sth  * used:
220*b9238976Sth  *
221*b9238976Sth  * - These VOPs require a previous VOP_OPEN to have occurred. That will have
222*b9238976Sth  *   lost the reference to the stub vnode, meaning these should not be called:
223*b9238976Sth  *       close, read, write, ioctl, readdir, seek.
224*b9238976Sth  *
225*b9238976Sth  * - These VOPs are meaningless for vnodes without data pages. Since the
226*b9238976Sth  *   stub vnode is of type VDIR, these should not be called:
227*b9238976Sth  *       space, getpage, putpage, map, addmap, delmap, pageio, fsync.
228*b9238976Sth  *
229*b9238976Sth  * - These VOPs are otherwise not applicable, and should not be called:
230*b9238976Sth  *       dump, setsecattr.
231*b9238976Sth  *
232*b9238976Sth  *
233*b9238976Sth  * These VOPs we do not want to define, but nor do we want the VFS default
234*b9238976Sth  * action. Instead, we specify the VFS error function, with fs_error(), but
235*b9238976Sth  * note that fs_error() is not actually called. Instead it results in the
236*b9238976Sth  * use of the error function defined for the particular VOP, in vn_ops_table[]:
237*b9238976Sth  *
238*b9238976Sth  * -   frlock, dispose, shrlock.
239*b9238976Sth  *
240*b9238976Sth  *
241*b9238976Sth  * These VOPs we define to use the corresponding regular NFSv4 vnodeop.
242*b9238976Sth  * NOTE: if any of these ops involve an OTW call with the stub FH, then
243*b9238976Sth  * that call must be wrapped with save_mnt_secinfo()/check_mnt_secinfo()
244*b9238976Sth  * to protect the security data in the servinfo4_t for the "parent"
245*b9238976Sth  * filesystem that contains the stub.
246*b9238976Sth  *
247*b9238976Sth  * - These VOPs should not trigger a mount, so that "ls -l" does not:
248*b9238976Sth  *       pathconf, getsecattr.
249*b9238976Sth  *
250*b9238976Sth  * - These VOPs would not make sense to trigger:
251*b9238976Sth  *       inactive, rwlock, rwunlock, fid, realvp.
252*b9238976Sth  */
253*b9238976Sth const fs_operation_def_t nfs4_trigger_vnodeops_template[] = {
254*b9238976Sth 	VOPNAME_OPEN,		{ .vop_open = nfs4_trigger_open },
255*b9238976Sth 	VOPNAME_GETATTR,	{ .vop_getattr = nfs4_trigger_getattr },
256*b9238976Sth 	VOPNAME_SETATTR,	{ .vop_setattr = nfs4_trigger_setattr },
257*b9238976Sth 	VOPNAME_ACCESS,		{ .vop_access = nfs4_trigger_access },
258*b9238976Sth 	VOPNAME_LOOKUP,		{ .vop_lookup = nfs4_trigger_lookup },
259*b9238976Sth 	VOPNAME_CREATE,		{ .vop_create = nfs4_trigger_create },
260*b9238976Sth 	VOPNAME_REMOVE,		{ .vop_remove = nfs4_trigger_remove },
261*b9238976Sth 	VOPNAME_LINK,		{ .vop_link = nfs4_trigger_link },
262*b9238976Sth 	VOPNAME_RENAME,		{ .vop_rename = nfs4_trigger_rename },
263*b9238976Sth 	VOPNAME_MKDIR,		{ .vop_mkdir = nfs4_trigger_mkdir },
264*b9238976Sth 	VOPNAME_RMDIR,		{ .vop_rmdir = nfs4_trigger_rmdir },
265*b9238976Sth 	VOPNAME_SYMLINK,	{ .vop_symlink = nfs4_trigger_symlink },
266*b9238976Sth 	VOPNAME_READLINK,	{ .vop_readlink = nfs4_trigger_readlink },
267*b9238976Sth 	VOPNAME_INACTIVE, 	{ .vop_inactive = nfs4_inactive },
268*b9238976Sth 	VOPNAME_FID,		{ .vop_fid = nfs4_fid },
269*b9238976Sth 	VOPNAME_RWLOCK,		{ .vop_rwlock = nfs4_rwlock },
270*b9238976Sth 	VOPNAME_RWUNLOCK,	{ .vop_rwunlock = nfs4_rwunlock },
271*b9238976Sth 	VOPNAME_REALVP,		{ .vop_realvp = nfs4_realvp },
272*b9238976Sth 	VOPNAME_GETSECATTR,	{ .vop_getsecattr = nfs4_getsecattr },
273*b9238976Sth 	VOPNAME_PATHCONF,	{ .vop_pathconf = nfs4_pathconf },
274*b9238976Sth 	VOPNAME_FRLOCK,		{ .error = fs_error },
275*b9238976Sth 	VOPNAME_DISPOSE,	{ .error = fs_error },
276*b9238976Sth 	VOPNAME_SHRLOCK,	{ .error = fs_error },
277*b9238976Sth 	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
278*b9238976Sth 	NULL, NULL
279*b9238976Sth };
280*b9238976Sth 
281*b9238976Sth /*
282*b9238976Sth  * Trigger ops for stub vnodes; for mirror mounts, etc.
283*b9238976Sth  *
284*b9238976Sth  * The general idea is that a "triggering" op will first call
285*b9238976Sth  * nfs4_trigger_mount(), which will find out whether a mount has already
286*b9238976Sth  * been triggered.
287*b9238976Sth  *
288*b9238976Sth  * If it has, then nfs4_trigger_mount() sets newvp to the root vnode
289*b9238976Sth  * of the covering vfs.
290*b9238976Sth  *
291*b9238976Sth  * If a mount has not yet been triggered, nfs4_trigger_mount() will do so,
292*b9238976Sth  * and again set newvp, as above.
293*b9238976Sth  *
294*b9238976Sth  * The triggering op may then re-issue the VOP by calling it on newvp.
295*b9238976Sth  *
296*b9238976Sth  * Note that some ops may perform custom action, and may or may not need
297*b9238976Sth  * to trigger a mount.
298*b9238976Sth  *
299*b9238976Sth  * Some ops need to call the regular NFSv4 vnodeop for a stub vnode. We
300*b9238976Sth  * obviously can't do this with VOP_<whatever>, since it's a stub vnode
301*b9238976Sth  * and that would just recurse. Instead, we call the v4 op directly,
302*b9238976Sth  * by name.  This is OK, since we know that the vnode is for NFSv4,
303*b9238976Sth  * otherwise it couldn't be a stub.
304*b9238976Sth  *
305*b9238976Sth  */
306*b9238976Sth 
307*b9238976Sth static int
308*b9238976Sth nfs4_trigger_open(vnode_t **vpp, int flag, cred_t *cr)
309*b9238976Sth {
310*b9238976Sth 	int error;
311*b9238976Sth 	vnode_t *newvp;
312*b9238976Sth 
313*b9238976Sth 	error = nfs4_trigger_mount(*vpp, &newvp);
314*b9238976Sth 	if (error)
315*b9238976Sth 		return (error);
316*b9238976Sth 
317*b9238976Sth 	/* Release the stub vnode, as we're losing the reference to it */
318*b9238976Sth 	VN_RELE(*vpp);
319*b9238976Sth 
320*b9238976Sth 	/* Give the caller the root vnode of the newly-mounted fs */
321*b9238976Sth 	*vpp = newvp;
322*b9238976Sth 
323*b9238976Sth 	/* return with VN_HELD(newvp) */
324*b9238976Sth 	return (VOP_OPEN(vpp, flag, cr));
325*b9238976Sth }
326*b9238976Sth 
327*b9238976Sth /*
328*b9238976Sth  * For the majority of cases, nfs4_trigger_getattr() will not trigger
329*b9238976Sth  * a mount. However, if ATTR_TRIGGER is set, we are being informed
330*b9238976Sth  * that we need to force the mount before we attempt to determine
331*b9238976Sth  * the attributes. The intent is an atomic operation for security
332*b9238976Sth  * testing.
333*b9238976Sth  */
334*b9238976Sth static int
335*b9238976Sth nfs4_trigger_getattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr)
336*b9238976Sth {
337*b9238976Sth 	int error;
338*b9238976Sth 
339*b9238976Sth 	if (flags & ATTR_TRIGGER) {
340*b9238976Sth 		vnode_t	*newvp;
341*b9238976Sth 
342*b9238976Sth 		error = nfs4_trigger_mount(vp, &newvp);
343*b9238976Sth 		if (error)
344*b9238976Sth 			return (error);
345*b9238976Sth 
346*b9238976Sth 		error = VOP_GETATTR(newvp, vap, flags, cr);
347*b9238976Sth 		VN_RELE(newvp);
348*b9238976Sth 	} else {
349*b9238976Sth 		error = nfs4_getattr(vp, vap, flags, cr);
350*b9238976Sth 	}
351*b9238976Sth 
352*b9238976Sth 	return (error);
353*b9238976Sth }
354*b9238976Sth 
355*b9238976Sth static int
356*b9238976Sth nfs4_trigger_setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
357*b9238976Sth 		caller_context_t *ct)
358*b9238976Sth {
359*b9238976Sth 	int error;
360*b9238976Sth 	vnode_t *newvp;
361*b9238976Sth 
362*b9238976Sth 	error = nfs4_trigger_mount(vp, &newvp);
363*b9238976Sth 	if (error)
364*b9238976Sth 		return (error);
365*b9238976Sth 
366*b9238976Sth 	error = VOP_SETATTR(newvp, vap, flags, cr, ct);
367*b9238976Sth 	VN_RELE(newvp);
368*b9238976Sth 
369*b9238976Sth 	return (error);
370*b9238976Sth }
371*b9238976Sth 
372*b9238976Sth static int
373*b9238976Sth nfs4_trigger_access(vnode_t *vp, int mode, int flags, cred_t *cr)
374*b9238976Sth {
375*b9238976Sth 	int error;
376*b9238976Sth 	vnode_t *newvp;
377*b9238976Sth 
378*b9238976Sth 	error = nfs4_trigger_mount(vp, &newvp);
379*b9238976Sth 	if (error)
380*b9238976Sth 		return (error);
381*b9238976Sth 
382*b9238976Sth 	error = VOP_ACCESS(newvp, mode, flags, cr);
383*b9238976Sth 	VN_RELE(newvp);
384*b9238976Sth 
385*b9238976Sth 	return (error);
386*b9238976Sth }
387*b9238976Sth 
388*b9238976Sth static int
389*b9238976Sth nfs4_trigger_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
390*b9238976Sth 	int flags, vnode_t *rdir, cred_t *cr)
391*b9238976Sth {
392*b9238976Sth 	int error;
393*b9238976Sth 	vnode_t *newdvp;
394*b9238976Sth 	rnode4_t *drp = VTOR4(dvp);
395*b9238976Sth 
396*b9238976Sth 	ASSERT(RP_ISSTUB(drp));
397*b9238976Sth 
398*b9238976Sth 	/* for now, we only support mirror-mounts */
399*b9238976Sth 	ASSERT(RP_ISSTUB_MIRRORMOUNT(drp));
400*b9238976Sth 
401*b9238976Sth 	/*
402*b9238976Sth 	 * It's not legal to lookup ".." for an fs root, so we mustn't pass
403*b9238976Sth 	 * that up. Instead, pass onto the regular op, regardless of whether
404*b9238976Sth 	 * we've triggered a mount.
405*b9238976Sth 	 */
406*b9238976Sth 	if (strcmp(nm, "..") == 0)
407*b9238976Sth 		return (nfs4_lookup(dvp, nm, vpp, pnp, flags, rdir, cr));
408*b9238976Sth 
409*b9238976Sth 	error = nfs4_trigger_mount(dvp, &newdvp);
410*b9238976Sth 	if (error)
411*b9238976Sth 		return (error);
412*b9238976Sth 
413*b9238976Sth 	error = VOP_LOOKUP(newdvp, nm, vpp, pnp, flags, rdir, cr);
414*b9238976Sth 	VN_RELE(newdvp);
415*b9238976Sth 
416*b9238976Sth 	return (error);
417*b9238976Sth }
418*b9238976Sth 
419*b9238976Sth static int
420*b9238976Sth nfs4_trigger_create(vnode_t *dvp, char *nm, struct vattr *va,
421*b9238976Sth 		enum vcexcl exclusive, int mode, vnode_t **vpp, cred_t *cr,
422*b9238976Sth 		int flags)
423*b9238976Sth {
424*b9238976Sth 	int error;
425*b9238976Sth 	vnode_t *newdvp;
426*b9238976Sth 
427*b9238976Sth 	error = nfs4_trigger_mount(dvp, &newdvp);
428*b9238976Sth 	if (error)
429*b9238976Sth 		return (error);
430*b9238976Sth 
431*b9238976Sth 	error = VOP_CREATE(newdvp, nm, va, exclusive, mode, vpp, cr, flags);
432*b9238976Sth 	VN_RELE(newdvp);
433*b9238976Sth 
434*b9238976Sth 	return (error);
435*b9238976Sth }
436*b9238976Sth 
437*b9238976Sth static int
438*b9238976Sth nfs4_trigger_remove(vnode_t *dvp, char *nm, cred_t *cr)
439*b9238976Sth {
440*b9238976Sth 	int error;
441*b9238976Sth 	vnode_t *newdvp;
442*b9238976Sth 
443*b9238976Sth 	error = nfs4_trigger_mount(dvp, &newdvp);
444*b9238976Sth 	if (error)
445*b9238976Sth 		return (error);
446*b9238976Sth 
447*b9238976Sth 	error = VOP_REMOVE(newdvp, nm, cr);
448*b9238976Sth 	VN_RELE(newdvp);
449*b9238976Sth 
450*b9238976Sth 	return (error);
451*b9238976Sth }
452*b9238976Sth 
453*b9238976Sth static int
454*b9238976Sth nfs4_trigger_link(vnode_t *tdvp, vnode_t *svp, char *tnm, cred_t *cr)
455*b9238976Sth {
456*b9238976Sth 	int error;
457*b9238976Sth 	vnode_t *newtdvp;
458*b9238976Sth 
459*b9238976Sth 	error = nfs4_trigger_mount(tdvp, &newtdvp);
460*b9238976Sth 	if (error)
461*b9238976Sth 		return (error);
462*b9238976Sth 
463*b9238976Sth 	/*
464*b9238976Sth 	 * We don't check whether svp is a stub. Let the NFSv4 code
465*b9238976Sth 	 * detect that error, and return accordingly.
466*b9238976Sth 	 */
467*b9238976Sth 	error = VOP_LINK(newtdvp, svp, tnm, cr);
468*b9238976Sth 	VN_RELE(newtdvp);
469*b9238976Sth 
470*b9238976Sth 	return (error);
471*b9238976Sth }
472*b9238976Sth 
473*b9238976Sth static int
474*b9238976Sth nfs4_trigger_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm,
475*b9238976Sth 		cred_t *cr)
476*b9238976Sth {
477*b9238976Sth 	int error;
478*b9238976Sth 	vnode_t *newsdvp;
479*b9238976Sth 	rnode4_t *tdrp = VTOR4(tdvp);
480*b9238976Sth 
481*b9238976Sth 	/*
482*b9238976Sth 	 * We know that sdvp is a stub, otherwise we would not be here.
483*b9238976Sth 	 *
484*b9238976Sth 	 * If tdvp is also be a stub, there are two possibilities: it
485*b9238976Sth 	 * is either the same stub as sdvp [i.e. VN_CMP(sdvp, tdvp)]
486*b9238976Sth 	 * or it is a different stub [!VN_CMP(sdvp, tdvp)].
487*b9238976Sth 	 *
488*b9238976Sth 	 * In the former case, just trigger sdvp, and treat tdvp as
489*b9238976Sth 	 * though it were not a stub.
490*b9238976Sth 	 *
491*b9238976Sth 	 * In the latter case, it might be a different stub for the
492*b9238976Sth 	 * same server fs as sdvp, or for a different server fs.
493*b9238976Sth 	 * Regardless, from the client perspective this would still
494*b9238976Sth 	 * be a cross-filesystem rename, and should not be allowed,
495*b9238976Sth 	 * so return EXDEV, without triggering either mount.
496*b9238976Sth 	 */
497*b9238976Sth 	if (RP_ISSTUB(tdrp) && !VN_CMP(sdvp, tdvp))
498*b9238976Sth 		return (EXDEV);
499*b9238976Sth 
500*b9238976Sth 	error = nfs4_trigger_mount(sdvp, &newsdvp);
501*b9238976Sth 	if (error)
502*b9238976Sth 		return (error);
503*b9238976Sth 
504*b9238976Sth 	error = VOP_RENAME(newsdvp, snm, tdvp, tnm, cr);
505*b9238976Sth 
506*b9238976Sth 	VN_RELE(newsdvp);
507*b9238976Sth 
508*b9238976Sth 	return (error);
509*b9238976Sth }
510*b9238976Sth 
511*b9238976Sth static int
512*b9238976Sth nfs4_trigger_mkdir(vnode_t *dvp, char *nm, struct vattr *va, vnode_t **vpp,
513*b9238976Sth 		cred_t *cr)
514*b9238976Sth {
515*b9238976Sth 	int error;
516*b9238976Sth 	vnode_t *newdvp;
517*b9238976Sth 
518*b9238976Sth 	error = nfs4_trigger_mount(dvp, &newdvp);
519*b9238976Sth 	if (error)
520*b9238976Sth 		return (error);
521*b9238976Sth 
522*b9238976Sth 	error = VOP_MKDIR(newdvp, nm, va, vpp, cr);
523*b9238976Sth 	VN_RELE(newdvp);
524*b9238976Sth 
525*b9238976Sth 	return (error);
526*b9238976Sth }
527*b9238976Sth 
528*b9238976Sth static int
529*b9238976Sth nfs4_trigger_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, cred_t *cr)
530*b9238976Sth {
531*b9238976Sth 	int error;
532*b9238976Sth 	vnode_t *newdvp;
533*b9238976Sth 
534*b9238976Sth 	error = nfs4_trigger_mount(dvp, &newdvp);
535*b9238976Sth 	if (error)
536*b9238976Sth 		return (error);
537*b9238976Sth 
538*b9238976Sth 	error = VOP_RMDIR(newdvp, nm, cdir, cr);
539*b9238976Sth 	VN_RELE(newdvp);
540*b9238976Sth 
541*b9238976Sth 	return (error);
542*b9238976Sth }
543*b9238976Sth 
544*b9238976Sth static int
545*b9238976Sth nfs4_trigger_symlink(vnode_t *dvp, char *lnm, struct vattr *tva, char *tnm,
546*b9238976Sth 	cred_t *cr)
547*b9238976Sth {
548*b9238976Sth 	int error;
549*b9238976Sth 	vnode_t *newdvp;
550*b9238976Sth 
551*b9238976Sth 	error = nfs4_trigger_mount(dvp, &newdvp);
552*b9238976Sth 	if (error)
553*b9238976Sth 		return (error);
554*b9238976Sth 
555*b9238976Sth 	error = VOP_SYMLINK(newdvp, lnm, tva, tnm, cr);
556*b9238976Sth 	VN_RELE(newdvp);
557*b9238976Sth 
558*b9238976Sth 	return (error);
559*b9238976Sth }
560*b9238976Sth 
561*b9238976Sth static int
562*b9238976Sth nfs4_trigger_readlink(vnode_t *vp, struct uio *uiop, cred_t *cr)
563*b9238976Sth {
564*b9238976Sth 	int error;
565*b9238976Sth 	vnode_t *newvp;
566*b9238976Sth 
567*b9238976Sth 	error = nfs4_trigger_mount(vp, &newvp);
568*b9238976Sth 	if (error)
569*b9238976Sth 		return (error);
570*b9238976Sth 
571*b9238976Sth 	error = VOP_READLINK(newvp, uiop, cr);
572*b9238976Sth 	VN_RELE(newvp);
573*b9238976Sth 
574*b9238976Sth 	return (error);
575*b9238976Sth }
576*b9238976Sth 
577*b9238976Sth /* end of trigger vnode ops */
578*b9238976Sth 
579*b9238976Sth 
580*b9238976Sth /*
581*b9238976Sth  * Mount upon a trigger vnode; for mirror-mounts, etc.
582*b9238976Sth  *
583*b9238976Sth  * The mount may have already occurred, via another thread. If not,
584*b9238976Sth  * assemble the location information - which may require fetching - and
585*b9238976Sth  * perform the mount.
586*b9238976Sth  *
587*b9238976Sth  * Sets newvp to be the root of the fs that is now covering vp. Note
588*b9238976Sth  * that we return with VN_HELD(*newvp).
589*b9238976Sth  *
590*b9238976Sth  * The caller is responsible for passing the VOP onto the covering fs.
591*b9238976Sth  */
592*b9238976Sth static int
593*b9238976Sth nfs4_trigger_mount(vnode_t *vp, vnode_t **newvpp)
594*b9238976Sth {
595*b9238976Sth 	int			 error;
596*b9238976Sth 	vfs_t			*vfsp;
597*b9238976Sth 	rnode4_t		*rp = VTOR4(vp);
598*b9238976Sth 	mntinfo4_t		*mi = VTOMI4(vp);
599*b9238976Sth 	domount_args_t		*dma;
600*b9238976Sth 
601*b9238976Sth 	nfs4_ephemeral_tree_t	*net;
602*b9238976Sth 
603*b9238976Sth 	bool_t			must_unlock = FALSE;
604*b9238976Sth 	bool_t			is_building = FALSE;
605*b9238976Sth 
606*b9238976Sth 	cred_t			*zcred;
607*b9238976Sth 
608*b9238976Sth 	nfs4_trigger_globals_t	*ntg;
609*b9238976Sth 
610*b9238976Sth 	zone_t			*zone = curproc->p_zone;
611*b9238976Sth 
612*b9238976Sth 	ASSERT(RP_ISSTUB(rp));
613*b9238976Sth 
614*b9238976Sth 	/* for now, we only support mirror-mounts */
615*b9238976Sth 	ASSERT(RP_ISSTUB_MIRRORMOUNT(rp));
616*b9238976Sth 
617*b9238976Sth 	*newvpp = NULL;
618*b9238976Sth 
619*b9238976Sth 	/*
620*b9238976Sth 	 * Has the mount already occurred?
621*b9238976Sth 	 */
622*b9238976Sth 	error = vn_vfsrlock_wait(vp);
623*b9238976Sth 	if (error)
624*b9238976Sth 		goto done;
625*b9238976Sth 	vfsp = vn_mountedvfs(vp);
626*b9238976Sth 	if (vfsp != NULL) {
627*b9238976Sth 		/* the mount has already occurred */
628*b9238976Sth 		error = VFS_ROOT(vfsp, newvpp);
629*b9238976Sth 		if (!error) {
630*b9238976Sth 			/* need to update the reference time  */
631*b9238976Sth 			mutex_enter(&mi->mi_lock);
632*b9238976Sth 			if (mi->mi_ephemeral)
633*b9238976Sth 				mi->mi_ephemeral->ne_ref_time =
634*b9238976Sth 				    gethrestime_sec();
635*b9238976Sth 			mutex_exit(&mi->mi_lock);
636*b9238976Sth 		}
637*b9238976Sth 
638*b9238976Sth 		vn_vfsunlock(vp);
639*b9238976Sth 		goto done;
640*b9238976Sth 	}
641*b9238976Sth 	vn_vfsunlock(vp);
642*b9238976Sth 
643*b9238976Sth 	ntg = zone_getspecific(nfs4_ephemeral_key, zone);
644*b9238976Sth 	ASSERT(ntg != NULL);
645*b9238976Sth 
646*b9238976Sth 	mutex_enter(&mi->mi_lock);
647*b9238976Sth 
648*b9238976Sth 	/*
649*b9238976Sth 	 * We need to lock down the ephemeral tree.
650*b9238976Sth 	 */
651*b9238976Sth 	if (mi->mi_ephemeral_tree == NULL) {
652*b9238976Sth 		net = kmem_zalloc(sizeof (*net), KM_SLEEP);
653*b9238976Sth 		mutex_init(&net->net_tree_lock, NULL, MUTEX_DEFAULT, NULL);
654*b9238976Sth 		mutex_init(&net->net_cnt_lock, NULL, MUTEX_DEFAULT, NULL);
655*b9238976Sth 		net->net_refcnt = 1;
656*b9238976Sth 		net->net_status = NFS4_EPHEMERAL_TREE_BUILDING;
657*b9238976Sth 		is_building = TRUE;
658*b9238976Sth 
659*b9238976Sth 		/*
660*b9238976Sth 		 * We need to add it to the zone specific list for
661*b9238976Sth 		 * automatic unmounting and harvesting of deadwood.
662*b9238976Sth 		 */
663*b9238976Sth 		mutex_enter(&ntg->ntg_forest_lock);
664*b9238976Sth 		if (ntg->ntg_forest != NULL)
665*b9238976Sth 			net->net_next = ntg->ntg_forest;
666*b9238976Sth 		ntg->ntg_forest = net;
667*b9238976Sth 		mutex_exit(&ntg->ntg_forest_lock);
668*b9238976Sth 
669*b9238976Sth 		/*
670*b9238976Sth 		 * No lock order confusion with mi_lock because no
671*b9238976Sth 		 * other node could have grabbed net_tree_lock.
672*b9238976Sth 		 */
673*b9238976Sth 		mutex_enter(&net->net_tree_lock);
674*b9238976Sth 		mi->mi_ephemeral_tree = net;
675*b9238976Sth 		net->net_mount = mi;
676*b9238976Sth 		mutex_exit(&mi->mi_lock);
677*b9238976Sth 	} else {
678*b9238976Sth 		net = mi->mi_ephemeral_tree;
679*b9238976Sth 		mutex_exit(&mi->mi_lock);
680*b9238976Sth 
681*b9238976Sth 		mutex_enter(&net->net_cnt_lock);
682*b9238976Sth 		net->net_refcnt++;
683*b9238976Sth 		mutex_exit(&net->net_cnt_lock);
684*b9238976Sth 
685*b9238976Sth 		/*
686*b9238976Sth 		 * Note that we do not do any checks to
687*b9238976Sth 		 * see if the parent has been nuked.
688*b9238976Sth 		 * We count on the vfs layer having protected
689*b9238976Sth 		 * us from feet shooters.
690*b9238976Sth 		 */
691*b9238976Sth 		mutex_enter(&net->net_tree_lock);
692*b9238976Sth 	}
693*b9238976Sth 
694*b9238976Sth 	mutex_enter(&net->net_cnt_lock);
695*b9238976Sth 	net->net_status |= NFS4_EPHEMERAL_TREE_MOUNTING;
696*b9238976Sth 	mutex_exit(&net->net_cnt_lock);
697*b9238976Sth 
698*b9238976Sth 	must_unlock = TRUE;
699*b9238976Sth 
700*b9238976Sth 	dma = nfs4_trigger_domount_args_create(vp);
701*b9238976Sth 	if (dma == NULL) {
702*b9238976Sth 		error = EINVAL;
703*b9238976Sth 		goto done;
704*b9238976Sth 	}
705*b9238976Sth 
706*b9238976Sth 	/*
707*b9238976Sth 	 * Need to be root for this call to make mount work.
708*b9238976Sth 	 * Note that since we define mirror mounts to work
709*b9238976Sth 	 * for any user, we allow the mount to proceed. And
710*b9238976Sth 	 * we realize that the server will perform security
711*b9238976Sth 	 * checks to make sure that the client is allowed
712*b9238976Sth 	 * access. Finally, once the mount takes place,
713*b9238976Sth 	 * directory permissions will ensure that the
714*b9238976Sth 	 * content is secure.
715*b9238976Sth 	 */
716*b9238976Sth 	zcred = zone_get_kcred(getzoneid());
717*b9238976Sth 	ASSERT(zcred != NULL);
718*b9238976Sth 
719*b9238976Sth 	error = nfs4_trigger_domount(vp, dma, &vfsp, zcred);
720*b9238976Sth 	nfs4_trigger_domount_args_destroy(dma, vp);
721*b9238976Sth 
722*b9238976Sth 	crfree(zcred);
723*b9238976Sth 
724*b9238976Sth 	if (!error)
725*b9238976Sth 		error = VFS_ROOT(vfsp, newvpp);
726*b9238976Sth done:
727*b9238976Sth 	if (must_unlock) {
728*b9238976Sth 		mutex_enter(&net->net_cnt_lock);
729*b9238976Sth 		net->net_status &= ~NFS4_EPHEMERAL_TREE_MOUNTING;
730*b9238976Sth 		if (is_building)
731*b9238976Sth 			net->net_status &= ~NFS4_EPHEMERAL_TREE_BUILDING;
732*b9238976Sth 		net->net_refcnt--;
733*b9238976Sth 		mutex_exit(&net->net_cnt_lock);
734*b9238976Sth 
735*b9238976Sth 		mutex_exit(&net->net_tree_lock);
736*b9238976Sth 	}
737*b9238976Sth 
738*b9238976Sth 	if (!error && (newvpp == NULL || *newvpp == NULL))
739*b9238976Sth 		error = ENOSYS;
740*b9238976Sth 
741*b9238976Sth 	return (error);
742*b9238976Sth }
743*b9238976Sth 
744*b9238976Sth /*
745*b9238976Sth  * Collect together both the generic & mount-type specific args.
746*b9238976Sth  */
747*b9238976Sth static domount_args_t *
748*b9238976Sth nfs4_trigger_domount_args_create(vnode_t *vp)
749*b9238976Sth {
750*b9238976Sth 	int nointr;
751*b9238976Sth 	char *hostlist;
752*b9238976Sth 	servinfo4_t *svp;
753*b9238976Sth 	struct nfs_args *nargs, *nargs_head;
754*b9238976Sth 	enum clnt_stat status;
755*b9238976Sth 	ephemeral_servinfo_t *esi, *esi_first;
756*b9238976Sth 	domount_args_t *dma;
757*b9238976Sth 	mntinfo4_t *mi = VTOMI4(vp);
758*b9238976Sth 
759*b9238976Sth 	nointr = !(mi->mi_flags & MI4_INT);
760*b9238976Sth 	hostlist = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
761*b9238976Sth 
762*b9238976Sth 	svp = mi->mi_curr_serv;
763*b9238976Sth 	/* check if the current server is responding */
764*b9238976Sth 	status = nfs4_trigger_ping_server(svp, nointr);
765*b9238976Sth 	if (status == RPC_SUCCESS) {
766*b9238976Sth 		esi_first = nfs4_trigger_esi_create(vp, svp);
767*b9238976Sth 		if (esi_first == NULL) {
768*b9238976Sth 			kmem_free(hostlist, MAXPATHLEN);
769*b9238976Sth 			return (NULL);
770*b9238976Sth 		}
771*b9238976Sth 
772*b9238976Sth 		(void) strlcpy(hostlist, esi_first->esi_hostname, MAXPATHLEN);
773*b9238976Sth 
774*b9238976Sth 		nargs_head = nfs4_trigger_nargs_create(mi, svp, esi_first);
775*b9238976Sth 	} else {
776*b9238976Sth 		/* current server did not respond */
777*b9238976Sth 		esi_first = NULL;
778*b9238976Sth 		nargs_head = NULL;
779*b9238976Sth 	}
780*b9238976Sth 	nargs = nargs_head;
781*b9238976Sth 
782*b9238976Sth 	/*
783*b9238976Sth 	 * NFS RO failover.
784*b9238976Sth 	 *
785*b9238976Sth 	 * If we have multiple servinfo4 structures, linked via sv_next,
786*b9238976Sth 	 * we must create one nfs_args for each, linking the nfs_args via
787*b9238976Sth 	 * nfs_ext_u.nfs_extB.next.
788*b9238976Sth 	 *
789*b9238976Sth 	 * We need to build a corresponding esi for each, too, but that is
790*b9238976Sth 	 * used solely for building nfs_args, and may be immediately
791*b9238976Sth 	 * discarded, as domount() requires the info from just one esi,
792*b9238976Sth 	 * but all the nfs_args.
793*b9238976Sth 	 *
794*b9238976Sth 	 * Currently, the NFS mount code will hang if not all servers
795*b9238976Sth 	 * requested are available. To avoid that, we need to ping each
796*b9238976Sth 	 * server, here, and remove it from the list if it is not
797*b9238976Sth 	 * responding. This has the side-effect of that server then
798*b9238976Sth 	 * being permanently unavailable for this failover mount, even if
799*b9238976Sth 	 * it recovers. That's unfortunate, but the best we can do until
800*b9238976Sth 	 * the mount code path is fixed.
801*b9238976Sth 	 */
802*b9238976Sth 
803*b9238976Sth 	/*
804*b9238976Sth 	 * If the current server was down, loop indefinitely until we find
805*b9238976Sth 	 * at least one responsive server.
806*b9238976Sth 	 */
807*b9238976Sth 	do {
808*b9238976Sth 		/* no locking needed for sv_next; it is only set at fs mount */
809*b9238976Sth 		for (svp = mi->mi_servers; svp != NULL; svp = svp->sv_next) {
810*b9238976Sth 			struct nfs_args *next;
811*b9238976Sth 
812*b9238976Sth 			/*
813*b9238976Sth 			 * nargs_head: the head of the nfs_args list
814*b9238976Sth 			 * nargs: the current tail of the list
815*b9238976Sth 			 * next: the newly-created element to be added
816*b9238976Sth 			 */
817*b9238976Sth 
818*b9238976Sth 			/*
819*b9238976Sth 			 * We've already tried the current server, above;
820*b9238976Sth 			 * if it was responding, we have already included it
821*b9238976Sth 			 * and it may now be ignored.
822*b9238976Sth 			 *
823*b9238976Sth 			 * Otherwise, try it again, since it may now have
824*b9238976Sth 			 * recovered.
825*b9238976Sth 			 */
826*b9238976Sth 			if (svp == mi->mi_curr_serv && esi_first != NULL)
827*b9238976Sth 				continue;
828*b9238976Sth 
829*b9238976Sth 			(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
830*b9238976Sth 			if (svp->sv_flags & SV4_NOTINUSE) {
831*b9238976Sth 				nfs_rw_exit(&svp->sv_lock);
832*b9238976Sth 				continue;
833*b9238976Sth 			}
834*b9238976Sth 			nfs_rw_exit(&svp->sv_lock);
835*b9238976Sth 
836*b9238976Sth 			/* check if the server is responding */
837*b9238976Sth 			status = nfs4_trigger_ping_server(svp, nointr);
838*b9238976Sth 			/* if the server did not respond, ignore it */
839*b9238976Sth 			if (status != RPC_SUCCESS)
840*b9238976Sth 				continue;
841*b9238976Sth 
842*b9238976Sth 			esi = nfs4_trigger_esi_create(vp, svp);
843*b9238976Sth 			if (esi == NULL)
844*b9238976Sth 				continue;
845*b9238976Sth 
846*b9238976Sth 			/*
847*b9238976Sth 			 * If the original current server (mi_curr_serv)
848*b9238976Sth 			 * was down when when we first tried it,
849*b9238976Sth 			 * (i.e. esi_first == NULL),
850*b9238976Sth 			 * we select this new server (svp) to be the server
851*b9238976Sth 			 * that we will actually contact (esi_first).
852*b9238976Sth 			 *
853*b9238976Sth 			 * Note that it's possible that mi_curr_serv == svp,
854*b9238976Sth 			 * if that mi_curr_serv was down but has now recovered.
855*b9238976Sth 			 */
856*b9238976Sth 			next = nfs4_trigger_nargs_create(mi, svp, esi);
857*b9238976Sth 			if (esi_first == NULL) {
858*b9238976Sth 				ASSERT(nargs == NULL);
859*b9238976Sth 				ASSERT(nargs_head == NULL);
860*b9238976Sth 				nargs_head = next;
861*b9238976Sth 				esi_first = esi;
862*b9238976Sth 				(void) strlcpy(hostlist,
863*b9238976Sth 				    esi_first->esi_hostname, MAXPATHLEN);
864*b9238976Sth 			} else {
865*b9238976Sth 				ASSERT(nargs_head != NULL);
866*b9238976Sth 				nargs->nfs_ext_u.nfs_extB.next = next;
867*b9238976Sth 				(void) strlcat(hostlist, ",", MAXPATHLEN);
868*b9238976Sth 				(void) strlcat(hostlist, esi->esi_hostname,
869*b9238976Sth 				    MAXPATHLEN);
870*b9238976Sth 				/* esi was only needed for hostname & nargs */
871*b9238976Sth 				nfs4_trigger_esi_destroy(esi, vp);
872*b9238976Sth 			}
873*b9238976Sth 
874*b9238976Sth 			nargs = next;
875*b9238976Sth 		}
876*b9238976Sth 
877*b9238976Sth 		/* if we've had no response at all, wait a second */
878*b9238976Sth 		if (esi_first == NULL)
879*b9238976Sth 			delay(drv_usectohz(1000000));
880*b9238976Sth 
881*b9238976Sth 	} while (esi_first == NULL);
882*b9238976Sth 	ASSERT(nargs_head != NULL);
883*b9238976Sth 
884*b9238976Sth 	dma = kmem_zalloc(sizeof (domount_args_t), KM_SLEEP);
885*b9238976Sth 	dma->dma_esi = esi_first;
886*b9238976Sth 	dma->dma_hostlist = hostlist;
887*b9238976Sth 	dma->dma_nargs = nargs_head;
888*b9238976Sth 
889*b9238976Sth 	return (dma);
890*b9238976Sth }
891*b9238976Sth 
892*b9238976Sth static void
893*b9238976Sth nfs4_trigger_domount_args_destroy(domount_args_t *dma, vnode_t *vp)
894*b9238976Sth {
895*b9238976Sth 	if (dma != NULL) {
896*b9238976Sth 		if (dma->dma_esi != NULL && vp != NULL)
897*b9238976Sth 			nfs4_trigger_esi_destroy(dma->dma_esi, vp);
898*b9238976Sth 
899*b9238976Sth 		if (dma->dma_hostlist != NULL)
900*b9238976Sth 			kmem_free(dma->dma_hostlist, MAXPATHLEN);
901*b9238976Sth 
902*b9238976Sth 		if (dma->dma_nargs != NULL) {
903*b9238976Sth 			struct nfs_args *nargs = dma->dma_nargs;
904*b9238976Sth 
905*b9238976Sth 			do {
906*b9238976Sth 				struct nfs_args *next =
907*b9238976Sth 				    nargs->nfs_ext_u.nfs_extB.next;
908*b9238976Sth 
909*b9238976Sth 				nfs4_trigger_nargs_destroy(nargs);
910*b9238976Sth 				nargs = next;
911*b9238976Sth 			} while (nargs != NULL);
912*b9238976Sth 		}
913*b9238976Sth 
914*b9238976Sth 		kmem_free(dma, sizeof (domount_args_t));
915*b9238976Sth 	}
916*b9238976Sth }
917*b9238976Sth 
918*b9238976Sth /*
919*b9238976Sth  * The ephemeral_servinfo_t struct contains basic information we will need to
920*b9238976Sth  * perform the mount. Whilst the structure is generic across different
921*b9238976Sth  * types of ephemeral mount, the way we gather its contents differs.
922*b9238976Sth  */
923*b9238976Sth static ephemeral_servinfo_t *
924*b9238976Sth nfs4_trigger_esi_create(vnode_t *vp, servinfo4_t *svp)
925*b9238976Sth {
926*b9238976Sth 	ephemeral_servinfo_t *esi;
927*b9238976Sth 	rnode4_t *rp = VTOR4(vp);
928*b9238976Sth 
929*b9238976Sth 	ASSERT(RP_ISSTUB(rp));
930*b9238976Sth 
931*b9238976Sth 	/* Call the ephemeral type-specific routine */
932*b9238976Sth 	if (RP_ISSTUB_MIRRORMOUNT(rp))
933*b9238976Sth 		esi = nfs4_trigger_esi_create_mirrormount(vp, svp);
934*b9238976Sth 	else
935*b9238976Sth 		esi = NULL;
936*b9238976Sth 
937*b9238976Sth 	/* for now, we only support mirror-mounts */
938*b9238976Sth 	ASSERT(esi != NULL);
939*b9238976Sth 
940*b9238976Sth 	return (esi);
941*b9238976Sth }
942*b9238976Sth 
943*b9238976Sth static void
944*b9238976Sth nfs4_trigger_esi_destroy(ephemeral_servinfo_t *esi, vnode_t *vp)
945*b9238976Sth {
946*b9238976Sth 	rnode4_t *rp = VTOR4(vp);
947*b9238976Sth 
948*b9238976Sth 	ASSERT(RP_ISSTUB(rp));
949*b9238976Sth 
950*b9238976Sth 	/* for now, we only support mirror-mounts */
951*b9238976Sth 	ASSERT(RP_ISSTUB_MIRRORMOUNT(rp));
952*b9238976Sth 
953*b9238976Sth 	/* Currently, no need for an ephemeral type-specific routine */
954*b9238976Sth 
955*b9238976Sth 	/*
956*b9238976Sth 	 * The contents of ephemeral_servinfo_t goes into nfs_args,
957*b9238976Sth 	 * and will be handled by nfs4_trigger_nargs_destroy().
958*b9238976Sth 	 * We need only free the structure itself.
959*b9238976Sth 	 */
960*b9238976Sth 	if (esi != NULL)
961*b9238976Sth 		kmem_free(esi, sizeof (ephemeral_servinfo_t));
962*b9238976Sth }
963*b9238976Sth 
964*b9238976Sth /*
965*b9238976Sth  * Some of this may turn out to be common with other ephemeral types,
966*b9238976Sth  * in which case it should be moved to nfs4_trigger_esi_create(), or a
967*b9238976Sth  * common function called.
968*b9238976Sth  */
969*b9238976Sth static ephemeral_servinfo_t *
970*b9238976Sth nfs4_trigger_esi_create_mirrormount(vnode_t *vp, servinfo4_t *svp)
971*b9238976Sth {
972*b9238976Sth 	char			*stubpath;
973*b9238976Sth 	struct knetconfig	*sikncp, *svkncp;
974*b9238976Sth 	struct netbuf		*bufp;
975*b9238976Sth 	ephemeral_servinfo_t	*esi;
976*b9238976Sth 
977*b9238976Sth 	esi = kmem_zalloc(sizeof (ephemeral_servinfo_t), KM_SLEEP);
978*b9238976Sth 
979*b9238976Sth 	/* initially set to be our type of ephemeral mount; may be added to */
980*b9238976Sth 	esi->esi_mount_flags = NFSMNT_MIRRORMOUNT;
981*b9238976Sth 
982*b9238976Sth 	/*
983*b9238976Sth 	 * We're copying info from the stub rnode's servinfo4, but
984*b9238976Sth 	 * we must create new copies, not pointers, since this information
985*b9238976Sth 	 * is to be associated with the new mount, which will be
986*b9238976Sth 	 * unmounted (and its structures freed) separately
987*b9238976Sth 	 */
988*b9238976Sth 
989*b9238976Sth 	/*
990*b9238976Sth 	 * Sizes passed to kmem_[z]alloc here must match those freed
991*b9238976Sth 	 * in nfs4_free_args()
992*b9238976Sth 	 */
993*b9238976Sth 
994*b9238976Sth 	/*
995*b9238976Sth 	 * We hold sv_lock across kmem_zalloc() calls that may sleep, but this
996*b9238976Sth 	 * is difficult to avoid: as we need to read svp to calculate the
997*b9238976Sth 	 * sizes to be allocated.
998*b9238976Sth 	 */
999*b9238976Sth 	(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
1000*b9238976Sth 
1001*b9238976Sth 	esi->esi_hostname = kmem_zalloc(strlen(svp->sv_hostname) + 1, KM_SLEEP);
1002*b9238976Sth 	(void) strcat(esi->esi_hostname, svp->sv_hostname);
1003*b9238976Sth 
1004*b9238976Sth 	esi->esi_addr = kmem_zalloc(sizeof (struct netbuf), KM_SLEEP);
1005*b9238976Sth 	bufp = esi->esi_addr;
1006*b9238976Sth 	bufp->len = svp->sv_addr.len;
1007*b9238976Sth 	bufp->maxlen = svp->sv_addr.maxlen;
1008*b9238976Sth 	bufp->buf = kmem_zalloc(bufp->len, KM_SLEEP);
1009*b9238976Sth 	bcopy(svp->sv_addr.buf, bufp->buf, bufp->len);
1010*b9238976Sth 
1011*b9238976Sth 	esi->esi_knconf = kmem_zalloc(sizeof (*esi->esi_knconf), KM_SLEEP);
1012*b9238976Sth 	sikncp = esi->esi_knconf;
1013*b9238976Sth 	svkncp = svp->sv_knconf;
1014*b9238976Sth 	sikncp->knc_semantics = svkncp->knc_semantics;
1015*b9238976Sth 	sikncp->knc_protofmly = (caddr_t)kmem_zalloc(KNC_STRSIZE, KM_SLEEP);
1016*b9238976Sth 	(void) strcat((char *)sikncp->knc_protofmly,
1017*b9238976Sth 	    (char *)svkncp->knc_protofmly);
1018*b9238976Sth 	sikncp->knc_proto = (caddr_t)kmem_zalloc(KNC_STRSIZE, KM_SLEEP);
1019*b9238976Sth 	(void) strcat((char *)sikncp->knc_proto, (char *)svkncp->knc_proto);
1020*b9238976Sth 	sikncp->knc_rdev = svkncp->knc_rdev;
1021*b9238976Sth 
1022*b9238976Sth 	/*
1023*b9238976Sth 	 * Used when AUTH_DH is negotiated.
1024*b9238976Sth 	 *
1025*b9238976Sth 	 * This is ephemeral mount-type specific, since it contains the
1026*b9238976Sth 	 * server's time-sync syncaddr.
1027*b9238976Sth 	 */
1028*b9238976Sth 	if (svp->sv_dhsec) {
1029*b9238976Sth 		struct netbuf *bufp;
1030*b9238976Sth 		sec_data_t *sdata;
1031*b9238976Sth 		dh_k4_clntdata_t *data;
1032*b9238976Sth 
1033*b9238976Sth 		sdata = svp->sv_dhsec;
1034*b9238976Sth 		data = (dh_k4_clntdata_t *)sdata->data;
1035*b9238976Sth 		ASSERT(sdata->rpcflavor == AUTH_DH);
1036*b9238976Sth 
1037*b9238976Sth 		bufp = kmem_zalloc(sizeof (struct netbuf), KM_SLEEP);
1038*b9238976Sth 		bufp->len = data->syncaddr.len;
1039*b9238976Sth 		bufp->maxlen = data->syncaddr.maxlen;
1040*b9238976Sth 		bufp->buf = kmem_zalloc(bufp->len, KM_SLEEP);
1041*b9238976Sth 		bcopy(data->syncaddr.buf, bufp->buf, bufp->len);
1042*b9238976Sth 		esi->esi_syncaddr = bufp;
1043*b9238976Sth 
1044*b9238976Sth 		if (data->netname != NULL) {
1045*b9238976Sth 			int nmlen = data->netnamelen;
1046*b9238976Sth 
1047*b9238976Sth 			/*
1048*b9238976Sth 			 * We need to copy from a dh_k4_clntdata_t
1049*b9238976Sth 			 * netname/netnamelen pair to a NUL-terminated
1050*b9238976Sth 			 * netname string suitable for putting in nfs_args,
1051*b9238976Sth 			 * where the latter has no netnamelen field.
1052*b9238976Sth 			 */
1053*b9238976Sth 			esi->esi_netname = kmem_zalloc(nmlen + 1, KM_SLEEP);
1054*b9238976Sth 			bcopy(data->netname, esi->esi_netname, nmlen);
1055*b9238976Sth 		}
1056*b9238976Sth 	} else {
1057*b9238976Sth 		esi->esi_syncaddr = NULL;
1058*b9238976Sth 		esi->esi_netname = NULL;
1059*b9238976Sth 	}
1060*b9238976Sth 
1061*b9238976Sth 	stubpath = fn_path(VTOSV(vp)->sv_name);
1062*b9238976Sth 	/* step over initial '.', to avoid e.g. sv_path: "/tank./ws" */
1063*b9238976Sth 	ASSERT(*stubpath == '.');
1064*b9238976Sth 	stubpath += 1;
1065*b9238976Sth 
1066*b9238976Sth 	/* for nfs_args->fh */
1067*b9238976Sth 	esi->esi_path_len = strlen(svp->sv_path) + strlen(stubpath) + 1;
1068*b9238976Sth 	esi->esi_path = kmem_zalloc(esi->esi_path_len, KM_SLEEP);
1069*b9238976Sth 	(void) strcat(esi->esi_path, svp->sv_path);
1070*b9238976Sth 	(void) strcat(esi->esi_path, stubpath);
1071*b9238976Sth 
1072*b9238976Sth 	stubpath -= 1;
1073*b9238976Sth 	/* stubpath allocated by fn_path() */
1074*b9238976Sth 	kmem_free(stubpath, strlen(stubpath) + 1);
1075*b9238976Sth 
1076*b9238976Sth 	nfs_rw_exit(&svp->sv_lock);
1077*b9238976Sth 
1078*b9238976Sth 	return (esi);
1079*b9238976Sth }
1080*b9238976Sth 
1081*b9238976Sth /*
1082*b9238976Sth  * Assemble the args, and call the generic VFS mount function to
1083*b9238976Sth  * finally perform the ephemeral mount.
1084*b9238976Sth  */
1085*b9238976Sth static int
1086*b9238976Sth nfs4_trigger_domount(vnode_t *stubvp, domount_args_t *dma, vfs_t **vfsp,
1087*b9238976Sth     cred_t *cr)
1088*b9238976Sth {
1089*b9238976Sth 	struct mounta	*uap;
1090*b9238976Sth 	char		*mntpt, *orig_path, *path;
1091*b9238976Sth 	const char	*orig_mntpt;
1092*b9238976Sth 	int		retval;
1093*b9238976Sth 	int		mntpt_len;
1094*b9238976Sth 	int		spec_len;
1095*b9238976Sth 	zone_t		*zone = curproc->p_zone;
1096*b9238976Sth 	bool_t		has_leading_slash;
1097*b9238976Sth 
1098*b9238976Sth 	vfs_t			*stubvfsp = stubvp->v_vfsp;
1099*b9238976Sth 	ephemeral_servinfo_t	*esi = dma->dma_esi;
1100*b9238976Sth 	struct nfs_args		*nargs = dma->dma_nargs;
1101*b9238976Sth 
1102*b9238976Sth 	/* first, construct the mount point for the ephemeral mount */
1103*b9238976Sth 	orig_path = path = fn_path(VTOSV(stubvp)->sv_name);
1104*b9238976Sth 	orig_mntpt = (char *)refstr_value(stubvfsp->vfs_mntpt);
1105*b9238976Sth 
1106*b9238976Sth 	if (*orig_path == '.')
1107*b9238976Sth 		orig_path++;
1108*b9238976Sth 
1109*b9238976Sth 	/*
1110*b9238976Sth 	 * Get rid of zone's root path
1111*b9238976Sth 	 */
1112*b9238976Sth 	if (zone != global_zone) {
1113*b9238976Sth 		/*
1114*b9238976Sth 		 * -1 for trailing '/' and -1 for EOS.
1115*b9238976Sth 		 */
1116*b9238976Sth 		if (strncmp(zone->zone_rootpath, orig_mntpt,
1117*b9238976Sth 		    zone->zone_rootpathlen - 1) == 0) {
1118*b9238976Sth 			orig_mntpt += (zone->zone_rootpathlen - 2);
1119*b9238976Sth 		}
1120*b9238976Sth 	}
1121*b9238976Sth 
1122*b9238976Sth 	mntpt_len = strlen(orig_mntpt) + strlen(orig_path);
1123*b9238976Sth 	mntpt = kmem_zalloc(mntpt_len + 1, KM_SLEEP);
1124*b9238976Sth 	(void) strcat(mntpt, orig_mntpt);
1125*b9238976Sth 	(void) strcat(mntpt, orig_path);
1126*b9238976Sth 
1127*b9238976Sth 	kmem_free(path, strlen(path) + 1);
1128*b9238976Sth 	path = esi->esi_path;
1129*b9238976Sth 	if (*path == '.')
1130*b9238976Sth 		path++;
1131*b9238976Sth 	if (path[0] == '/' && path[1] == '/')
1132*b9238976Sth 		path++;
1133*b9238976Sth 	has_leading_slash = (*path == '/');
1134*b9238976Sth 
1135*b9238976Sth 	spec_len = strlen(dma->dma_hostlist);
1136*b9238976Sth 	spec_len += strlen(path);
1137*b9238976Sth 
1138*b9238976Sth 	/* We are going to have to add this in */
1139*b9238976Sth 	if (!has_leading_slash)
1140*b9238976Sth 		spec_len++;
1141*b9238976Sth 
1142*b9238976Sth 	/* We need to get the ':' for dma_hostlist:esi_path */
1143*b9238976Sth 	spec_len++;
1144*b9238976Sth 
1145*b9238976Sth 	uap = kmem_zalloc(sizeof (struct mounta), KM_SLEEP);
1146*b9238976Sth 	uap->spec = kmem_zalloc(spec_len + 1, KM_SLEEP);
1147*b9238976Sth 	(void) snprintf(uap->spec, spec_len + 1, "%s:%s%s", dma->dma_hostlist,
1148*b9238976Sth 	    has_leading_slash ? "" : "/", path);
1149*b9238976Sth 
1150*b9238976Sth 	uap->dir = mntpt;
1151*b9238976Sth 
1152*b9238976Sth 	uap->flags = MS_SYSSPACE | MS_DATA;
1153*b9238976Sth 	/* fstype-independent mount options not covered elsewhere */
1154*b9238976Sth 	/* copy parent's mount(1M) "-m" flag */
1155*b9238976Sth 	if (stubvfsp->vfs_flag & VFS_NOMNTTAB)
1156*b9238976Sth 		uap->flags |= MS_NOMNTTAB;
1157*b9238976Sth 
1158*b9238976Sth 	uap->fstype = MNTTYPE_NFS4;
1159*b9238976Sth 	uap->dataptr = (char *)nargs;
1160*b9238976Sth 	/* not needed for MS_SYSSPACE */
1161*b9238976Sth 	uap->datalen = 0;
1162*b9238976Sth 
1163*b9238976Sth 	/* use optptr to pass in extra mount options */
1164*b9238976Sth 	uap->flags |= MS_OPTIONSTR;
1165*b9238976Sth 	uap->optptr = nfs4_trigger_create_mntopts(stubvfsp);
1166*b9238976Sth 	if (uap->optptr == NULL) {
1167*b9238976Sth 		retval = EINVAL;
1168*b9238976Sth 		goto done;
1169*b9238976Sth 	}
1170*b9238976Sth 	/* domount() expects us to count the trailing NUL */
1171*b9238976Sth 	uap->optlen = strlen(uap->optptr) + 1;
1172*b9238976Sth 
1173*b9238976Sth 	retval = domount(NULL, uap, stubvp, cr, vfsp);
1174*b9238976Sth 	if (retval == 0)
1175*b9238976Sth 		VFS_RELE(*vfsp);
1176*b9238976Sth done:
1177*b9238976Sth 	if (uap->optptr)
1178*b9238976Sth 		nfs4_trigger_destroy_mntopts(uap->optptr);
1179*b9238976Sth 
1180*b9238976Sth 	kmem_free(uap->spec, spec_len + 1);
1181*b9238976Sth 	kmem_free(uap, sizeof (struct mounta));
1182*b9238976Sth 	kmem_free(mntpt, mntpt_len + 1);
1183*b9238976Sth 
1184*b9238976Sth 	return (retval);
1185*b9238976Sth }
1186*b9238976Sth 
1187*b9238976Sth /*
1188*b9238976Sth  * Build an nfs_args structure for passing to domount().
1189*b9238976Sth  *
1190*b9238976Sth  * Ephemeral mount-type specific data comes from the ephemeral_servinfo_t;
1191*b9238976Sth  * generic data - common to all ephemeral mount types - is read directly
1192*b9238976Sth  * from the parent mount's servinfo4_t and mntinfo4_t, via the stub vnode.
1193*b9238976Sth  */
1194*b9238976Sth static struct nfs_args *
1195*b9238976Sth nfs4_trigger_nargs_create(mntinfo4_t *mi, servinfo4_t *svp,
1196*b9238976Sth     ephemeral_servinfo_t *esi)
1197*b9238976Sth {
1198*b9238976Sth 	sec_data_t *secdata;
1199*b9238976Sth 	struct nfs_args *nargs;
1200*b9238976Sth 
1201*b9238976Sth 	/* setup the nfs args */
1202*b9238976Sth 	nargs = kmem_zalloc(sizeof (struct nfs_args), KM_SLEEP);
1203*b9238976Sth 
1204*b9238976Sth 	(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
1205*b9238976Sth 
1206*b9238976Sth 	nargs->addr = esi->esi_addr;
1207*b9238976Sth 
1208*b9238976Sth 	/* for AUTH_DH by negotiation */
1209*b9238976Sth 	if (esi->esi_syncaddr || esi->esi_netname) {
1210*b9238976Sth 		nargs->flags |= NFSMNT_SECURE;
1211*b9238976Sth 		nargs->syncaddr = esi->esi_syncaddr;
1212*b9238976Sth 		nargs->netname = esi->esi_netname;
1213*b9238976Sth 	}
1214*b9238976Sth 
1215*b9238976Sth 	nargs->flags |= NFSMNT_KNCONF;
1216*b9238976Sth 	nargs->knconf = esi->esi_knconf;
1217*b9238976Sth 	nargs->flags |= NFSMNT_HOSTNAME;
1218*b9238976Sth 	nargs->hostname = esi->esi_hostname;
1219*b9238976Sth 	nargs->fh = esi->esi_path;
1220*b9238976Sth 
1221*b9238976Sth 	/* general mount settings, all copied from parent mount */
1222*b9238976Sth 	mutex_enter(&mi->mi_lock);
1223*b9238976Sth 
1224*b9238976Sth 	if (!(mi->mi_flags & MI4_HARD))
1225*b9238976Sth 		nargs->flags |= NFSMNT_SOFT;
1226*b9238976Sth 
1227*b9238976Sth 	nargs->flags |= NFSMNT_WSIZE | NFSMNT_RSIZE | NFSMNT_TIMEO |
1228*b9238976Sth 	    NFSMNT_RETRANS;
1229*b9238976Sth 	nargs->wsize = mi->mi_stsize;
1230*b9238976Sth 	nargs->rsize = mi->mi_tsize;
1231*b9238976Sth 	nargs->timeo = mi->mi_timeo;
1232*b9238976Sth 	nargs->retrans = mi->mi_retrans;
1233*b9238976Sth 
1234*b9238976Sth 	if (mi->mi_flags & MI4_INT)
1235*b9238976Sth 		nargs->flags |= NFSMNT_INT;
1236*b9238976Sth 	if (mi->mi_flags & MI4_NOAC)
1237*b9238976Sth 		nargs->flags |= NFSMNT_NOAC;
1238*b9238976Sth 
1239*b9238976Sth 	nargs->flags |= NFSMNT_ACREGMIN | NFSMNT_ACREGMAX | NFSMNT_ACDIRMIN |
1240*b9238976Sth 	    NFSMNT_ACDIRMAX;
1241*b9238976Sth 	nargs->acregmin = HR2SEC(mi->mi_acregmin);
1242*b9238976Sth 	nargs->acregmax = HR2SEC(mi->mi_acregmax);
1243*b9238976Sth 	nargs->acdirmin = HR2SEC(mi->mi_acdirmin);
1244*b9238976Sth 	nargs->acdirmax = HR2SEC(mi->mi_acdirmax);
1245*b9238976Sth 
1246*b9238976Sth 	if (mi->mi_flags & MI4_NOCTO)
1247*b9238976Sth 		nargs->flags |= NFSMNT_NOCTO;
1248*b9238976Sth 	if (mi->mi_flags & MI4_GRPID)
1249*b9238976Sth 		nargs->flags |= NFSMNT_GRPID;
1250*b9238976Sth 	if (mi->mi_flags & MI4_LLOCK)
1251*b9238976Sth 		nargs->flags |= NFSMNT_LLOCK;
1252*b9238976Sth 	if (mi->mi_flags & MI4_NOPRINT)
1253*b9238976Sth 		nargs->flags |= NFSMNT_NOPRINT;
1254*b9238976Sth 	if (mi->mi_flags & MI4_DIRECTIO)
1255*b9238976Sth 		nargs->flags |= NFSMNT_DIRECTIO;
1256*b9238976Sth 	if (mi->mi_flags & MI4_PUBLIC)
1257*b9238976Sth 		nargs->flags |= NFSMNT_PUBLIC;
1258*b9238976Sth 
1259*b9238976Sth 	mutex_exit(&mi->mi_lock);
1260*b9238976Sth 
1261*b9238976Sth 	/* add any specific flags for this type of ephemeral mount */
1262*b9238976Sth 	nargs->flags |= esi->esi_mount_flags;
1263*b9238976Sth 
1264*b9238976Sth 	/*
1265*b9238976Sth 	 * Security data & negotiation policy.
1266*b9238976Sth 	 *
1267*b9238976Sth 	 * We need to preserve the parent mount's preference for security
1268*b9238976Sth 	 * negotiation, translating SV4_TRYSECDEFAULT -> NFSMNT_SECDEFAULT.
1269*b9238976Sth 	 *
1270*b9238976Sth 	 * If SV4_TRYSECDEFAULT is not set, that indicates that a specific
1271*b9238976Sth 	 * security flavour was requested, with data in sv_secdata, and that
1272*b9238976Sth 	 * no negotiation should occur. If this specified flavour fails, that's
1273*b9238976Sth 	 * it. We will copy sv_secdata, and not set NFSMNT_SECDEFAULT.
1274*b9238976Sth 	 *
1275*b9238976Sth 	 * If SV4_TRYSECDEFAULT is set, then we start with a passed-in
1276*b9238976Sth 	 * default flavour, in sv_secdata, but then negotiate a new flavour.
1277*b9238976Sth 	 * Possible flavours are recorded in an array in sv_secinfo, with
1278*b9238976Sth 	 * currently in-use flavour pointed to by sv_currsec.
1279*b9238976Sth 	 *
1280*b9238976Sth 	 * If sv_currsec is set, i.e. if negotiation has already occurred,
1281*b9238976Sth 	 * we will copy sv_currsec. Otherwise, copy sv_secdata. Regardless,
1282*b9238976Sth 	 * we will set NFSMNT_SECDEFAULT, to enable negotiation.
1283*b9238976Sth 	 */
1284*b9238976Sth 	if (svp->sv_flags & SV4_TRYSECDEFAULT) {
1285*b9238976Sth 		/* enable negotiation for ephemeral mount */
1286*b9238976Sth 		nargs->flags |= NFSMNT_SECDEFAULT;
1287*b9238976Sth 
1288*b9238976Sth 		/*
1289*b9238976Sth 		 * As a starting point for negotiation, copy parent
1290*b9238976Sth 		 * mount's negotiated flavour (sv_currsec) if available,
1291*b9238976Sth 		 * or its passed-in flavour (sv_secdata) if not.
1292*b9238976Sth 		 */
1293*b9238976Sth 		if (svp->sv_currsec != NULL)
1294*b9238976Sth 			secdata = copy_sec_data(svp->sv_currsec);
1295*b9238976Sth 		else if (svp->sv_secdata != NULL)
1296*b9238976Sth 			secdata = copy_sec_data(svp->sv_secdata);
1297*b9238976Sth 		else
1298*b9238976Sth 			secdata = NULL;
1299*b9238976Sth 	} else {
1300*b9238976Sth 		/* do not enable negotiation; copy parent's passed-in flavour */
1301*b9238976Sth 		if (svp->sv_secdata != NULL)
1302*b9238976Sth 			secdata = copy_sec_data(svp->sv_secdata);
1303*b9238976Sth 		else
1304*b9238976Sth 			secdata = NULL;
1305*b9238976Sth 	}
1306*b9238976Sth 
1307*b9238976Sth 	nfs_rw_exit(&svp->sv_lock);
1308*b9238976Sth 
1309*b9238976Sth 	nargs->flags |= NFSMNT_NEWARGS;
1310*b9238976Sth 	nargs->nfs_args_ext = NFS_ARGS_EXTB;
1311*b9238976Sth 	nargs->nfs_ext_u.nfs_extB.secdata = secdata;
1312*b9238976Sth 
1313*b9238976Sth 	/* for NFS RO failover; caller will set if necessary */
1314*b9238976Sth 	nargs->nfs_ext_u.nfs_extB.next = NULL;
1315*b9238976Sth 
1316*b9238976Sth 	return (nargs);
1317*b9238976Sth }
1318*b9238976Sth 
1319*b9238976Sth static void
1320*b9238976Sth nfs4_trigger_nargs_destroy(struct nfs_args *nargs)
1321*b9238976Sth {
1322*b9238976Sth 	/*
1323*b9238976Sth 	 * Either the mount failed, in which case the data is not needed, or
1324*b9238976Sth 	 * nfs4_mount() has either taken copies of what it needs or,
1325*b9238976Sth 	 * where it has merely copied the ptr, it has set *our* ptr to NULL,
1326*b9238976Sth 	 * whereby nfs4_free_args() will ignore it.
1327*b9238976Sth 	 */
1328*b9238976Sth 	nfs4_free_args(nargs);
1329*b9238976Sth 	kmem_free(nargs, sizeof (struct nfs_args));
1330*b9238976Sth }
1331*b9238976Sth 
1332*b9238976Sth /*
1333*b9238976Sth  * When we finally get into the mounting, we need to add this
1334*b9238976Sth  * node to the ephemeral tree.
1335*b9238976Sth  *
1336*b9238976Sth  * This is called from nfs4_mount().
1337*b9238976Sth  */
1338*b9238976Sth void
1339*b9238976Sth nfs4_record_ephemeral_mount(mntinfo4_t *mi, vnode_t *mvp)
1340*b9238976Sth {
1341*b9238976Sth 	mntinfo4_t		*mi_parent;
1342*b9238976Sth 	nfs4_ephemeral_t	*eph;
1343*b9238976Sth 	nfs4_ephemeral_tree_t	*net;
1344*b9238976Sth 
1345*b9238976Sth 	nfs4_ephemeral_t	*prior;
1346*b9238976Sth 	nfs4_ephemeral_t	*child;
1347*b9238976Sth 
1348*b9238976Sth 	nfs4_ephemeral_t	*peer;
1349*b9238976Sth 
1350*b9238976Sth 	nfs4_trigger_globals_t	*ntg;
1351*b9238976Sth 	zone_t			*zone = curproc->p_zone;
1352*b9238976Sth 
1353*b9238976Sth 	mi_parent = VTOMI4(mvp);
1354*b9238976Sth 
1355*b9238976Sth 	/*
1356*b9238976Sth 	 * Get this before grabbing anything else!
1357*b9238976Sth 	 */
1358*b9238976Sth 	ntg = zone_getspecific(nfs4_ephemeral_key, zone);
1359*b9238976Sth 	if (!ntg->ntg_thread_started) {
1360*b9238976Sth 		nfs4_ephemeral_start_harvester(ntg);
1361*b9238976Sth 	}
1362*b9238976Sth 
1363*b9238976Sth 	mutex_enter(&mi_parent->mi_lock);
1364*b9238976Sth 	mutex_enter(&mi->mi_lock);
1365*b9238976Sth 
1366*b9238976Sth 	/*
1367*b9238976Sth 	 * We need to tack together the ephemeral mount
1368*b9238976Sth 	 * with this new mntinfo.
1369*b9238976Sth 	 */
1370*b9238976Sth 	eph = kmem_zalloc(sizeof (*eph), KM_SLEEP);
1371*b9238976Sth 	eph->ne_mount = mi;
1372*b9238976Sth 	eph->ne_ref_time = gethrestime_sec();
1373*b9238976Sth 
1374*b9238976Sth 	/*
1375*b9238976Sth 	 * We need to tell the ephemeral mount when
1376*b9238976Sth 	 * to time out.
1377*b9238976Sth 	 */
1378*b9238976Sth 	eph->ne_mount_to = ntg->ntg_mount_to;
1379*b9238976Sth 
1380*b9238976Sth 	mi->mi_flags |= MI4_EPHEMERAL;
1381*b9238976Sth 	mi->mi_ephemeral = eph;
1382*b9238976Sth 
1383*b9238976Sth 	net = mi->mi_ephemeral_tree =
1384*b9238976Sth 	    mi_parent->mi_ephemeral_tree;
1385*b9238976Sth 	ASSERT(net != NULL);
1386*b9238976Sth 
1387*b9238976Sth 	/*
1388*b9238976Sth 	 * If the enclosing mntinfo4 is also ephemeral,
1389*b9238976Sth 	 * then we need to point to its enclosing parent.
1390*b9238976Sth 	 * Else the enclosing mntinfo4 is the enclosing parent.
1391*b9238976Sth 	 *
1392*b9238976Sth 	 * We also need to weave this ephemeral node
1393*b9238976Sth 	 * into the tree.
1394*b9238976Sth 	 */
1395*b9238976Sth 	if (mi_parent->mi_flags & MI4_EPHEMERAL) {
1396*b9238976Sth 		/*
1397*b9238976Sth 		 * We need to decide if we are
1398*b9238976Sth 		 * the root node of this branch
1399*b9238976Sth 		 * or if we are a sibling of this
1400*b9238976Sth 		 * branch.
1401*b9238976Sth 		 */
1402*b9238976Sth 		prior = mi_parent->mi_ephemeral;
1403*b9238976Sth 		ASSERT(prior != NULL);
1404*b9238976Sth 		if (prior->ne_child == NULL) {
1405*b9238976Sth 			prior->ne_child = eph;
1406*b9238976Sth 		} else {
1407*b9238976Sth 			child = prior->ne_child;
1408*b9238976Sth 
1409*b9238976Sth 			prior->ne_child = eph;
1410*b9238976Sth 			eph->ne_peer = child;
1411*b9238976Sth 
1412*b9238976Sth 			child->ne_prior = eph;
1413*b9238976Sth 		}
1414*b9238976Sth 
1415*b9238976Sth 		eph->ne_prior = prior;
1416*b9238976Sth 	} else {
1417*b9238976Sth 		/*
1418*b9238976Sth 		 * The parent mntinfo4 is the non-ephemeral
1419*b9238976Sth 		 * root of the ephemeral tree. We
1420*b9238976Sth 		 * need to decide if we are the root
1421*b9238976Sth 		 * node of that tree or if we are a
1422*b9238976Sth 		 * sibling of the root node.
1423*b9238976Sth 		 *
1424*b9238976Sth 		 * We are the root if there is no
1425*b9238976Sth 		 * other node.
1426*b9238976Sth 		 */
1427*b9238976Sth 		if (net->net_root == NULL) {
1428*b9238976Sth 			net->net_root = eph;
1429*b9238976Sth 		} else {
1430*b9238976Sth 			eph->ne_peer = peer = net->net_root;
1431*b9238976Sth 			ASSERT(peer != NULL);
1432*b9238976Sth 			net->net_root = eph;
1433*b9238976Sth 
1434*b9238976Sth 			peer->ne_prior = eph;
1435*b9238976Sth 		}
1436*b9238976Sth 
1437*b9238976Sth 		eph->ne_prior = NULL;
1438*b9238976Sth 	}
1439*b9238976Sth 
1440*b9238976Sth 	mutex_exit(&mi->mi_lock);
1441*b9238976Sth 	mutex_exit(&mi_parent->mi_lock);
1442*b9238976Sth }
1443*b9238976Sth 
1444*b9238976Sth /*
1445*b9238976Sth  * Commit the changes to the ephemeral tree for removing this node.
1446*b9238976Sth  */
1447*b9238976Sth static void
1448*b9238976Sth nfs4_ephemeral_umount_cleanup(nfs4_ephemeral_t *eph)
1449*b9238976Sth {
1450*b9238976Sth 	nfs4_ephemeral_t	*e = eph;
1451*b9238976Sth 	nfs4_ephemeral_t	*peer;
1452*b9238976Sth 	nfs4_ephemeral_t	*prior;
1453*b9238976Sth 
1454*b9238976Sth 	peer = eph->ne_peer;
1455*b9238976Sth 	prior = e->ne_prior;
1456*b9238976Sth 
1457*b9238976Sth 	/*
1458*b9238976Sth 	 * If this branch root was not the
1459*b9238976Sth 	 * tree root, then we need to fix back pointers.
1460*b9238976Sth 	 */
1461*b9238976Sth 	if (prior) {
1462*b9238976Sth 		if (prior->ne_child == e) {
1463*b9238976Sth 			prior->ne_child = peer;
1464*b9238976Sth 		} else {
1465*b9238976Sth 			prior->ne_peer = peer;
1466*b9238976Sth 		}
1467*b9238976Sth 
1468*b9238976Sth 		if (peer)
1469*b9238976Sth 			peer->ne_prior = prior;
1470*b9238976Sth 	} else if (peer) {
1471*b9238976Sth 		peer->ne_mount->mi_ephemeral_tree->net_root = peer;
1472*b9238976Sth 		peer->ne_prior = NULL;
1473*b9238976Sth 	} else {
1474*b9238976Sth 		e->ne_mount->mi_ephemeral_tree->net_root = NULL;
1475*b9238976Sth 	}
1476*b9238976Sth }
1477*b9238976Sth 
1478*b9238976Sth /*
1479*b9238976Sth  * We want to avoid recursion at all costs. So we need to
1480*b9238976Sth  * unroll the tree. We do this by a depth first traversal to
1481*b9238976Sth  * leaf nodes. We blast away the leaf and work our way back
1482*b9238976Sth  * up and down the tree.
1483*b9238976Sth  */
1484*b9238976Sth static int
1485*b9238976Sth nfs4_ephemeral_unmount_engine(nfs4_ephemeral_t *eph,
1486*b9238976Sth     int isTreeRoot, int flag, cred_t *cr)
1487*b9238976Sth {
1488*b9238976Sth 	nfs4_ephemeral_t	*e = eph;
1489*b9238976Sth 	nfs4_ephemeral_t	*prior;
1490*b9238976Sth 	mntinfo4_t		*mi;
1491*b9238976Sth 	vfs_t			*vfsp;
1492*b9238976Sth 	int			error;
1493*b9238976Sth 
1494*b9238976Sth 	/*
1495*b9238976Sth 	 * We use the loop while unrolling the ephemeral tree.
1496*b9238976Sth 	 */
1497*b9238976Sth 	for (;;) {
1498*b9238976Sth 		/*
1499*b9238976Sth 		 * First we walk down the child.
1500*b9238976Sth 		 */
1501*b9238976Sth 		if (e->ne_child) {
1502*b9238976Sth 			prior = e;
1503*b9238976Sth 			e = e->ne_child;
1504*b9238976Sth 			continue;
1505*b9238976Sth 		}
1506*b9238976Sth 
1507*b9238976Sth 		/*
1508*b9238976Sth 		 * If we are the root of the branch we are removing,
1509*b9238976Sth 		 * we end it here. But if the branch is the root of
1510*b9238976Sth 		 * the tree, we have to forge on. We do not consider
1511*b9238976Sth 		 * the peer list for the root because while it may
1512*b9238976Sth 		 * be okay to remove, it is both extra work and a
1513*b9238976Sth 		 * potential for a false-positive error to stall the
1514*b9238976Sth 		 * unmount attempt.
1515*b9238976Sth 		 */
1516*b9238976Sth 		if (e == eph && isTreeRoot == FALSE)
1517*b9238976Sth 			return (0);
1518*b9238976Sth 
1519*b9238976Sth 		/*
1520*b9238976Sth 		 * Next we walk down the peer list.
1521*b9238976Sth 		 */
1522*b9238976Sth 		if (e->ne_peer) {
1523*b9238976Sth 			prior = e;
1524*b9238976Sth 			e = e->ne_peer;
1525*b9238976Sth 			continue;
1526*b9238976Sth 		}
1527*b9238976Sth 
1528*b9238976Sth 		/*
1529*b9238976Sth 		 * We can only remove the node passed in by the
1530*b9238976Sth 		 * caller if it is the root of the ephemeral tree.
1531*b9238976Sth 		 * Otherwise, the caller will remove it.
1532*b9238976Sth 		 */
1533*b9238976Sth 		if (e == eph && isTreeRoot == FALSE)
1534*b9238976Sth 			return (0);
1535*b9238976Sth 
1536*b9238976Sth 		/*
1537*b9238976Sth 		 * Okay, we have a leaf node, time
1538*b9238976Sth 		 * to prune it!
1539*b9238976Sth 		 *
1540*b9238976Sth 		 * Note that prior can only be NULL if
1541*b9238976Sth 		 * and only if it is the root of the
1542*b9238976Sth 		 * ephemeral tree.
1543*b9238976Sth 		 */
1544*b9238976Sth 		prior = e->ne_prior;
1545*b9238976Sth 
1546*b9238976Sth 		mi = e->ne_mount;
1547*b9238976Sth 		mutex_enter(&mi->mi_lock);
1548*b9238976Sth 		vfsp = mi->mi_vfsp;
1549*b9238976Sth 
1550*b9238976Sth 		/*
1551*b9238976Sth 		 * Cleared by umount2_engine.
1552*b9238976Sth 		 */
1553*b9238976Sth 		VFS_HOLD(vfsp);
1554*b9238976Sth 
1555*b9238976Sth 		/*
1556*b9238976Sth 		 * Inform nfs4_unmount to not recursively
1557*b9238976Sth 		 * descend into this node's children when it
1558*b9238976Sth 		 * gets processed.
1559*b9238976Sth 		 */
1560*b9238976Sth 		mi->mi_flags |= MI4_EPHEMERAL_RECURSED;
1561*b9238976Sth 		mutex_exit(&mi->mi_lock);
1562*b9238976Sth 
1563*b9238976Sth 		error = umount2_engine(vfsp, flag, cr, FALSE);
1564*b9238976Sth 		if (error) {
1565*b9238976Sth 			/*
1566*b9238976Sth 			 * We need to reenable nfs4_unmount's ability
1567*b9238976Sth 			 * to recursively descend on this node.
1568*b9238976Sth 			 */
1569*b9238976Sth 			mutex_enter(&mi->mi_lock);
1570*b9238976Sth 			mi->mi_flags &= ~MI4_EPHEMERAL_RECURSED;
1571*b9238976Sth 			mutex_exit(&mi->mi_lock);
1572*b9238976Sth 
1573*b9238976Sth 			return (error);
1574*b9238976Sth 		}
1575*b9238976Sth 
1576*b9238976Sth 		/*
1577*b9238976Sth 		 * If we are the current node, we do not want to
1578*b9238976Sth 		 * touch anything else. At this point, the only
1579*b9238976Sth 		 * way the current node can have survived to here
1580*b9238976Sth 		 * is if it is the root of the ephemeral tree and
1581*b9238976Sth 		 * we are unmounting the enclosing mntinfo4.
1582*b9238976Sth 		 */
1583*b9238976Sth 		if (e == eph) {
1584*b9238976Sth 			ASSERT(prior == NULL);
1585*b9238976Sth 			return (0);
1586*b9238976Sth 		}
1587*b9238976Sth 
1588*b9238976Sth 		/*
1589*b9238976Sth 		 * Stitch up the prior node. Note that since
1590*b9238976Sth 		 * we have handled the root of the tree, prior
1591*b9238976Sth 		 * must be non-NULL.
1592*b9238976Sth 		 */
1593*b9238976Sth 		ASSERT(prior != NULL);
1594*b9238976Sth 		if (prior->ne_child == e) {
1595*b9238976Sth 			prior->ne_child = NULL;
1596*b9238976Sth 		} else {
1597*b9238976Sth 			ASSERT(prior->ne_peer == e);
1598*b9238976Sth 
1599*b9238976Sth 			prior->ne_peer = NULL;
1600*b9238976Sth 		}
1601*b9238976Sth 
1602*b9238976Sth 		e = prior;
1603*b9238976Sth 	}
1604*b9238976Sth 
1605*b9238976Sth 	/* NOTREACHED */
1606*b9238976Sth }
1607*b9238976Sth 
1608*b9238976Sth /*
1609*b9238976Sth  * Common code to safely release net_cnt_lock and net_tree_lock
1610*b9238976Sth  */
1611*b9238976Sth void
1612*b9238976Sth nfs4_ephemeral_umount_unlock(bool_t *pmust_unlock,
1613*b9238976Sth     nfs4_ephemeral_tree_t **pnet)
1614*b9238976Sth {
1615*b9238976Sth 	nfs4_ephemeral_tree_t	*net = *pnet;
1616*b9238976Sth 
1617*b9238976Sth 	if (*pmust_unlock) {
1618*b9238976Sth 		mutex_enter(&net->net_cnt_lock);
1619*b9238976Sth 		net->net_refcnt--;
1620*b9238976Sth 		net->net_status &= ~NFS4_EPHEMERAL_TREE_UMOUNTING;
1621*b9238976Sth 		mutex_exit(&net->net_cnt_lock);
1622*b9238976Sth 
1623*b9238976Sth 		mutex_exit(&net->net_tree_lock);
1624*b9238976Sth 
1625*b9238976Sth 		*pmust_unlock = FALSE;
1626*b9238976Sth 	}
1627*b9238976Sth }
1628*b9238976Sth 
1629*b9238976Sth /*
1630*b9238976Sth  * While we may have removed any child or sibling nodes of this
1631*b9238976Sth  * ephemeral node, we can not nuke it until we know that there
1632*b9238976Sth  * were no actived vnodes on it. This will do that final
1633*b9238976Sth  * work once we know it is not busy.
1634*b9238976Sth  */
1635*b9238976Sth void
1636*b9238976Sth nfs4_ephemeral_umount_activate(mntinfo4_t *mi, bool_t *pmust_unlock,
1637*b9238976Sth     nfs4_ephemeral_tree_t **pnet)
1638*b9238976Sth {
1639*b9238976Sth 	/*
1640*b9238976Sth 	 * Now we need to get rid of the ephemeral data if it exists.
1641*b9238976Sth 	 */
1642*b9238976Sth 	mutex_enter(&mi->mi_lock);
1643*b9238976Sth 	if (mi->mi_ephemeral) {
1644*b9238976Sth 		/*
1645*b9238976Sth 		 * If we are the root node of an ephemeral branch
1646*b9238976Sth 		 * which is being removed, then we need to fixup
1647*b9238976Sth 		 * pointers into and out of the node.
1648*b9238976Sth 		 */
1649*b9238976Sth 		if (!(mi->mi_flags & MI4_EPHEMERAL_RECURSED))
1650*b9238976Sth 			nfs4_ephemeral_umount_cleanup(mi->mi_ephemeral);
1651*b9238976Sth 
1652*b9238976Sth 		ASSERT(mi->mi_ephemeral != NULL);
1653*b9238976Sth 
1654*b9238976Sth 		kmem_free(mi->mi_ephemeral, sizeof (*mi->mi_ephemeral));
1655*b9238976Sth 		mi->mi_ephemeral = NULL;
1656*b9238976Sth 	}
1657*b9238976Sth 	mutex_exit(&mi->mi_lock);
1658*b9238976Sth 
1659*b9238976Sth 	nfs4_ephemeral_umount_unlock(pmust_unlock, pnet);
1660*b9238976Sth }
1661*b9238976Sth 
1662*b9238976Sth /*
1663*b9238976Sth  * Unmount an ephemeral node.
1664*b9238976Sth  */
1665*b9238976Sth int
1666*b9238976Sth nfs4_ephemeral_umount(mntinfo4_t *mi, int flag, cred_t *cr,
1667*b9238976Sth     bool_t *pmust_unlock, nfs4_ephemeral_tree_t **pnet)
1668*b9238976Sth {
1669*b9238976Sth 	int			error = 0;
1670*b9238976Sth 	nfs4_ephemeral_t	*eph;
1671*b9238976Sth 	nfs4_ephemeral_tree_t	*net;
1672*b9238976Sth 	int			is_derooting = FALSE;
1673*b9238976Sth 	int			is_recursed = FALSE;
1674*b9238976Sth 	int			was_locked = FALSE;
1675*b9238976Sth 
1676*b9238976Sth 	/*
1677*b9238976Sth 	 * The active vnodes on this file system may be ephemeral
1678*b9238976Sth 	 * children. We need to check for and try to unmount them
1679*b9238976Sth 	 * here. If any can not be unmounted, we are going
1680*b9238976Sth 	 * to return EBUSY.
1681*b9238976Sth 	 */
1682*b9238976Sth 	mutex_enter(&mi->mi_lock);
1683*b9238976Sth 
1684*b9238976Sth 	/*
1685*b9238976Sth 	 * If an ephemeral tree, we need to check to see if
1686*b9238976Sth 	 * the lock is already held. If it is, then we need
1687*b9238976Sth 	 * to see if we are being called as a result of
1688*b9238976Sth 	 * the recursive removal of some node of the tree or
1689*b9238976Sth 	 * if we are another attempt to remove the tree.
1690*b9238976Sth 	 *
1691*b9238976Sth 	 * mi_flags & MI4_EPHEMERAL indicates an ephemeral
1692*b9238976Sth 	 * node. mi_ephemeral being non-NULL also does this.
1693*b9238976Sth 	 *
1694*b9238976Sth 	 * mi_ephemeral_tree being non-NULL is sufficient
1695*b9238976Sth 	 * to also indicate either it is an ephemeral node
1696*b9238976Sth 	 * or the enclosing mntinfo4.
1697*b9238976Sth 	 *
1698*b9238976Sth 	 * Do we need MI4_EPHEMERAL? Yes, it is useful for
1699*b9238976Sth 	 * when we delete the ephemeral node and need to
1700*b9238976Sth 	 * differentiate from an ephemeral node and the
1701*b9238976Sth 	 * enclosing root node.
1702*b9238976Sth 	 */
1703*b9238976Sth 	*pnet = net = mi->mi_ephemeral_tree;
1704*b9238976Sth 	eph = mi->mi_ephemeral;
1705*b9238976Sth 	if (net) {
1706*b9238976Sth 		is_recursed = mi->mi_flags & MI4_EPHEMERAL_RECURSED;
1707*b9238976Sth 		is_derooting = (eph == NULL);
1708*b9238976Sth 		mutex_exit(&mi->mi_lock);
1709*b9238976Sth 
1710*b9238976Sth 		/*
1711*b9238976Sth 		 * If this is not recursion, then we need to
1712*b9238976Sth 		 * grab a ref count.
1713*b9238976Sth 		 *
1714*b9238976Sth 		 * But wait, we also do not want to do that
1715*b9238976Sth 		 * if a harvester thread has already grabbed
1716*b9238976Sth 		 * the lock.
1717*b9238976Sth 		 */
1718*b9238976Sth 		if (!is_recursed) {
1719*b9238976Sth 			mutex_enter(&net->net_cnt_lock);
1720*b9238976Sth 			if (net->net_status &
1721*b9238976Sth 			    NFS4_EPHEMERAL_TREE_LOCKED)
1722*b9238976Sth 				was_locked = TRUE;
1723*b9238976Sth 			else
1724*b9238976Sth 				net->net_refcnt++;
1725*b9238976Sth 			mutex_exit(&net->net_cnt_lock);
1726*b9238976Sth 		}
1727*b9238976Sth 
1728*b9238976Sth 		/*
1729*b9238976Sth 		 * If we grab the lock, it means that no other
1730*b9238976Sth 		 * operation is working on the tree. If we don't
1731*b9238976Sth 		 * grab it, we need to decide if this is because
1732*b9238976Sth 		 * we are a recursive call or a new operation.
1733*b9238976Sth 		 *
1734*b9238976Sth 		 * If we are a recursive call, we proceed without
1735*b9238976Sth 		 * the lock.
1736*b9238976Sth 		 *
1737*b9238976Sth 		 * Else we have to wait until the lock becomes free.
1738*b9238976Sth 		 */
1739*b9238976Sth 		if (was_locked == FALSE &&
1740*b9238976Sth 		    !mutex_tryenter(&net->net_tree_lock)) {
1741*b9238976Sth 			if (!is_recursed) {
1742*b9238976Sth 				mutex_enter(&net->net_cnt_lock);
1743*b9238976Sth 				if (net->net_status &
1744*b9238976Sth 				    (NFS4_EPHEMERAL_TREE_DEROOTING
1745*b9238976Sth 				    | NFS4_EPHEMERAL_TREE_INVALID)) {
1746*b9238976Sth 					net->net_refcnt--;
1747*b9238976Sth 					mutex_exit(&net->net_cnt_lock);
1748*b9238976Sth 					goto is_busy;
1749*b9238976Sth 				}
1750*b9238976Sth 				mutex_exit(&net->net_cnt_lock);
1751*b9238976Sth 
1752*b9238976Sth 				/*
1753*b9238976Sth 				 * We can't hold any other locks whilst
1754*b9238976Sth 				 * we wait on this to free up.
1755*b9238976Sth 				 */
1756*b9238976Sth 				mutex_enter(&net->net_tree_lock);
1757*b9238976Sth 
1758*b9238976Sth 				/*
1759*b9238976Sth 				 * Note that while mi->mi_ephemeral
1760*b9238976Sth 				 * may change and thus we have to
1761*b9238976Sth 				 * update eph, it is the case that
1762*b9238976Sth 				 * we have tied down net and
1763*b9238976Sth 				 * do not care if mi->mi_ephemeral_tree
1764*b9238976Sth 				 * has changed.
1765*b9238976Sth 				 */
1766*b9238976Sth 				mutex_enter(&mi->mi_lock);
1767*b9238976Sth 				eph = mi->mi_ephemeral;
1768*b9238976Sth 				mutex_exit(&mi->mi_lock);
1769*b9238976Sth 
1770*b9238976Sth 				/*
1771*b9238976Sth 				 * Okay, we need to see if either the
1772*b9238976Sth 				 * tree got nuked or the current node
1773*b9238976Sth 				 * got nuked. Both of which will cause
1774*b9238976Sth 				 * an error.
1775*b9238976Sth 				 *
1776*b9238976Sth 				 * Note that a subsequent retry of the
1777*b9238976Sth 				 * umount shall work.
1778*b9238976Sth 				 */
1779*b9238976Sth 				mutex_enter(&net->net_cnt_lock);
1780*b9238976Sth 				if (net->net_status &
1781*b9238976Sth 				    NFS4_EPHEMERAL_TREE_INVALID ||
1782*b9238976Sth 				    (!is_derooting && eph == NULL)) {
1783*b9238976Sth 					net->net_refcnt--;
1784*b9238976Sth 					mutex_exit(&net->net_cnt_lock);
1785*b9238976Sth 					mutex_exit(&net->net_tree_lock);
1786*b9238976Sth 					goto is_busy;
1787*b9238976Sth 				}
1788*b9238976Sth 				mutex_exit(&net->net_cnt_lock);
1789*b9238976Sth 				*pmust_unlock = TRUE;
1790*b9238976Sth 			}
1791*b9238976Sth 		} else if (was_locked == FALSE) {
1792*b9238976Sth 			/*
1793*b9238976Sth 			 * If we grab it right away, everything must
1794*b9238976Sth 			 * be great!
1795*b9238976Sth 			 */
1796*b9238976Sth 			*pmust_unlock = TRUE;
1797*b9238976Sth 		}
1798*b9238976Sth 
1799*b9238976Sth 		/*
1800*b9238976Sth 		 * Only once we have grabbed the lock can we mark what we
1801*b9238976Sth 		 * are planning on doing to the ephemeral tree.
1802*b9238976Sth 		 */
1803*b9238976Sth 		if (*pmust_unlock) {
1804*b9238976Sth 			mutex_enter(&net->net_cnt_lock);
1805*b9238976Sth 			net->net_status |= NFS4_EPHEMERAL_TREE_UMOUNTING;
1806*b9238976Sth 
1807*b9238976Sth 			/*
1808*b9238976Sth 			 * Check to see if we are nuking the root.
1809*b9238976Sth 			 */
1810*b9238976Sth 			if (is_derooting)
1811*b9238976Sth 				net->net_status |=
1812*b9238976Sth 				    NFS4_EPHEMERAL_TREE_DEROOTING;
1813*b9238976Sth 			mutex_exit(&net->net_cnt_lock);
1814*b9238976Sth 		}
1815*b9238976Sth 
1816*b9238976Sth 		if (!is_derooting) {
1817*b9238976Sth 			/*
1818*b9238976Sth 			 * Only work on children if the caller has not already
1819*b9238976Sth 			 * done so.
1820*b9238976Sth 			 */
1821*b9238976Sth 			if (!is_recursed) {
1822*b9238976Sth 				ASSERT(eph != NULL);
1823*b9238976Sth 
1824*b9238976Sth 				error = nfs4_ephemeral_unmount_engine(eph,
1825*b9238976Sth 				    FALSE, flag, cr);
1826*b9238976Sth 				if (error)
1827*b9238976Sth 					goto is_busy;
1828*b9238976Sth 			}
1829*b9238976Sth 		} else {
1830*b9238976Sth 			eph = net->net_root;
1831*b9238976Sth 
1832*b9238976Sth 			/*
1833*b9238976Sth 			 * Only work if there is something there.
1834*b9238976Sth 			 */
1835*b9238976Sth 			if (eph) {
1836*b9238976Sth 				error = nfs4_ephemeral_unmount_engine(eph, TRUE,
1837*b9238976Sth 				    flag, cr);
1838*b9238976Sth 				if (error) {
1839*b9238976Sth 					mutex_enter(&net->net_cnt_lock);
1840*b9238976Sth 					net->net_status &=
1841*b9238976Sth 					    ~NFS4_EPHEMERAL_TREE_DEROOTING;
1842*b9238976Sth 					mutex_exit(&net->net_cnt_lock);
1843*b9238976Sth 					goto is_busy;
1844*b9238976Sth 				}
1845*b9238976Sth 
1846*b9238976Sth 				/*
1847*b9238976Sth 				 * Nothing else which goes wrong will
1848*b9238976Sth 				 * invalidate the blowing away of the
1849*b9238976Sth 				 * ephmeral tree.
1850*b9238976Sth 				 */
1851*b9238976Sth 				net->net_root = NULL;
1852*b9238976Sth 			}
1853*b9238976Sth 
1854*b9238976Sth 			/*
1855*b9238976Sth 			 * We have derooted and we have caused the tree to be
1856*b9238976Sth 			 * invalid.
1857*b9238976Sth 			 */
1858*b9238976Sth 			mutex_enter(&net->net_cnt_lock);
1859*b9238976Sth 			net->net_status &= ~NFS4_EPHEMERAL_TREE_DEROOTING;
1860*b9238976Sth 			net->net_status |= NFS4_EPHEMERAL_TREE_INVALID;
1861*b9238976Sth 			net->net_refcnt--;
1862*b9238976Sth 			mutex_exit(&net->net_cnt_lock);
1863*b9238976Sth 
1864*b9238976Sth 			/*
1865*b9238976Sth 			 * At this point, the tree should no
1866*b9238976Sth 			 * longer be associated with the
1867*b9238976Sth 			 * mntinfo4. We need to pull it off
1868*b9238976Sth 			 * there and let the harvester take
1869*b9238976Sth 			 * care of it once the refcnt drops.
1870*b9238976Sth 			 */
1871*b9238976Sth 			mutex_enter(&mi->mi_lock);
1872*b9238976Sth 			mi->mi_ephemeral_tree = NULL;
1873*b9238976Sth 			mutex_exit(&mi->mi_lock);
1874*b9238976Sth 		}
1875*b9238976Sth 	} else {
1876*b9238976Sth 		mutex_exit(&mi->mi_lock);
1877*b9238976Sth 	}
1878*b9238976Sth 
1879*b9238976Sth 	return (0);
1880*b9238976Sth 
1881*b9238976Sth is_busy:
1882*b9238976Sth 
1883*b9238976Sth 	nfs4_ephemeral_umount_unlock(pmust_unlock, pnet);
1884*b9238976Sth 
1885*b9238976Sth 	return (error);
1886*b9238976Sth }
1887*b9238976Sth 
1888*b9238976Sth /*
1889*b9238976Sth  * Do the umount and record any error in the parent.
1890*b9238976Sth  */
1891*b9238976Sth static void
1892*b9238976Sth nfs4_ephemeral_record_umount(vfs_t *vfsp, int flag,
1893*b9238976Sth     nfs4_ephemeral_t *e, nfs4_ephemeral_t *prior)
1894*b9238976Sth {
1895*b9238976Sth 	int	error;
1896*b9238976Sth 
1897*b9238976Sth 	error = umount2_engine(vfsp, flag, kcred, FALSE);
1898*b9238976Sth 	if (error) {
1899*b9238976Sth 		if (prior) {
1900*b9238976Sth 			if (prior->ne_child == e)
1901*b9238976Sth 				prior->ne_state |=
1902*b9238976Sth 				    NFS4_EPHEMERAL_CHILD_ERROR;
1903*b9238976Sth 			else
1904*b9238976Sth 				prior->ne_state |=
1905*b9238976Sth 				    NFS4_EPHEMERAL_PEER_ERROR;
1906*b9238976Sth 		}
1907*b9238976Sth 	}
1908*b9238976Sth }
1909*b9238976Sth 
1910*b9238976Sth /*
1911*b9238976Sth  * For each tree in the forest (where the forest is in
1912*b9238976Sth  * effect all of the ephemeral trees for this zone),
1913*b9238976Sth  * scan to see if a node can be unmounted. Note that
1914*b9238976Sth  * unlike nfs4_ephemeral_unmount_engine(), we do
1915*b9238976Sth  * not process the current node before children or
1916*b9238976Sth  * siblings. I.e., if a node can be unmounted, we
1917*b9238976Sth  * do not recursively check to see if the nodes
1918*b9238976Sth  * hanging off of it can also be unmounted.
1919*b9238976Sth  *
1920*b9238976Sth  * Instead, we delve down deep to try and remove the
1921*b9238976Sth  * children first. Then, because we share code with
1922*b9238976Sth  * nfs4_ephemeral_unmount_engine(), we will try
1923*b9238976Sth  * them again. This could be a performance issue in
1924*b9238976Sth  * the future.
1925*b9238976Sth  *
1926*b9238976Sth  * Also note that unlike nfs4_ephemeral_unmount_engine(),
1927*b9238976Sth  * we do not halt on an error. We will not remove the
1928*b9238976Sth  * current node, but we will keep on trying to remove
1929*b9238976Sth  * the others.
1930*b9238976Sth  *
1931*b9238976Sth  * force indicates that we want the unmount to occur
1932*b9238976Sth  * even if there is something blocking it.
1933*b9238976Sth  *
1934*b9238976Sth  * time_check indicates that we want to see if the
1935*b9238976Sth  * mount has expired past mount_to or not. Typically
1936*b9238976Sth  * we want to do this and only on a shutdown of the
1937*b9238976Sth  * zone would we want to ignore the check.
1938*b9238976Sth  */
1939*b9238976Sth static void
1940*b9238976Sth nfs4_ephemeral_harvest_forest(nfs4_trigger_globals_t *ntg,
1941*b9238976Sth     bool_t force, bool_t time_check)
1942*b9238976Sth {
1943*b9238976Sth 	nfs4_ephemeral_tree_t	*net;
1944*b9238976Sth 	nfs4_ephemeral_tree_t	*prev = NULL;
1945*b9238976Sth 	nfs4_ephemeral_tree_t	*next;
1946*b9238976Sth 	nfs4_ephemeral_t	*e;
1947*b9238976Sth 	nfs4_ephemeral_t	*prior;
1948*b9238976Sth 	time_t			now = gethrestime_sec();
1949*b9238976Sth 
1950*b9238976Sth 	nfs4_ephemeral_tree_t	*harvest = NULL;
1951*b9238976Sth 
1952*b9238976Sth 	int			flag;
1953*b9238976Sth 
1954*b9238976Sth 	mntinfo4_t		*mi;
1955*b9238976Sth 	vfs_t			*vfsp;
1956*b9238976Sth 
1957*b9238976Sth 	if (force)
1958*b9238976Sth 		flag = MS_FORCE;
1959*b9238976Sth 	else
1960*b9238976Sth 		flag = 0;
1961*b9238976Sth 
1962*b9238976Sth 	mutex_enter(&ntg->ntg_forest_lock);
1963*b9238976Sth 	for (net = ntg->ntg_forest; net != NULL; net = next) {
1964*b9238976Sth 		next = net->net_next;
1965*b9238976Sth 
1966*b9238976Sth 		mutex_enter(&net->net_cnt_lock);
1967*b9238976Sth 		net->net_refcnt++;
1968*b9238976Sth 		mutex_exit(&net->net_cnt_lock);
1969*b9238976Sth 
1970*b9238976Sth 		mutex_enter(&net->net_tree_lock);
1971*b9238976Sth 
1972*b9238976Sth 		/*
1973*b9238976Sth 		 * Let the unmount code know that the
1974*b9238976Sth 		 * tree is already locked!
1975*b9238976Sth 		 */
1976*b9238976Sth 		mutex_enter(&net->net_cnt_lock);
1977*b9238976Sth 		net->net_status |= NFS4_EPHEMERAL_TREE_LOCKED;
1978*b9238976Sth 		mutex_exit(&net->net_cnt_lock);
1979*b9238976Sth 
1980*b9238976Sth 		/*
1981*b9238976Sth 		 * If the intent is force all ephemeral nodes to
1982*b9238976Sth 		 * be unmounted in this zone, we can short circuit a
1983*b9238976Sth 		 * lot of tree traversal and simply zap the root node.
1984*b9238976Sth 		 */
1985*b9238976Sth 		if (force) {
1986*b9238976Sth 			if (net->net_root) {
1987*b9238976Sth 				mi = net->net_root->ne_mount;
1988*b9238976Sth 				vfsp = mi->mi_vfsp;
1989*b9238976Sth 
1990*b9238976Sth 				/*
1991*b9238976Sth 				 * Cleared by umount2_engine.
1992*b9238976Sth 				 */
1993*b9238976Sth 				VFS_HOLD(vfsp);
1994*b9238976Sth 
1995*b9238976Sth 				(void) umount2_engine(vfsp, flag,
1996*b9238976Sth 				    kcred, FALSE);
1997*b9238976Sth 
1998*b9238976Sth 				goto check_done;
1999*b9238976Sth 			}
2000*b9238976Sth 		}
2001*b9238976Sth 
2002*b9238976Sth 		e = net->net_root;
2003*b9238976Sth 		if (e)
2004*b9238976Sth 			e->ne_state = NFS4_EPHEMERAL_VISIT_CHILD;
2005*b9238976Sth 
2006*b9238976Sth 		while (e) {
2007*b9238976Sth 			if (e->ne_state == NFS4_EPHEMERAL_VISIT_CHILD) {
2008*b9238976Sth 				e->ne_state = NFS4_EPHEMERAL_VISIT_SIBLING;
2009*b9238976Sth 				if (e->ne_child) {
2010*b9238976Sth 					e = e->ne_child;
2011*b9238976Sth 					e->ne_state =
2012*b9238976Sth 					    NFS4_EPHEMERAL_VISIT_CHILD;
2013*b9238976Sth 				}
2014*b9238976Sth 
2015*b9238976Sth 				continue;
2016*b9238976Sth 			} else if (e->ne_state ==
2017*b9238976Sth 			    NFS4_EPHEMERAL_VISIT_SIBLING) {
2018*b9238976Sth 				e->ne_state = NFS4_EPHEMERAL_PROCESS_ME;
2019*b9238976Sth 				if (e->ne_peer) {
2020*b9238976Sth 					e = e->ne_peer;
2021*b9238976Sth 					e->ne_state =
2022*b9238976Sth 					    NFS4_EPHEMERAL_VISIT_CHILD;
2023*b9238976Sth 				}
2024*b9238976Sth 
2025*b9238976Sth 				continue;
2026*b9238976Sth 			} else if (e->ne_state ==
2027*b9238976Sth 			    NFS4_EPHEMERAL_CHILD_ERROR) {
2028*b9238976Sth 				prior = e->ne_prior;
2029*b9238976Sth 
2030*b9238976Sth 				/*
2031*b9238976Sth 				 * If a child reported an error, do
2032*b9238976Sth 				 * not bother trying to unmount.
2033*b9238976Sth 				 *
2034*b9238976Sth 				 * If your prior node is a parent,
2035*b9238976Sth 				 * pass the error up such that they
2036*b9238976Sth 				 * also do not try to unmount.
2037*b9238976Sth 				 *
2038*b9238976Sth 				 * However, if your prior is a sibling,
2039*b9238976Sth 				 * let them try to unmount if they can.
2040*b9238976Sth 				 */
2041*b9238976Sth 				if (prior) {
2042*b9238976Sth 					if (prior->ne_child == e)
2043*b9238976Sth 						prior->ne_state |=
2044*b9238976Sth 						    NFS4_EPHEMERAL_CHILD_ERROR;
2045*b9238976Sth 					else
2046*b9238976Sth 						prior->ne_state |=
2047*b9238976Sth 						    NFS4_EPHEMERAL_PEER_ERROR;
2048*b9238976Sth 				}
2049*b9238976Sth 
2050*b9238976Sth 				/*
2051*b9238976Sth 				 * Clear the error and if needed, process peers.
2052*b9238976Sth 				 *
2053*b9238976Sth 				 * Once we mask out the error, we know whether
2054*b9238976Sth 				 * or we have to process another node.
2055*b9238976Sth 				 */
2056*b9238976Sth 				e->ne_state &= ~NFS4_EPHEMERAL_CHILD_ERROR;
2057*b9238976Sth 				if (e->ne_state == NFS4_EPHEMERAL_PROCESS_ME)
2058*b9238976Sth 					e = prior;
2059*b9238976Sth 
2060*b9238976Sth 				continue;
2061*b9238976Sth 			} else if (e->ne_state ==
2062*b9238976Sth 			    NFS4_EPHEMERAL_PEER_ERROR) {
2063*b9238976Sth 				prior = e->ne_prior;
2064*b9238976Sth 
2065*b9238976Sth 				if (prior) {
2066*b9238976Sth 					if (prior->ne_child == e)
2067*b9238976Sth 						prior->ne_state =
2068*b9238976Sth 						    NFS4_EPHEMERAL_CHILD_ERROR;
2069*b9238976Sth 					else
2070*b9238976Sth 						prior->ne_state =
2071*b9238976Sth 						    NFS4_EPHEMERAL_PEER_ERROR;
2072*b9238976Sth 				}
2073*b9238976Sth 
2074*b9238976Sth 				/*
2075*b9238976Sth 				 * Clear the error from this node and do the
2076*b9238976Sth 				 * correct processing.
2077*b9238976Sth 				 */
2078*b9238976Sth 				e->ne_state &= ~NFS4_EPHEMERAL_PEER_ERROR;
2079*b9238976Sth 				continue;
2080*b9238976Sth 			}
2081*b9238976Sth 
2082*b9238976Sth 			prior = e->ne_prior;
2083*b9238976Sth 			e->ne_state = NFS4_EPHEMERAL_OK;
2084*b9238976Sth 
2085*b9238976Sth 			/*
2086*b9238976Sth 			 * It must be the case that we need to process
2087*b9238976Sth 			 * this node.
2088*b9238976Sth 			 */
2089*b9238976Sth 			if (!time_check ||
2090*b9238976Sth 			    now - e->ne_ref_time > e->ne_mount_to) {
2091*b9238976Sth 				mi = e->ne_mount;
2092*b9238976Sth 				vfsp = mi->mi_vfsp;
2093*b9238976Sth 
2094*b9238976Sth 				/*
2095*b9238976Sth 				 * Cleared by umount2_engine.
2096*b9238976Sth 				 */
2097*b9238976Sth 				VFS_HOLD(vfsp);
2098*b9238976Sth 
2099*b9238976Sth 				/*
2100*b9238976Sth 				 * Note that we effectively work down to the
2101*b9238976Sth 				 * leaf nodes first, try to unmount them,
2102*b9238976Sth 				 * then work our way back up into the leaf
2103*b9238976Sth 				 * nodes.
2104*b9238976Sth 				 *
2105*b9238976Sth 				 * Also note that we deal with a lot of
2106*b9238976Sth 				 * complexity by sharing the work with
2107*b9238976Sth 				 * the manual unmount code.
2108*b9238976Sth 				 */
2109*b9238976Sth 				nfs4_ephemeral_record_umount(vfsp, flag,
2110*b9238976Sth 				    e, prior);
2111*b9238976Sth 			}
2112*b9238976Sth 
2113*b9238976Sth 			e = prior;
2114*b9238976Sth 		}
2115*b9238976Sth 
2116*b9238976Sth check_done:
2117*b9238976Sth 
2118*b9238976Sth 		/*
2119*b9238976Sth 		 * Are we done with this tree?
2120*b9238976Sth 		 */
2121*b9238976Sth 		mutex_enter(&net->net_cnt_lock);
2122*b9238976Sth 		if (net->net_refcnt == 1 &&
2123*b9238976Sth 		    net->net_status & NFS4_EPHEMERAL_TREE_INVALID) {
2124*b9238976Sth 			net->net_refcnt--;
2125*b9238976Sth 			net->net_status &= ~NFS4_EPHEMERAL_TREE_LOCKED;
2126*b9238976Sth 			mutex_exit(&net->net_cnt_lock);
2127*b9238976Sth 			mutex_exit(&net->net_tree_lock);
2128*b9238976Sth 
2129*b9238976Sth 			if (prev)
2130*b9238976Sth 				prev->net_next = net->net_next;
2131*b9238976Sth 			else
2132*b9238976Sth 				ntg->ntg_forest = net->net_next;
2133*b9238976Sth 
2134*b9238976Sth 			net->net_next = harvest;
2135*b9238976Sth 			harvest = net;
2136*b9238976Sth 			continue;
2137*b9238976Sth 		}
2138*b9238976Sth 
2139*b9238976Sth 		net->net_refcnt--;
2140*b9238976Sth 		net->net_status &= ~NFS4_EPHEMERAL_TREE_LOCKED;
2141*b9238976Sth 		mutex_exit(&net->net_cnt_lock);
2142*b9238976Sth 		mutex_exit(&net->net_tree_lock);
2143*b9238976Sth 
2144*b9238976Sth 		prev = net;
2145*b9238976Sth 	}
2146*b9238976Sth 	mutex_exit(&ntg->ntg_forest_lock);
2147*b9238976Sth 
2148*b9238976Sth 	for (net = harvest; net != NULL; net = next) {
2149*b9238976Sth 		next = net->net_next;
2150*b9238976Sth 
2151*b9238976Sth 		mutex_destroy(&net->net_tree_lock);
2152*b9238976Sth 		mutex_destroy(&net->net_cnt_lock);
2153*b9238976Sth 		kmem_free(net, sizeof (*net));
2154*b9238976Sth 	}
2155*b9238976Sth }
2156*b9238976Sth 
2157*b9238976Sth /*
2158*b9238976Sth  * This is the thread which decides when the harvesting
2159*b9238976Sth  * can proceed and when to kill it off for this zone.
2160*b9238976Sth  */
2161*b9238976Sth static void
2162*b9238976Sth nfs4_ephemeral_harvester(nfs4_trigger_globals_t *ntg)
2163*b9238976Sth {
2164*b9238976Sth 	clock_t		timeleft;
2165*b9238976Sth 	zone_t		*zone = curproc->p_zone;
2166*b9238976Sth 
2167*b9238976Sth 	for (;;) {
2168*b9238976Sth 		timeleft = zone_status_timedwait(zone, lbolt +
2169*b9238976Sth 		    nfs4_trigger_thread_timer * hz, ZONE_IS_SHUTTING_DOWN);
2170*b9238976Sth 
2171*b9238976Sth 		/*
2172*b9238976Sth 		 * zone is exiting...
2173*b9238976Sth 		 */
2174*b9238976Sth 		if (timeleft != -1) {
2175*b9238976Sth 			ASSERT(zone_status_get(zone) >= ZONE_IS_SHUTTING_DOWN);
2176*b9238976Sth 			zthread_exit();
2177*b9238976Sth 			/* NOTREACHED */
2178*b9238976Sth 		}
2179*b9238976Sth 
2180*b9238976Sth 		/*
2181*b9238976Sth 		 * Only bother scanning if there is potential
2182*b9238976Sth 		 * work to be done.
2183*b9238976Sth 		 */
2184*b9238976Sth 		if (ntg->ntg_forest == NULL)
2185*b9238976Sth 			continue;
2186*b9238976Sth 
2187*b9238976Sth 		/*
2188*b9238976Sth 		 * Now scan the list and get rid of everything which
2189*b9238976Sth 		 * is old.
2190*b9238976Sth 		 */
2191*b9238976Sth 		nfs4_ephemeral_harvest_forest(ntg, FALSE, TRUE);
2192*b9238976Sth 	}
2193*b9238976Sth 
2194*b9238976Sth 	/* NOTREACHED */
2195*b9238976Sth }
2196*b9238976Sth 
2197*b9238976Sth /*
2198*b9238976Sth  * The zone specific glue needed to start the unmount harvester.
2199*b9238976Sth  *
2200*b9238976Sth  * Note that we want to avoid holding the mutex as long as possible,
2201*b9238976Sth  * hence the multiple checks.
2202*b9238976Sth  *
2203*b9238976Sth  * The caller should avoid us getting down here in the first
2204*b9238976Sth  * place.
2205*b9238976Sth  */
2206*b9238976Sth static void
2207*b9238976Sth nfs4_ephemeral_start_harvester(nfs4_trigger_globals_t *ntg)
2208*b9238976Sth {
2209*b9238976Sth 	/*
2210*b9238976Sth 	 * It got started before we got here...
2211*b9238976Sth 	 */
2212*b9238976Sth 	if (ntg->ntg_thread_started)
2213*b9238976Sth 		return;
2214*b9238976Sth 
2215*b9238976Sth 	mutex_enter(&nfs4_ephemeral_thread_lock);
2216*b9238976Sth 
2217*b9238976Sth 	if (ntg->ntg_thread_started) {
2218*b9238976Sth 		mutex_exit(&nfs4_ephemeral_thread_lock);
2219*b9238976Sth 		return;
2220*b9238976Sth 	}
2221*b9238976Sth 
2222*b9238976Sth 	/*
2223*b9238976Sth 	 * Start the unmounter harvester thread for this zone.
2224*b9238976Sth 	 */
2225*b9238976Sth 	(void) zthread_create(NULL, 0, nfs4_ephemeral_harvester,
2226*b9238976Sth 	    ntg, 0, minclsyspri);
2227*b9238976Sth 
2228*b9238976Sth 	ntg->ntg_thread_started = TRUE;
2229*b9238976Sth 	mutex_exit(&nfs4_ephemeral_thread_lock);
2230*b9238976Sth }
2231*b9238976Sth 
2232*b9238976Sth /*ARGSUSED*/
2233*b9238976Sth static void *
2234*b9238976Sth nfs4_ephemeral_zsd_create(zoneid_t zoneid)
2235*b9238976Sth {
2236*b9238976Sth 	nfs4_trigger_globals_t	*ntg;
2237*b9238976Sth 
2238*b9238976Sth 	ntg = kmem_zalloc(sizeof (*ntg), KM_SLEEP);
2239*b9238976Sth 	ntg->ntg_thread_started = FALSE;
2240*b9238976Sth 
2241*b9238976Sth 	/*
2242*b9238976Sth 	 * This is the default....
2243*b9238976Sth 	 */
2244*b9238976Sth 	ntg->ntg_mount_to = nfs4_trigger_thread_timer;
2245*b9238976Sth 
2246*b9238976Sth 	mutex_init(&ntg->ntg_forest_lock, NULL,
2247*b9238976Sth 	    MUTEX_DEFAULT, NULL);
2248*b9238976Sth 
2249*b9238976Sth 	return (ntg);
2250*b9238976Sth }
2251*b9238976Sth 
2252*b9238976Sth /*
2253*b9238976Sth  * Try a nice gentle walk down the forest and convince
2254*b9238976Sth  * all of the trees to gracefully give it up.
2255*b9238976Sth  */
2256*b9238976Sth /*ARGSUSED*/
2257*b9238976Sth static void
2258*b9238976Sth nfs4_ephemeral_zsd_shutdown(zoneid_t zoneid, void *arg)
2259*b9238976Sth {
2260*b9238976Sth 	nfs4_trigger_globals_t	*ntg = arg;
2261*b9238976Sth 
2262*b9238976Sth 	if (!ntg)
2263*b9238976Sth 		return;
2264*b9238976Sth 
2265*b9238976Sth 	nfs4_ephemeral_harvest_forest(ntg, FALSE, FALSE);
2266*b9238976Sth }
2267*b9238976Sth 
2268*b9238976Sth /*
2269*b9238976Sth  * Race along the forest and rip all of the trees out by
2270*b9238976Sth  * their rootballs!
2271*b9238976Sth  */
2272*b9238976Sth /*ARGSUSED*/
2273*b9238976Sth static void
2274*b9238976Sth nfs4_ephemeral_zsd_destroy(zoneid_t zoneid, void *arg)
2275*b9238976Sth {
2276*b9238976Sth 	nfs4_trigger_globals_t	*ntg = arg;
2277*b9238976Sth 
2278*b9238976Sth 	if (!ntg)
2279*b9238976Sth 		return;
2280*b9238976Sth 
2281*b9238976Sth 	nfs4_ephemeral_harvest_forest(ntg, TRUE, FALSE);
2282*b9238976Sth 
2283*b9238976Sth 	mutex_destroy(&ntg->ntg_forest_lock);
2284*b9238976Sth 	kmem_free(ntg, sizeof (*ntg));
2285*b9238976Sth }
2286*b9238976Sth 
2287*b9238976Sth /*
2288*b9238976Sth  * This is the zone independent cleanup needed for
2289*b9238976Sth  * emphemeral mount processing.
2290*b9238976Sth  */
2291*b9238976Sth void
2292*b9238976Sth nfs4_ephemeral_fini(void)
2293*b9238976Sth {
2294*b9238976Sth 	(void) zone_key_delete(nfs4_ephemeral_key);
2295*b9238976Sth 	mutex_destroy(&nfs4_ephemeral_thread_lock);
2296*b9238976Sth }
2297*b9238976Sth 
2298*b9238976Sth /*
2299*b9238976Sth  * This is the zone independent initialization needed for
2300*b9238976Sth  * emphemeral mount processing.
2301*b9238976Sth  */
2302*b9238976Sth void
2303*b9238976Sth nfs4_ephemeral_init(void)
2304*b9238976Sth {
2305*b9238976Sth 	mutex_init(&nfs4_ephemeral_thread_lock, NULL, MUTEX_DEFAULT,
2306*b9238976Sth 	    NULL);
2307*b9238976Sth 
2308*b9238976Sth 	zone_key_create(&nfs4_ephemeral_key, nfs4_ephemeral_zsd_create,
2309*b9238976Sth 	    nfs4_ephemeral_zsd_shutdown, nfs4_ephemeral_zsd_destroy);
2310*b9238976Sth }
2311*b9238976Sth 
2312*b9238976Sth /*
2313*b9238976Sth  * nfssys() calls this function to set the per-zone
2314*b9238976Sth  * value of mount_to to drive when an ephemeral mount is
2315*b9238976Sth  * timed out. Each mount will grab a copy of this value
2316*b9238976Sth  * when mounted.
2317*b9238976Sth  */
2318*b9238976Sth void
2319*b9238976Sth nfs4_ephemeral_set_mount_to(uint_t mount_to)
2320*b9238976Sth {
2321*b9238976Sth 	nfs4_trigger_globals_t	*ntg;
2322*b9238976Sth 	zone_t			*zone = curproc->p_zone;
2323*b9238976Sth 
2324*b9238976Sth 	ntg = zone_getspecific(nfs4_ephemeral_key, zone);
2325*b9238976Sth 
2326*b9238976Sth 	ntg->ntg_mount_to = mount_to;
2327*b9238976Sth }
2328*b9238976Sth 
2329*b9238976Sth /*
2330*b9238976Sth  * Walk the list of v4 mount options; if they are currently set in vfsp,
2331*b9238976Sth  * append them to a new comma-separated mount option string, and return it.
2332*b9238976Sth  *
2333*b9238976Sth  * Caller should free by calling nfs4_trigger_destroy_mntopts().
2334*b9238976Sth  */
2335*b9238976Sth static char *
2336*b9238976Sth nfs4_trigger_create_mntopts(vfs_t *vfsp)
2337*b9238976Sth {
2338*b9238976Sth 	uint_t i;
2339*b9238976Sth 	char *mntopts;
2340*b9238976Sth 	struct vfssw *vswp;
2341*b9238976Sth 	mntopts_t *optproto;
2342*b9238976Sth 
2343*b9238976Sth 	mntopts = kmem_zalloc(MAX_MNTOPT_STR, KM_SLEEP);
2344*b9238976Sth 
2345*b9238976Sth 	/* get the list of applicable mount options for v4; locks *vswp */
2346*b9238976Sth 	vswp = vfs_getvfssw(MNTTYPE_NFS4);
2347*b9238976Sth 	optproto = &vswp->vsw_optproto;
2348*b9238976Sth 
2349*b9238976Sth 	for (i = 0; i < optproto->mo_count; i++) {
2350*b9238976Sth 		struct mntopt *mop = &optproto->mo_list[i];
2351*b9238976Sth 
2352*b9238976Sth 		if (mop->mo_flags & MO_EMPTY)
2353*b9238976Sth 			continue;
2354*b9238976Sth 
2355*b9238976Sth 		if (nfs4_trigger_add_mntopt(mntopts, mop->mo_name, vfsp)) {
2356*b9238976Sth 			kmem_free(mntopts, MAX_MNTOPT_STR);
2357*b9238976Sth 			vfs_unrefvfssw(vswp);
2358*b9238976Sth 			return (NULL);
2359*b9238976Sth 		}
2360*b9238976Sth 	}
2361*b9238976Sth 
2362*b9238976Sth 	vfs_unrefvfssw(vswp);
2363*b9238976Sth 
2364*b9238976Sth 	/*
2365*b9238976Sth 	 * MNTOPT_XATTR is not in the v4 mount opt proto list,
2366*b9238976Sth 	 * and it may only be passed via MS_OPTIONSTR, so we
2367*b9238976Sth 	 * must handle it here.
2368*b9238976Sth 	 *
2369*b9238976Sth 	 * Ideally, it would be in the list, but NFS does not specify its
2370*b9238976Sth 	 * own opt proto list, it uses instead the default one. Since
2371*b9238976Sth 	 * not all filesystems support extended attrs, it would not be
2372*b9238976Sth 	 * appropriate to add it there.
2373*b9238976Sth 	 */
2374*b9238976Sth 	if (nfs4_trigger_add_mntopt(mntopts, MNTOPT_XATTR, vfsp) ||
2375*b9238976Sth 	    nfs4_trigger_add_mntopt(mntopts, MNTOPT_NOXATTR, vfsp)) {
2376*b9238976Sth 		kmem_free(mntopts, MAX_MNTOPT_STR);
2377*b9238976Sth 		return (NULL);
2378*b9238976Sth 	}
2379*b9238976Sth 
2380*b9238976Sth 	return (mntopts);
2381*b9238976Sth }
2382*b9238976Sth 
2383*b9238976Sth static void
2384*b9238976Sth nfs4_trigger_destroy_mntopts(char *mntopts)
2385*b9238976Sth {
2386*b9238976Sth 	if (mntopts)
2387*b9238976Sth 		kmem_free(mntopts, MAX_MNTOPT_STR);
2388*b9238976Sth }
2389*b9238976Sth 
2390*b9238976Sth /*
2391*b9238976Sth  * Check a single mount option (optname). Add to mntopts if it is set in VFS.
2392*b9238976Sth  */
2393*b9238976Sth static int
2394*b9238976Sth nfs4_trigger_add_mntopt(char *mntopts, char *optname, vfs_t *vfsp)
2395*b9238976Sth {
2396*b9238976Sth 	if (mntopts == NULL || optname == NULL || vfsp == NULL)
2397*b9238976Sth 		return (EINVAL);
2398*b9238976Sth 
2399*b9238976Sth 	if (vfs_optionisset(vfsp, optname, NULL)) {
2400*b9238976Sth 		size_t mntoptslen = strlen(mntopts);
2401*b9238976Sth 		size_t optnamelen = strlen(optname);
2402*b9238976Sth 
2403*b9238976Sth 		/* +1 for ',', +1 for NUL */
2404*b9238976Sth 		if (mntoptslen + optnamelen + 2 > MAX_MNTOPT_STR)
2405*b9238976Sth 			return (EOVERFLOW);
2406*b9238976Sth 
2407*b9238976Sth 		/* first or subsequent mount option? */
2408*b9238976Sth 		if (*mntopts != '\0')
2409*b9238976Sth 			(void) strcat(mntopts, ",");
2410*b9238976Sth 
2411*b9238976Sth 		(void) strcat(mntopts, optname);
2412*b9238976Sth 	}
2413*b9238976Sth 
2414*b9238976Sth 	return (0);
2415*b9238976Sth }
2416*b9238976Sth 
2417*b9238976Sth static enum clnt_stat
2418*b9238976Sth nfs4_trigger_ping_server(servinfo4_t *svp, int nointr)
2419*b9238976Sth {
2420*b9238976Sth 	int retries, error;
2421*b9238976Sth 	uint_t max_msgsize;
2422*b9238976Sth 	enum clnt_stat status;
2423*b9238976Sth 	CLIENT *cl;
2424*b9238976Sth 	struct timeval timeout;
2425*b9238976Sth 
2426*b9238976Sth 	/* as per recov_newserver() */
2427*b9238976Sth 	max_msgsize = 0;
2428*b9238976Sth 	retries = 1;
2429*b9238976Sth 	timeout.tv_sec = 2;
2430*b9238976Sth 	timeout.tv_usec = 0;
2431*b9238976Sth 
2432*b9238976Sth 	error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr, NFS_PROGRAM,
2433*b9238976Sth 	    NFS_V4, max_msgsize, retries, CRED(), &cl);
2434*b9238976Sth 	if (error)
2435*b9238976Sth 		return (RPC_FAILED);
2436*b9238976Sth 
2437*b9238976Sth 	if (nointr)
2438*b9238976Sth 		cl->cl_nosignal = TRUE;
2439*b9238976Sth 	status = CLNT_CALL(cl, RFS_NULL, xdr_void, NULL, xdr_void, NULL,
2440*b9238976Sth 	    timeout);
2441*b9238976Sth 	if (nointr)
2442*b9238976Sth 		cl->cl_nosignal = FALSE;
2443*b9238976Sth 
2444*b9238976Sth 	AUTH_DESTROY(cl->cl_auth);
2445*b9238976Sth 	CLNT_DESTROY(cl);
2446*b9238976Sth 
2447*b9238976Sth 	return (status);
2448*b9238976Sth }
2449