xref: /illumos-gate/usr/src/uts/common/fs/nfs/nfs4_stub_vnops.c (revision f39b87890f92b92a38d84654c635ca63e14ffb1d)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  * Support for ephemeral mounts, e.g. mirror-mounts. These mounts are
31  * triggered from a "stub" rnode via a special set of vnodeops.
32  */
33 
34 #include <sys/param.h>
35 #include <sys/types.h>
36 #include <sys/systm.h>
37 #include <sys/cred.h>
38 #include <sys/time.h>
39 #include <sys/vnode.h>
40 #include <sys/vfs.h>
41 #include <sys/vfs_opreg.h>
42 #include <sys/file.h>
43 #include <sys/filio.h>
44 #include <sys/uio.h>
45 #include <sys/buf.h>
46 #include <sys/mman.h>
47 #include <sys/pathname.h>
48 #include <sys/dirent.h>
49 #include <sys/debug.h>
50 #include <sys/vmsystm.h>
51 #include <sys/fcntl.h>
52 #include <sys/flock.h>
53 #include <sys/swap.h>
54 #include <sys/errno.h>
55 #include <sys/strsubr.h>
56 #include <sys/sysmacros.h>
57 #include <sys/kmem.h>
58 #include <sys/mount.h>
59 #include <sys/cmn_err.h>
60 #include <sys/pathconf.h>
61 #include <sys/utsname.h>
62 #include <sys/dnlc.h>
63 #include <sys/acl.h>
64 #include <sys/systeminfo.h>
65 #include <sys/policy.h>
66 #include <sys/sdt.h>
67 #include <sys/list.h>
68 #include <sys/stat.h>
69 #include <sys/mntent.h>
70 
71 #include <rpc/types.h>
72 #include <rpc/auth.h>
73 #include <rpc/clnt.h>
74 
75 #include <nfs/nfs.h>
76 #include <nfs/nfs_clnt.h>
77 #include <nfs/nfs_acl.h>
78 #include <nfs/lm.h>
79 #include <nfs/nfs4.h>
80 #include <nfs/nfs4_kprot.h>
81 #include <nfs/rnode4.h>
82 #include <nfs/nfs4_clnt.h>
83 
84 #include <vm/hat.h>
85 #include <vm/as.h>
86 #include <vm/page.h>
87 #include <vm/pvn.h>
88 #include <vm/seg.h>
89 #include <vm/seg_map.h>
90 #include <vm/seg_kpm.h>
91 #include <vm/seg_vn.h>
92 
93 #include <fs/fs_subr.h>
94 
95 #include <sys/ddi.h>
96 #include <sys/int_fmtio.h>
97 
98 #include <sys/sunddi.h>
99 
100 /*
101  * The automatic unmounter thread stuff!
102  */
103 static int nfs4_trigger_thread_timer = 20;	/* in seconds */
104 
105 /*
106  * Just a default....
107  */
108 static uint_t nfs4_trigger_mount_to = 240;
109 
110 typedef struct nfs4_trigger_globals {
111 	kmutex_t		ntg_forest_lock;
112 	uint_t			ntg_mount_to;
113 	int			ntg_thread_started;
114 	nfs4_ephemeral_tree_t	*ntg_forest;
115 } nfs4_trigger_globals_t;
116 
117 kmutex_t	nfs4_ephemeral_thread_lock;
118 
119 zone_key_t	nfs4_ephemeral_key = ZONE_KEY_UNINITIALIZED;
120 
121 static void	nfs4_ephemeral_start_harvester(nfs4_trigger_globals_t *);
122 
123 /*
124  * Used for ephemeral mounts; contains data either duplicated from
125  * servinfo4_t, or hand-crafted, depending on type of ephemeral mount.
126  *
127  * It's intended that this structure is used solely for ephemeral
128  * mount-type specific data, for passing this data to
129  * nfs4_trigger_nargs_create().
130  */
131 typedef struct ephemeral_servinfo {
132 	char			*esi_hostname;
133 	char			*esi_netname;
134 	char			*esi_path;
135 	int			esi_path_len;
136 	int			esi_mount_flags;
137 	struct netbuf		*esi_addr;
138 	struct netbuf		*esi_syncaddr;
139 	struct knetconfig	*esi_knconf;
140 } ephemeral_servinfo_t;
141 
142 /*
143  * Collect together the mount-type specific and generic data args.
144  */
145 typedef struct domount_args {
146 	ephemeral_servinfo_t	*dma_esi;
147 	char			*dma_hostlist; /* comma-sep. for RO failover */
148 	struct nfs_args		*dma_nargs;
149 } domount_args_t;
150 
151 
152 /*
153  * The vnode ops functions for a trigger stub vnode
154  */
155 static int	nfs4_trigger_open(vnode_t **, int, cred_t *);
156 static int	nfs4_trigger_getattr(vnode_t *, struct vattr *, int, cred_t *);
157 static int	nfs4_trigger_setattr(vnode_t *, struct vattr *, int, cred_t *,
158 			caller_context_t *);
159 static int	nfs4_trigger_access(vnode_t *, int, int, cred_t *);
160 static int	nfs4_trigger_readlink(vnode_t *, struct uio *, cred_t *);
161 static int	nfs4_trigger_lookup(vnode_t *, char *, vnode_t **,
162 			struct pathname *, int, vnode_t *, cred_t *);
163 static int	nfs4_trigger_create(vnode_t *, char *, struct vattr *,
164 			enum vcexcl, int, vnode_t **, cred_t *, int);
165 static int	nfs4_trigger_remove(vnode_t *, char *, cred_t *);
166 static int	nfs4_trigger_link(vnode_t *, vnode_t *, char *, cred_t *);
167 static int	nfs4_trigger_rename(vnode_t *, char *, vnode_t *, char *,
168 			cred_t *);
169 static int	nfs4_trigger_mkdir(vnode_t *, char *, struct vattr *,
170 			vnode_t **, cred_t *);
171 static int	nfs4_trigger_rmdir(vnode_t *, char *, vnode_t *, cred_t *);
172 static int	nfs4_trigger_symlink(vnode_t *, char *, struct vattr *, char *,
173 			cred_t *);
174 static int	nfs4_trigger_cmp(vnode_t *, vnode_t *);
175 
176 /*
177  * Regular NFSv4 vnodeops that we need to reference directly
178  */
179 extern int	nfs4_getattr(vnode_t *, struct vattr *, int, cred_t *);
180 extern void	nfs4_inactive(vnode_t *, cred_t *);
181 extern int	nfs4_rwlock(vnode_t *, int, caller_context_t *);
182 extern void	nfs4_rwunlock(vnode_t *, int, caller_context_t *);
183 extern int	nfs4_lookup(vnode_t *, char *, vnode_t **,
184     struct pathname *, int, vnode_t *, cred_t *);
185 extern int	nfs4_pathconf(vnode_t *, int, ulong_t *, cred_t *);
186 extern int	nfs4_getsecattr(vnode_t *, vsecattr_t *, int, cred_t *);
187 extern int	nfs4_fid(vnode_t *, fid_t *);
188 extern int	nfs4_realvp(vnode_t *, vnode_t **);
189 
190 static int	nfs4_trigger_mount(vnode_t *, vnode_t **);
191 static int	nfs4_trigger_domount(vnode_t *, domount_args_t *, vfs_t **,
192     cred_t *);
193 static domount_args_t  *nfs4_trigger_domount_args_create(vnode_t *);
194 static void	nfs4_trigger_domount_args_destroy(domount_args_t *dma,
195     vnode_t *vp);
196 static ephemeral_servinfo_t *nfs4_trigger_esi_create(vnode_t *, servinfo4_t *);
197 static void	nfs4_trigger_esi_destroy(ephemeral_servinfo_t *, vnode_t *);
198 static ephemeral_servinfo_t *nfs4_trigger_esi_create_mirrormount(vnode_t *,
199     servinfo4_t *);
200 static struct nfs_args 	*nfs4_trigger_nargs_create(mntinfo4_t *, servinfo4_t *,
201     ephemeral_servinfo_t *);
202 static void	nfs4_trigger_nargs_destroy(struct nfs_args *);
203 static char	*nfs4_trigger_create_mntopts(vfs_t *);
204 static void	nfs4_trigger_destroy_mntopts(char *);
205 static int 	nfs4_trigger_add_mntopt(char *, char *, vfs_t *);
206 static enum clnt_stat nfs4_trigger_ping_server(servinfo4_t *, int);
207 
208 extern int	umount2_engine(vfs_t *, int, cred_t *, int);
209 
210 
211 vnodeops_t *nfs4_trigger_vnodeops;
212 
213 /*
214  * These are the vnodeops that we must define for stub vnodes.
215  *
216  *
217  * Many of the VOPs defined for NFSv4 do not need to be defined here,
218  * for various reasons. This will result in the VFS default function being
219  * used:
220  *
221  * - These VOPs require a previous VOP_OPEN to have occurred. That will have
222  *   lost the reference to the stub vnode, meaning these should not be called:
223  *       close, read, write, ioctl, readdir, seek.
224  *
225  * - These VOPs are meaningless for vnodes without data pages. Since the
226  *   stub vnode is of type VDIR, these should not be called:
227  *       space, getpage, putpage, map, addmap, delmap, pageio, fsync.
228  *
229  * - These VOPs are otherwise not applicable, and should not be called:
230  *       dump, setsecattr.
231  *
232  *
233  * These VOPs we do not want to define, but nor do we want the VFS default
234  * action. Instead, we specify the VFS error function, with fs_error(), but
235  * note that fs_error() is not actually called. Instead it results in the
236  * use of the error function defined for the particular VOP, in vn_ops_table[]:
237  *
238  * -   frlock, dispose, shrlock.
239  *
240  *
241  * These VOPs we define to use the corresponding regular NFSv4 vnodeop.
242  * NOTE: if any of these ops involve an OTW call with the stub FH, then
243  * that call must be wrapped with save_mnt_secinfo()/check_mnt_secinfo()
244  * to protect the security data in the servinfo4_t for the "parent"
245  * filesystem that contains the stub.
246  *
247  * - These VOPs should not trigger a mount, so that "ls -l" does not:
248  *       pathconf, getsecattr.
249  *
250  * - These VOPs would not make sense to trigger:
251  *       inactive, rwlock, rwunlock, fid, realvp.
252  */
253 const fs_operation_def_t nfs4_trigger_vnodeops_template[] = {
254 	VOPNAME_OPEN,		{ .vop_open = nfs4_trigger_open },
255 	VOPNAME_GETATTR,	{ .vop_getattr = nfs4_trigger_getattr },
256 	VOPNAME_SETATTR,	{ .vop_setattr = nfs4_trigger_setattr },
257 	VOPNAME_ACCESS,		{ .vop_access = nfs4_trigger_access },
258 	VOPNAME_LOOKUP,		{ .vop_lookup = nfs4_trigger_lookup },
259 	VOPNAME_CREATE,		{ .vop_create = nfs4_trigger_create },
260 	VOPNAME_REMOVE,		{ .vop_remove = nfs4_trigger_remove },
261 	VOPNAME_LINK,		{ .vop_link = nfs4_trigger_link },
262 	VOPNAME_RENAME,		{ .vop_rename = nfs4_trigger_rename },
263 	VOPNAME_MKDIR,		{ .vop_mkdir = nfs4_trigger_mkdir },
264 	VOPNAME_RMDIR,		{ .vop_rmdir = nfs4_trigger_rmdir },
265 	VOPNAME_SYMLINK,	{ .vop_symlink = nfs4_trigger_symlink },
266 	VOPNAME_READLINK,	{ .vop_readlink = nfs4_trigger_readlink },
267 	VOPNAME_INACTIVE, 	{ .vop_inactive = nfs4_inactive },
268 	VOPNAME_FID,		{ .vop_fid = nfs4_fid },
269 	VOPNAME_RWLOCK,		{ .vop_rwlock = nfs4_rwlock },
270 	VOPNAME_RWUNLOCK,	{ .vop_rwunlock = nfs4_rwunlock },
271 	VOPNAME_REALVP,		{ .vop_realvp = nfs4_realvp },
272 	VOPNAME_GETSECATTR,	{ .vop_getsecattr = nfs4_getsecattr },
273 	VOPNAME_PATHCONF,	{ .vop_pathconf = nfs4_pathconf },
274 	VOPNAME_FRLOCK,		{ .error = fs_error },
275 	VOPNAME_DISPOSE,	{ .error = fs_error },
276 	VOPNAME_SHRLOCK,	{ .error = fs_error },
277 	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
278 	NULL, NULL
279 };
280 
281 /*
282  * Trigger ops for stub vnodes; for mirror mounts, etc.
283  *
284  * The general idea is that a "triggering" op will first call
285  * nfs4_trigger_mount(), which will find out whether a mount has already
286  * been triggered.
287  *
288  * If it has, then nfs4_trigger_mount() sets newvp to the root vnode
289  * of the covering vfs.
290  *
291  * If a mount has not yet been triggered, nfs4_trigger_mount() will do so,
292  * and again set newvp, as above.
293  *
294  * The triggering op may then re-issue the VOP by calling it on newvp.
295  *
296  * Note that some ops may perform custom action, and may or may not need
297  * to trigger a mount.
298  *
299  * Some ops need to call the regular NFSv4 vnodeop for a stub vnode. We
300  * obviously can't do this with VOP_<whatever>, since it's a stub vnode
301  * and that would just recurse. Instead, we call the v4 op directly,
302  * by name.  This is OK, since we know that the vnode is for NFSv4,
303  * otherwise it couldn't be a stub.
304  *
305  */
306 
307 static int
308 nfs4_trigger_open(vnode_t **vpp, int flag, cred_t *cr)
309 {
310 	int error;
311 	vnode_t *newvp;
312 
313 	error = nfs4_trigger_mount(*vpp, &newvp);
314 	if (error)
315 		return (error);
316 
317 	/* Release the stub vnode, as we're losing the reference to it */
318 	VN_RELE(*vpp);
319 
320 	/* Give the caller the root vnode of the newly-mounted fs */
321 	*vpp = newvp;
322 
323 	/* return with VN_HELD(newvp) */
324 	return (VOP_OPEN(vpp, flag, cr));
325 }
326 
327 /*
328  * For the majority of cases, nfs4_trigger_getattr() will not trigger
329  * a mount. However, if ATTR_TRIGGER is set, we are being informed
330  * that we need to force the mount before we attempt to determine
331  * the attributes. The intent is an atomic operation for security
332  * testing.
333  */
334 static int
335 nfs4_trigger_getattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr)
336 {
337 	int error;
338 
339 	if (flags & ATTR_TRIGGER) {
340 		vnode_t	*newvp;
341 
342 		error = nfs4_trigger_mount(vp, &newvp);
343 		if (error)
344 			return (error);
345 
346 		error = VOP_GETATTR(newvp, vap, flags, cr);
347 		VN_RELE(newvp);
348 	} else {
349 		error = nfs4_getattr(vp, vap, flags, cr);
350 	}
351 
352 	return (error);
353 }
354 
355 static int
356 nfs4_trigger_setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
357 		caller_context_t *ct)
358 {
359 	int error;
360 	vnode_t *newvp;
361 
362 	error = nfs4_trigger_mount(vp, &newvp);
363 	if (error)
364 		return (error);
365 
366 	error = VOP_SETATTR(newvp, vap, flags, cr, ct);
367 	VN_RELE(newvp);
368 
369 	return (error);
370 }
371 
372 static int
373 nfs4_trigger_access(vnode_t *vp, int mode, int flags, cred_t *cr)
374 {
375 	int error;
376 	vnode_t *newvp;
377 
378 	error = nfs4_trigger_mount(vp, &newvp);
379 	if (error)
380 		return (error);
381 
382 	error = VOP_ACCESS(newvp, mode, flags, cr);
383 	VN_RELE(newvp);
384 
385 	return (error);
386 }
387 
388 static int
389 nfs4_trigger_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
390 	int flags, vnode_t *rdir, cred_t *cr)
391 {
392 	int error;
393 	vnode_t *newdvp;
394 	rnode4_t *drp = VTOR4(dvp);
395 
396 	ASSERT(RP_ISSTUB(drp));
397 
398 	/* for now, we only support mirror-mounts */
399 	ASSERT(RP_ISSTUB_MIRRORMOUNT(drp));
400 
401 	/*
402 	 * It's not legal to lookup ".." for an fs root, so we mustn't pass
403 	 * that up. Instead, pass onto the regular op, regardless of whether
404 	 * we've triggered a mount.
405 	 */
406 	if (strcmp(nm, "..") == 0)
407 		return (nfs4_lookup(dvp, nm, vpp, pnp, flags, rdir, cr));
408 
409 	error = nfs4_trigger_mount(dvp, &newdvp);
410 	if (error)
411 		return (error);
412 
413 	error = VOP_LOOKUP(newdvp, nm, vpp, pnp, flags, rdir, cr);
414 	VN_RELE(newdvp);
415 
416 	return (error);
417 }
418 
419 static int
420 nfs4_trigger_create(vnode_t *dvp, char *nm, struct vattr *va,
421 		enum vcexcl exclusive, int mode, vnode_t **vpp, cred_t *cr,
422 		int flags)
423 {
424 	int error;
425 	vnode_t *newdvp;
426 
427 	error = nfs4_trigger_mount(dvp, &newdvp);
428 	if (error)
429 		return (error);
430 
431 	error = VOP_CREATE(newdvp, nm, va, exclusive, mode, vpp, cr, flags);
432 	VN_RELE(newdvp);
433 
434 	return (error);
435 }
436 
437 static int
438 nfs4_trigger_remove(vnode_t *dvp, char *nm, cred_t *cr)
439 {
440 	int error;
441 	vnode_t *newdvp;
442 
443 	error = nfs4_trigger_mount(dvp, &newdvp);
444 	if (error)
445 		return (error);
446 
447 	error = VOP_REMOVE(newdvp, nm, cr);
448 	VN_RELE(newdvp);
449 
450 	return (error);
451 }
452 
453 static int
454 nfs4_trigger_link(vnode_t *tdvp, vnode_t *svp, char *tnm, cred_t *cr)
455 {
456 	int error;
457 	vnode_t *newtdvp;
458 
459 	error = nfs4_trigger_mount(tdvp, &newtdvp);
460 	if (error)
461 		return (error);
462 
463 	/*
464 	 * We don't check whether svp is a stub. Let the NFSv4 code
465 	 * detect that error, and return accordingly.
466 	 */
467 	error = VOP_LINK(newtdvp, svp, tnm, cr);
468 	VN_RELE(newtdvp);
469 
470 	return (error);
471 }
472 
473 static int
474 nfs4_trigger_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm,
475 		cred_t *cr)
476 {
477 	int error;
478 	vnode_t *newsdvp;
479 	rnode4_t *tdrp = VTOR4(tdvp);
480 
481 	/*
482 	 * We know that sdvp is a stub, otherwise we would not be here.
483 	 *
484 	 * If tdvp is also be a stub, there are two possibilities: it
485 	 * is either the same stub as sdvp [i.e. VN_CMP(sdvp, tdvp)]
486 	 * or it is a different stub [!VN_CMP(sdvp, tdvp)].
487 	 *
488 	 * In the former case, just trigger sdvp, and treat tdvp as
489 	 * though it were not a stub.
490 	 *
491 	 * In the latter case, it might be a different stub for the
492 	 * same server fs as sdvp, or for a different server fs.
493 	 * Regardless, from the client perspective this would still
494 	 * be a cross-filesystem rename, and should not be allowed,
495 	 * so return EXDEV, without triggering either mount.
496 	 */
497 	if (RP_ISSTUB(tdrp) && !VN_CMP(sdvp, tdvp))
498 		return (EXDEV);
499 
500 	error = nfs4_trigger_mount(sdvp, &newsdvp);
501 	if (error)
502 		return (error);
503 
504 	error = VOP_RENAME(newsdvp, snm, tdvp, tnm, cr);
505 
506 	VN_RELE(newsdvp);
507 
508 	return (error);
509 }
510 
511 static int
512 nfs4_trigger_mkdir(vnode_t *dvp, char *nm, struct vattr *va, vnode_t **vpp,
513 		cred_t *cr)
514 {
515 	int error;
516 	vnode_t *newdvp;
517 
518 	error = nfs4_trigger_mount(dvp, &newdvp);
519 	if (error)
520 		return (error);
521 
522 	error = VOP_MKDIR(newdvp, nm, va, vpp, cr);
523 	VN_RELE(newdvp);
524 
525 	return (error);
526 }
527 
528 static int
529 nfs4_trigger_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, cred_t *cr)
530 {
531 	int error;
532 	vnode_t *newdvp;
533 
534 	error = nfs4_trigger_mount(dvp, &newdvp);
535 	if (error)
536 		return (error);
537 
538 	error = VOP_RMDIR(newdvp, nm, cdir, cr);
539 	VN_RELE(newdvp);
540 
541 	return (error);
542 }
543 
544 static int
545 nfs4_trigger_symlink(vnode_t *dvp, char *lnm, struct vattr *tva, char *tnm,
546 	cred_t *cr)
547 {
548 	int error;
549 	vnode_t *newdvp;
550 
551 	error = nfs4_trigger_mount(dvp, &newdvp);
552 	if (error)
553 		return (error);
554 
555 	error = VOP_SYMLINK(newdvp, lnm, tva, tnm, cr);
556 	VN_RELE(newdvp);
557 
558 	return (error);
559 }
560 
561 static int
562 nfs4_trigger_readlink(vnode_t *vp, struct uio *uiop, cred_t *cr)
563 {
564 	int error;
565 	vnode_t *newvp;
566 
567 	error = nfs4_trigger_mount(vp, &newvp);
568 	if (error)
569 		return (error);
570 
571 	error = VOP_READLINK(newvp, uiop, cr);
572 	VN_RELE(newvp);
573 
574 	return (error);
575 }
576 
577 /* end of trigger vnode ops */
578 
579 
580 /*
581  * Mount upon a trigger vnode; for mirror-mounts, etc.
582  *
583  * The mount may have already occurred, via another thread. If not,
584  * assemble the location information - which may require fetching - and
585  * perform the mount.
586  *
587  * Sets newvp to be the root of the fs that is now covering vp. Note
588  * that we return with VN_HELD(*newvp).
589  *
590  * The caller is responsible for passing the VOP onto the covering fs.
591  */
592 static int
593 nfs4_trigger_mount(vnode_t *vp, vnode_t **newvpp)
594 {
595 	int			 error;
596 	vfs_t			*vfsp;
597 	rnode4_t		*rp = VTOR4(vp);
598 	mntinfo4_t		*mi = VTOMI4(vp);
599 	domount_args_t		*dma;
600 
601 	nfs4_ephemeral_tree_t	*net;
602 
603 	bool_t			must_unlock = FALSE;
604 	bool_t			is_building = FALSE;
605 
606 	cred_t			*zcred;
607 
608 	nfs4_trigger_globals_t	*ntg;
609 
610 	zone_t			*zone = curproc->p_zone;
611 
612 	ASSERT(RP_ISSTUB(rp));
613 
614 	/* for now, we only support mirror-mounts */
615 	ASSERT(RP_ISSTUB_MIRRORMOUNT(rp));
616 
617 	*newvpp = NULL;
618 
619 	/*
620 	 * Has the mount already occurred?
621 	 */
622 	error = vn_vfsrlock_wait(vp);
623 	if (error)
624 		goto done;
625 	vfsp = vn_mountedvfs(vp);
626 	if (vfsp != NULL) {
627 		/* the mount has already occurred */
628 		error = VFS_ROOT(vfsp, newvpp);
629 		if (!error) {
630 			/* need to update the reference time  */
631 			mutex_enter(&mi->mi_lock);
632 			if (mi->mi_ephemeral)
633 				mi->mi_ephemeral->ne_ref_time =
634 				    gethrestime_sec();
635 			mutex_exit(&mi->mi_lock);
636 		}
637 
638 		vn_vfsunlock(vp);
639 		goto done;
640 	}
641 	vn_vfsunlock(vp);
642 
643 	ntg = zone_getspecific(nfs4_ephemeral_key, zone);
644 	ASSERT(ntg != NULL);
645 
646 	mutex_enter(&mi->mi_lock);
647 
648 	/*
649 	 * We need to lock down the ephemeral tree.
650 	 */
651 	if (mi->mi_ephemeral_tree == NULL) {
652 		net = kmem_zalloc(sizeof (*net), KM_SLEEP);
653 		mutex_init(&net->net_tree_lock, NULL, MUTEX_DEFAULT, NULL);
654 		mutex_init(&net->net_cnt_lock, NULL, MUTEX_DEFAULT, NULL);
655 		net->net_refcnt = 1;
656 		net->net_status = NFS4_EPHEMERAL_TREE_BUILDING;
657 		is_building = TRUE;
658 
659 		/*
660 		 * We need to add it to the zone specific list for
661 		 * automatic unmounting and harvesting of deadwood.
662 		 */
663 		mutex_enter(&ntg->ntg_forest_lock);
664 		if (ntg->ntg_forest != NULL)
665 			net->net_next = ntg->ntg_forest;
666 		ntg->ntg_forest = net;
667 		mutex_exit(&ntg->ntg_forest_lock);
668 
669 		/*
670 		 * No lock order confusion with mi_lock because no
671 		 * other node could have grabbed net_tree_lock.
672 		 */
673 		mutex_enter(&net->net_tree_lock);
674 		mi->mi_ephemeral_tree = net;
675 		net->net_mount = mi;
676 		mutex_exit(&mi->mi_lock);
677 	} else {
678 		net = mi->mi_ephemeral_tree;
679 		mutex_exit(&mi->mi_lock);
680 
681 		mutex_enter(&net->net_cnt_lock);
682 		net->net_refcnt++;
683 		mutex_exit(&net->net_cnt_lock);
684 
685 		/*
686 		 * Note that we do not do any checks to
687 		 * see if the parent has been nuked.
688 		 * We count on the vfs layer having protected
689 		 * us from feet shooters.
690 		 */
691 		mutex_enter(&net->net_tree_lock);
692 	}
693 
694 	mutex_enter(&net->net_cnt_lock);
695 	net->net_status |= NFS4_EPHEMERAL_TREE_MOUNTING;
696 	mutex_exit(&net->net_cnt_lock);
697 
698 	must_unlock = TRUE;
699 
700 	dma = nfs4_trigger_domount_args_create(vp);
701 	if (dma == NULL) {
702 		error = EINVAL;
703 		goto done;
704 	}
705 
706 	/*
707 	 * Need to be root for this call to make mount work.
708 	 * Note that since we define mirror mounts to work
709 	 * for any user, we allow the mount to proceed. And
710 	 * we realize that the server will perform security
711 	 * checks to make sure that the client is allowed
712 	 * access. Finally, once the mount takes place,
713 	 * directory permissions will ensure that the
714 	 * content is secure.
715 	 */
716 	zcred = zone_get_kcred(getzoneid());
717 	ASSERT(zcred != NULL);
718 
719 	error = nfs4_trigger_domount(vp, dma, &vfsp, zcred);
720 	nfs4_trigger_domount_args_destroy(dma, vp);
721 
722 	crfree(zcred);
723 
724 	if (!error)
725 		error = VFS_ROOT(vfsp, newvpp);
726 done:
727 	if (must_unlock) {
728 		mutex_enter(&net->net_cnt_lock);
729 		net->net_status &= ~NFS4_EPHEMERAL_TREE_MOUNTING;
730 		if (is_building)
731 			net->net_status &= ~NFS4_EPHEMERAL_TREE_BUILDING;
732 		net->net_refcnt--;
733 		mutex_exit(&net->net_cnt_lock);
734 
735 		mutex_exit(&net->net_tree_lock);
736 	}
737 
738 	if (!error && (newvpp == NULL || *newvpp == NULL))
739 		error = ENOSYS;
740 
741 	return (error);
742 }
743 
744 /*
745  * Collect together both the generic & mount-type specific args.
746  */
747 static domount_args_t *
748 nfs4_trigger_domount_args_create(vnode_t *vp)
749 {
750 	int nointr;
751 	char *hostlist;
752 	servinfo4_t *svp;
753 	struct nfs_args *nargs, *nargs_head;
754 	enum clnt_stat status;
755 	ephemeral_servinfo_t *esi, *esi_first;
756 	domount_args_t *dma;
757 	mntinfo4_t *mi = VTOMI4(vp);
758 
759 	nointr = !(mi->mi_flags & MI4_INT);
760 	hostlist = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
761 
762 	svp = mi->mi_curr_serv;
763 	/* check if the current server is responding */
764 	status = nfs4_trigger_ping_server(svp, nointr);
765 	if (status == RPC_SUCCESS) {
766 		esi_first = nfs4_trigger_esi_create(vp, svp);
767 		if (esi_first == NULL) {
768 			kmem_free(hostlist, MAXPATHLEN);
769 			return (NULL);
770 		}
771 
772 		(void) strlcpy(hostlist, esi_first->esi_hostname, MAXPATHLEN);
773 
774 		nargs_head = nfs4_trigger_nargs_create(mi, svp, esi_first);
775 	} else {
776 		/* current server did not respond */
777 		esi_first = NULL;
778 		nargs_head = NULL;
779 	}
780 	nargs = nargs_head;
781 
782 	/*
783 	 * NFS RO failover.
784 	 *
785 	 * If we have multiple servinfo4 structures, linked via sv_next,
786 	 * we must create one nfs_args for each, linking the nfs_args via
787 	 * nfs_ext_u.nfs_extB.next.
788 	 *
789 	 * We need to build a corresponding esi for each, too, but that is
790 	 * used solely for building nfs_args, and may be immediately
791 	 * discarded, as domount() requires the info from just one esi,
792 	 * but all the nfs_args.
793 	 *
794 	 * Currently, the NFS mount code will hang if not all servers
795 	 * requested are available. To avoid that, we need to ping each
796 	 * server, here, and remove it from the list if it is not
797 	 * responding. This has the side-effect of that server then
798 	 * being permanently unavailable for this failover mount, even if
799 	 * it recovers. That's unfortunate, but the best we can do until
800 	 * the mount code path is fixed.
801 	 */
802 
803 	/*
804 	 * If the current server was down, loop indefinitely until we find
805 	 * at least one responsive server.
806 	 */
807 	do {
808 		/* no locking needed for sv_next; it is only set at fs mount */
809 		for (svp = mi->mi_servers; svp != NULL; svp = svp->sv_next) {
810 			struct nfs_args *next;
811 
812 			/*
813 			 * nargs_head: the head of the nfs_args list
814 			 * nargs: the current tail of the list
815 			 * next: the newly-created element to be added
816 			 */
817 
818 			/*
819 			 * We've already tried the current server, above;
820 			 * if it was responding, we have already included it
821 			 * and it may now be ignored.
822 			 *
823 			 * Otherwise, try it again, since it may now have
824 			 * recovered.
825 			 */
826 			if (svp == mi->mi_curr_serv && esi_first != NULL)
827 				continue;
828 
829 			(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
830 			if (svp->sv_flags & SV4_NOTINUSE) {
831 				nfs_rw_exit(&svp->sv_lock);
832 				continue;
833 			}
834 			nfs_rw_exit(&svp->sv_lock);
835 
836 			/* check if the server is responding */
837 			status = nfs4_trigger_ping_server(svp, nointr);
838 			/* if the server did not respond, ignore it */
839 			if (status != RPC_SUCCESS)
840 				continue;
841 
842 			esi = nfs4_trigger_esi_create(vp, svp);
843 			if (esi == NULL)
844 				continue;
845 
846 			/*
847 			 * If the original current server (mi_curr_serv)
848 			 * was down when when we first tried it,
849 			 * (i.e. esi_first == NULL),
850 			 * we select this new server (svp) to be the server
851 			 * that we will actually contact (esi_first).
852 			 *
853 			 * Note that it's possible that mi_curr_serv == svp,
854 			 * if that mi_curr_serv was down but has now recovered.
855 			 */
856 			next = nfs4_trigger_nargs_create(mi, svp, esi);
857 			if (esi_first == NULL) {
858 				ASSERT(nargs == NULL);
859 				ASSERT(nargs_head == NULL);
860 				nargs_head = next;
861 				esi_first = esi;
862 				(void) strlcpy(hostlist,
863 				    esi_first->esi_hostname, MAXPATHLEN);
864 			} else {
865 				ASSERT(nargs_head != NULL);
866 				nargs->nfs_ext_u.nfs_extB.next = next;
867 				(void) strlcat(hostlist, ",", MAXPATHLEN);
868 				(void) strlcat(hostlist, esi->esi_hostname,
869 				    MAXPATHLEN);
870 				/* esi was only needed for hostname & nargs */
871 				nfs4_trigger_esi_destroy(esi, vp);
872 			}
873 
874 			nargs = next;
875 		}
876 
877 		/* if we've had no response at all, wait a second */
878 		if (esi_first == NULL)
879 			delay(drv_usectohz(1000000));
880 
881 	} while (esi_first == NULL);
882 	ASSERT(nargs_head != NULL);
883 
884 	dma = kmem_zalloc(sizeof (domount_args_t), KM_SLEEP);
885 	dma->dma_esi = esi_first;
886 	dma->dma_hostlist = hostlist;
887 	dma->dma_nargs = nargs_head;
888 
889 	return (dma);
890 }
891 
892 static void
893 nfs4_trigger_domount_args_destroy(domount_args_t *dma, vnode_t *vp)
894 {
895 	if (dma != NULL) {
896 		if (dma->dma_esi != NULL && vp != NULL)
897 			nfs4_trigger_esi_destroy(dma->dma_esi, vp);
898 
899 		if (dma->dma_hostlist != NULL)
900 			kmem_free(dma->dma_hostlist, MAXPATHLEN);
901 
902 		if (dma->dma_nargs != NULL) {
903 			struct nfs_args *nargs = dma->dma_nargs;
904 
905 			do {
906 				struct nfs_args *next =
907 				    nargs->nfs_ext_u.nfs_extB.next;
908 
909 				nfs4_trigger_nargs_destroy(nargs);
910 				nargs = next;
911 			} while (nargs != NULL);
912 		}
913 
914 		kmem_free(dma, sizeof (domount_args_t));
915 	}
916 }
917 
918 /*
919  * The ephemeral_servinfo_t struct contains basic information we will need to
920  * perform the mount. Whilst the structure is generic across different
921  * types of ephemeral mount, the way we gather its contents differs.
922  */
923 static ephemeral_servinfo_t *
924 nfs4_trigger_esi_create(vnode_t *vp, servinfo4_t *svp)
925 {
926 	ephemeral_servinfo_t *esi;
927 	rnode4_t *rp = VTOR4(vp);
928 
929 	ASSERT(RP_ISSTUB(rp));
930 
931 	/* Call the ephemeral type-specific routine */
932 	if (RP_ISSTUB_MIRRORMOUNT(rp))
933 		esi = nfs4_trigger_esi_create_mirrormount(vp, svp);
934 	else
935 		esi = NULL;
936 
937 	/* for now, we only support mirror-mounts */
938 	ASSERT(esi != NULL);
939 
940 	return (esi);
941 }
942 
943 static void
944 nfs4_trigger_esi_destroy(ephemeral_servinfo_t *esi, vnode_t *vp)
945 {
946 	rnode4_t *rp = VTOR4(vp);
947 
948 	ASSERT(RP_ISSTUB(rp));
949 
950 	/* for now, we only support mirror-mounts */
951 	ASSERT(RP_ISSTUB_MIRRORMOUNT(rp));
952 
953 	/* Currently, no need for an ephemeral type-specific routine */
954 
955 	/*
956 	 * The contents of ephemeral_servinfo_t goes into nfs_args,
957 	 * and will be handled by nfs4_trigger_nargs_destroy().
958 	 * We need only free the structure itself.
959 	 */
960 	if (esi != NULL)
961 		kmem_free(esi, sizeof (ephemeral_servinfo_t));
962 }
963 
964 /*
965  * Some of this may turn out to be common with other ephemeral types,
966  * in which case it should be moved to nfs4_trigger_esi_create(), or a
967  * common function called.
968  */
969 static ephemeral_servinfo_t *
970 nfs4_trigger_esi_create_mirrormount(vnode_t *vp, servinfo4_t *svp)
971 {
972 	char			*stubpath;
973 	struct knetconfig	*sikncp, *svkncp;
974 	struct netbuf		*bufp;
975 	ephemeral_servinfo_t	*esi;
976 
977 	esi = kmem_zalloc(sizeof (ephemeral_servinfo_t), KM_SLEEP);
978 
979 	/* initially set to be our type of ephemeral mount; may be added to */
980 	esi->esi_mount_flags = NFSMNT_MIRRORMOUNT;
981 
982 	/*
983 	 * We're copying info from the stub rnode's servinfo4, but
984 	 * we must create new copies, not pointers, since this information
985 	 * is to be associated with the new mount, which will be
986 	 * unmounted (and its structures freed) separately
987 	 */
988 
989 	/*
990 	 * Sizes passed to kmem_[z]alloc here must match those freed
991 	 * in nfs4_free_args()
992 	 */
993 
994 	/*
995 	 * We hold sv_lock across kmem_zalloc() calls that may sleep, but this
996 	 * is difficult to avoid: as we need to read svp to calculate the
997 	 * sizes to be allocated.
998 	 */
999 	(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
1000 
1001 	esi->esi_hostname = kmem_zalloc(strlen(svp->sv_hostname) + 1, KM_SLEEP);
1002 	(void) strcat(esi->esi_hostname, svp->sv_hostname);
1003 
1004 	esi->esi_addr = kmem_zalloc(sizeof (struct netbuf), KM_SLEEP);
1005 	bufp = esi->esi_addr;
1006 	bufp->len = svp->sv_addr.len;
1007 	bufp->maxlen = svp->sv_addr.maxlen;
1008 	bufp->buf = kmem_zalloc(bufp->len, KM_SLEEP);
1009 	bcopy(svp->sv_addr.buf, bufp->buf, bufp->len);
1010 
1011 	esi->esi_knconf = kmem_zalloc(sizeof (*esi->esi_knconf), KM_SLEEP);
1012 	sikncp = esi->esi_knconf;
1013 	svkncp = svp->sv_knconf;
1014 	sikncp->knc_semantics = svkncp->knc_semantics;
1015 	sikncp->knc_protofmly = (caddr_t)kmem_zalloc(KNC_STRSIZE, KM_SLEEP);
1016 	(void) strcat((char *)sikncp->knc_protofmly,
1017 	    (char *)svkncp->knc_protofmly);
1018 	sikncp->knc_proto = (caddr_t)kmem_zalloc(KNC_STRSIZE, KM_SLEEP);
1019 	(void) strcat((char *)sikncp->knc_proto, (char *)svkncp->knc_proto);
1020 	sikncp->knc_rdev = svkncp->knc_rdev;
1021 
1022 	/*
1023 	 * Used when AUTH_DH is negotiated.
1024 	 *
1025 	 * This is ephemeral mount-type specific, since it contains the
1026 	 * server's time-sync syncaddr.
1027 	 */
1028 	if (svp->sv_dhsec) {
1029 		struct netbuf *bufp;
1030 		sec_data_t *sdata;
1031 		dh_k4_clntdata_t *data;
1032 
1033 		sdata = svp->sv_dhsec;
1034 		data = (dh_k4_clntdata_t *)sdata->data;
1035 		ASSERT(sdata->rpcflavor == AUTH_DH);
1036 
1037 		bufp = kmem_zalloc(sizeof (struct netbuf), KM_SLEEP);
1038 		bufp->len = data->syncaddr.len;
1039 		bufp->maxlen = data->syncaddr.maxlen;
1040 		bufp->buf = kmem_zalloc(bufp->len, KM_SLEEP);
1041 		bcopy(data->syncaddr.buf, bufp->buf, bufp->len);
1042 		esi->esi_syncaddr = bufp;
1043 
1044 		if (data->netname != NULL) {
1045 			int nmlen = data->netnamelen;
1046 
1047 			/*
1048 			 * We need to copy from a dh_k4_clntdata_t
1049 			 * netname/netnamelen pair to a NUL-terminated
1050 			 * netname string suitable for putting in nfs_args,
1051 			 * where the latter has no netnamelen field.
1052 			 */
1053 			esi->esi_netname = kmem_zalloc(nmlen + 1, KM_SLEEP);
1054 			bcopy(data->netname, esi->esi_netname, nmlen);
1055 		}
1056 	} else {
1057 		esi->esi_syncaddr = NULL;
1058 		esi->esi_netname = NULL;
1059 	}
1060 
1061 	stubpath = fn_path(VTOSV(vp)->sv_name);
1062 	/* step over initial '.', to avoid e.g. sv_path: "/tank./ws" */
1063 	ASSERT(*stubpath == '.');
1064 	stubpath += 1;
1065 
1066 	/* for nfs_args->fh */
1067 	esi->esi_path_len = strlen(svp->sv_path) + strlen(stubpath) + 1;
1068 	esi->esi_path = kmem_zalloc(esi->esi_path_len, KM_SLEEP);
1069 	(void) strcat(esi->esi_path, svp->sv_path);
1070 	(void) strcat(esi->esi_path, stubpath);
1071 
1072 	stubpath -= 1;
1073 	/* stubpath allocated by fn_path() */
1074 	kmem_free(stubpath, strlen(stubpath) + 1);
1075 
1076 	nfs_rw_exit(&svp->sv_lock);
1077 
1078 	return (esi);
1079 }
1080 
1081 /*
1082  * Assemble the args, and call the generic VFS mount function to
1083  * finally perform the ephemeral mount.
1084  */
1085 static int
1086 nfs4_trigger_domount(vnode_t *stubvp, domount_args_t *dma, vfs_t **vfsp,
1087     cred_t *cr)
1088 {
1089 	struct mounta	*uap;
1090 	char		*mntpt, *orig_path, *path;
1091 	const char	*orig_mntpt;
1092 	int		retval;
1093 	int		mntpt_len;
1094 	int		spec_len;
1095 	zone_t		*zone = curproc->p_zone;
1096 	bool_t		has_leading_slash;
1097 
1098 	vfs_t			*stubvfsp = stubvp->v_vfsp;
1099 	ephemeral_servinfo_t	*esi = dma->dma_esi;
1100 	struct nfs_args		*nargs = dma->dma_nargs;
1101 
1102 	/* first, construct the mount point for the ephemeral mount */
1103 	orig_path = path = fn_path(VTOSV(stubvp)->sv_name);
1104 	orig_mntpt = (char *)refstr_value(stubvfsp->vfs_mntpt);
1105 
1106 	if (*orig_path == '.')
1107 		orig_path++;
1108 
1109 	/*
1110 	 * Get rid of zone's root path
1111 	 */
1112 	if (zone != global_zone) {
1113 		/*
1114 		 * -1 for trailing '/' and -1 for EOS.
1115 		 */
1116 		if (strncmp(zone->zone_rootpath, orig_mntpt,
1117 		    zone->zone_rootpathlen - 1) == 0) {
1118 			orig_mntpt += (zone->zone_rootpathlen - 2);
1119 		}
1120 	}
1121 
1122 	mntpt_len = strlen(orig_mntpt) + strlen(orig_path);
1123 	mntpt = kmem_zalloc(mntpt_len + 1, KM_SLEEP);
1124 	(void) strcat(mntpt, orig_mntpt);
1125 	(void) strcat(mntpt, orig_path);
1126 
1127 	kmem_free(path, strlen(path) + 1);
1128 	path = esi->esi_path;
1129 	if (*path == '.')
1130 		path++;
1131 	if (path[0] == '/' && path[1] == '/')
1132 		path++;
1133 	has_leading_slash = (*path == '/');
1134 
1135 	spec_len = strlen(dma->dma_hostlist);
1136 	spec_len += strlen(path);
1137 
1138 	/* We are going to have to add this in */
1139 	if (!has_leading_slash)
1140 		spec_len++;
1141 
1142 	/* We need to get the ':' for dma_hostlist:esi_path */
1143 	spec_len++;
1144 
1145 	uap = kmem_zalloc(sizeof (struct mounta), KM_SLEEP);
1146 	uap->spec = kmem_zalloc(spec_len + 1, KM_SLEEP);
1147 	(void) snprintf(uap->spec, spec_len + 1, "%s:%s%s", dma->dma_hostlist,
1148 	    has_leading_slash ? "" : "/", path);
1149 
1150 	uap->dir = mntpt;
1151 
1152 	uap->flags = MS_SYSSPACE | MS_DATA;
1153 	/* fstype-independent mount options not covered elsewhere */
1154 	/* copy parent's mount(1M) "-m" flag */
1155 	if (stubvfsp->vfs_flag & VFS_NOMNTTAB)
1156 		uap->flags |= MS_NOMNTTAB;
1157 
1158 	uap->fstype = MNTTYPE_NFS4;
1159 	uap->dataptr = (char *)nargs;
1160 	/* not needed for MS_SYSSPACE */
1161 	uap->datalen = 0;
1162 
1163 	/* use optptr to pass in extra mount options */
1164 	uap->flags |= MS_OPTIONSTR;
1165 	uap->optptr = nfs4_trigger_create_mntopts(stubvfsp);
1166 	if (uap->optptr == NULL) {
1167 		retval = EINVAL;
1168 		goto done;
1169 	}
1170 	/* domount() expects us to count the trailing NUL */
1171 	uap->optlen = strlen(uap->optptr) + 1;
1172 
1173 	retval = domount(NULL, uap, stubvp, cr, vfsp);
1174 	if (retval == 0)
1175 		VFS_RELE(*vfsp);
1176 done:
1177 	if (uap->optptr)
1178 		nfs4_trigger_destroy_mntopts(uap->optptr);
1179 
1180 	kmem_free(uap->spec, spec_len + 1);
1181 	kmem_free(uap, sizeof (struct mounta));
1182 	kmem_free(mntpt, mntpt_len + 1);
1183 
1184 	return (retval);
1185 }
1186 
1187 /*
1188  * Build an nfs_args structure for passing to domount().
1189  *
1190  * Ephemeral mount-type specific data comes from the ephemeral_servinfo_t;
1191  * generic data - common to all ephemeral mount types - is read directly
1192  * from the parent mount's servinfo4_t and mntinfo4_t, via the stub vnode.
1193  */
1194 static struct nfs_args *
1195 nfs4_trigger_nargs_create(mntinfo4_t *mi, servinfo4_t *svp,
1196     ephemeral_servinfo_t *esi)
1197 {
1198 	sec_data_t *secdata;
1199 	struct nfs_args *nargs;
1200 
1201 	/* setup the nfs args */
1202 	nargs = kmem_zalloc(sizeof (struct nfs_args), KM_SLEEP);
1203 
1204 	(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
1205 
1206 	nargs->addr = esi->esi_addr;
1207 
1208 	/* for AUTH_DH by negotiation */
1209 	if (esi->esi_syncaddr || esi->esi_netname) {
1210 		nargs->flags |= NFSMNT_SECURE;
1211 		nargs->syncaddr = esi->esi_syncaddr;
1212 		nargs->netname = esi->esi_netname;
1213 	}
1214 
1215 	nargs->flags |= NFSMNT_KNCONF;
1216 	nargs->knconf = esi->esi_knconf;
1217 	nargs->flags |= NFSMNT_HOSTNAME;
1218 	nargs->hostname = esi->esi_hostname;
1219 	nargs->fh = esi->esi_path;
1220 
1221 	/* general mount settings, all copied from parent mount */
1222 	mutex_enter(&mi->mi_lock);
1223 
1224 	if (!(mi->mi_flags & MI4_HARD))
1225 		nargs->flags |= NFSMNT_SOFT;
1226 
1227 	nargs->flags |= NFSMNT_WSIZE | NFSMNT_RSIZE | NFSMNT_TIMEO |
1228 	    NFSMNT_RETRANS;
1229 	nargs->wsize = mi->mi_stsize;
1230 	nargs->rsize = mi->mi_tsize;
1231 	nargs->timeo = mi->mi_timeo;
1232 	nargs->retrans = mi->mi_retrans;
1233 
1234 	if (mi->mi_flags & MI4_INT)
1235 		nargs->flags |= NFSMNT_INT;
1236 	if (mi->mi_flags & MI4_NOAC)
1237 		nargs->flags |= NFSMNT_NOAC;
1238 
1239 	nargs->flags |= NFSMNT_ACREGMIN | NFSMNT_ACREGMAX | NFSMNT_ACDIRMIN |
1240 	    NFSMNT_ACDIRMAX;
1241 	nargs->acregmin = HR2SEC(mi->mi_acregmin);
1242 	nargs->acregmax = HR2SEC(mi->mi_acregmax);
1243 	nargs->acdirmin = HR2SEC(mi->mi_acdirmin);
1244 	nargs->acdirmax = HR2SEC(mi->mi_acdirmax);
1245 
1246 	if (mi->mi_flags & MI4_NOCTO)
1247 		nargs->flags |= NFSMNT_NOCTO;
1248 	if (mi->mi_flags & MI4_GRPID)
1249 		nargs->flags |= NFSMNT_GRPID;
1250 	if (mi->mi_flags & MI4_LLOCK)
1251 		nargs->flags |= NFSMNT_LLOCK;
1252 	if (mi->mi_flags & MI4_NOPRINT)
1253 		nargs->flags |= NFSMNT_NOPRINT;
1254 	if (mi->mi_flags & MI4_DIRECTIO)
1255 		nargs->flags |= NFSMNT_DIRECTIO;
1256 	if (mi->mi_flags & MI4_PUBLIC)
1257 		nargs->flags |= NFSMNT_PUBLIC;
1258 
1259 	mutex_exit(&mi->mi_lock);
1260 
1261 	/* add any specific flags for this type of ephemeral mount */
1262 	nargs->flags |= esi->esi_mount_flags;
1263 
1264 	/*
1265 	 * Security data & negotiation policy.
1266 	 *
1267 	 * We need to preserve the parent mount's preference for security
1268 	 * negotiation, translating SV4_TRYSECDEFAULT -> NFSMNT_SECDEFAULT.
1269 	 *
1270 	 * If SV4_TRYSECDEFAULT is not set, that indicates that a specific
1271 	 * security flavour was requested, with data in sv_secdata, and that
1272 	 * no negotiation should occur. If this specified flavour fails, that's
1273 	 * it. We will copy sv_secdata, and not set NFSMNT_SECDEFAULT.
1274 	 *
1275 	 * If SV4_TRYSECDEFAULT is set, then we start with a passed-in
1276 	 * default flavour, in sv_secdata, but then negotiate a new flavour.
1277 	 * Possible flavours are recorded in an array in sv_secinfo, with
1278 	 * currently in-use flavour pointed to by sv_currsec.
1279 	 *
1280 	 * If sv_currsec is set, i.e. if negotiation has already occurred,
1281 	 * we will copy sv_currsec. Otherwise, copy sv_secdata. Regardless,
1282 	 * we will set NFSMNT_SECDEFAULT, to enable negotiation.
1283 	 */
1284 	if (svp->sv_flags & SV4_TRYSECDEFAULT) {
1285 		/* enable negotiation for ephemeral mount */
1286 		nargs->flags |= NFSMNT_SECDEFAULT;
1287 
1288 		/*
1289 		 * As a starting point for negotiation, copy parent
1290 		 * mount's negotiated flavour (sv_currsec) if available,
1291 		 * or its passed-in flavour (sv_secdata) if not.
1292 		 */
1293 		if (svp->sv_currsec != NULL)
1294 			secdata = copy_sec_data(svp->sv_currsec);
1295 		else if (svp->sv_secdata != NULL)
1296 			secdata = copy_sec_data(svp->sv_secdata);
1297 		else
1298 			secdata = NULL;
1299 	} else {
1300 		/* do not enable negotiation; copy parent's passed-in flavour */
1301 		if (svp->sv_secdata != NULL)
1302 			secdata = copy_sec_data(svp->sv_secdata);
1303 		else
1304 			secdata = NULL;
1305 	}
1306 
1307 	nfs_rw_exit(&svp->sv_lock);
1308 
1309 	nargs->flags |= NFSMNT_NEWARGS;
1310 	nargs->nfs_args_ext = NFS_ARGS_EXTB;
1311 	nargs->nfs_ext_u.nfs_extB.secdata = secdata;
1312 
1313 	/* for NFS RO failover; caller will set if necessary */
1314 	nargs->nfs_ext_u.nfs_extB.next = NULL;
1315 
1316 	return (nargs);
1317 }
1318 
1319 static void
1320 nfs4_trigger_nargs_destroy(struct nfs_args *nargs)
1321 {
1322 	/*
1323 	 * Either the mount failed, in which case the data is not needed, or
1324 	 * nfs4_mount() has either taken copies of what it needs or,
1325 	 * where it has merely copied the ptr, it has set *our* ptr to NULL,
1326 	 * whereby nfs4_free_args() will ignore it.
1327 	 */
1328 	nfs4_free_args(nargs);
1329 	kmem_free(nargs, sizeof (struct nfs_args));
1330 }
1331 
1332 /*
1333  * When we finally get into the mounting, we need to add this
1334  * node to the ephemeral tree.
1335  *
1336  * This is called from nfs4_mount().
1337  */
1338 void
1339 nfs4_record_ephemeral_mount(mntinfo4_t *mi, vnode_t *mvp)
1340 {
1341 	mntinfo4_t		*mi_parent;
1342 	nfs4_ephemeral_t	*eph;
1343 	nfs4_ephemeral_tree_t	*net;
1344 
1345 	nfs4_ephemeral_t	*prior;
1346 	nfs4_ephemeral_t	*child;
1347 
1348 	nfs4_ephemeral_t	*peer;
1349 
1350 	nfs4_trigger_globals_t	*ntg;
1351 	zone_t			*zone = curproc->p_zone;
1352 
1353 	mi_parent = VTOMI4(mvp);
1354 
1355 	/*
1356 	 * Get this before grabbing anything else!
1357 	 */
1358 	ntg = zone_getspecific(nfs4_ephemeral_key, zone);
1359 	if (!ntg->ntg_thread_started) {
1360 		nfs4_ephemeral_start_harvester(ntg);
1361 	}
1362 
1363 	mutex_enter(&mi_parent->mi_lock);
1364 	mutex_enter(&mi->mi_lock);
1365 
1366 	/*
1367 	 * We need to tack together the ephemeral mount
1368 	 * with this new mntinfo.
1369 	 */
1370 	eph = kmem_zalloc(sizeof (*eph), KM_SLEEP);
1371 	eph->ne_mount = mi;
1372 	eph->ne_ref_time = gethrestime_sec();
1373 
1374 	/*
1375 	 * We need to tell the ephemeral mount when
1376 	 * to time out.
1377 	 */
1378 	eph->ne_mount_to = ntg->ntg_mount_to;
1379 
1380 	mi->mi_flags |= MI4_EPHEMERAL;
1381 	mi->mi_ephemeral = eph;
1382 
1383 	net = mi->mi_ephemeral_tree =
1384 	    mi_parent->mi_ephemeral_tree;
1385 	ASSERT(net != NULL);
1386 
1387 	/*
1388 	 * If the enclosing mntinfo4 is also ephemeral,
1389 	 * then we need to point to its enclosing parent.
1390 	 * Else the enclosing mntinfo4 is the enclosing parent.
1391 	 *
1392 	 * We also need to weave this ephemeral node
1393 	 * into the tree.
1394 	 */
1395 	if (mi_parent->mi_flags & MI4_EPHEMERAL) {
1396 		/*
1397 		 * We need to decide if we are
1398 		 * the root node of this branch
1399 		 * or if we are a sibling of this
1400 		 * branch.
1401 		 */
1402 		prior = mi_parent->mi_ephemeral;
1403 		ASSERT(prior != NULL);
1404 		if (prior->ne_child == NULL) {
1405 			prior->ne_child = eph;
1406 		} else {
1407 			child = prior->ne_child;
1408 
1409 			prior->ne_child = eph;
1410 			eph->ne_peer = child;
1411 
1412 			child->ne_prior = eph;
1413 		}
1414 
1415 		eph->ne_prior = prior;
1416 	} else {
1417 		/*
1418 		 * The parent mntinfo4 is the non-ephemeral
1419 		 * root of the ephemeral tree. We
1420 		 * need to decide if we are the root
1421 		 * node of that tree or if we are a
1422 		 * sibling of the root node.
1423 		 *
1424 		 * We are the root if there is no
1425 		 * other node.
1426 		 */
1427 		if (net->net_root == NULL) {
1428 			net->net_root = eph;
1429 		} else {
1430 			eph->ne_peer = peer = net->net_root;
1431 			ASSERT(peer != NULL);
1432 			net->net_root = eph;
1433 
1434 			peer->ne_prior = eph;
1435 		}
1436 
1437 		eph->ne_prior = NULL;
1438 	}
1439 
1440 	mutex_exit(&mi->mi_lock);
1441 	mutex_exit(&mi_parent->mi_lock);
1442 }
1443 
1444 /*
1445  * Commit the changes to the ephemeral tree for removing this node.
1446  */
1447 static void
1448 nfs4_ephemeral_umount_cleanup(nfs4_ephemeral_t *eph)
1449 {
1450 	nfs4_ephemeral_t	*e = eph;
1451 	nfs4_ephemeral_t	*peer;
1452 	nfs4_ephemeral_t	*prior;
1453 
1454 	peer = eph->ne_peer;
1455 	prior = e->ne_prior;
1456 
1457 	/*
1458 	 * If this branch root was not the
1459 	 * tree root, then we need to fix back pointers.
1460 	 */
1461 	if (prior) {
1462 		if (prior->ne_child == e) {
1463 			prior->ne_child = peer;
1464 		} else {
1465 			prior->ne_peer = peer;
1466 		}
1467 
1468 		if (peer)
1469 			peer->ne_prior = prior;
1470 	} else if (peer) {
1471 		peer->ne_mount->mi_ephemeral_tree->net_root = peer;
1472 		peer->ne_prior = NULL;
1473 	} else {
1474 		e->ne_mount->mi_ephemeral_tree->net_root = NULL;
1475 	}
1476 }
1477 
1478 /*
1479  * We want to avoid recursion at all costs. So we need to
1480  * unroll the tree. We do this by a depth first traversal to
1481  * leaf nodes. We blast away the leaf and work our way back
1482  * up and down the tree.
1483  */
1484 static int
1485 nfs4_ephemeral_unmount_engine(nfs4_ephemeral_t *eph,
1486     int isTreeRoot, int flag, cred_t *cr)
1487 {
1488 	nfs4_ephemeral_t	*e = eph;
1489 	nfs4_ephemeral_t	*prior;
1490 	mntinfo4_t		*mi;
1491 	vfs_t			*vfsp;
1492 	int			error;
1493 
1494 	/*
1495 	 * We use the loop while unrolling the ephemeral tree.
1496 	 */
1497 	for (;;) {
1498 		/*
1499 		 * First we walk down the child.
1500 		 */
1501 		if (e->ne_child) {
1502 			prior = e;
1503 			e = e->ne_child;
1504 			continue;
1505 		}
1506 
1507 		/*
1508 		 * If we are the root of the branch we are removing,
1509 		 * we end it here. But if the branch is the root of
1510 		 * the tree, we have to forge on. We do not consider
1511 		 * the peer list for the root because while it may
1512 		 * be okay to remove, it is both extra work and a
1513 		 * potential for a false-positive error to stall the
1514 		 * unmount attempt.
1515 		 */
1516 		if (e == eph && isTreeRoot == FALSE)
1517 			return (0);
1518 
1519 		/*
1520 		 * Next we walk down the peer list.
1521 		 */
1522 		if (e->ne_peer) {
1523 			prior = e;
1524 			e = e->ne_peer;
1525 			continue;
1526 		}
1527 
1528 		/*
1529 		 * We can only remove the node passed in by the
1530 		 * caller if it is the root of the ephemeral tree.
1531 		 * Otherwise, the caller will remove it.
1532 		 */
1533 		if (e == eph && isTreeRoot == FALSE)
1534 			return (0);
1535 
1536 		/*
1537 		 * Okay, we have a leaf node, time
1538 		 * to prune it!
1539 		 *
1540 		 * Note that prior can only be NULL if
1541 		 * and only if it is the root of the
1542 		 * ephemeral tree.
1543 		 */
1544 		prior = e->ne_prior;
1545 
1546 		mi = e->ne_mount;
1547 		mutex_enter(&mi->mi_lock);
1548 		vfsp = mi->mi_vfsp;
1549 
1550 		/*
1551 		 * Cleared by umount2_engine.
1552 		 */
1553 		VFS_HOLD(vfsp);
1554 
1555 		/*
1556 		 * Inform nfs4_unmount to not recursively
1557 		 * descend into this node's children when it
1558 		 * gets processed.
1559 		 */
1560 		mi->mi_flags |= MI4_EPHEMERAL_RECURSED;
1561 		mutex_exit(&mi->mi_lock);
1562 
1563 		error = umount2_engine(vfsp, flag, cr, FALSE);
1564 		if (error) {
1565 			/*
1566 			 * We need to reenable nfs4_unmount's ability
1567 			 * to recursively descend on this node.
1568 			 */
1569 			mutex_enter(&mi->mi_lock);
1570 			mi->mi_flags &= ~MI4_EPHEMERAL_RECURSED;
1571 			mutex_exit(&mi->mi_lock);
1572 
1573 			return (error);
1574 		}
1575 
1576 		/*
1577 		 * If we are the current node, we do not want to
1578 		 * touch anything else. At this point, the only
1579 		 * way the current node can have survived to here
1580 		 * is if it is the root of the ephemeral tree and
1581 		 * we are unmounting the enclosing mntinfo4.
1582 		 */
1583 		if (e == eph) {
1584 			ASSERT(prior == NULL);
1585 			return (0);
1586 		}
1587 
1588 		/*
1589 		 * Stitch up the prior node. Note that since
1590 		 * we have handled the root of the tree, prior
1591 		 * must be non-NULL.
1592 		 */
1593 		ASSERT(prior != NULL);
1594 		if (prior->ne_child == e) {
1595 			prior->ne_child = NULL;
1596 		} else {
1597 			ASSERT(prior->ne_peer == e);
1598 
1599 			prior->ne_peer = NULL;
1600 		}
1601 
1602 		e = prior;
1603 	}
1604 
1605 	/* NOTREACHED */
1606 }
1607 
1608 /*
1609  * Common code to safely release net_cnt_lock and net_tree_lock
1610  */
1611 void
1612 nfs4_ephemeral_umount_unlock(bool_t *pmust_unlock,
1613     nfs4_ephemeral_tree_t **pnet)
1614 {
1615 	nfs4_ephemeral_tree_t	*net = *pnet;
1616 
1617 	if (*pmust_unlock) {
1618 		mutex_enter(&net->net_cnt_lock);
1619 		net->net_refcnt--;
1620 		net->net_status &= ~NFS4_EPHEMERAL_TREE_UMOUNTING;
1621 		mutex_exit(&net->net_cnt_lock);
1622 
1623 		mutex_exit(&net->net_tree_lock);
1624 
1625 		*pmust_unlock = FALSE;
1626 	}
1627 }
1628 
1629 /*
1630  * While we may have removed any child or sibling nodes of this
1631  * ephemeral node, we can not nuke it until we know that there
1632  * were no actived vnodes on it. This will do that final
1633  * work once we know it is not busy.
1634  */
1635 void
1636 nfs4_ephemeral_umount_activate(mntinfo4_t *mi, bool_t *pmust_unlock,
1637     nfs4_ephemeral_tree_t **pnet)
1638 {
1639 	/*
1640 	 * Now we need to get rid of the ephemeral data if it exists.
1641 	 */
1642 	mutex_enter(&mi->mi_lock);
1643 	if (mi->mi_ephemeral) {
1644 		/*
1645 		 * If we are the root node of an ephemeral branch
1646 		 * which is being removed, then we need to fixup
1647 		 * pointers into and out of the node.
1648 		 */
1649 		if (!(mi->mi_flags & MI4_EPHEMERAL_RECURSED))
1650 			nfs4_ephemeral_umount_cleanup(mi->mi_ephemeral);
1651 
1652 		ASSERT(mi->mi_ephemeral != NULL);
1653 
1654 		kmem_free(mi->mi_ephemeral, sizeof (*mi->mi_ephemeral));
1655 		mi->mi_ephemeral = NULL;
1656 	}
1657 	mutex_exit(&mi->mi_lock);
1658 
1659 	nfs4_ephemeral_umount_unlock(pmust_unlock, pnet);
1660 }
1661 
1662 /*
1663  * Unmount an ephemeral node.
1664  */
1665 int
1666 nfs4_ephemeral_umount(mntinfo4_t *mi, int flag, cred_t *cr,
1667     bool_t *pmust_unlock, nfs4_ephemeral_tree_t **pnet)
1668 {
1669 	int			error = 0;
1670 	nfs4_ephemeral_t	*eph;
1671 	nfs4_ephemeral_tree_t	*net;
1672 	int			is_derooting = FALSE;
1673 	int			is_recursed = FALSE;
1674 	int			was_locked = FALSE;
1675 
1676 	/*
1677 	 * The active vnodes on this file system may be ephemeral
1678 	 * children. We need to check for and try to unmount them
1679 	 * here. If any can not be unmounted, we are going
1680 	 * to return EBUSY.
1681 	 */
1682 	mutex_enter(&mi->mi_lock);
1683 
1684 	/*
1685 	 * If an ephemeral tree, we need to check to see if
1686 	 * the lock is already held. If it is, then we need
1687 	 * to see if we are being called as a result of
1688 	 * the recursive removal of some node of the tree or
1689 	 * if we are another attempt to remove the tree.
1690 	 *
1691 	 * mi_flags & MI4_EPHEMERAL indicates an ephemeral
1692 	 * node. mi_ephemeral being non-NULL also does this.
1693 	 *
1694 	 * mi_ephemeral_tree being non-NULL is sufficient
1695 	 * to also indicate either it is an ephemeral node
1696 	 * or the enclosing mntinfo4.
1697 	 *
1698 	 * Do we need MI4_EPHEMERAL? Yes, it is useful for
1699 	 * when we delete the ephemeral node and need to
1700 	 * differentiate from an ephemeral node and the
1701 	 * enclosing root node.
1702 	 */
1703 	*pnet = net = mi->mi_ephemeral_tree;
1704 	eph = mi->mi_ephemeral;
1705 	if (net) {
1706 		is_recursed = mi->mi_flags & MI4_EPHEMERAL_RECURSED;
1707 		is_derooting = (eph == NULL);
1708 		mutex_exit(&mi->mi_lock);
1709 
1710 		/*
1711 		 * If this is not recursion, then we need to
1712 		 * grab a ref count.
1713 		 *
1714 		 * But wait, we also do not want to do that
1715 		 * if a harvester thread has already grabbed
1716 		 * the lock.
1717 		 */
1718 		if (!is_recursed) {
1719 			mutex_enter(&net->net_cnt_lock);
1720 			if (net->net_status &
1721 			    NFS4_EPHEMERAL_TREE_LOCKED)
1722 				was_locked = TRUE;
1723 			else
1724 				net->net_refcnt++;
1725 			mutex_exit(&net->net_cnt_lock);
1726 		}
1727 
1728 		/*
1729 		 * If we grab the lock, it means that no other
1730 		 * operation is working on the tree. If we don't
1731 		 * grab it, we need to decide if this is because
1732 		 * we are a recursive call or a new operation.
1733 		 *
1734 		 * If we are a recursive call, we proceed without
1735 		 * the lock.
1736 		 *
1737 		 * Else we have to wait until the lock becomes free.
1738 		 */
1739 		if (was_locked == FALSE &&
1740 		    !mutex_tryenter(&net->net_tree_lock)) {
1741 			if (!is_recursed) {
1742 				mutex_enter(&net->net_cnt_lock);
1743 				if (net->net_status &
1744 				    (NFS4_EPHEMERAL_TREE_DEROOTING
1745 				    | NFS4_EPHEMERAL_TREE_INVALID)) {
1746 					net->net_refcnt--;
1747 					mutex_exit(&net->net_cnt_lock);
1748 					goto is_busy;
1749 				}
1750 				mutex_exit(&net->net_cnt_lock);
1751 
1752 				/*
1753 				 * We can't hold any other locks whilst
1754 				 * we wait on this to free up.
1755 				 */
1756 				mutex_enter(&net->net_tree_lock);
1757 
1758 				/*
1759 				 * Note that while mi->mi_ephemeral
1760 				 * may change and thus we have to
1761 				 * update eph, it is the case that
1762 				 * we have tied down net and
1763 				 * do not care if mi->mi_ephemeral_tree
1764 				 * has changed.
1765 				 */
1766 				mutex_enter(&mi->mi_lock);
1767 				eph = mi->mi_ephemeral;
1768 				mutex_exit(&mi->mi_lock);
1769 
1770 				/*
1771 				 * Okay, we need to see if either the
1772 				 * tree got nuked or the current node
1773 				 * got nuked. Both of which will cause
1774 				 * an error.
1775 				 *
1776 				 * Note that a subsequent retry of the
1777 				 * umount shall work.
1778 				 */
1779 				mutex_enter(&net->net_cnt_lock);
1780 				if (net->net_status &
1781 				    NFS4_EPHEMERAL_TREE_INVALID ||
1782 				    (!is_derooting && eph == NULL)) {
1783 					net->net_refcnt--;
1784 					mutex_exit(&net->net_cnt_lock);
1785 					mutex_exit(&net->net_tree_lock);
1786 					goto is_busy;
1787 				}
1788 				mutex_exit(&net->net_cnt_lock);
1789 				*pmust_unlock = TRUE;
1790 			}
1791 		} else if (was_locked == FALSE) {
1792 			/*
1793 			 * If we grab it right away, everything must
1794 			 * be great!
1795 			 */
1796 			*pmust_unlock = TRUE;
1797 		}
1798 
1799 		/*
1800 		 * Only once we have grabbed the lock can we mark what we
1801 		 * are planning on doing to the ephemeral tree.
1802 		 */
1803 		if (*pmust_unlock) {
1804 			mutex_enter(&net->net_cnt_lock);
1805 			net->net_status |= NFS4_EPHEMERAL_TREE_UMOUNTING;
1806 
1807 			/*
1808 			 * Check to see if we are nuking the root.
1809 			 */
1810 			if (is_derooting)
1811 				net->net_status |=
1812 				    NFS4_EPHEMERAL_TREE_DEROOTING;
1813 			mutex_exit(&net->net_cnt_lock);
1814 		}
1815 
1816 		if (!is_derooting) {
1817 			/*
1818 			 * Only work on children if the caller has not already
1819 			 * done so.
1820 			 */
1821 			if (!is_recursed) {
1822 				ASSERT(eph != NULL);
1823 
1824 				error = nfs4_ephemeral_unmount_engine(eph,
1825 				    FALSE, flag, cr);
1826 				if (error)
1827 					goto is_busy;
1828 			}
1829 		} else {
1830 			eph = net->net_root;
1831 
1832 			/*
1833 			 * Only work if there is something there.
1834 			 */
1835 			if (eph) {
1836 				error = nfs4_ephemeral_unmount_engine(eph, TRUE,
1837 				    flag, cr);
1838 				if (error) {
1839 					mutex_enter(&net->net_cnt_lock);
1840 					net->net_status &=
1841 					    ~NFS4_EPHEMERAL_TREE_DEROOTING;
1842 					mutex_exit(&net->net_cnt_lock);
1843 					goto is_busy;
1844 				}
1845 
1846 				/*
1847 				 * Nothing else which goes wrong will
1848 				 * invalidate the blowing away of the
1849 				 * ephmeral tree.
1850 				 */
1851 				net->net_root = NULL;
1852 			}
1853 
1854 			/*
1855 			 * We have derooted and we have caused the tree to be
1856 			 * invalid.
1857 			 */
1858 			mutex_enter(&net->net_cnt_lock);
1859 			net->net_status &= ~NFS4_EPHEMERAL_TREE_DEROOTING;
1860 			net->net_status |= NFS4_EPHEMERAL_TREE_INVALID;
1861 			net->net_refcnt--;
1862 			mutex_exit(&net->net_cnt_lock);
1863 
1864 			/*
1865 			 * At this point, the tree should no
1866 			 * longer be associated with the
1867 			 * mntinfo4. We need to pull it off
1868 			 * there and let the harvester take
1869 			 * care of it once the refcnt drops.
1870 			 */
1871 			mutex_enter(&mi->mi_lock);
1872 			mi->mi_ephemeral_tree = NULL;
1873 			mutex_exit(&mi->mi_lock);
1874 		}
1875 	} else {
1876 		mutex_exit(&mi->mi_lock);
1877 	}
1878 
1879 	return (0);
1880 
1881 is_busy:
1882 
1883 	nfs4_ephemeral_umount_unlock(pmust_unlock, pnet);
1884 
1885 	return (error);
1886 }
1887 
1888 /*
1889  * Do the umount and record any error in the parent.
1890  */
1891 static void
1892 nfs4_ephemeral_record_umount(vfs_t *vfsp, int flag,
1893     nfs4_ephemeral_t *e, nfs4_ephemeral_t *prior)
1894 {
1895 	int	error;
1896 
1897 	error = umount2_engine(vfsp, flag, kcred, FALSE);
1898 	if (error) {
1899 		if (prior) {
1900 			if (prior->ne_child == e)
1901 				prior->ne_state |=
1902 				    NFS4_EPHEMERAL_CHILD_ERROR;
1903 			else
1904 				prior->ne_state |=
1905 				    NFS4_EPHEMERAL_PEER_ERROR;
1906 		}
1907 	}
1908 }
1909 
1910 /*
1911  * For each tree in the forest (where the forest is in
1912  * effect all of the ephemeral trees for this zone),
1913  * scan to see if a node can be unmounted. Note that
1914  * unlike nfs4_ephemeral_unmount_engine(), we do
1915  * not process the current node before children or
1916  * siblings. I.e., if a node can be unmounted, we
1917  * do not recursively check to see if the nodes
1918  * hanging off of it can also be unmounted.
1919  *
1920  * Instead, we delve down deep to try and remove the
1921  * children first. Then, because we share code with
1922  * nfs4_ephemeral_unmount_engine(), we will try
1923  * them again. This could be a performance issue in
1924  * the future.
1925  *
1926  * Also note that unlike nfs4_ephemeral_unmount_engine(),
1927  * we do not halt on an error. We will not remove the
1928  * current node, but we will keep on trying to remove
1929  * the others.
1930  *
1931  * force indicates that we want the unmount to occur
1932  * even if there is something blocking it.
1933  *
1934  * time_check indicates that we want to see if the
1935  * mount has expired past mount_to or not. Typically
1936  * we want to do this and only on a shutdown of the
1937  * zone would we want to ignore the check.
1938  */
1939 static void
1940 nfs4_ephemeral_harvest_forest(nfs4_trigger_globals_t *ntg,
1941     bool_t force, bool_t time_check)
1942 {
1943 	nfs4_ephemeral_tree_t	*net;
1944 	nfs4_ephemeral_tree_t	*prev = NULL;
1945 	nfs4_ephemeral_tree_t	*next;
1946 	nfs4_ephemeral_t	*e;
1947 	nfs4_ephemeral_t	*prior;
1948 	time_t			now = gethrestime_sec();
1949 
1950 	nfs4_ephemeral_tree_t	*harvest = NULL;
1951 
1952 	int			flag;
1953 
1954 	mntinfo4_t		*mi;
1955 	vfs_t			*vfsp;
1956 
1957 	if (force)
1958 		flag = MS_FORCE;
1959 	else
1960 		flag = 0;
1961 
1962 	mutex_enter(&ntg->ntg_forest_lock);
1963 	for (net = ntg->ntg_forest; net != NULL; net = next) {
1964 		next = net->net_next;
1965 
1966 		mutex_enter(&net->net_cnt_lock);
1967 		net->net_refcnt++;
1968 		mutex_exit(&net->net_cnt_lock);
1969 
1970 		mutex_enter(&net->net_tree_lock);
1971 
1972 		/*
1973 		 * Let the unmount code know that the
1974 		 * tree is already locked!
1975 		 */
1976 		mutex_enter(&net->net_cnt_lock);
1977 		net->net_status |= NFS4_EPHEMERAL_TREE_LOCKED;
1978 		mutex_exit(&net->net_cnt_lock);
1979 
1980 		/*
1981 		 * If the intent is force all ephemeral nodes to
1982 		 * be unmounted in this zone, we can short circuit a
1983 		 * lot of tree traversal and simply zap the root node.
1984 		 */
1985 		if (force) {
1986 			if (net->net_root) {
1987 				mi = net->net_root->ne_mount;
1988 				vfsp = mi->mi_vfsp;
1989 
1990 				/*
1991 				 * Cleared by umount2_engine.
1992 				 */
1993 				VFS_HOLD(vfsp);
1994 
1995 				(void) umount2_engine(vfsp, flag,
1996 				    kcred, FALSE);
1997 
1998 				goto check_done;
1999 			}
2000 		}
2001 
2002 		e = net->net_root;
2003 		if (e)
2004 			e->ne_state = NFS4_EPHEMERAL_VISIT_CHILD;
2005 
2006 		while (e) {
2007 			if (e->ne_state == NFS4_EPHEMERAL_VISIT_CHILD) {
2008 				e->ne_state = NFS4_EPHEMERAL_VISIT_SIBLING;
2009 				if (e->ne_child) {
2010 					e = e->ne_child;
2011 					e->ne_state =
2012 					    NFS4_EPHEMERAL_VISIT_CHILD;
2013 				}
2014 
2015 				continue;
2016 			} else if (e->ne_state ==
2017 			    NFS4_EPHEMERAL_VISIT_SIBLING) {
2018 				e->ne_state = NFS4_EPHEMERAL_PROCESS_ME;
2019 				if (e->ne_peer) {
2020 					e = e->ne_peer;
2021 					e->ne_state =
2022 					    NFS4_EPHEMERAL_VISIT_CHILD;
2023 				}
2024 
2025 				continue;
2026 			} else if (e->ne_state ==
2027 			    NFS4_EPHEMERAL_CHILD_ERROR) {
2028 				prior = e->ne_prior;
2029 
2030 				/*
2031 				 * If a child reported an error, do
2032 				 * not bother trying to unmount.
2033 				 *
2034 				 * If your prior node is a parent,
2035 				 * pass the error up such that they
2036 				 * also do not try to unmount.
2037 				 *
2038 				 * However, if your prior is a sibling,
2039 				 * let them try to unmount if they can.
2040 				 */
2041 				if (prior) {
2042 					if (prior->ne_child == e)
2043 						prior->ne_state |=
2044 						    NFS4_EPHEMERAL_CHILD_ERROR;
2045 					else
2046 						prior->ne_state |=
2047 						    NFS4_EPHEMERAL_PEER_ERROR;
2048 				}
2049 
2050 				/*
2051 				 * Clear the error and if needed, process peers.
2052 				 *
2053 				 * Once we mask out the error, we know whether
2054 				 * or we have to process another node.
2055 				 */
2056 				e->ne_state &= ~NFS4_EPHEMERAL_CHILD_ERROR;
2057 				if (e->ne_state == NFS4_EPHEMERAL_PROCESS_ME)
2058 					e = prior;
2059 
2060 				continue;
2061 			} else if (e->ne_state ==
2062 			    NFS4_EPHEMERAL_PEER_ERROR) {
2063 				prior = e->ne_prior;
2064 
2065 				if (prior) {
2066 					if (prior->ne_child == e)
2067 						prior->ne_state =
2068 						    NFS4_EPHEMERAL_CHILD_ERROR;
2069 					else
2070 						prior->ne_state =
2071 						    NFS4_EPHEMERAL_PEER_ERROR;
2072 				}
2073 
2074 				/*
2075 				 * Clear the error from this node and do the
2076 				 * correct processing.
2077 				 */
2078 				e->ne_state &= ~NFS4_EPHEMERAL_PEER_ERROR;
2079 				continue;
2080 			}
2081 
2082 			prior = e->ne_prior;
2083 			e->ne_state = NFS4_EPHEMERAL_OK;
2084 
2085 			/*
2086 			 * It must be the case that we need to process
2087 			 * this node.
2088 			 */
2089 			if (!time_check ||
2090 			    now - e->ne_ref_time > e->ne_mount_to) {
2091 				mi = e->ne_mount;
2092 				vfsp = mi->mi_vfsp;
2093 
2094 				/*
2095 				 * Cleared by umount2_engine.
2096 				 */
2097 				VFS_HOLD(vfsp);
2098 
2099 				/*
2100 				 * Note that we effectively work down to the
2101 				 * leaf nodes first, try to unmount them,
2102 				 * then work our way back up into the leaf
2103 				 * nodes.
2104 				 *
2105 				 * Also note that we deal with a lot of
2106 				 * complexity by sharing the work with
2107 				 * the manual unmount code.
2108 				 */
2109 				nfs4_ephemeral_record_umount(vfsp, flag,
2110 				    e, prior);
2111 			}
2112 
2113 			e = prior;
2114 		}
2115 
2116 check_done:
2117 
2118 		/*
2119 		 * Are we done with this tree?
2120 		 */
2121 		mutex_enter(&net->net_cnt_lock);
2122 		if (net->net_refcnt == 1 &&
2123 		    net->net_status & NFS4_EPHEMERAL_TREE_INVALID) {
2124 			net->net_refcnt--;
2125 			net->net_status &= ~NFS4_EPHEMERAL_TREE_LOCKED;
2126 			mutex_exit(&net->net_cnt_lock);
2127 			mutex_exit(&net->net_tree_lock);
2128 
2129 			if (prev)
2130 				prev->net_next = net->net_next;
2131 			else
2132 				ntg->ntg_forest = net->net_next;
2133 
2134 			net->net_next = harvest;
2135 			harvest = net;
2136 			continue;
2137 		}
2138 
2139 		net->net_refcnt--;
2140 		net->net_status &= ~NFS4_EPHEMERAL_TREE_LOCKED;
2141 		mutex_exit(&net->net_cnt_lock);
2142 		mutex_exit(&net->net_tree_lock);
2143 
2144 		prev = net;
2145 	}
2146 	mutex_exit(&ntg->ntg_forest_lock);
2147 
2148 	for (net = harvest; net != NULL; net = next) {
2149 		next = net->net_next;
2150 
2151 		mutex_destroy(&net->net_tree_lock);
2152 		mutex_destroy(&net->net_cnt_lock);
2153 		kmem_free(net, sizeof (*net));
2154 	}
2155 }
2156 
2157 /*
2158  * This is the thread which decides when the harvesting
2159  * can proceed and when to kill it off for this zone.
2160  */
2161 static void
2162 nfs4_ephemeral_harvester(nfs4_trigger_globals_t *ntg)
2163 {
2164 	clock_t		timeleft;
2165 	zone_t		*zone = curproc->p_zone;
2166 
2167 	for (;;) {
2168 		timeleft = zone_status_timedwait(zone, lbolt +
2169 		    nfs4_trigger_thread_timer * hz, ZONE_IS_SHUTTING_DOWN);
2170 
2171 		/*
2172 		 * zone is exiting...
2173 		 */
2174 		if (timeleft != -1) {
2175 			ASSERT(zone_status_get(zone) >= ZONE_IS_SHUTTING_DOWN);
2176 			zthread_exit();
2177 			/* NOTREACHED */
2178 		}
2179 
2180 		/*
2181 		 * Only bother scanning if there is potential
2182 		 * work to be done.
2183 		 */
2184 		if (ntg->ntg_forest == NULL)
2185 			continue;
2186 
2187 		/*
2188 		 * Now scan the list and get rid of everything which
2189 		 * is old.
2190 		 */
2191 		nfs4_ephemeral_harvest_forest(ntg, FALSE, TRUE);
2192 	}
2193 
2194 	/* NOTREACHED */
2195 }
2196 
2197 /*
2198  * The zone specific glue needed to start the unmount harvester.
2199  *
2200  * Note that we want to avoid holding the mutex as long as possible,
2201  * hence the multiple checks.
2202  *
2203  * The caller should avoid us getting down here in the first
2204  * place.
2205  */
2206 static void
2207 nfs4_ephemeral_start_harvester(nfs4_trigger_globals_t *ntg)
2208 {
2209 	/*
2210 	 * It got started before we got here...
2211 	 */
2212 	if (ntg->ntg_thread_started)
2213 		return;
2214 
2215 	mutex_enter(&nfs4_ephemeral_thread_lock);
2216 
2217 	if (ntg->ntg_thread_started) {
2218 		mutex_exit(&nfs4_ephemeral_thread_lock);
2219 		return;
2220 	}
2221 
2222 	/*
2223 	 * Start the unmounter harvester thread for this zone.
2224 	 */
2225 	(void) zthread_create(NULL, 0, nfs4_ephemeral_harvester,
2226 	    ntg, 0, minclsyspri);
2227 
2228 	ntg->ntg_thread_started = TRUE;
2229 	mutex_exit(&nfs4_ephemeral_thread_lock);
2230 }
2231 
2232 /*ARGSUSED*/
2233 static void *
2234 nfs4_ephemeral_zsd_create(zoneid_t zoneid)
2235 {
2236 	nfs4_trigger_globals_t	*ntg;
2237 
2238 	ntg = kmem_zalloc(sizeof (*ntg), KM_SLEEP);
2239 	ntg->ntg_thread_started = FALSE;
2240 
2241 	/*
2242 	 * This is the default....
2243 	 */
2244 	ntg->ntg_mount_to = nfs4_trigger_thread_timer;
2245 
2246 	mutex_init(&ntg->ntg_forest_lock, NULL,
2247 	    MUTEX_DEFAULT, NULL);
2248 
2249 	return (ntg);
2250 }
2251 
2252 /*
2253  * Try a nice gentle walk down the forest and convince
2254  * all of the trees to gracefully give it up.
2255  */
2256 /*ARGSUSED*/
2257 static void
2258 nfs4_ephemeral_zsd_shutdown(zoneid_t zoneid, void *arg)
2259 {
2260 	nfs4_trigger_globals_t	*ntg = arg;
2261 
2262 	if (!ntg)
2263 		return;
2264 
2265 	nfs4_ephemeral_harvest_forest(ntg, FALSE, FALSE);
2266 }
2267 
2268 /*
2269  * Race along the forest and rip all of the trees out by
2270  * their rootballs!
2271  */
2272 /*ARGSUSED*/
2273 static void
2274 nfs4_ephemeral_zsd_destroy(zoneid_t zoneid, void *arg)
2275 {
2276 	nfs4_trigger_globals_t	*ntg = arg;
2277 
2278 	if (!ntg)
2279 		return;
2280 
2281 	nfs4_ephemeral_harvest_forest(ntg, TRUE, FALSE);
2282 
2283 	mutex_destroy(&ntg->ntg_forest_lock);
2284 	kmem_free(ntg, sizeof (*ntg));
2285 }
2286 
2287 /*
2288  * This is the zone independent cleanup needed for
2289  * emphemeral mount processing.
2290  */
2291 void
2292 nfs4_ephemeral_fini(void)
2293 {
2294 	(void) zone_key_delete(nfs4_ephemeral_key);
2295 	mutex_destroy(&nfs4_ephemeral_thread_lock);
2296 }
2297 
2298 /*
2299  * This is the zone independent initialization needed for
2300  * emphemeral mount processing.
2301  */
2302 void
2303 nfs4_ephemeral_init(void)
2304 {
2305 	mutex_init(&nfs4_ephemeral_thread_lock, NULL, MUTEX_DEFAULT,
2306 	    NULL);
2307 
2308 	zone_key_create(&nfs4_ephemeral_key, nfs4_ephemeral_zsd_create,
2309 	    nfs4_ephemeral_zsd_shutdown, nfs4_ephemeral_zsd_destroy);
2310 }
2311 
2312 /*
2313  * nfssys() calls this function to set the per-zone
2314  * value of mount_to to drive when an ephemeral mount is
2315  * timed out. Each mount will grab a copy of this value
2316  * when mounted.
2317  */
2318 void
2319 nfs4_ephemeral_set_mount_to(uint_t mount_to)
2320 {
2321 	nfs4_trigger_globals_t	*ntg;
2322 	zone_t			*zone = curproc->p_zone;
2323 
2324 	ntg = zone_getspecific(nfs4_ephemeral_key, zone);
2325 
2326 	ntg->ntg_mount_to = mount_to;
2327 }
2328 
2329 /*
2330  * Walk the list of v4 mount options; if they are currently set in vfsp,
2331  * append them to a new comma-separated mount option string, and return it.
2332  *
2333  * Caller should free by calling nfs4_trigger_destroy_mntopts().
2334  */
2335 static char *
2336 nfs4_trigger_create_mntopts(vfs_t *vfsp)
2337 {
2338 	uint_t i;
2339 	char *mntopts;
2340 	struct vfssw *vswp;
2341 	mntopts_t *optproto;
2342 
2343 	mntopts = kmem_zalloc(MAX_MNTOPT_STR, KM_SLEEP);
2344 
2345 	/* get the list of applicable mount options for v4; locks *vswp */
2346 	vswp = vfs_getvfssw(MNTTYPE_NFS4);
2347 	optproto = &vswp->vsw_optproto;
2348 
2349 	for (i = 0; i < optproto->mo_count; i++) {
2350 		struct mntopt *mop = &optproto->mo_list[i];
2351 
2352 		if (mop->mo_flags & MO_EMPTY)
2353 			continue;
2354 
2355 		if (nfs4_trigger_add_mntopt(mntopts, mop->mo_name, vfsp)) {
2356 			kmem_free(mntopts, MAX_MNTOPT_STR);
2357 			vfs_unrefvfssw(vswp);
2358 			return (NULL);
2359 		}
2360 	}
2361 
2362 	vfs_unrefvfssw(vswp);
2363 
2364 	/*
2365 	 * MNTOPT_XATTR is not in the v4 mount opt proto list,
2366 	 * and it may only be passed via MS_OPTIONSTR, so we
2367 	 * must handle it here.
2368 	 *
2369 	 * Ideally, it would be in the list, but NFS does not specify its
2370 	 * own opt proto list, it uses instead the default one. Since
2371 	 * not all filesystems support extended attrs, it would not be
2372 	 * appropriate to add it there.
2373 	 */
2374 	if (nfs4_trigger_add_mntopt(mntopts, MNTOPT_XATTR, vfsp) ||
2375 	    nfs4_trigger_add_mntopt(mntopts, MNTOPT_NOXATTR, vfsp)) {
2376 		kmem_free(mntopts, MAX_MNTOPT_STR);
2377 		return (NULL);
2378 	}
2379 
2380 	return (mntopts);
2381 }
2382 
2383 static void
2384 nfs4_trigger_destroy_mntopts(char *mntopts)
2385 {
2386 	if (mntopts)
2387 		kmem_free(mntopts, MAX_MNTOPT_STR);
2388 }
2389 
2390 /*
2391  * Check a single mount option (optname). Add to mntopts if it is set in VFS.
2392  */
2393 static int
2394 nfs4_trigger_add_mntopt(char *mntopts, char *optname, vfs_t *vfsp)
2395 {
2396 	if (mntopts == NULL || optname == NULL || vfsp == NULL)
2397 		return (EINVAL);
2398 
2399 	if (vfs_optionisset(vfsp, optname, NULL)) {
2400 		size_t mntoptslen = strlen(mntopts);
2401 		size_t optnamelen = strlen(optname);
2402 
2403 		/* +1 for ',', +1 for NUL */
2404 		if (mntoptslen + optnamelen + 2 > MAX_MNTOPT_STR)
2405 			return (EOVERFLOW);
2406 
2407 		/* first or subsequent mount option? */
2408 		if (*mntopts != '\0')
2409 			(void) strcat(mntopts, ",");
2410 
2411 		(void) strcat(mntopts, optname);
2412 	}
2413 
2414 	return (0);
2415 }
2416 
2417 static enum clnt_stat
2418 nfs4_trigger_ping_server(servinfo4_t *svp, int nointr)
2419 {
2420 	int retries, error;
2421 	uint_t max_msgsize;
2422 	enum clnt_stat status;
2423 	CLIENT *cl;
2424 	struct timeval timeout;
2425 
2426 	/* as per recov_newserver() */
2427 	max_msgsize = 0;
2428 	retries = 1;
2429 	timeout.tv_sec = 2;
2430 	timeout.tv_usec = 0;
2431 
2432 	error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr, NFS_PROGRAM,
2433 	    NFS_V4, max_msgsize, retries, CRED(), &cl);
2434 	if (error)
2435 		return (RPC_FAILED);
2436 
2437 	if (nointr)
2438 		cl->cl_nosignal = TRUE;
2439 	status = CLNT_CALL(cl, RFS_NULL, xdr_void, NULL, xdr_void, NULL,
2440 	    timeout);
2441 	if (nointr)
2442 		cl->cl_nosignal = FALSE;
2443 
2444 	AUTH_DESTROY(cl->cl_auth);
2445 	CLNT_DESTROY(cl);
2446 
2447 	return (status);
2448 }
2449