1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27/*
28 * Support for ephemeral mounts, e.g. mirror-mounts. These mounts are
29 * triggered from a "stub" rnode via a special set of vnodeops.
30 */
31
32#include <sys/param.h>
33#include <sys/types.h>
34#include <sys/systm.h>
35#include <sys/cred.h>
36#include <sys/time.h>
37#include <sys/vnode.h>
38#include <sys/vfs.h>
39#include <sys/vfs_opreg.h>
40#include <sys/file.h>
41#include <sys/filio.h>
42#include <sys/uio.h>
43#include <sys/buf.h>
44#include <sys/mman.h>
45#include <sys/pathname.h>
46#include <sys/dirent.h>
47#include <sys/debug.h>
48#include <sys/vmsystm.h>
49#include <sys/fcntl.h>
50#include <sys/flock.h>
51#include <sys/swap.h>
52#include <sys/errno.h>
53#include <sys/strsubr.h>
54#include <sys/sysmacros.h>
55#include <sys/kmem.h>
56#include <sys/mount.h>
57#include <sys/cmn_err.h>
58#include <sys/pathconf.h>
59#include <sys/utsname.h>
60#include <sys/dnlc.h>
61#include <sys/acl.h>
62#include <sys/systeminfo.h>
63#include <sys/policy.h>
64#include <sys/sdt.h>
65#include <sys/list.h>
66#include <sys/stat.h>
67#include <sys/mntent.h>
68#include <sys/priv.h>
69
70#include <rpc/types.h>
71#include <rpc/auth.h>
72#include <rpc/clnt.h>
73
74#include <nfs/nfs.h>
75#include <nfs/nfs_clnt.h>
76#include <nfs/nfs_acl.h>
77#include <nfs/lm.h>
78#include <nfs/nfs4.h>
79#include <nfs/nfs4_kprot.h>
80#include <nfs/rnode4.h>
81#include <nfs/nfs4_clnt.h>
82#include <nfs/nfsid_map.h>
83#include <nfs/nfs4_idmap_impl.h>
84
85#include <vm/hat.h>
86#include <vm/as.h>
87#include <vm/page.h>
88#include <vm/pvn.h>
89#include <vm/seg.h>
90#include <vm/seg_map.h>
91#include <vm/seg_kpm.h>
92#include <vm/seg_vn.h>
93
94#include <fs/fs_subr.h>
95
96#include <sys/ddi.h>
97#include <sys/int_fmtio.h>
98
99#include <sys/sunddi.h>
100
101#include <sys/priv_names.h>
102
103extern zone_key_t	nfs4clnt_zone_key;
104extern zone_key_t	nfsidmap_zone_key;
105
106/*
107 * The automatic unmounter thread stuff!
108 */
109static int nfs4_trigger_thread_timer = 20;	/* in seconds */
110
111/*
112 * Just a default....
113 */
114static uint_t nfs4_trigger_mount_to = 240;
115
116typedef struct nfs4_trigger_globals {
117	kmutex_t		ntg_forest_lock;
118	uint_t			ntg_mount_to;
119	int			ntg_thread_started;
120	nfs4_ephemeral_tree_t	*ntg_forest;
121} nfs4_trigger_globals_t;
122
123kmutex_t	nfs4_ephemeral_thread_lock;
124
125zone_key_t	nfs4_ephemeral_key = ZONE_KEY_UNINITIALIZED;
126
127static void	nfs4_ephemeral_start_harvester(nfs4_trigger_globals_t *);
128
129/*
130 * Used for ephemeral mounts; contains data either duplicated from
131 * servinfo4_t, or hand-crafted, depending on type of ephemeral mount.
132 *
133 * It's intended that this structure is used solely for ephemeral
134 * mount-type specific data, for passing this data to
135 * nfs4_trigger_nargs_create().
136 */
137typedef struct ephemeral_servinfo {
138	char			*esi_hostname;
139	char			*esi_netname;
140	char			*esi_path;
141	int			esi_path_len;
142	int			esi_mount_flags;
143	struct netbuf		*esi_addr;
144	struct netbuf		*esi_syncaddr;
145	struct knetconfig	*esi_knconf;
146} ephemeral_servinfo_t;
147
148/*
149 * Collect together the mount-type specific and generic data args.
150 */
151typedef struct domount_args {
152	ephemeral_servinfo_t	*dma_esi;
153	char			*dma_hostlist; /* comma-sep. for RO failover */
154	struct nfs_args		*dma_nargs;
155} domount_args_t;
156
157
158/*
159 * The vnode ops functions for a trigger stub vnode
160 */
161static int nfs4_trigger_open(vnode_t **, int, cred_t *, caller_context_t *);
162static int nfs4_trigger_getattr(vnode_t *, struct vattr *, int, cred_t *,
163    caller_context_t *);
164static int nfs4_trigger_setattr(vnode_t *, struct vattr *, int, cred_t *,
165    caller_context_t *);
166static int nfs4_trigger_access(vnode_t *, int, int, cred_t *,
167    caller_context_t *);
168static int nfs4_trigger_readlink(vnode_t *, struct uio *, cred_t *,
169    caller_context_t *);
170static int nfs4_trigger_lookup(vnode_t *, char *, vnode_t **,
171    struct pathname *, int, vnode_t *, cred_t *, caller_context_t *,
172    int *, pathname_t *);
173static int nfs4_trigger_create(vnode_t *, char *, struct vattr *,
174    enum vcexcl, int, vnode_t **, cred_t *, int, caller_context_t *,
175    vsecattr_t *);
176static int nfs4_trigger_remove(vnode_t *, char *, cred_t *, caller_context_t *,
177    int);
178static int nfs4_trigger_link(vnode_t *, vnode_t *, char *, cred_t *,
179    caller_context_t *, int);
180static int nfs4_trigger_rename(vnode_t *, char *, vnode_t *, char *,
181    cred_t *, caller_context_t *, int);
182static int nfs4_trigger_mkdir(vnode_t *, char *, struct vattr *,
183    vnode_t **, cred_t *, caller_context_t *, int, vsecattr_t *vsecp);
184static int nfs4_trigger_rmdir(vnode_t *, char *, vnode_t *, cred_t *,
185    caller_context_t *, int);
186static int nfs4_trigger_symlink(vnode_t *, char *, struct vattr *, char *,
187    cred_t *, caller_context_t *, int);
188static int nfs4_trigger_cmp(vnode_t *, vnode_t *, caller_context_t *);
189
190/*
191 * Regular NFSv4 vnodeops that we need to reference directly
192 */
193extern int	nfs4_getattr(vnode_t *, struct vattr *, int, cred_t *,
194		    caller_context_t *);
195extern void	nfs4_inactive(vnode_t *, cred_t *, caller_context_t *);
196extern int	nfs4_rwlock(vnode_t *, int, caller_context_t *);
197extern void	nfs4_rwunlock(vnode_t *, int, caller_context_t *);
198extern int	nfs4_lookup(vnode_t *, char *, vnode_t **,
199		    struct pathname *, int, vnode_t *, cred_t *,
200		    caller_context_t *, int *, pathname_t *);
201extern int	nfs4_pathconf(vnode_t *, int, ulong_t *, cred_t *,
202		    caller_context_t *);
203extern int	nfs4_getsecattr(vnode_t *, vsecattr_t *, int, cred_t *,
204		    caller_context_t *);
205extern int	nfs4_fid(vnode_t *, fid_t *, caller_context_t *);
206extern int	nfs4_realvp(vnode_t *, vnode_t **, caller_context_t *);
207
208static int	nfs4_trigger_mount(vnode_t *, cred_t *, vnode_t **);
209static int	nfs4_trigger_domount(vnode_t *, domount_args_t *, vfs_t **,
210    cred_t *, vnode_t **);
211static int 	nfs4_trigger_domount_args_create(vnode_t *, cred_t *,
212    domount_args_t **dmap);
213static void	nfs4_trigger_domount_args_destroy(domount_args_t *dma,
214    vnode_t *vp);
215static ephemeral_servinfo_t *nfs4_trigger_esi_create(vnode_t *, servinfo4_t *,
216    cred_t *);
217static void	nfs4_trigger_esi_destroy(ephemeral_servinfo_t *, vnode_t *);
218static ephemeral_servinfo_t *nfs4_trigger_esi_create_mirrormount(vnode_t *,
219    servinfo4_t *);
220static ephemeral_servinfo_t *nfs4_trigger_esi_create_referral(vnode_t *,
221    cred_t *);
222static struct nfs_args 	*nfs4_trigger_nargs_create(mntinfo4_t *, servinfo4_t *,
223    ephemeral_servinfo_t *);
224static void	nfs4_trigger_nargs_destroy(struct nfs_args *);
225static char	*nfs4_trigger_create_mntopts(vfs_t *);
226static void	nfs4_trigger_destroy_mntopts(char *);
227static int 	nfs4_trigger_add_mntopt(char *, char *, vfs_t *);
228static enum clnt_stat nfs4_trigger_ping_server(servinfo4_t *, int);
229static enum clnt_stat nfs4_ping_server_common(struct knetconfig *,
230    struct netbuf *, int);
231
232extern int	umount2_engine(vfs_t *, int, cred_t *, int);
233
234vnodeops_t *nfs4_trigger_vnodeops;
235
236/*
237 * These are the vnodeops that we must define for stub vnodes.
238 *
239 *
240 * Many of the VOPs defined for NFSv4 do not need to be defined here,
241 * for various reasons. This will result in the VFS default function being
242 * used:
243 *
244 * - These VOPs require a previous VOP_OPEN to have occurred. That will have
245 *   lost the reference to the stub vnode, meaning these should not be called:
246 *       close, read, write, ioctl, readdir, seek.
247 *
248 * - These VOPs are meaningless for vnodes without data pages. Since the
249 *   stub vnode is of type VDIR, these should not be called:
250 *       space, getpage, putpage, map, addmap, delmap, pageio, fsync.
251 *
252 * - These VOPs are otherwise not applicable, and should not be called:
253 *       dump, setsecattr.
254 *
255 *
256 * These VOPs we do not want to define, but nor do we want the VFS default
257 * action. Instead, we specify the VFS error function, with fs_error(), but
258 * note that fs_error() is not actually called. Instead it results in the
259 * use of the error function defined for the particular VOP, in vn_ops_table[]:
260 *
261 * -   frlock, dispose, shrlock.
262 *
263 *
264 * These VOPs we define to use the corresponding regular NFSv4 vnodeop.
265 * NOTE: if any of these ops involve an OTW call with the stub FH, then
266 * that call must be wrapped with save_mnt_secinfo()/check_mnt_secinfo()
267 * to protect the security data in the servinfo4_t for the "parent"
268 * filesystem that contains the stub.
269 *
270 * - These VOPs should not trigger a mount, so that "ls -l" does not:
271 *       pathconf, getsecattr.
272 *
273 * - These VOPs would not make sense to trigger:
274 *       inactive, rwlock, rwunlock, fid, realvp.
275 */
276const fs_operation_def_t nfs4_trigger_vnodeops_template[] = {
277	VOPNAME_OPEN,		{ .vop_open = nfs4_trigger_open },
278	VOPNAME_GETATTR,	{ .vop_getattr = nfs4_trigger_getattr },
279	VOPNAME_SETATTR,	{ .vop_setattr = nfs4_trigger_setattr },
280	VOPNAME_ACCESS,		{ .vop_access = nfs4_trigger_access },
281	VOPNAME_LOOKUP,		{ .vop_lookup = nfs4_trigger_lookup },
282	VOPNAME_CREATE,		{ .vop_create = nfs4_trigger_create },
283	VOPNAME_REMOVE,		{ .vop_remove = nfs4_trigger_remove },
284	VOPNAME_LINK,		{ .vop_link = nfs4_trigger_link },
285	VOPNAME_RENAME,		{ .vop_rename = nfs4_trigger_rename },
286	VOPNAME_MKDIR,		{ .vop_mkdir = nfs4_trigger_mkdir },
287	VOPNAME_RMDIR,		{ .vop_rmdir = nfs4_trigger_rmdir },
288	VOPNAME_SYMLINK,	{ .vop_symlink = nfs4_trigger_symlink },
289	VOPNAME_READLINK,	{ .vop_readlink = nfs4_trigger_readlink },
290	VOPNAME_INACTIVE, 	{ .vop_inactive = nfs4_inactive },
291	VOPNAME_FID,		{ .vop_fid = nfs4_fid },
292	VOPNAME_RWLOCK,		{ .vop_rwlock = nfs4_rwlock },
293	VOPNAME_RWUNLOCK,	{ .vop_rwunlock = nfs4_rwunlock },
294	VOPNAME_REALVP,		{ .vop_realvp = nfs4_realvp },
295	VOPNAME_GETSECATTR,	{ .vop_getsecattr = nfs4_getsecattr },
296	VOPNAME_PATHCONF,	{ .vop_pathconf = nfs4_pathconf },
297	VOPNAME_FRLOCK,		{ .error = fs_error },
298	VOPNAME_DISPOSE,	{ .error = fs_error },
299	VOPNAME_SHRLOCK,	{ .error = fs_error },
300	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
301	NULL, NULL
302};
303
304static void
305nfs4_ephemeral_tree_incr(nfs4_ephemeral_tree_t *net)
306{
307	ASSERT(mutex_owned(&net->net_cnt_lock));
308	net->net_refcnt++;
309	ASSERT(net->net_refcnt != 0);
310}
311
312static void
313nfs4_ephemeral_tree_hold(nfs4_ephemeral_tree_t *net)
314{
315	mutex_enter(&net->net_cnt_lock);
316	nfs4_ephemeral_tree_incr(net);
317	mutex_exit(&net->net_cnt_lock);
318}
319
320/*
321 * We need a safe way to decrement the refcnt whilst the
322 * lock is being held.
323 */
324static void
325nfs4_ephemeral_tree_decr(nfs4_ephemeral_tree_t *net)
326{
327	ASSERT(mutex_owned(&net->net_cnt_lock));
328	ASSERT(net->net_refcnt != 0);
329	net->net_refcnt--;
330}
331
332static void
333nfs4_ephemeral_tree_rele(nfs4_ephemeral_tree_t *net)
334{
335	mutex_enter(&net->net_cnt_lock);
336	nfs4_ephemeral_tree_decr(net);
337	mutex_exit(&net->net_cnt_lock);
338}
339
340/*
341 * Trigger ops for stub vnodes; for mirror mounts, etc.
342 *
343 * The general idea is that a "triggering" op will first call
344 * nfs4_trigger_mount(), which will find out whether a mount has already
345 * been triggered.
346 *
347 * If it has, then nfs4_trigger_mount() sets newvp to the root vnode
348 * of the covering vfs.
349 *
350 * If a mount has not yet been triggered, nfs4_trigger_mount() will do so,
351 * and again set newvp, as above.
352 *
353 * The triggering op may then re-issue the VOP by calling it on newvp.
354 *
355 * Note that some ops may perform custom action, and may or may not need
356 * to trigger a mount.
357 *
358 * Some ops need to call the regular NFSv4 vnodeop for a stub vnode. We
359 * obviously can't do this with VOP_<whatever>, since it's a stub vnode
360 * and that would just recurse. Instead, we call the v4 op directly,
361 * by name.  This is OK, since we know that the vnode is for NFSv4,
362 * otherwise it couldn't be a stub.
363 *
364 */
365
366static int
367nfs4_trigger_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
368{
369	int error;
370	vnode_t *newvp;
371
372	error = nfs4_trigger_mount(*vpp, cr, &newvp);
373	if (error)
374		return (error);
375
376	/* Release the stub vnode, as we're losing the reference to it */
377	VN_RELE(*vpp);
378
379	/* Give the caller the root vnode of the newly-mounted fs */
380	*vpp = newvp;
381
382	/* return with VN_HELD(newvp) */
383	return (VOP_OPEN(vpp, flag, cr, ct));
384}
385
386void
387nfs4_fake_attrs(vnode_t *vp, struct vattr *vap)
388{
389	uint_t mask;
390	timespec_t now;
391
392	/*
393	 * Set some attributes here for referrals.
394	 */
395	mask = vap->va_mask;
396	bzero(vap, sizeof (struct vattr));
397	vap->va_mask	= mask;
398	vap->va_uid	= 0;
399	vap->va_gid	= 0;
400	vap->va_nlink	= 1;
401	vap->va_size	= 1;
402	gethrestime(&now);
403	vap->va_atime	= now;
404	vap->va_mtime	= now;
405	vap->va_ctime	= now;
406	vap->va_type	= VDIR;
407	vap->va_mode	= 0555;
408	vap->va_fsid	= vp->v_vfsp->vfs_dev;
409	vap->va_rdev	= 0;
410	vap->va_blksize	= MAXBSIZE;
411	vap->va_nblocks	= 1;
412	vap->va_seq	= 0;
413}
414
415/*
416 * For the majority of cases, nfs4_trigger_getattr() will not trigger
417 * a mount. However, if ATTR_TRIGGER is set, we are being informed
418 * that we need to force the mount before we attempt to determine
419 * the attributes. The intent is an atomic operation for security
420 * testing.
421 *
422 * If we're not triggering a mount, we can still inquire about the
423 * actual attributes from the server in the mirror mount case,
424 * and will return manufactured attributes for a referral (see
425 * the 'create' branch of find_referral_stubvp()).
426 */
427static int
428nfs4_trigger_getattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
429    caller_context_t *ct)
430{
431	int error;
432
433	if (flags & ATTR_TRIGGER) {
434		vnode_t	*newvp;
435
436		error = nfs4_trigger_mount(vp, cr, &newvp);
437		if (error)
438			return (error);
439
440		error = VOP_GETATTR(newvp, vap, flags, cr, ct);
441		VN_RELE(newvp);
442
443	} else if (RP_ISSTUB_MIRRORMOUNT(VTOR4(vp))) {
444
445		error = nfs4_getattr(vp, vap, flags, cr, ct);
446
447	} else if (RP_ISSTUB_REFERRAL(VTOR4(vp))) {
448
449		nfs4_fake_attrs(vp, vap);
450		error = 0;
451	}
452
453	return (error);
454}
455
456static int
457nfs4_trigger_setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
458    caller_context_t *ct)
459{
460	int error;
461	vnode_t *newvp;
462
463	error = nfs4_trigger_mount(vp, cr, &newvp);
464	if (error)
465		return (error);
466
467	error = VOP_SETATTR(newvp, vap, flags, cr, ct);
468	VN_RELE(newvp);
469
470	return (error);
471}
472
473static int
474nfs4_trigger_access(vnode_t *vp, int mode, int flags, cred_t *cr,
475    caller_context_t *ct)
476{
477	int error;
478	vnode_t *newvp;
479
480	error = nfs4_trigger_mount(vp, cr, &newvp);
481	if (error)
482		return (error);
483
484	error = VOP_ACCESS(newvp, mode, flags, cr, ct);
485	VN_RELE(newvp);
486
487	return (error);
488}
489
490static int
491nfs4_trigger_lookup(vnode_t *dvp, char *nm, vnode_t **vpp,
492    struct pathname *pnp, int flags, vnode_t *rdir, cred_t *cr,
493    caller_context_t *ct, int *deflags, pathname_t *rpnp)
494{
495	int error;
496	vnode_t *newdvp;
497	rnode4_t *drp = VTOR4(dvp);
498
499	ASSERT(RP_ISSTUB(drp));
500
501	/*
502	 * It's not legal to lookup ".." for an fs root, so we mustn't pass
503	 * that up. Instead, pass onto the regular op, regardless of whether
504	 * we've triggered a mount.
505	 */
506	if (strcmp(nm, "..") == 0)
507		if (RP_ISSTUB_MIRRORMOUNT(drp)) {
508			return (nfs4_lookup(dvp, nm, vpp, pnp, flags, rdir, cr,
509			    ct, deflags, rpnp));
510		} else if (RP_ISSTUB_REFERRAL(drp)) {
511			/* Return the parent vnode */
512			return (vtodv(dvp, vpp, cr, TRUE));
513		}
514
515	error = nfs4_trigger_mount(dvp, cr, &newdvp);
516	if (error)
517		return (error);
518
519	error = VOP_LOOKUP(newdvp, nm, vpp, pnp, flags, rdir, cr, ct,
520	    deflags, rpnp);
521	VN_RELE(newdvp);
522
523	return (error);
524}
525
526static int
527nfs4_trigger_create(vnode_t *dvp, char *nm, struct vattr *va,
528    enum vcexcl exclusive, int mode, vnode_t **vpp, cred_t *cr,
529    int flags, caller_context_t *ct, vsecattr_t *vsecp)
530{
531	int error;
532	vnode_t *newdvp;
533
534	error = nfs4_trigger_mount(dvp, cr, &newdvp);
535	if (error)
536		return (error);
537
538	error = VOP_CREATE(newdvp, nm, va, exclusive, mode, vpp, cr,
539	    flags, ct, vsecp);
540	VN_RELE(newdvp);
541
542	return (error);
543}
544
545static int
546nfs4_trigger_remove(vnode_t *dvp, char *nm, cred_t *cr, caller_context_t *ct,
547    int flags)
548{
549	int error;
550	vnode_t *newdvp;
551
552	error = nfs4_trigger_mount(dvp, cr, &newdvp);
553	if (error)
554		return (error);
555
556	error = VOP_REMOVE(newdvp, nm, cr, ct, flags);
557	VN_RELE(newdvp);
558
559	return (error);
560}
561
562static int
563nfs4_trigger_link(vnode_t *tdvp, vnode_t *svp, char *tnm, cred_t *cr,
564    caller_context_t *ct, int flags)
565{
566	int error;
567	vnode_t *newtdvp;
568
569	error = nfs4_trigger_mount(tdvp, cr, &newtdvp);
570	if (error)
571		return (error);
572
573	/*
574	 * We don't check whether svp is a stub. Let the NFSv4 code
575	 * detect that error, and return accordingly.
576	 */
577	error = VOP_LINK(newtdvp, svp, tnm, cr, ct, flags);
578	VN_RELE(newtdvp);
579
580	return (error);
581}
582
583static int
584nfs4_trigger_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm,
585    cred_t *cr, caller_context_t *ct, int flags)
586{
587	int error;
588	vnode_t *newsdvp;
589	rnode4_t *tdrp = VTOR4(tdvp);
590
591	/*
592	 * We know that sdvp is a stub, otherwise we would not be here.
593	 *
594	 * If tdvp is also be a stub, there are two possibilities: it
595	 * is either the same stub as sdvp [i.e. VN_CMP(sdvp, tdvp)]
596	 * or it is a different stub [!VN_CMP(sdvp, tdvp)].
597	 *
598	 * In the former case, just trigger sdvp, and treat tdvp as
599	 * though it were not a stub.
600	 *
601	 * In the latter case, it might be a different stub for the
602	 * same server fs as sdvp, or for a different server fs.
603	 * Regardless, from the client perspective this would still
604	 * be a cross-filesystem rename, and should not be allowed,
605	 * so return EXDEV, without triggering either mount.
606	 */
607	if (RP_ISSTUB(tdrp) && !VN_CMP(sdvp, tdvp))
608		return (EXDEV);
609
610	error = nfs4_trigger_mount(sdvp, cr, &newsdvp);
611	if (error)
612		return (error);
613
614	error = VOP_RENAME(newsdvp, snm, tdvp, tnm, cr, ct, flags);
615
616	VN_RELE(newsdvp);
617
618	return (error);
619}
620
621/* ARGSUSED */
622static int
623nfs4_trigger_mkdir(vnode_t *dvp, char *nm, struct vattr *va, vnode_t **vpp,
624    cred_t *cr, caller_context_t *ct, int flags, vsecattr_t *vsecp)
625{
626	int error;
627	vnode_t *newdvp;
628
629	error = nfs4_trigger_mount(dvp, cr, &newdvp);
630	if (error)
631		return (error);
632
633	error = VOP_MKDIR(newdvp, nm, va, vpp, cr, ct, flags, vsecp);
634	VN_RELE(newdvp);
635
636	return (error);
637}
638
639static int
640nfs4_trigger_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, cred_t *cr,
641    caller_context_t *ct, int flags)
642{
643	int error;
644	vnode_t *newdvp;
645
646	error = nfs4_trigger_mount(dvp, cr, &newdvp);
647	if (error)
648		return (error);
649
650	error = VOP_RMDIR(newdvp, nm, cdir, cr, ct, flags);
651	VN_RELE(newdvp);
652
653	return (error);
654}
655
656static int
657nfs4_trigger_symlink(vnode_t *dvp, char *lnm, struct vattr *tva, char *tnm,
658    cred_t *cr, caller_context_t *ct, int flags)
659{
660	int error;
661	vnode_t *newdvp;
662
663	error = nfs4_trigger_mount(dvp, cr, &newdvp);
664	if (error)
665		return (error);
666
667	error = VOP_SYMLINK(newdvp, lnm, tva, tnm, cr, ct, flags);
668	VN_RELE(newdvp);
669
670	return (error);
671}
672
673static int
674nfs4_trigger_readlink(vnode_t *vp, struct uio *uiop, cred_t *cr,
675    caller_context_t *ct)
676{
677	int error;
678	vnode_t *newvp;
679
680	error = nfs4_trigger_mount(vp, cr, &newvp);
681	if (error)
682		return (error);
683
684	error = VOP_READLINK(newvp, uiop, cr, ct);
685	VN_RELE(newvp);
686
687	return (error);
688}
689
690/* end of trigger vnode ops */
691
692/*
693 * See if the mount has already been done by another caller.
694 */
695static int
696nfs4_trigger_mounted_already(vnode_t *vp, vnode_t **newvpp,
697    bool_t *was_mounted, vfs_t **vfsp)
698{
699	int		error;
700	mntinfo4_t	*mi = VTOMI4(vp);
701
702	*was_mounted = FALSE;
703
704	error = vn_vfsrlock_wait(vp);
705	if (error)
706		return (error);
707
708	*vfsp = vn_mountedvfs(vp);
709	if (*vfsp != NULL) {
710		/* the mount has already occurred */
711		error = VFS_ROOT(*vfsp, newvpp);
712		if (!error) {
713			/* need to update the reference time  */
714			mutex_enter(&mi->mi_lock);
715			if (mi->mi_ephemeral)
716				mi->mi_ephemeral->ne_ref_time =
717				    gethrestime_sec();
718			mutex_exit(&mi->mi_lock);
719
720			*was_mounted = TRUE;
721		}
722	}
723
724	vn_vfsunlock(vp);
725	return (0);
726}
727
728/*
729 * Mount upon a trigger vnode; for mirror-mounts, referrals, etc.
730 *
731 * The mount may have already occurred, via another thread. If not,
732 * assemble the location information - which may require fetching - and
733 * perform the mount.
734 *
735 * Sets newvp to be the root of the fs that is now covering vp. Note
736 * that we return with VN_HELD(*newvp).
737 *
738 * The caller is responsible for passing the VOP onto the covering fs.
739 */
740static int
741nfs4_trigger_mount(vnode_t *vp, cred_t *cr, vnode_t **newvpp)
742{
743	int			 error;
744	vfs_t			*vfsp;
745	rnode4_t		*rp = VTOR4(vp);
746	mntinfo4_t		*mi = VTOMI4(vp);
747	domount_args_t		*dma;
748
749	nfs4_ephemeral_tree_t	*net;
750
751	bool_t			must_unlock = FALSE;
752	bool_t			is_building = FALSE;
753	bool_t			was_mounted = FALSE;
754
755	cred_t			*mcred = NULL;
756
757	nfs4_trigger_globals_t	*ntg;
758
759	zone_t			*zone = curproc->p_zone;
760
761	ASSERT(RP_ISSTUB(rp));
762
763	*newvpp = NULL;
764
765	/*
766	 * Has the mount already occurred?
767	 */
768	error = nfs4_trigger_mounted_already(vp, newvpp,
769	    &was_mounted, &vfsp);
770	if (error || was_mounted)
771		goto done;
772
773	ntg = zone_getspecific(nfs4_ephemeral_key, zone);
774	ASSERT(ntg != NULL);
775
776	mutex_enter(&mi->mi_lock);
777
778	/*
779	 * We need to lock down the ephemeral tree.
780	 */
781	if (mi->mi_ephemeral_tree == NULL) {
782		net = kmem_zalloc(sizeof (*net), KM_SLEEP);
783		mutex_init(&net->net_tree_lock, NULL, MUTEX_DEFAULT, NULL);
784		mutex_init(&net->net_cnt_lock, NULL, MUTEX_DEFAULT, NULL);
785		net->net_refcnt = 1;
786		net->net_status = NFS4_EPHEMERAL_TREE_BUILDING;
787		is_building = TRUE;
788
789		/*
790		 * We need to add it to the zone specific list for
791		 * automatic unmounting and harvesting of deadwood.
792		 */
793		mutex_enter(&ntg->ntg_forest_lock);
794		if (ntg->ntg_forest != NULL)
795			net->net_next = ntg->ntg_forest;
796		ntg->ntg_forest = net;
797		mutex_exit(&ntg->ntg_forest_lock);
798
799		/*
800		 * No lock order confusion with mi_lock because no
801		 * other node could have grabbed net_tree_lock.
802		 */
803		mutex_enter(&net->net_tree_lock);
804		mi->mi_ephemeral_tree = net;
805		net->net_mount = mi;
806		mutex_exit(&mi->mi_lock);
807
808		MI4_HOLD(mi);
809		VFS_HOLD(mi->mi_vfsp);
810	} else {
811		net = mi->mi_ephemeral_tree;
812		nfs4_ephemeral_tree_hold(net);
813
814		mutex_exit(&mi->mi_lock);
815
816		mutex_enter(&net->net_tree_lock);
817
818		/*
819		 * We can only procede if the tree is neither locked
820		 * nor being torn down.
821		 */
822		mutex_enter(&net->net_cnt_lock);
823		if (net->net_status & NFS4_EPHEMERAL_TREE_PROCESSING) {
824			nfs4_ephemeral_tree_decr(net);
825			mutex_exit(&net->net_cnt_lock);
826			mutex_exit(&net->net_tree_lock);
827
828			return (EIO);
829		}
830		mutex_exit(&net->net_cnt_lock);
831	}
832
833	mutex_enter(&net->net_cnt_lock);
834	net->net_status |= NFS4_EPHEMERAL_TREE_MOUNTING;
835	mutex_exit(&net->net_cnt_lock);
836
837	must_unlock = TRUE;
838
839	error = nfs4_trigger_domount_args_create(vp, cr, &dma);
840	if (error)
841		goto done;
842
843	/*
844	 * Note that since we define mirror mounts to work
845	 * for any user, we simply extend the privileges of
846	 * the user's credentials to allow the mount to
847	 * proceed.
848	 */
849	mcred = crdup(cr);
850	if (mcred == NULL) {
851		error = EINVAL;
852		nfs4_trigger_domount_args_destroy(dma, vp);
853		goto done;
854	}
855
856	crset_zone_privall(mcred);
857	if (is_system_labeled())
858		(void) setpflags(NET_MAC_AWARE, 1, mcred);
859
860	error = nfs4_trigger_domount(vp, dma, &vfsp, mcred, newvpp);
861	nfs4_trigger_domount_args_destroy(dma, vp);
862
863	DTRACE_PROBE2(nfs4clnt__func__referral__mount,
864	    vnode_t *, vp, int, error);
865
866	crfree(mcred);
867
868done:
869
870	if (must_unlock) {
871		mutex_enter(&net->net_cnt_lock);
872		net->net_status &= ~NFS4_EPHEMERAL_TREE_MOUNTING;
873
874		/*
875		 * REFCNT: If we are the root of the tree, then we need
876		 * to keep a reference because we malloced the tree and
877		 * this is where we tied it to our mntinfo.
878		 *
879		 * If we are not the root of the tree, then our tie to
880		 * the mntinfo occured elsewhere and we need to
881		 * decrement the reference to the tree.
882		 */
883		if (is_building)
884			net->net_status &= ~NFS4_EPHEMERAL_TREE_BUILDING;
885		else
886			nfs4_ephemeral_tree_decr(net);
887		mutex_exit(&net->net_cnt_lock);
888
889		mutex_exit(&net->net_tree_lock);
890	}
891
892	if (!error && (newvpp == NULL || *newvpp == NULL))
893		error = ENOSYS;
894
895	return (error);
896}
897
898/*
899 * Collect together both the generic & mount-type specific args.
900 */
901static int
902nfs4_trigger_domount_args_create(vnode_t *vp, cred_t *cr, domount_args_t **dmap)
903{
904	int nointr;
905	char *hostlist;
906	servinfo4_t *svp;
907	struct nfs_args *nargs, *nargs_head;
908	enum clnt_stat status;
909	ephemeral_servinfo_t *esi, *esi_first;
910	domount_args_t *dma;
911	mntinfo4_t *mi = VTOMI4(vp);
912
913	nointr = !(mi->mi_flags & MI4_INT);
914	hostlist = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
915
916	svp = mi->mi_curr_serv;
917	/* check if the current server is responding */
918	status = nfs4_trigger_ping_server(svp, nointr);
919	if (status == RPC_SUCCESS) {
920		esi_first = nfs4_trigger_esi_create(vp, svp, cr);
921		if (esi_first == NULL) {
922			kmem_free(hostlist, MAXPATHLEN);
923			return (EINVAL);
924		}
925
926		(void) strlcpy(hostlist, esi_first->esi_hostname, MAXPATHLEN);
927
928		nargs_head = nfs4_trigger_nargs_create(mi, svp, esi_first);
929	} else {
930		/* current server did not respond */
931		esi_first = NULL;
932		nargs_head = NULL;
933	}
934	nargs = nargs_head;
935
936	/*
937	 * NFS RO failover.
938	 *
939	 * If we have multiple servinfo4 structures, linked via sv_next,
940	 * we must create one nfs_args for each, linking the nfs_args via
941	 * nfs_ext_u.nfs_extB.next.
942	 *
943	 * We need to build a corresponding esi for each, too, but that is
944	 * used solely for building nfs_args, and may be immediately
945	 * discarded, as domount() requires the info from just one esi,
946	 * but all the nfs_args.
947	 *
948	 * Currently, the NFS mount code will hang if not all servers
949	 * requested are available. To avoid that, we need to ping each
950	 * server, here, and remove it from the list if it is not
951	 * responding. This has the side-effect of that server then
952	 * being permanently unavailable for this failover mount, even if
953	 * it recovers. That's unfortunate, but the best we can do until
954	 * the mount code path is fixed.
955	 */
956
957	/*
958	 * If the current server was down, loop indefinitely until we find
959	 * at least one responsive server.
960	 */
961	do {
962		/* no locking needed for sv_next; it is only set at fs mount */
963		for (svp = mi->mi_servers; svp != NULL; svp = svp->sv_next) {
964			struct nfs_args *next;
965
966			/*
967			 * nargs_head: the head of the nfs_args list
968			 * nargs: the current tail of the list
969			 * next: the newly-created element to be added
970			 */
971
972			/*
973			 * We've already tried the current server, above;
974			 * if it was responding, we have already included it
975			 * and it may now be ignored.
976			 *
977			 * Otherwise, try it again, since it may now have
978			 * recovered.
979			 */
980			if (svp == mi->mi_curr_serv && esi_first != NULL)
981				continue;
982
983			(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
984			if (svp->sv_flags & SV4_NOTINUSE) {
985				nfs_rw_exit(&svp->sv_lock);
986				continue;
987			}
988			nfs_rw_exit(&svp->sv_lock);
989
990			/* check if the server is responding */
991			status = nfs4_trigger_ping_server(svp, nointr);
992			if (status == RPC_INTR) {
993				kmem_free(hostlist, MAXPATHLEN);
994				nfs4_trigger_esi_destroy(esi_first, vp);
995				nargs = nargs_head;
996				while (nargs != NULL) {
997					next = nargs->nfs_ext_u.nfs_extB.next;
998					nfs4_trigger_nargs_destroy(nargs);
999					nargs = next;
1000				}
1001				return (EINTR);
1002			} else if (status != RPC_SUCCESS) {
1003				/* if the server did not respond, ignore it */
1004				continue;
1005			}
1006
1007			esi = nfs4_trigger_esi_create(vp, svp, cr);
1008			if (esi == NULL)
1009				continue;
1010
1011			/*
1012			 * If the original current server (mi_curr_serv)
1013			 * was down when when we first tried it,
1014			 * (i.e. esi_first == NULL),
1015			 * we select this new server (svp) to be the server
1016			 * that we will actually contact (esi_first).
1017			 *
1018			 * Note that it's possible that mi_curr_serv == svp,
1019			 * if that mi_curr_serv was down but has now recovered.
1020			 */
1021			next = nfs4_trigger_nargs_create(mi, svp, esi);
1022			if (esi_first == NULL) {
1023				ASSERT(nargs == NULL);
1024				ASSERT(nargs_head == NULL);
1025				nargs_head = next;
1026				esi_first = esi;
1027				(void) strlcpy(hostlist,
1028				    esi_first->esi_hostname, MAXPATHLEN);
1029			} else {
1030				ASSERT(nargs_head != NULL);
1031				nargs->nfs_ext_u.nfs_extB.next = next;
1032				(void) strlcat(hostlist, ",", MAXPATHLEN);
1033				(void) strlcat(hostlist, esi->esi_hostname,
1034				    MAXPATHLEN);
1035				/* esi was only needed for hostname & nargs */
1036				nfs4_trigger_esi_destroy(esi, vp);
1037			}
1038
1039			nargs = next;
1040		}
1041
1042		/* if we've had no response at all, wait a second */
1043		if (esi_first == NULL)
1044			delay(drv_usectohz(1000000));
1045
1046	} while (esi_first == NULL);
1047	ASSERT(nargs_head != NULL);
1048
1049	dma = kmem_zalloc(sizeof (domount_args_t), KM_SLEEP);
1050	dma->dma_esi = esi_first;
1051	dma->dma_hostlist = hostlist;
1052	dma->dma_nargs = nargs_head;
1053	*dmap = dma;
1054
1055	return (0);
1056}
1057
1058static void
1059nfs4_trigger_domount_args_destroy(domount_args_t *dma, vnode_t *vp)
1060{
1061	if (dma != NULL) {
1062		if (dma->dma_esi != NULL && vp != NULL)
1063			nfs4_trigger_esi_destroy(dma->dma_esi, vp);
1064
1065		if (dma->dma_hostlist != NULL)
1066			kmem_free(dma->dma_hostlist, MAXPATHLEN);
1067
1068		if (dma->dma_nargs != NULL) {
1069			struct nfs_args *nargs = dma->dma_nargs;
1070
1071			do {
1072				struct nfs_args *next =
1073				    nargs->nfs_ext_u.nfs_extB.next;
1074
1075				nfs4_trigger_nargs_destroy(nargs);
1076				nargs = next;
1077			} while (nargs != NULL);
1078		}
1079
1080		kmem_free(dma, sizeof (domount_args_t));
1081	}
1082}
1083
1084/*
1085 * The ephemeral_servinfo_t struct contains basic information we will need to
1086 * perform the mount. Whilst the structure is generic across different
1087 * types of ephemeral mount, the way we gather its contents differs.
1088 */
1089static ephemeral_servinfo_t *
1090nfs4_trigger_esi_create(vnode_t *vp, servinfo4_t *svp, cred_t *cr)
1091{
1092	ephemeral_servinfo_t *esi;
1093	rnode4_t *rp = VTOR4(vp);
1094
1095	ASSERT(RP_ISSTUB(rp));
1096
1097	/* Call the ephemeral type-specific routine */
1098	if (RP_ISSTUB_MIRRORMOUNT(rp))
1099		esi = nfs4_trigger_esi_create_mirrormount(vp, svp);
1100	else if (RP_ISSTUB_REFERRAL(rp))
1101		esi = nfs4_trigger_esi_create_referral(vp, cr);
1102	else
1103		esi = NULL;
1104	return (esi);
1105}
1106
1107static void
1108nfs4_trigger_esi_destroy(ephemeral_servinfo_t *esi, vnode_t *vp)
1109{
1110	rnode4_t *rp = VTOR4(vp);
1111
1112	ASSERT(RP_ISSTUB(rp));
1113
1114	/* Currently, no need for an ephemeral type-specific routine */
1115
1116	/*
1117	 * The contents of ephemeral_servinfo_t goes into nfs_args,
1118	 * and will be handled by nfs4_trigger_nargs_destroy().
1119	 * We need only free the structure itself.
1120	 */
1121	if (esi != NULL)
1122		kmem_free(esi, sizeof (ephemeral_servinfo_t));
1123}
1124
1125/*
1126 * Some of this may turn out to be common with other ephemeral types,
1127 * in which case it should be moved to nfs4_trigger_esi_create(), or a
1128 * common function called.
1129 */
1130
1131/*
1132 * Mirror mounts case - should have all data available
1133 */
1134static ephemeral_servinfo_t *
1135nfs4_trigger_esi_create_mirrormount(vnode_t *vp, servinfo4_t *svp)
1136{
1137	char			*stubpath;
1138	struct knetconfig	*sikncp, *svkncp;
1139	struct netbuf		*bufp;
1140	ephemeral_servinfo_t	*esi;
1141
1142	esi = kmem_zalloc(sizeof (ephemeral_servinfo_t), KM_SLEEP);
1143
1144	/* initially set to be our type of ephemeral mount; may be added to */
1145	esi->esi_mount_flags = NFSMNT_MIRRORMOUNT;
1146
1147	/*
1148	 * We're copying info from the stub rnode's servinfo4, but
1149	 * we must create new copies, not pointers, since this information
1150	 * is to be associated with the new mount, which will be
1151	 * unmounted (and its structures freed) separately
1152	 */
1153
1154	/*
1155	 * Sizes passed to kmem_[z]alloc here must match those freed
1156	 * in nfs4_free_args()
1157	 */
1158
1159	/*
1160	 * We hold sv_lock across kmem_zalloc() calls that may sleep, but this
1161	 * is difficult to avoid: as we need to read svp to calculate the
1162	 * sizes to be allocated.
1163	 */
1164	(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
1165
1166	esi->esi_hostname = kmem_zalloc(strlen(svp->sv_hostname) + 1, KM_SLEEP);
1167	(void) strcat(esi->esi_hostname, svp->sv_hostname);
1168
1169	esi->esi_addr = kmem_zalloc(sizeof (struct netbuf), KM_SLEEP);
1170	bufp = esi->esi_addr;
1171	bufp->len = svp->sv_addr.len;
1172	bufp->maxlen = svp->sv_addr.maxlen;
1173	bufp->buf = kmem_zalloc(bufp->len, KM_SLEEP);
1174	bcopy(svp->sv_addr.buf, bufp->buf, bufp->len);
1175
1176	esi->esi_knconf = kmem_zalloc(sizeof (*esi->esi_knconf), KM_SLEEP);
1177	sikncp = esi->esi_knconf;
1178	svkncp = svp->sv_knconf;
1179	sikncp->knc_semantics = svkncp->knc_semantics;
1180	sikncp->knc_protofmly = (caddr_t)kmem_zalloc(KNC_STRSIZE, KM_SLEEP);
1181	(void) strcat((char *)sikncp->knc_protofmly,
1182	    (char *)svkncp->knc_protofmly);
1183	sikncp->knc_proto = (caddr_t)kmem_zalloc(KNC_STRSIZE, KM_SLEEP);
1184	(void) strcat((char *)sikncp->knc_proto, (char *)svkncp->knc_proto);
1185	sikncp->knc_rdev = svkncp->knc_rdev;
1186
1187	/*
1188	 * Used when AUTH_DH is negotiated.
1189	 *
1190	 * This is ephemeral mount-type specific, since it contains the
1191	 * server's time-sync syncaddr.
1192	 */
1193	if (svp->sv_dhsec) {
1194		struct netbuf *bufp;
1195		sec_data_t *sdata;
1196		dh_k4_clntdata_t *data;
1197
1198		sdata = svp->sv_dhsec;
1199		data = (dh_k4_clntdata_t *)sdata->data;
1200		ASSERT(sdata->rpcflavor == AUTH_DH);
1201
1202		bufp = kmem_zalloc(sizeof (struct netbuf), KM_SLEEP);
1203		bufp->len = data->syncaddr.len;
1204		bufp->maxlen = data->syncaddr.maxlen;
1205		bufp->buf = kmem_zalloc(bufp->len, KM_SLEEP);
1206		bcopy(data->syncaddr.buf, bufp->buf, bufp->len);
1207		esi->esi_syncaddr = bufp;
1208
1209		if (data->netname != NULL) {
1210			int nmlen = data->netnamelen;
1211
1212			/*
1213			 * We need to copy from a dh_k4_clntdata_t
1214			 * netname/netnamelen pair to a NUL-terminated
1215			 * netname string suitable for putting in nfs_args,
1216			 * where the latter has no netnamelen field.
1217			 */
1218			esi->esi_netname = kmem_zalloc(nmlen + 1, KM_SLEEP);
1219			bcopy(data->netname, esi->esi_netname, nmlen);
1220		}
1221	} else {
1222		esi->esi_syncaddr = NULL;
1223		esi->esi_netname = NULL;
1224	}
1225
1226	stubpath = fn_path(VTOSV(vp)->sv_name);
1227	/* step over initial '.', to avoid e.g. sv_path: "/tank./ws" */
1228	ASSERT(*stubpath == '.');
1229	stubpath += 1;
1230
1231	/* for nfs_args->fh */
1232	esi->esi_path_len = strlen(stubpath) + 1;
1233	if (strcmp(svp->sv_path, "/") != 0)
1234		esi->esi_path_len += strlen(svp->sv_path);
1235	esi->esi_path = kmem_zalloc(esi->esi_path_len, KM_SLEEP);
1236	if (strcmp(svp->sv_path, "/") != 0)
1237		(void) strcat(esi->esi_path, svp->sv_path);
1238	(void) strcat(esi->esi_path, stubpath);
1239
1240	stubpath -= 1;
1241	/* stubpath allocated by fn_path() */
1242	kmem_free(stubpath, strlen(stubpath) + 1);
1243
1244	nfs_rw_exit(&svp->sv_lock);
1245
1246	return (esi);
1247}
1248
1249/*
1250 * Makes an upcall to NFSMAPID daemon to resolve hostname of NFS server to
1251 * get network information required to do the mount call.
1252 */
1253int
1254nfs4_callmapid(utf8string *server, struct nfs_fsl_info *resp)
1255{
1256	door_arg_t	door_args;
1257	door_handle_t	dh;
1258	XDR		xdr;
1259	refd_door_args_t *xdr_argsp;
1260	refd_door_res_t  *orig_resp;
1261	k_sigset_t	smask;
1262	int		xdr_len = 0;
1263	int 		res_len = 16; /* length of an ip adress */
1264	int		orig_reslen = res_len;
1265	int		error = 0;
1266	struct nfsidmap_globals *nig;
1267
1268	if (zone_status_get(curproc->p_zone) >= ZONE_IS_SHUTTING_DOWN)
1269		return (ECONNREFUSED);
1270
1271	nig = zone_getspecific(nfsidmap_zone_key, nfs_zone());
1272	ASSERT(nig != NULL);
1273
1274	mutex_enter(&nig->nfsidmap_daemon_lock);
1275	dh = nig->nfsidmap_daemon_dh;
1276	if (dh == NULL) {
1277		mutex_exit(&nig->nfsidmap_daemon_lock);
1278		cmn_err(CE_NOTE,
1279		    "nfs4_callmapid: nfsmapid daemon not " \
1280		    "running unable to resolve host name\n");
1281		return (EINVAL);
1282	}
1283	door_ki_hold(dh);
1284	mutex_exit(&nig->nfsidmap_daemon_lock);
1285
1286	xdr_len = xdr_sizeof(&(xdr_utf8string), server);
1287
1288	xdr_argsp = kmem_zalloc(xdr_len + sizeof (*xdr_argsp), KM_SLEEP);
1289	xdr_argsp->xdr_len = xdr_len;
1290	xdr_argsp->cmd = NFSMAPID_SRV_NETINFO;
1291
1292	xdrmem_create(&xdr, (char *)&xdr_argsp->xdr_arg,
1293	    xdr_len, XDR_ENCODE);
1294
1295	if (!xdr_utf8string(&xdr, server)) {
1296		kmem_free(xdr_argsp, xdr_len + sizeof (*xdr_argsp));
1297		door_ki_rele(dh);
1298		return (1);
1299	}
1300
1301	if (orig_reslen)
1302		orig_resp = kmem_alloc(orig_reslen, KM_SLEEP);
1303
1304	door_args.data_ptr = (char *)xdr_argsp;
1305	door_args.data_size = sizeof (*xdr_argsp) + xdr_argsp->xdr_len;
1306	door_args.desc_ptr = NULL;
1307	door_args.desc_num = 0;
1308	door_args.rbuf = orig_resp ? (char *)orig_resp : NULL;
1309	door_args.rsize = res_len;
1310
1311	sigintr(&smask, 1);
1312	error = door_ki_upcall(dh, &door_args);
1313	sigunintr(&smask);
1314
1315	door_ki_rele(dh);
1316
1317	kmem_free(xdr_argsp, xdr_len + sizeof (*xdr_argsp));
1318	if (error) {
1319		kmem_free(orig_resp, orig_reslen);
1320		/*
1321		 * There is no door to connect to. The referral daemon
1322		 * must not be running yet.
1323		 */
1324		cmn_err(CE_WARN,
1325		    "nfsmapid not running cannot resolve host name");
1326		goto out;
1327	}
1328
1329	/*
1330	 * If the results buffer passed back are not the same as
1331	 * what was sent free the old buffer and use the new one.
1332	 */
1333	if (orig_resp && orig_reslen) {
1334		refd_door_res_t *door_resp;
1335
1336		door_resp = (refd_door_res_t *)door_args.rbuf;
1337		if ((void *)door_args.rbuf != orig_resp)
1338			kmem_free(orig_resp, orig_reslen);
1339		if (door_resp->res_status == 0) {
1340			xdrmem_create(&xdr, (char *)&door_resp->xdr_res,
1341			    door_resp->xdr_len, XDR_DECODE);
1342			bzero(resp, sizeof (struct nfs_fsl_info));
1343			if (!xdr_nfs_fsl_info(&xdr, resp)) {
1344				DTRACE_PROBE2(
1345				    nfs4clnt__debug__referral__upcall__xdrfail,
1346				    struct nfs_fsl_info *, resp,
1347				    char *, "nfs4_callmapid");
1348				error = EINVAL;
1349			}
1350		} else {
1351			DTRACE_PROBE2(
1352			    nfs4clnt__debug__referral__upcall__badstatus,
1353			    int, door_resp->res_status,
1354			    char *, "nfs4_callmapid");
1355			error = door_resp->res_status;
1356		}
1357		kmem_free(door_args.rbuf, door_args.rsize);
1358	}
1359out:
1360	DTRACE_PROBE2(nfs4clnt__func__referral__upcall,
1361	    char *, server, int, error);
1362	return (error);
1363}
1364
1365/*
1366 * Fetches the fs_locations attribute. Typically called
1367 * from a Replication/Migration/Referrals/Mirror-mount context
1368 *
1369 * Fills in the attributes in garp. The caller is assumed
1370 * to have allocated memory for garp.
1371 *
1372 * lock: if set do not lock s_recovlock and mi_recovlock mutex,
1373 *	 it's already done by caller. Otherwise lock these mutexes
1374 *	 before doing the rfs4call().
1375 *
1376 * Returns
1377 * 	1	 for success
1378 * 	0	 for failure
1379 */
1380int
1381nfs4_fetch_locations(mntinfo4_t *mi, nfs4_sharedfh_t *sfh, char *nm,
1382    cred_t *cr, nfs4_ga_res_t *garp, COMPOUND4res_clnt *callres, bool_t lock)
1383{
1384	COMPOUND4args_clnt args;
1385	COMPOUND4res_clnt res;
1386	nfs_argop4 *argop;
1387	int argoplist_size = 3 * sizeof (nfs_argop4);
1388	nfs4_server_t *sp = NULL;
1389	int doqueue = 1;
1390	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
1391	int retval = 1;
1392	struct nfs4_clnt *nfscl;
1393
1394	if (lock == TRUE)
1395		(void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 0);
1396	else
1397		ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
1398		    nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
1399
1400	sp = find_nfs4_server(mi);
1401	if (lock == TRUE)
1402		nfs_rw_exit(&mi->mi_recovlock);
1403
1404	if (sp != NULL)
1405		mutex_exit(&sp->s_lock);
1406
1407	if (lock == TRUE) {
1408		if (sp != NULL)
1409			(void) nfs_rw_enter_sig(&sp->s_recovlock,
1410			    RW_WRITER, 0);
1411		(void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_WRITER, 0);
1412	} else {
1413		if (sp != NULL) {
1414			ASSERT(nfs_rw_lock_held(&sp->s_recovlock, RW_READER) ||
1415			    nfs_rw_lock_held(&sp->s_recovlock, RW_WRITER));
1416		}
1417	}
1418
1419	/*
1420	 * Do we want to do the setup for recovery here?
1421	 *
1422	 * We know that the server responded to a null ping a very
1423	 * short time ago, and we know that we intend to do a
1424	 * single stateless operation - we want to fetch attributes,
1425	 * so we know we can't encounter errors about state.  If
1426	 * something goes wrong with the GETATTR, like not being
1427	 * able to get a response from the server or getting any
1428	 * kind of FH error, we should fail the mount.
1429	 *
1430	 * We may want to re-visited this at a later time.
1431	 */
1432	argop = kmem_alloc(argoplist_size, KM_SLEEP);
1433
1434	args.ctag = TAG_GETATTR_FSLOCATION;
1435	/* PUTFH LOOKUP GETATTR */
1436	args.array_len = 3;
1437	args.array = argop;
1438
1439	/* 0. putfh file */
1440	argop[0].argop = OP_CPUTFH;
1441	argop[0].nfs_argop4_u.opcputfh.sfh = sfh;
1442
1443	/* 1. lookup name, can't be dotdot */
1444	argop[1].argop = OP_CLOOKUP;
1445	argop[1].nfs_argop4_u.opclookup.cname = nm;
1446
1447	/* 2. file attrs */
1448	argop[2].argop = OP_GETATTR;
1449	argop[2].nfs_argop4_u.opgetattr.attr_request =
1450	    FATTR4_FSID_MASK | FATTR4_FS_LOCATIONS_MASK |
1451	    FATTR4_MOUNTED_ON_FILEID_MASK;
1452	argop[2].nfs_argop4_u.opgetattr.mi = mi;
1453
1454	rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
1455
1456	if (lock == TRUE) {
1457		nfs_rw_exit(&mi->mi_recovlock);
1458		if (sp != NULL)
1459			nfs_rw_exit(&sp->s_recovlock);
1460	}
1461
1462	nfscl = zone_getspecific(nfs4clnt_zone_key, nfs_zone());
1463	nfscl->nfscl_stat.referrals.value.ui64++;
1464	DTRACE_PROBE3(nfs4clnt__func__referral__fsloc,
1465	    nfs4_sharedfh_t *, sfh, char *, nm, nfs4_error_t *, &e);
1466
1467	if (e.error != 0) {
1468		if (sp != NULL)
1469			nfs4_server_rele(sp);
1470		kmem_free(argop, argoplist_size);
1471		return (0);
1472	}
1473
1474	/*
1475	 * Check for all possible error conditions.
1476	 * For valid replies without an ops array or for illegal
1477	 * replies, return a failure.
1478	 */
1479	if (res.status != NFS4_OK || res.array_len < 3 ||
1480	    res.array[2].nfs_resop4_u.opgetattr.status != NFS4_OK) {
1481		retval = 0;
1482		goto exit;
1483	}
1484
1485	/*
1486	 * There isn't much value in putting the attributes
1487	 * in the attr cache since fs_locations4 aren't
1488	 * encountered very frequently, so just make them
1489	 * available to the caller.
1490	 */
1491	*garp = res.array[2].nfs_resop4_u.opgetattr.ga_res;
1492
1493	DTRACE_PROBE2(nfs4clnt__debug__referral__fsloc,
1494	    nfs4_ga_res_t *, garp, char *, "nfs4_fetch_locations");
1495
1496	/* No fs_locations? -- return a failure */
1497	if (garp->n4g_ext_res == NULL ||
1498	    garp->n4g_ext_res->n4g_fslocations.locations_val == NULL) {
1499		retval = 0;
1500		goto exit;
1501	}
1502
1503	if (!garp->n4g_fsid_valid)
1504		retval = 0;
1505
1506exit:
1507	if (retval == 0) {
1508		/* the call was ok but failed validating the call results */
1509		xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1510	} else {
1511		ASSERT(callres != NULL);
1512		*callres = res;
1513	}
1514
1515	if (sp != NULL)
1516		nfs4_server_rele(sp);
1517	kmem_free(argop, argoplist_size);
1518	return (retval);
1519}
1520
1521/* tunable to disable referral mounts */
1522int nfs4_no_referrals = 0;
1523
1524/*
1525 * Returns NULL if the vnode cannot be created or found.
1526 */
1527vnode_t *
1528find_referral_stubvp(vnode_t *dvp, char *nm, cred_t *cr)
1529{
1530	nfs_fh4 *stub_fh, *dfh;
1531	nfs4_sharedfh_t *sfhp;
1532	char *newfhval;
1533	vnode_t *vp = NULL;
1534	fattr4_mounted_on_fileid mnt_on_fileid;
1535	nfs4_ga_res_t garp;
1536	mntinfo4_t *mi;
1537	COMPOUND4res_clnt callres;
1538	hrtime_t t;
1539
1540	if (nfs4_no_referrals)
1541		return (NULL);
1542
1543	/*
1544	 * Get the mounted_on_fileid, unique on that server::fsid
1545	 */
1546	mi = VTOMI4(dvp);
1547	if (nfs4_fetch_locations(mi, VTOR4(dvp)->r_fh, nm, cr,
1548	    &garp, &callres, FALSE) == 0)
1549		return (NULL);
1550	mnt_on_fileid = garp.n4g_mon_fid;
1551	xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&callres);
1552
1553	/*
1554	 * Build a fake filehandle from the dir FH and the mounted_on_fileid
1555	 */
1556	dfh = &VTOR4(dvp)->r_fh->sfh_fh;
1557	stub_fh = kmem_alloc(sizeof (nfs_fh4), KM_SLEEP);
1558	stub_fh->nfs_fh4_val = kmem_alloc(dfh->nfs_fh4_len +
1559	    sizeof (fattr4_mounted_on_fileid), KM_SLEEP);
1560	newfhval = stub_fh->nfs_fh4_val;
1561
1562	/* copy directory's file handle */
1563	bcopy(dfh->nfs_fh4_val, newfhval, dfh->nfs_fh4_len);
1564	stub_fh->nfs_fh4_len = dfh->nfs_fh4_len;
1565	newfhval = newfhval + dfh->nfs_fh4_len;
1566
1567	/* Add mounted_on_fileid. Use bcopy to avoid alignment problem */
1568	bcopy((char *)&mnt_on_fileid, newfhval,
1569	    sizeof (fattr4_mounted_on_fileid));
1570	stub_fh->nfs_fh4_len += sizeof (fattr4_mounted_on_fileid);
1571
1572	sfhp = sfh4_put(stub_fh, VTOMI4(dvp), NULL);
1573	kmem_free(stub_fh->nfs_fh4_val, dfh->nfs_fh4_len +
1574	    sizeof (fattr4_mounted_on_fileid));
1575	kmem_free(stub_fh, sizeof (nfs_fh4));
1576	if (sfhp == NULL)
1577		return (NULL);
1578
1579	t = gethrtime();
1580	garp.n4g_va.va_type = VDIR;
1581	vp = makenfs4node(sfhp, NULL, dvp->v_vfsp, t,
1582	    cr, dvp, fn_get(VTOSV(dvp)->sv_name, nm, sfhp));
1583
1584	if (vp != NULL)
1585		vp->v_type = VDIR;
1586
1587	sfh4_rele(&sfhp);
1588	return (vp);
1589}
1590
1591int
1592nfs4_setup_referral(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr)
1593{
1594	vnode_t *nvp;
1595	rnode4_t *rp;
1596
1597	if ((nvp = find_referral_stubvp(dvp, nm, cr)) == NULL)
1598		return (EINVAL);
1599
1600	rp = VTOR4(nvp);
1601	mutex_enter(&rp->r_statelock);
1602	r4_stub_referral(rp);
1603	mutex_exit(&rp->r_statelock);
1604	dnlc_enter(dvp, nm, nvp);
1605
1606	if (*vpp != NULL)
1607		VN_RELE(*vpp);	/* no longer need this vnode */
1608
1609	*vpp = nvp;
1610
1611	return (0);
1612}
1613
1614/*
1615 * Fetch the location information and resolve the new server.
1616 * Caller needs to free up the XDR data which is returned.
1617 * Input: mount info, shared filehandle, nodename
1618 * Return: Index to the result or Error(-1)
1619 * Output: FsLocations Info, Resolved Server Info.
1620 */
1621int
1622nfs4_process_referral(mntinfo4_t *mi, nfs4_sharedfh_t *sfh,
1623    char *nm, cred_t *cr, nfs4_ga_res_t *grp, COMPOUND4res_clnt *res,
1624    struct nfs_fsl_info *fsloc)
1625{
1626	fs_location4 *fsp;
1627	struct nfs_fsl_info nfsfsloc;
1628	int ret, i, error;
1629	nfs4_ga_res_t garp;
1630	COMPOUND4res_clnt callres;
1631	struct knetconfig *knc;
1632
1633	ret = nfs4_fetch_locations(mi, sfh, nm, cr, &garp, &callres, TRUE);
1634	if (ret == 0)
1635		return (-1);
1636
1637	/*
1638	 * As a lame attempt to figuring out if we're
1639	 * handling a migration event or a referral,
1640	 * look for rnodes with this fsid in the rnode
1641	 * cache.
1642	 *
1643	 * If we can find one or more such rnodes, it
1644	 * means we're handling a migration event and
1645	 * we want to bail out in that case.
1646	 */
1647	if (r4find_by_fsid(mi, &garp.n4g_fsid)) {
1648		DTRACE_PROBE3(nfs4clnt__debug__referral__migration,
1649		    mntinfo4_t *, mi, nfs4_ga_res_t *, &garp,
1650		    char *, "nfs4_process_referral");
1651		xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&callres);
1652		return (-1);
1653	}
1654
1655	/*
1656	 * Find the first responsive server to mount.  When we find
1657	 * one, fsp will point to it.
1658	 */
1659	for (i = 0; i < garp.n4g_ext_res->n4g_fslocations.locations_len; i++) {
1660
1661		fsp = &garp.n4g_ext_res->n4g_fslocations.locations_val[i];
1662		if (fsp->server_len == 0 || fsp->server_val == NULL)
1663			continue;
1664
1665		error = nfs4_callmapid(fsp->server_val, &nfsfsloc);
1666		if (error != 0)
1667			continue;
1668
1669		error = nfs4_ping_server_common(nfsfsloc.knconf,
1670		    nfsfsloc.addr, !(mi->mi_flags & MI4_INT));
1671		if (error == RPC_SUCCESS)
1672			break;
1673
1674		DTRACE_PROBE2(nfs4clnt__debug__referral__srvaddr,
1675		    sockaddr_in *, (struct sockaddr_in *)nfsfsloc.addr->buf,
1676		    char *, "nfs4_process_referral");
1677
1678		xdr_free(xdr_nfs_fsl_info, (char *)&nfsfsloc);
1679	}
1680	knc = nfsfsloc.knconf;
1681	if ((i >= garp.n4g_ext_res->n4g_fslocations.locations_len) ||
1682	    (knc->knc_protofmly == NULL) || (knc->knc_proto == NULL)) {
1683		DTRACE_PROBE2(nfs4clnt__debug__referral__nofsloc,
1684		    nfs4_ga_res_t *, &garp, char *, "nfs4_process_referral");
1685		xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&callres);
1686		return (-1);
1687	}
1688
1689	/* Send the results back */
1690	*fsloc = nfsfsloc;
1691	*grp = garp;
1692	*res = callres;
1693	return (i);
1694}
1695
1696/*
1697 * Referrals case - need to fetch referral data and then upcall to
1698 * user-level to get complete mount data.
1699 */
1700static ephemeral_servinfo_t *
1701nfs4_trigger_esi_create_referral(vnode_t *vp, cred_t *cr)
1702{
1703	struct knetconfig	*sikncp, *svkncp;
1704	struct netbuf		*bufp;
1705	ephemeral_servinfo_t	*esi;
1706	vnode_t			*dvp;
1707	rnode4_t		*drp;
1708	fs_location4		*fsp;
1709	struct nfs_fsl_info	nfsfsloc;
1710	nfs4_ga_res_t		garp;
1711	char			*p;
1712	char			fn[MAXNAMELEN];
1713	int			i, index = -1;
1714	mntinfo4_t		*mi;
1715	COMPOUND4res_clnt	callres;
1716
1717	/*
1718	 * If we're passed in a stub vnode that
1719	 * isn't a "referral" stub, bail out
1720	 * and return a failure
1721	 */
1722	if (!RP_ISSTUB_REFERRAL(VTOR4(vp)))
1723		return (NULL);
1724
1725	if (vtodv(vp, &dvp, CRED(), TRUE) != 0)
1726		return (NULL);
1727
1728	drp = VTOR4(dvp);
1729	if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp))) {
1730		VN_RELE(dvp);
1731		return (NULL);
1732	}
1733
1734	if (vtoname(vp, fn, MAXNAMELEN) != 0) {
1735		nfs_rw_exit(&drp->r_rwlock);
1736		VN_RELE(dvp);
1737		return (NULL);
1738	}
1739
1740	mi = VTOMI4(dvp);
1741	index = nfs4_process_referral(mi, drp->r_fh, fn, cr,
1742	    &garp, &callres, &nfsfsloc);
1743	nfs_rw_exit(&drp->r_rwlock);
1744	VN_RELE(dvp);
1745	if (index < 0)
1746		return (NULL);
1747
1748	fsp = &garp.n4g_ext_res->n4g_fslocations.locations_val[index];
1749	esi = kmem_zalloc(sizeof (ephemeral_servinfo_t), KM_SLEEP);
1750
1751	/* initially set to be our type of ephemeral mount; may be added to */
1752	esi->esi_mount_flags = NFSMNT_REFERRAL;
1753
1754	esi->esi_hostname =
1755	    kmem_zalloc(fsp->server_val->utf8string_len + 1, KM_SLEEP);
1756	bcopy(fsp->server_val->utf8string_val, esi->esi_hostname,
1757	    fsp->server_val->utf8string_len);
1758	esi->esi_hostname[fsp->server_val->utf8string_len] = '\0';
1759
1760	bufp = kmem_alloc(sizeof (struct netbuf), KM_SLEEP);
1761	bufp->len = nfsfsloc.addr->len;
1762	bufp->maxlen = nfsfsloc.addr->maxlen;
1763	bufp->buf = kmem_zalloc(bufp->len, KM_SLEEP);
1764	bcopy(nfsfsloc.addr->buf, bufp->buf, bufp->len);
1765	esi->esi_addr = bufp;
1766
1767	esi->esi_knconf = kmem_zalloc(sizeof (*esi->esi_knconf), KM_SLEEP);
1768	sikncp = esi->esi_knconf;
1769
1770	DTRACE_PROBE2(nfs4clnt__debug__referral__nfsfsloc,
1771	    struct nfs_fsl_info *, &nfsfsloc,
1772	    char *, "nfs4_trigger_esi_create_referral");
1773
1774	svkncp = nfsfsloc.knconf;
1775	sikncp->knc_semantics = svkncp->knc_semantics;
1776	sikncp->knc_protofmly = (caddr_t)kmem_zalloc(KNC_STRSIZE, KM_SLEEP);
1777	(void) strlcat((char *)sikncp->knc_protofmly,
1778	    (char *)svkncp->knc_protofmly, KNC_STRSIZE);
1779	sikncp->knc_proto = (caddr_t)kmem_zalloc(KNC_STRSIZE, KM_SLEEP);
1780	(void) strlcat((char *)sikncp->knc_proto, (char *)svkncp->knc_proto,
1781	    KNC_STRSIZE);
1782	sikncp->knc_rdev = svkncp->knc_rdev;
1783
1784	DTRACE_PROBE2(nfs4clnt__debug__referral__knetconf,
1785	    struct knetconfig *, sikncp,
1786	    char *, "nfs4_trigger_esi_create_referral");
1787
1788	esi->esi_netname = kmem_zalloc(nfsfsloc.netnm_len, KM_SLEEP);
1789	bcopy(nfsfsloc.netname, esi->esi_netname, nfsfsloc.netnm_len);
1790	esi->esi_syncaddr = NULL;
1791
1792	esi->esi_path = p = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
1793	esi->esi_path_len = MAXPATHLEN;
1794	*p++ = '/';
1795	for (i = 0; i < fsp->rootpath.pathname4_len; i++) {
1796		component4 *comp;
1797
1798		comp = &fsp->rootpath.pathname4_val[i];
1799		/* If no space, null the string and bail */
1800		if ((p - esi->esi_path) + comp->utf8string_len + 1 > MAXPATHLEN)
1801			goto err;
1802		bcopy(comp->utf8string_val, p, comp->utf8string_len);
1803		p += comp->utf8string_len;
1804		*p++ = '/';
1805	}
1806	if (fsp->rootpath.pathname4_len != 0)
1807		*(p - 1) = '\0';
1808	else
1809		*p = '\0';
1810	p = esi->esi_path;
1811	esi->esi_path = strdup(p);
1812	esi->esi_path_len = strlen(p) + 1;
1813	kmem_free(p, MAXPATHLEN);
1814
1815	/* Allocated in nfs4_process_referral() */
1816	xdr_free(xdr_nfs_fsl_info, (char *)&nfsfsloc);
1817	xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&callres);
1818
1819	return (esi);
1820err:
1821	kmem_free(esi->esi_path, esi->esi_path_len);
1822	kmem_free(esi->esi_hostname, fsp->server_val->utf8string_len + 1);
1823	kmem_free(esi->esi_addr->buf, esi->esi_addr->len);
1824	kmem_free(esi->esi_addr, sizeof (struct netbuf));
1825	kmem_free(esi->esi_knconf->knc_protofmly, KNC_STRSIZE);
1826	kmem_free(esi->esi_knconf->knc_proto, KNC_STRSIZE);
1827	kmem_free(esi->esi_knconf, sizeof (*esi->esi_knconf));
1828	kmem_free(esi->esi_netname, nfsfsloc.netnm_len);
1829	kmem_free(esi, sizeof (ephemeral_servinfo_t));
1830	xdr_free(xdr_nfs_fsl_info, (char *)&nfsfsloc);
1831	xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&callres);
1832	return (NULL);
1833}
1834
1835/*
1836 * Assemble the args, and call the generic VFS mount function to
1837 * finally perform the ephemeral mount.
1838 */
1839static int
1840nfs4_trigger_domount(vnode_t *stubvp, domount_args_t *dma, vfs_t **vfsp,
1841    cred_t *cr, vnode_t **newvpp)
1842{
1843	struct mounta	*uap;
1844	char		*mntpt, *orig_path, *path;
1845	const char	*orig_mntpt;
1846	int		retval;
1847	int		mntpt_len;
1848	int		spec_len;
1849	zone_t		*zone = curproc->p_zone;
1850	bool_t		has_leading_slash;
1851	int		i;
1852
1853	vfs_t			*stubvfsp = stubvp->v_vfsp;
1854	ephemeral_servinfo_t	*esi = dma->dma_esi;
1855	struct nfs_args		*nargs = dma->dma_nargs;
1856
1857	/* first, construct the mount point for the ephemeral mount */
1858	orig_path = path = fn_path(VTOSV(stubvp)->sv_name);
1859	orig_mntpt = (char *)refstr_value(stubvfsp->vfs_mntpt);
1860
1861	if (*orig_path == '.')
1862		orig_path++;
1863
1864	/*
1865	 * Get rid of zone's root path
1866	 */
1867	if (zone != global_zone) {
1868		/*
1869		 * -1 for trailing '/' and -1 for EOS.
1870		 */
1871		if (strncmp(zone->zone_rootpath, orig_mntpt,
1872		    zone->zone_rootpathlen - 1) == 0) {
1873			orig_mntpt += (zone->zone_rootpathlen - 2);
1874		}
1875	}
1876
1877	mntpt_len = strlen(orig_mntpt) + strlen(orig_path);
1878	mntpt = kmem_zalloc(mntpt_len + 1, KM_SLEEP);
1879	(void) strcat(mntpt, orig_mntpt);
1880	(void) strcat(mntpt, orig_path);
1881
1882	kmem_free(path, strlen(path) + 1);
1883	path = esi->esi_path;
1884	if (*path == '.')
1885		path++;
1886	if (path[0] == '/' && path[1] == '/')
1887		path++;
1888	has_leading_slash = (*path == '/');
1889
1890	spec_len = strlen(dma->dma_hostlist);
1891	spec_len += strlen(path);
1892
1893	/* We are going to have to add this in */
1894	if (!has_leading_slash)
1895		spec_len++;
1896
1897	/* We need to get the ':' for dma_hostlist:esi_path */
1898	spec_len++;
1899
1900	uap = kmem_zalloc(sizeof (struct mounta), KM_SLEEP);
1901	uap->spec = kmem_zalloc(spec_len + 1, KM_SLEEP);
1902	(void) snprintf(uap->spec, spec_len + 1, "%s:%s%s", dma->dma_hostlist,
1903	    has_leading_slash ? "" : "/", path);
1904
1905	uap->dir = mntpt;
1906
1907	uap->flags = MS_SYSSPACE | MS_DATA;
1908	/* fstype-independent mount options not covered elsewhere */
1909	/* copy parent's mount(1M) "-m" flag */
1910	if (stubvfsp->vfs_flag & VFS_NOMNTTAB)
1911		uap->flags |= MS_NOMNTTAB;
1912
1913	uap->fstype = MNTTYPE_NFS4;
1914	uap->dataptr = (char *)nargs;
1915	/* not needed for MS_SYSSPACE */
1916	uap->datalen = 0;
1917
1918	/* use optptr to pass in extra mount options */
1919	uap->flags |= MS_OPTIONSTR;
1920	uap->optptr = nfs4_trigger_create_mntopts(stubvfsp);
1921	if (uap->optptr == NULL) {
1922		retval = EINVAL;
1923		goto done;
1924	}
1925
1926	/* domount() expects us to count the trailing NUL */
1927	uap->optlen = strlen(uap->optptr) + 1;
1928
1929	/*
1930	 * If we get EBUSY, we try again once to see if we can perform
1931	 * the mount. We do this because of a spurious race condition.
1932	 */
1933	for (i = 0; i < 2; i++) {
1934		int	error;
1935		bool_t	was_mounted;
1936
1937		retval = domount(NULL, uap, stubvp, cr, vfsp);
1938		if (retval == 0) {
1939			retval = VFS_ROOT(*vfsp, newvpp);
1940			VFS_RELE(*vfsp);
1941			break;
1942		} else if (retval != EBUSY) {
1943			break;
1944		}
1945
1946		/*
1947		 * We might find it mounted by the other racer...
1948		 */
1949		error = nfs4_trigger_mounted_already(stubvp,
1950		    newvpp, &was_mounted, vfsp);
1951		if (error) {
1952			goto done;
1953		} else if (was_mounted) {
1954			retval = 0;
1955			break;
1956		}
1957	}
1958
1959done:
1960	if (uap->optptr)
1961		nfs4_trigger_destroy_mntopts(uap->optptr);
1962
1963	kmem_free(uap->spec, spec_len + 1);
1964	kmem_free(uap, sizeof (struct mounta));
1965	kmem_free(mntpt, mntpt_len + 1);
1966
1967	return (retval);
1968}
1969
1970/*
1971 * Build an nfs_args structure for passing to domount().
1972 *
1973 * Ephemeral mount-type specific data comes from the ephemeral_servinfo_t;
1974 * generic data - common to all ephemeral mount types - is read directly
1975 * from the parent mount's servinfo4_t and mntinfo4_t, via the stub vnode.
1976 */
1977static struct nfs_args *
1978nfs4_trigger_nargs_create(mntinfo4_t *mi, servinfo4_t *svp,
1979    ephemeral_servinfo_t *esi)
1980{
1981	sec_data_t *secdata;
1982	struct nfs_args *nargs;
1983
1984	/* setup the nfs args */
1985	nargs = kmem_zalloc(sizeof (struct nfs_args), KM_SLEEP);
1986
1987	(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
1988
1989	nargs->addr = esi->esi_addr;
1990
1991	/* for AUTH_DH by negotiation */
1992	if (esi->esi_syncaddr || esi->esi_netname) {
1993		nargs->flags |= NFSMNT_SECURE;
1994		nargs->syncaddr = esi->esi_syncaddr;
1995		nargs->netname = esi->esi_netname;
1996	}
1997
1998	nargs->flags |= NFSMNT_KNCONF;
1999	nargs->knconf = esi->esi_knconf;
2000	nargs->flags |= NFSMNT_HOSTNAME;
2001	nargs->hostname = esi->esi_hostname;
2002	nargs->fh = esi->esi_path;
2003
2004	/* general mount settings, all copied from parent mount */
2005	mutex_enter(&mi->mi_lock);
2006
2007	if (!(mi->mi_flags & MI4_HARD))
2008		nargs->flags |= NFSMNT_SOFT;
2009
2010	nargs->flags |= NFSMNT_WSIZE | NFSMNT_RSIZE | NFSMNT_TIMEO |
2011	    NFSMNT_RETRANS;
2012	nargs->wsize = mi->mi_stsize;
2013	nargs->rsize = mi->mi_tsize;
2014	nargs->timeo = mi->mi_timeo;
2015	nargs->retrans = mi->mi_retrans;
2016
2017	if (mi->mi_flags & MI4_INT)
2018		nargs->flags |= NFSMNT_INT;
2019	if (mi->mi_flags & MI4_NOAC)
2020		nargs->flags |= NFSMNT_NOAC;
2021
2022	nargs->flags |= NFSMNT_ACREGMIN | NFSMNT_ACREGMAX | NFSMNT_ACDIRMIN |
2023	    NFSMNT_ACDIRMAX;
2024	nargs->acregmin = HR2SEC(mi->mi_acregmin);
2025	nargs->acregmax = HR2SEC(mi->mi_acregmax);
2026	nargs->acdirmin = HR2SEC(mi->mi_acdirmin);
2027	nargs->acdirmax = HR2SEC(mi->mi_acdirmax);
2028
2029	/* add any specific flags for this type of ephemeral mount */
2030	nargs->flags |= esi->esi_mount_flags;
2031
2032	if (mi->mi_flags & MI4_NOCTO)
2033		nargs->flags |= NFSMNT_NOCTO;
2034	if (mi->mi_flags & MI4_GRPID)
2035		nargs->flags |= NFSMNT_GRPID;
2036	if (mi->mi_flags & MI4_LLOCK)
2037		nargs->flags |= NFSMNT_LLOCK;
2038	if (mi->mi_flags & MI4_NOPRINT)
2039		nargs->flags |= NFSMNT_NOPRINT;
2040	if (mi->mi_flags & MI4_DIRECTIO)
2041		nargs->flags |= NFSMNT_DIRECTIO;
2042	if (mi->mi_flags & MI4_PUBLIC && nargs->flags & NFSMNT_MIRRORMOUNT)
2043		nargs->flags |= NFSMNT_PUBLIC;
2044
2045	/* Do some referral-specific option tweaking */
2046	if (nargs->flags & NFSMNT_REFERRAL) {
2047		nargs->flags &= ~NFSMNT_DORDMA;
2048		nargs->flags |= NFSMNT_TRYRDMA;
2049	}
2050
2051	mutex_exit(&mi->mi_lock);
2052
2053	/*
2054	 * Security data & negotiation policy.
2055	 *
2056	 * For mirror mounts, we need to preserve the parent mount's
2057	 * preference for security negotiation, translating SV4_TRYSECDEFAULT
2058	 * to NFSMNT_SECDEFAULT if present.
2059	 *
2060	 * For referrals, we always want security negotiation and will
2061	 * set NFSMNT_SECDEFAULT and we will not copy current secdata.
2062	 * The reason is that we can't negotiate down from a parent's
2063	 * Kerberos flavor to AUTH_SYS.
2064	 *
2065	 * If SV4_TRYSECDEFAULT is not set, that indicates that a specific
2066	 * security flavour was requested, with data in sv_secdata, and that
2067	 * no negotiation should occur. If this specified flavour fails, that's
2068	 * it. We will copy sv_secdata, and not set NFSMNT_SECDEFAULT.
2069	 *
2070	 * If SV4_TRYSECDEFAULT is set, then we start with a passed-in
2071	 * default flavour, in sv_secdata, but then negotiate a new flavour.
2072	 * Possible flavours are recorded in an array in sv_secinfo, with
2073	 * currently in-use flavour pointed to by sv_currsec.
2074	 *
2075	 * If sv_currsec is set, i.e. if negotiation has already occurred,
2076	 * we will copy sv_currsec. Otherwise, copy sv_secdata. Regardless,
2077	 * we will set NFSMNT_SECDEFAULT, to enable negotiation.
2078	 */
2079	if (nargs->flags & NFSMNT_REFERRAL) {
2080		/* enable negotiation for referral mount */
2081		nargs->flags |= NFSMNT_SECDEFAULT;
2082		secdata = kmem_alloc(sizeof (sec_data_t), KM_SLEEP);
2083		secdata->secmod = secdata->rpcflavor = AUTH_SYS;
2084		secdata->data = NULL;
2085	} else if (svp->sv_flags & SV4_TRYSECDEFAULT) {
2086		/* enable negotiation for mirror mount */
2087		nargs->flags |= NFSMNT_SECDEFAULT;
2088
2089		/*
2090		 * As a starting point for negotiation, copy parent
2091		 * mount's negotiated flavour (sv_currsec) if available,
2092		 * or its passed-in flavour (sv_secdata) if not.
2093		 */
2094		if (svp->sv_currsec != NULL)
2095			secdata = copy_sec_data(svp->sv_currsec);
2096		else if (svp->sv_secdata != NULL)
2097			secdata = copy_sec_data(svp->sv_secdata);
2098		else
2099			secdata = NULL;
2100	} else {
2101		/* do not enable negotiation; copy parent's passed-in flavour */
2102		if (svp->sv_secdata != NULL)
2103			secdata = copy_sec_data(svp->sv_secdata);
2104		else
2105			secdata = NULL;
2106	}
2107
2108	nfs_rw_exit(&svp->sv_lock);
2109
2110	nargs->flags |= NFSMNT_NEWARGS;
2111	nargs->nfs_args_ext = NFS_ARGS_EXTB;
2112	nargs->nfs_ext_u.nfs_extB.secdata = secdata;
2113
2114	/* for NFS RO failover; caller will set if necessary */
2115	nargs->nfs_ext_u.nfs_extB.next = NULL;
2116
2117	return (nargs);
2118}
2119
2120static void
2121nfs4_trigger_nargs_destroy(struct nfs_args *nargs)
2122{
2123	/*
2124	 * Either the mount failed, in which case the data is not needed, or
2125	 * nfs4_mount() has either taken copies of what it needs or,
2126	 * where it has merely copied the ptr, it has set *our* ptr to NULL,
2127	 * whereby nfs4_free_args() will ignore it.
2128	 */
2129	nfs4_free_args(nargs);
2130	kmem_free(nargs, sizeof (struct nfs_args));
2131}
2132
2133/*
2134 * When we finally get into the mounting, we need to add this
2135 * node to the ephemeral tree.
2136 *
2137 * This is called from nfs4_mount().
2138 */
2139int
2140nfs4_record_ephemeral_mount(mntinfo4_t *mi, vnode_t *mvp)
2141{
2142	mntinfo4_t		*mi_parent;
2143	nfs4_ephemeral_t	*eph;
2144	nfs4_ephemeral_tree_t	*net;
2145
2146	nfs4_ephemeral_t	*prior;
2147	nfs4_ephemeral_t	*child;
2148
2149	nfs4_ephemeral_t	*peer;
2150
2151	nfs4_trigger_globals_t	*ntg;
2152	zone_t			*zone = curproc->p_zone;
2153
2154	int			rc = 0;
2155
2156	mi_parent = VTOMI4(mvp);
2157
2158	/*
2159	 * Get this before grabbing anything else!
2160	 */
2161	ntg = zone_getspecific(nfs4_ephemeral_key, zone);
2162	if (!ntg->ntg_thread_started) {
2163		nfs4_ephemeral_start_harvester(ntg);
2164	}
2165
2166	mutex_enter(&mi_parent->mi_lock);
2167	mutex_enter(&mi->mi_lock);
2168
2169	net = mi->mi_ephemeral_tree =
2170	    mi_parent->mi_ephemeral_tree;
2171
2172	/*
2173	 * If the mi_ephemeral_tree is NULL, then it
2174	 * means that either the harvester or a manual
2175	 * umount has cleared the tree out right before
2176	 * we got here.
2177	 *
2178	 * There is nothing we can do here, so return
2179	 * to the caller and let them decide whether they
2180	 * try again.
2181	 */
2182	if (net == NULL) {
2183		mutex_exit(&mi->mi_lock);
2184		mutex_exit(&mi_parent->mi_lock);
2185
2186		return (EBUSY);
2187	}
2188
2189	/*
2190	 * We've just tied the mntinfo to the tree, so
2191	 * now we bump the refcnt and hold it there until
2192	 * this mntinfo is removed from the tree.
2193	 */
2194	nfs4_ephemeral_tree_hold(net);
2195
2196	/*
2197	 * We need to tack together the ephemeral mount
2198	 * with this new mntinfo.
2199	 */
2200	eph = kmem_zalloc(sizeof (*eph), KM_SLEEP);
2201	eph->ne_mount = mi;
2202	MI4_HOLD(mi);
2203	VFS_HOLD(mi->mi_vfsp);
2204	eph->ne_ref_time = gethrestime_sec();
2205
2206	/*
2207	 * We need to tell the ephemeral mount when
2208	 * to time out.
2209	 */
2210	eph->ne_mount_to = ntg->ntg_mount_to;
2211
2212	mi->mi_ephemeral = eph;
2213
2214	/*
2215	 * If the enclosing mntinfo4 is also ephemeral,
2216	 * then we need to point to its enclosing parent.
2217	 * Else the enclosing mntinfo4 is the enclosing parent.
2218	 *
2219	 * We also need to weave this ephemeral node
2220	 * into the tree.
2221	 */
2222	if (mi_parent->mi_flags & MI4_EPHEMERAL) {
2223		/*
2224		 * We need to decide if we are
2225		 * the root node of this branch
2226		 * or if we are a sibling of this
2227		 * branch.
2228		 */
2229		prior = mi_parent->mi_ephemeral;
2230		if (prior == NULL) {
2231			/*
2232			 * Race condition, clean up, and
2233			 * let caller handle mntinfo.
2234			 */
2235			mi->mi_flags &= ~MI4_EPHEMERAL;
2236			mi->mi_ephemeral = NULL;
2237			kmem_free(eph, sizeof (*eph));
2238			VFS_RELE(mi->mi_vfsp);
2239			MI4_RELE(mi);
2240			nfs4_ephemeral_tree_rele(net);
2241			rc = EBUSY;
2242		} else {
2243			if (prior->ne_child == NULL) {
2244				prior->ne_child = eph;
2245			} else {
2246				child = prior->ne_child;
2247
2248				prior->ne_child = eph;
2249				eph->ne_peer = child;
2250
2251				child->ne_prior = eph;
2252			}
2253
2254			eph->ne_prior = prior;
2255		}
2256	} else {
2257		/*
2258		 * The parent mntinfo4 is the non-ephemeral
2259		 * root of the ephemeral tree. We
2260		 * need to decide if we are the root
2261		 * node of that tree or if we are a
2262		 * sibling of the root node.
2263		 *
2264		 * We are the root if there is no
2265		 * other node.
2266		 */
2267		if (net->net_root == NULL) {
2268			net->net_root = eph;
2269		} else {
2270			eph->ne_peer = peer = net->net_root;
2271			ASSERT(peer != NULL);
2272			net->net_root = eph;
2273
2274			peer->ne_prior = eph;
2275		}
2276
2277		eph->ne_prior = NULL;
2278	}
2279
2280	mutex_exit(&mi->mi_lock);
2281	mutex_exit(&mi_parent->mi_lock);
2282
2283	return (rc);
2284}
2285
2286/*
2287 * Commit the changes to the ephemeral tree for removing this node.
2288 */
2289static void
2290nfs4_ephemeral_umount_cleanup(nfs4_ephemeral_t *eph)
2291{
2292	nfs4_ephemeral_t	*e = eph;
2293	nfs4_ephemeral_t	*peer;
2294	nfs4_ephemeral_t	*prior;
2295
2296	peer = eph->ne_peer;
2297	prior = e->ne_prior;
2298
2299	/*
2300	 * If this branch root was not the
2301	 * tree root, then we need to fix back pointers.
2302	 */
2303	if (prior) {
2304		if (prior->ne_child == e) {
2305			prior->ne_child = peer;
2306		} else {
2307			prior->ne_peer = peer;
2308		}
2309
2310		if (peer)
2311			peer->ne_prior = prior;
2312	} else if (peer) {
2313		peer->ne_mount->mi_ephemeral_tree->net_root = peer;
2314		peer->ne_prior = NULL;
2315	} else {
2316		e->ne_mount->mi_ephemeral_tree->net_root = NULL;
2317	}
2318}
2319
2320/*
2321 * We want to avoid recursion at all costs. So we need to
2322 * unroll the tree. We do this by a depth first traversal to
2323 * leaf nodes. We blast away the leaf and work our way back
2324 * up and down the tree.
2325 */
2326static int
2327nfs4_ephemeral_unmount_engine(nfs4_ephemeral_t *eph,
2328    int isTreeRoot, int flag, cred_t *cr)
2329{
2330	nfs4_ephemeral_t	*e = eph;
2331	nfs4_ephemeral_t	*prior;
2332	mntinfo4_t		*mi;
2333	vfs_t			*vfsp;
2334	int			error;
2335
2336	/*
2337	 * We use the loop while unrolling the ephemeral tree.
2338	 */
2339	for (;;) {
2340		/*
2341		 * First we walk down the child.
2342		 */
2343		if (e->ne_child) {
2344			prior = e;
2345			e = e->ne_child;
2346			continue;
2347		}
2348
2349		/*
2350		 * If we are the root of the branch we are removing,
2351		 * we end it here. But if the branch is the root of
2352		 * the tree, we have to forge on. We do not consider
2353		 * the peer list for the root because while it may
2354		 * be okay to remove, it is both extra work and a
2355		 * potential for a false-positive error to stall the
2356		 * unmount attempt.
2357		 */
2358		if (e == eph && isTreeRoot == FALSE)
2359			return (0);
2360
2361		/*
2362		 * Next we walk down the peer list.
2363		 */
2364		if (e->ne_peer) {
2365			prior = e;
2366			e = e->ne_peer;
2367			continue;
2368		}
2369
2370		/*
2371		 * We can only remove the node passed in by the
2372		 * caller if it is the root of the ephemeral tree.
2373		 * Otherwise, the caller will remove it.
2374		 */
2375		if (e == eph && isTreeRoot == FALSE)
2376			return (0);
2377
2378		/*
2379		 * Okay, we have a leaf node, time
2380		 * to prune it!
2381		 *
2382		 * Note that prior can only be NULL if
2383		 * and only if it is the root of the
2384		 * ephemeral tree.
2385		 */
2386		prior = e->ne_prior;
2387
2388		mi = e->ne_mount;
2389		mutex_enter(&mi->mi_lock);
2390		vfsp = mi->mi_vfsp;
2391		ASSERT(vfsp != NULL);
2392
2393		/*
2394		 * Cleared by umount2_engine.
2395		 */
2396		VFS_HOLD(vfsp);
2397
2398		/*
2399		 * Inform nfs4_unmount to not recursively
2400		 * descend into this node's children when it
2401		 * gets processed.
2402		 */
2403		mi->mi_flags |= MI4_EPHEMERAL_RECURSED;
2404		mutex_exit(&mi->mi_lock);
2405
2406		error = umount2_engine(vfsp, flag, cr, FALSE);
2407		if (error) {
2408			/*
2409			 * We need to reenable nfs4_unmount's ability
2410			 * to recursively descend on this node.
2411			 */
2412			mutex_enter(&mi->mi_lock);
2413			mi->mi_flags &= ~MI4_EPHEMERAL_RECURSED;
2414			mutex_exit(&mi->mi_lock);
2415
2416			return (error);
2417		}
2418
2419		/*
2420		 * If we are the current node, we do not want to
2421		 * touch anything else. At this point, the only
2422		 * way the current node can have survived to here
2423		 * is if it is the root of the ephemeral tree and
2424		 * we are unmounting the enclosing mntinfo4.
2425		 */
2426		if (e == eph) {
2427			ASSERT(prior == NULL);
2428			return (0);
2429		}
2430
2431		/*
2432		 * Stitch up the prior node. Note that since
2433		 * we have handled the root of the tree, prior
2434		 * must be non-NULL.
2435		 */
2436		ASSERT(prior != NULL);
2437		if (prior->ne_child == e) {
2438			prior->ne_child = NULL;
2439		} else {
2440			ASSERT(prior->ne_peer == e);
2441
2442			prior->ne_peer = NULL;
2443		}
2444
2445		e = prior;
2446	}
2447
2448	/* NOTREACHED */
2449}
2450
2451/*
2452 * Common code to safely release net_cnt_lock and net_tree_lock
2453 */
2454void
2455nfs4_ephemeral_umount_unlock(bool_t *pmust_unlock,
2456    nfs4_ephemeral_tree_t **pnet)
2457{
2458	nfs4_ephemeral_tree_t	*net = *pnet;
2459
2460	if (*pmust_unlock) {
2461		mutex_enter(&net->net_cnt_lock);
2462		net->net_status &= ~NFS4_EPHEMERAL_TREE_UMOUNTING;
2463		mutex_exit(&net->net_cnt_lock);
2464
2465		mutex_exit(&net->net_tree_lock);
2466
2467		*pmust_unlock = FALSE;
2468	}
2469}
2470
2471/*
2472 * While we may have removed any child or sibling nodes of this
2473 * ephemeral node, we can not nuke it until we know that there
2474 * were no actived vnodes on it. This will do that final
2475 * work once we know it is not busy.
2476 */
2477void
2478nfs4_ephemeral_umount_activate(mntinfo4_t *mi, bool_t *pmust_unlock,
2479    nfs4_ephemeral_tree_t **pnet)
2480{
2481	/*
2482	 * Now we need to get rid of the ephemeral data if it exists.
2483	 */
2484	mutex_enter(&mi->mi_lock);
2485	if (mi->mi_ephemeral) {
2486		/*
2487		 * If we are the root node of an ephemeral branch
2488		 * which is being removed, then we need to fixup
2489		 * pointers into and out of the node.
2490		 */
2491		if (!(mi->mi_flags & MI4_EPHEMERAL_RECURSED))
2492			nfs4_ephemeral_umount_cleanup(mi->mi_ephemeral);
2493
2494		nfs4_ephemeral_tree_rele(*pnet);
2495		ASSERT(mi->mi_ephemeral != NULL);
2496
2497		kmem_free(mi->mi_ephemeral, sizeof (*mi->mi_ephemeral));
2498		mi->mi_ephemeral = NULL;
2499		VFS_RELE(mi->mi_vfsp);
2500		MI4_RELE(mi);
2501	}
2502	mutex_exit(&mi->mi_lock);
2503
2504	nfs4_ephemeral_umount_unlock(pmust_unlock, pnet);
2505}
2506
2507/*
2508 * Unmount an ephemeral node.
2509 *
2510 * Note that if this code fails, then it must unlock.
2511 *
2512 * If it succeeds, then the caller must be prepared to do so.
2513 */
2514int
2515nfs4_ephemeral_umount(mntinfo4_t *mi, int flag, cred_t *cr,
2516    bool_t *pmust_unlock, nfs4_ephemeral_tree_t **pnet)
2517{
2518	int			error = 0;
2519	nfs4_ephemeral_t	*eph;
2520	nfs4_ephemeral_tree_t	*net;
2521	int			is_derooting = FALSE;
2522	int			is_recursed = FALSE;
2523	int			was_locked = FALSE;
2524
2525	/*
2526	 * Make sure to set the default state for cleaning
2527	 * up the tree in the caller (and on the way out).
2528	 */
2529	*pmust_unlock = FALSE;
2530
2531	/*
2532	 * The active vnodes on this file system may be ephemeral
2533	 * children. We need to check for and try to unmount them
2534	 * here. If any can not be unmounted, we are going
2535	 * to return EBUSY.
2536	 */
2537	mutex_enter(&mi->mi_lock);
2538
2539	/*
2540	 * If an ephemeral tree, we need to check to see if
2541	 * the lock is already held. If it is, then we need
2542	 * to see if we are being called as a result of
2543	 * the recursive removal of some node of the tree or
2544	 * if we are another attempt to remove the tree.
2545	 *
2546	 * mi_flags & MI4_EPHEMERAL indicates an ephemeral
2547	 * node. mi_ephemeral being non-NULL also does this.
2548	 *
2549	 * mi_ephemeral_tree being non-NULL is sufficient
2550	 * to also indicate either it is an ephemeral node
2551	 * or the enclosing mntinfo4.
2552	 *
2553	 * Do we need MI4_EPHEMERAL? Yes, it is useful for
2554	 * when we delete the ephemeral node and need to
2555	 * differentiate from an ephemeral node and the
2556	 * enclosing root node.
2557	 */
2558	*pnet = net = mi->mi_ephemeral_tree;
2559	if (net == NULL) {
2560		mutex_exit(&mi->mi_lock);
2561		return (0);
2562	}
2563
2564	eph = mi->mi_ephemeral;
2565	is_recursed = mi->mi_flags & MI4_EPHEMERAL_RECURSED;
2566	is_derooting = (eph == NULL);
2567
2568	mutex_enter(&net->net_cnt_lock);
2569
2570	/*
2571	 * If this is not recursion, then we need to
2572	 * check to see if a harvester thread has
2573	 * already grabbed the lock.
2574	 *
2575	 * After we exit this branch, we may not
2576	 * blindly return, we need to jump to
2577	 * is_busy!
2578	 */
2579	if (!is_recursed) {
2580		if (net->net_status &
2581		    NFS4_EPHEMERAL_TREE_LOCKED) {
2582			/*
2583			 * If the tree is locked, we need
2584			 * to decide whether we are the
2585			 * harvester or some explicit call
2586			 * for a umount. The only way that
2587			 * we are the harvester is if
2588			 * MS_SYSSPACE is set.
2589			 *
2590			 * We only let the harvester through
2591			 * at this point.
2592			 *
2593			 * We return EBUSY so that the
2594			 * caller knows something is
2595			 * going on. Note that by that
2596			 * time, the umount in the other
2597			 * thread may have already occured.
2598			 */
2599			if (!(flag & MS_SYSSPACE)) {
2600				mutex_exit(&net->net_cnt_lock);
2601				mutex_exit(&mi->mi_lock);
2602
2603				return (EBUSY);
2604			}
2605
2606			was_locked = TRUE;
2607		}
2608	}
2609
2610	mutex_exit(&net->net_cnt_lock);
2611	mutex_exit(&mi->mi_lock);
2612
2613	/*
2614	 * If we are not the harvester, we need to check
2615	 * to see if we need to grab the tree lock.
2616	 */
2617	if (was_locked == FALSE) {
2618		/*
2619		 * If we grab the lock, it means that no other
2620		 * operation is working on the tree. If we don't
2621		 * grab it, we need to decide if this is because
2622		 * we are a recursive call or a new operation.
2623		 */
2624		if (mutex_tryenter(&net->net_tree_lock)) {
2625			*pmust_unlock = TRUE;
2626		} else {
2627			/*
2628			 * If we are a recursive call, we can
2629			 * proceed without the lock.
2630			 * Otherwise we have to wait until
2631			 * the lock becomes free.
2632			 */
2633			if (!is_recursed) {
2634				mutex_enter(&net->net_cnt_lock);
2635				if (net->net_status &
2636				    (NFS4_EPHEMERAL_TREE_DEROOTING
2637				    | NFS4_EPHEMERAL_TREE_INVALID)) {
2638					mutex_exit(&net->net_cnt_lock);
2639					goto is_busy;
2640				}
2641				mutex_exit(&net->net_cnt_lock);
2642
2643				/*
2644				 * We can't hold any other locks whilst
2645				 * we wait on this to free up.
2646				 */
2647				mutex_enter(&net->net_tree_lock);
2648
2649				/*
2650				 * Note that while mi->mi_ephemeral
2651				 * may change and thus we have to
2652				 * update eph, it is the case that
2653				 * we have tied down net and
2654				 * do not care if mi->mi_ephemeral_tree
2655				 * has changed.
2656				 */
2657				mutex_enter(&mi->mi_lock);
2658				eph = mi->mi_ephemeral;
2659				mutex_exit(&mi->mi_lock);
2660
2661				/*
2662				 * Okay, we need to see if either the
2663				 * tree got nuked or the current node
2664				 * got nuked. Both of which will cause
2665				 * an error.
2666				 *
2667				 * Note that a subsequent retry of the
2668				 * umount shall work.
2669				 */
2670				mutex_enter(&net->net_cnt_lock);
2671				if (net->net_status &
2672				    NFS4_EPHEMERAL_TREE_INVALID ||
2673				    (!is_derooting && eph == NULL)) {
2674					mutex_exit(&net->net_cnt_lock);
2675					mutex_exit(&net->net_tree_lock);
2676					goto is_busy;
2677				}
2678				mutex_exit(&net->net_cnt_lock);
2679				*pmust_unlock = TRUE;
2680			}
2681		}
2682	}
2683
2684	/*
2685	 * Only once we have grabbed the lock can we mark what we
2686	 * are planning on doing to the ephemeral tree.
2687	 */
2688	if (*pmust_unlock) {
2689		mutex_enter(&net->net_cnt_lock);
2690		net->net_status |= NFS4_EPHEMERAL_TREE_UMOUNTING;
2691
2692		/*
2693		 * Check to see if we are nuking the root.
2694		 */
2695		if (is_derooting)
2696			net->net_status |=
2697			    NFS4_EPHEMERAL_TREE_DEROOTING;
2698		mutex_exit(&net->net_cnt_lock);
2699	}
2700
2701	if (!is_derooting) {
2702		/*
2703		 * Only work on children if the caller has not already
2704		 * done so.
2705		 */
2706		if (!is_recursed) {
2707			ASSERT(eph != NULL);
2708
2709			error = nfs4_ephemeral_unmount_engine(eph,
2710			    FALSE, flag, cr);
2711			if (error)
2712				goto is_busy;
2713		}
2714	} else {
2715		eph = net->net_root;
2716
2717		/*
2718		 * Only work if there is something there.
2719		 */
2720		if (eph) {
2721			error = nfs4_ephemeral_unmount_engine(eph, TRUE,
2722			    flag, cr);
2723			if (error) {
2724				mutex_enter(&net->net_cnt_lock);
2725				net->net_status &=
2726				    ~NFS4_EPHEMERAL_TREE_DEROOTING;
2727				mutex_exit(&net->net_cnt_lock);
2728				goto is_busy;
2729			}
2730
2731			/*
2732			 * Nothing else which goes wrong will
2733			 * invalidate the blowing away of the
2734			 * ephmeral tree.
2735			 */
2736			net->net_root = NULL;
2737		}
2738
2739		/*
2740		 * We have derooted and we have caused the tree to be
2741		 * invalidated.
2742		 */
2743		mutex_enter(&net->net_cnt_lock);
2744		net->net_status &= ~NFS4_EPHEMERAL_TREE_DEROOTING;
2745		net->net_status |= NFS4_EPHEMERAL_TREE_INVALID;
2746		DTRACE_NFSV4_1(nfs4clnt__dbg__ephemeral__tree__derooting,
2747		    uint_t, net->net_refcnt);
2748
2749		/*
2750		 * We will not finalize this node, so safe to
2751		 * release it.
2752		 */
2753		nfs4_ephemeral_tree_decr(net);
2754		mutex_exit(&net->net_cnt_lock);
2755
2756		if (was_locked == FALSE)
2757			mutex_exit(&net->net_tree_lock);
2758
2759		/*
2760		 * We have just blown away any notation of this
2761		 * tree being locked or having a refcnt.
2762		 * We can't let the caller try to clean things up.
2763		 */
2764		*pmust_unlock = FALSE;
2765
2766		/*
2767		 * At this point, the tree should no longer be
2768		 * associated with the mntinfo4. We need to pull
2769		 * it off there and let the harvester take
2770		 * care of it once the refcnt drops.
2771		 */
2772		mutex_enter(&mi->mi_lock);
2773		mi->mi_ephemeral_tree = NULL;
2774		mutex_exit(&mi->mi_lock);
2775	}
2776
2777	return (0);
2778
2779is_busy:
2780
2781	nfs4_ephemeral_umount_unlock(pmust_unlock, pnet);
2782
2783	return (error);
2784}
2785
2786/*
2787 * Do the umount and record any error in the parent.
2788 */
2789static void
2790nfs4_ephemeral_record_umount(vfs_t *vfsp, int flag,
2791    nfs4_ephemeral_t *e, nfs4_ephemeral_t *prior)
2792{
2793	int	error;
2794
2795	/*
2796	 * Only act on if the fs is still mounted.
2797	 */
2798	if (vfsp == NULL)
2799		return;
2800
2801	error = umount2_engine(vfsp, flag, kcred, FALSE);
2802	if (error) {
2803		if (prior) {
2804			if (prior->ne_child == e)
2805				prior->ne_state |=
2806				    NFS4_EPHEMERAL_CHILD_ERROR;
2807			else
2808				prior->ne_state |=
2809				    NFS4_EPHEMERAL_PEER_ERROR;
2810		}
2811	}
2812}
2813
2814/*
2815 * For each tree in the forest (where the forest is in
2816 * effect all of the ephemeral trees for this zone),
2817 * scan to see if a node can be unmounted. Note that
2818 * unlike nfs4_ephemeral_unmount_engine(), we do
2819 * not process the current node before children or
2820 * siblings. I.e., if a node can be unmounted, we
2821 * do not recursively check to see if the nodes
2822 * hanging off of it can also be unmounted.
2823 *
2824 * Instead, we delve down deep to try and remove the
2825 * children first. Then, because we share code with
2826 * nfs4_ephemeral_unmount_engine(), we will try
2827 * them again. This could be a performance issue in
2828 * the future.
2829 *
2830 * Also note that unlike nfs4_ephemeral_unmount_engine(),
2831 * we do not halt on an error. We will not remove the
2832 * current node, but we will keep on trying to remove
2833 * the others.
2834 *
2835 * force indicates that we want the unmount to occur
2836 * even if there is something blocking it.
2837 *
2838 * time_check indicates that we want to see if the
2839 * mount has expired past mount_to or not. Typically
2840 * we want to do this and only on a shutdown of the
2841 * zone would we want to ignore the check.
2842 */
2843static void
2844nfs4_ephemeral_harvest_forest(nfs4_trigger_globals_t *ntg,
2845    bool_t force, bool_t time_check)
2846{
2847	nfs4_ephemeral_tree_t	*net;
2848	nfs4_ephemeral_tree_t	*prev = NULL;
2849	nfs4_ephemeral_tree_t	*next;
2850	nfs4_ephemeral_t	*e;
2851	nfs4_ephemeral_t	*prior;
2852	time_t			now = gethrestime_sec();
2853
2854	nfs4_ephemeral_tree_t	*harvest = NULL;
2855
2856	int			flag;
2857
2858	mntinfo4_t		*mi;
2859	vfs_t			*vfsp;
2860
2861	if (force)
2862		flag = MS_FORCE | MS_SYSSPACE;
2863	else
2864		flag = MS_SYSSPACE;
2865
2866	mutex_enter(&ntg->ntg_forest_lock);
2867	for (net = ntg->ntg_forest; net != NULL; net = next) {
2868		next = net->net_next;
2869
2870		nfs4_ephemeral_tree_hold(net);
2871
2872		mutex_enter(&net->net_tree_lock);
2873
2874		/*
2875		 * Let the unmount code know that the
2876		 * tree is already locked!
2877		 */
2878		mutex_enter(&net->net_cnt_lock);
2879		net->net_status |= NFS4_EPHEMERAL_TREE_LOCKED;
2880		mutex_exit(&net->net_cnt_lock);
2881
2882		/*
2883		 * If the intent is force all ephemeral nodes to
2884		 * be unmounted in this zone, we can short circuit a
2885		 * lot of tree traversal and simply zap the root node.
2886		 */
2887		if (force) {
2888			if (net->net_root) {
2889				mi = net->net_root->ne_mount;
2890
2891				vfsp = mi->mi_vfsp;
2892				ASSERT(vfsp != NULL);
2893
2894				/*
2895				 * Cleared by umount2_engine.
2896				 */
2897				VFS_HOLD(vfsp);
2898
2899				(void) umount2_engine(vfsp, flag,
2900				    kcred, FALSE);
2901
2902				goto check_done;
2903			}
2904		}
2905
2906		e = net->net_root;
2907		if (e)
2908			e->ne_state = NFS4_EPHEMERAL_VISIT_CHILD;
2909
2910		while (e) {
2911			if (e->ne_state == NFS4_EPHEMERAL_VISIT_CHILD) {
2912				e->ne_state = NFS4_EPHEMERAL_VISIT_SIBLING;
2913				if (e->ne_child) {
2914					e = e->ne_child;
2915					e->ne_state =
2916					    NFS4_EPHEMERAL_VISIT_CHILD;
2917				}
2918
2919				continue;
2920			} else if (e->ne_state ==
2921			    NFS4_EPHEMERAL_VISIT_SIBLING) {
2922				e->ne_state = NFS4_EPHEMERAL_PROCESS_ME;
2923				if (e->ne_peer) {
2924					e = e->ne_peer;
2925					e->ne_state =
2926					    NFS4_EPHEMERAL_VISIT_CHILD;
2927				}
2928
2929				continue;
2930			} else if (e->ne_state ==
2931			    NFS4_EPHEMERAL_CHILD_ERROR) {
2932				prior = e->ne_prior;
2933
2934				/*
2935				 * If a child reported an error, do
2936				 * not bother trying to unmount.
2937				 *
2938				 * If your prior node is a parent,
2939				 * pass the error up such that they
2940				 * also do not try to unmount.
2941				 *
2942				 * However, if your prior is a sibling,
2943				 * let them try to unmount if they can.
2944				 */
2945				if (prior) {
2946					if (prior->ne_child == e)
2947						prior->ne_state |=
2948						    NFS4_EPHEMERAL_CHILD_ERROR;
2949					else
2950						prior->ne_state |=
2951						    NFS4_EPHEMERAL_PEER_ERROR;
2952				}
2953
2954				/*
2955				 * Clear the error and if needed, process peers.
2956				 *
2957				 * Once we mask out the error, we know whether
2958				 * or we have to process another node.
2959				 */
2960				e->ne_state &= ~NFS4_EPHEMERAL_CHILD_ERROR;
2961				if (e->ne_state == NFS4_EPHEMERAL_PROCESS_ME)
2962					e = prior;
2963
2964				continue;
2965			} else if (e->ne_state ==
2966			    NFS4_EPHEMERAL_PEER_ERROR) {
2967				prior = e->ne_prior;
2968
2969				if (prior) {
2970					if (prior->ne_child == e)
2971						prior->ne_state =
2972						    NFS4_EPHEMERAL_CHILD_ERROR;
2973					else
2974						prior->ne_state =
2975						    NFS4_EPHEMERAL_PEER_ERROR;
2976				}
2977
2978				/*
2979				 * Clear the error from this node and do the
2980				 * correct processing.
2981				 */
2982				e->ne_state &= ~NFS4_EPHEMERAL_PEER_ERROR;
2983				continue;
2984			}
2985
2986			prior = e->ne_prior;
2987			e->ne_state = NFS4_EPHEMERAL_OK;
2988
2989			/*
2990			 * It must be the case that we need to process
2991			 * this node.
2992			 */
2993			if (!time_check ||
2994			    now - e->ne_ref_time > e->ne_mount_to) {
2995				mi = e->ne_mount;
2996				vfsp = mi->mi_vfsp;
2997
2998				/*
2999				 * Cleared by umount2_engine.
3000				 */
3001				if (vfsp != NULL)
3002					VFS_HOLD(vfsp);
3003
3004				/*
3005				 * Note that we effectively work down to the
3006				 * leaf nodes first, try to unmount them,
3007				 * then work our way back up into the leaf
3008				 * nodes.
3009				 *
3010				 * Also note that we deal with a lot of
3011				 * complexity by sharing the work with
3012				 * the manual unmount code.
3013				 */
3014				nfs4_ephemeral_record_umount(vfsp, flag,
3015				    e, prior);
3016			}
3017
3018			e = prior;
3019		}
3020
3021check_done:
3022
3023		/*
3024		 * At this point we are done processing this tree.
3025		 *
3026		 * If the tree is invalid and we were the only reference
3027		 * to it, then we push it on the local linked list
3028		 * to remove it at the end. We avoid that action now
3029		 * to keep the tree processing going along at a fair clip.
3030		 *
3031		 * Else, even if we were the only reference, we
3032		 * allow it to be reused as needed.
3033		 */
3034		mutex_enter(&net->net_cnt_lock);
3035		nfs4_ephemeral_tree_decr(net);
3036		if (net->net_refcnt == 0 &&
3037		    net->net_status & NFS4_EPHEMERAL_TREE_INVALID) {
3038			net->net_status &= ~NFS4_EPHEMERAL_TREE_LOCKED;
3039			mutex_exit(&net->net_cnt_lock);
3040			mutex_exit(&net->net_tree_lock);
3041
3042			if (prev)
3043				prev->net_next = net->net_next;
3044			else
3045				ntg->ntg_forest = net->net_next;
3046
3047			net->net_next = harvest;
3048			harvest = net;
3049
3050			VFS_RELE(net->net_mount->mi_vfsp);
3051			MI4_RELE(net->net_mount);
3052
3053			continue;
3054		}
3055
3056		net->net_status &= ~NFS4_EPHEMERAL_TREE_LOCKED;
3057		mutex_exit(&net->net_cnt_lock);
3058		mutex_exit(&net->net_tree_lock);
3059
3060		prev = net;
3061	}
3062	mutex_exit(&ntg->ntg_forest_lock);
3063
3064	for (net = harvest; net != NULL; net = next) {
3065		next = net->net_next;
3066
3067		mutex_destroy(&net->net_tree_lock);
3068		mutex_destroy(&net->net_cnt_lock);
3069		kmem_free(net, sizeof (*net));
3070	}
3071}
3072
3073/*
3074 * This is the thread which decides when the harvesting
3075 * can proceed and when to kill it off for this zone.
3076 */
3077static void
3078nfs4_ephemeral_harvester(nfs4_trigger_globals_t *ntg)
3079{
3080	clock_t		timeleft;
3081	zone_t		*zone = curproc->p_zone;
3082
3083	for (;;) {
3084		timeleft = zone_status_timedwait(zone, ddi_get_lbolt() +
3085		    nfs4_trigger_thread_timer * hz, ZONE_IS_SHUTTING_DOWN);
3086
3087		/*
3088		 * zone is exiting...
3089		 */
3090		if (timeleft != -1) {
3091			ASSERT(zone_status_get(zone) >= ZONE_IS_SHUTTING_DOWN);
3092			zthread_exit();
3093			/* NOTREACHED */
3094		}
3095
3096		/*
3097		 * Only bother scanning if there is potential
3098		 * work to be done.
3099		 */
3100		if (ntg->ntg_forest == NULL)
3101			continue;
3102
3103		/*
3104		 * Now scan the list and get rid of everything which
3105		 * is old.
3106		 */
3107		nfs4_ephemeral_harvest_forest(ntg, FALSE, TRUE);
3108	}
3109
3110	/* NOTREACHED */
3111}
3112
3113/*
3114 * The zone specific glue needed to start the unmount harvester.
3115 *
3116 * Note that we want to avoid holding the mutex as long as possible,
3117 * hence the multiple checks.
3118 *
3119 * The caller should avoid us getting down here in the first
3120 * place.
3121 */
3122static void
3123nfs4_ephemeral_start_harvester(nfs4_trigger_globals_t *ntg)
3124{
3125	/*
3126	 * It got started before we got here...
3127	 */
3128	if (ntg->ntg_thread_started)
3129		return;
3130
3131	mutex_enter(&nfs4_ephemeral_thread_lock);
3132
3133	if (ntg->ntg_thread_started) {
3134		mutex_exit(&nfs4_ephemeral_thread_lock);
3135		return;
3136	}
3137
3138	/*
3139	 * Start the unmounter harvester thread for this zone.
3140	 */
3141	(void) zthread_create(NULL, 0, nfs4_ephemeral_harvester,
3142	    ntg, 0, minclsyspri);
3143
3144	ntg->ntg_thread_started = TRUE;
3145	mutex_exit(&nfs4_ephemeral_thread_lock);
3146}
3147
3148/*ARGSUSED*/
3149static void *
3150nfs4_ephemeral_zsd_create(zoneid_t zoneid)
3151{
3152	nfs4_trigger_globals_t	*ntg;
3153
3154	ntg = kmem_zalloc(sizeof (*ntg), KM_SLEEP);
3155	ntg->ntg_thread_started = FALSE;
3156
3157	/*
3158	 * This is the default....
3159	 */
3160	ntg->ntg_mount_to = nfs4_trigger_mount_to;
3161
3162	mutex_init(&ntg->ntg_forest_lock, NULL,
3163	    MUTEX_DEFAULT, NULL);
3164
3165	return (ntg);
3166}
3167
3168/*
3169 * Try a nice gentle walk down the forest and convince
3170 * all of the trees to gracefully give it up.
3171 */
3172/*ARGSUSED*/
3173static void
3174nfs4_ephemeral_zsd_shutdown(zoneid_t zoneid, void *arg)
3175{
3176	nfs4_trigger_globals_t	*ntg = arg;
3177
3178	if (!ntg)
3179		return;
3180
3181	nfs4_ephemeral_harvest_forest(ntg, FALSE, FALSE);
3182}
3183
3184/*
3185 * Race along the forest and rip all of the trees out by
3186 * their rootballs!
3187 */
3188/*ARGSUSED*/
3189static void
3190nfs4_ephemeral_zsd_destroy(zoneid_t zoneid, void *arg)
3191{
3192	nfs4_trigger_globals_t	*ntg = arg;
3193
3194	if (!ntg)
3195		return;
3196
3197	nfs4_ephemeral_harvest_forest(ntg, TRUE, FALSE);
3198
3199	mutex_destroy(&ntg->ntg_forest_lock);
3200	kmem_free(ntg, sizeof (*ntg));
3201}
3202
3203/*
3204 * This is the zone independent cleanup needed for
3205 * emphemeral mount processing.
3206 */
3207void
3208nfs4_ephemeral_fini(void)
3209{
3210	(void) zone_key_delete(nfs4_ephemeral_key);
3211	mutex_destroy(&nfs4_ephemeral_thread_lock);
3212}
3213
3214/*
3215 * This is the zone independent initialization needed for
3216 * emphemeral mount processing.
3217 */
3218void
3219nfs4_ephemeral_init(void)
3220{
3221	mutex_init(&nfs4_ephemeral_thread_lock, NULL, MUTEX_DEFAULT,
3222	    NULL);
3223
3224	zone_key_create(&nfs4_ephemeral_key, nfs4_ephemeral_zsd_create,
3225	    nfs4_ephemeral_zsd_shutdown, nfs4_ephemeral_zsd_destroy);
3226}
3227
3228/*
3229 * nfssys() calls this function to set the per-zone
3230 * value of mount_to to drive when an ephemeral mount is
3231 * timed out. Each mount will grab a copy of this value
3232 * when mounted.
3233 */
3234void
3235nfs4_ephemeral_set_mount_to(uint_t mount_to)
3236{
3237	nfs4_trigger_globals_t	*ntg;
3238	zone_t			*zone = curproc->p_zone;
3239
3240	ntg = zone_getspecific(nfs4_ephemeral_key, zone);
3241
3242	ntg->ntg_mount_to = mount_to;
3243}
3244
3245/*
3246 * Walk the list of v4 mount options; if they are currently set in vfsp,
3247 * append them to a new comma-separated mount option string, and return it.
3248 *
3249 * Caller should free by calling nfs4_trigger_destroy_mntopts().
3250 */
3251static char *
3252nfs4_trigger_create_mntopts(vfs_t *vfsp)
3253{
3254	uint_t i;
3255	char *mntopts;
3256	struct vfssw *vswp;
3257	mntopts_t *optproto;
3258
3259	mntopts = kmem_zalloc(MAX_MNTOPT_STR, KM_SLEEP);
3260
3261	/* get the list of applicable mount options for v4; locks *vswp */
3262	vswp = vfs_getvfssw(MNTTYPE_NFS4);
3263	optproto = &vswp->vsw_optproto;
3264
3265	for (i = 0; i < optproto->mo_count; i++) {
3266		struct mntopt *mop = &optproto->mo_list[i];
3267
3268		if (mop->mo_flags & MO_EMPTY)
3269			continue;
3270
3271		if (nfs4_trigger_add_mntopt(mntopts, mop->mo_name, vfsp)) {
3272			kmem_free(mntopts, MAX_MNTOPT_STR);
3273			vfs_unrefvfssw(vswp);
3274			return (NULL);
3275		}
3276	}
3277
3278	vfs_unrefvfssw(vswp);
3279
3280	/*
3281	 * MNTOPT_XATTR is not in the v4 mount opt proto list,
3282	 * and it may only be passed via MS_OPTIONSTR, so we
3283	 * must handle it here.
3284	 *
3285	 * Ideally, it would be in the list, but NFS does not specify its
3286	 * own opt proto list, it uses instead the default one. Since
3287	 * not all filesystems support extended attrs, it would not be
3288	 * appropriate to add it there.
3289	 */
3290	if (nfs4_trigger_add_mntopt(mntopts, MNTOPT_XATTR, vfsp) ||
3291	    nfs4_trigger_add_mntopt(mntopts, MNTOPT_NOXATTR, vfsp)) {
3292		kmem_free(mntopts, MAX_MNTOPT_STR);
3293		return (NULL);
3294	}
3295
3296	return (mntopts);
3297}
3298
3299static void
3300nfs4_trigger_destroy_mntopts(char *mntopts)
3301{
3302	if (mntopts)
3303		kmem_free(mntopts, MAX_MNTOPT_STR);
3304}
3305
3306/*
3307 * Check a single mount option (optname). Add to mntopts if it is set in VFS.
3308 */
3309static int
3310nfs4_trigger_add_mntopt(char *mntopts, char *optname, vfs_t *vfsp)
3311{
3312	if (mntopts == NULL || optname == NULL || vfsp == NULL)
3313		return (EINVAL);
3314
3315	if (vfs_optionisset(vfsp, optname, NULL)) {
3316		size_t mntoptslen = strlen(mntopts);
3317		size_t optnamelen = strlen(optname);
3318
3319		/* +1 for ',', +1 for NUL */
3320		if (mntoptslen + optnamelen + 2 > MAX_MNTOPT_STR)
3321			return (EOVERFLOW);
3322
3323		/* first or subsequent mount option? */
3324		if (*mntopts != '\0')
3325			(void) strcat(mntopts, ",");
3326
3327		(void) strcat(mntopts, optname);
3328	}
3329
3330	return (0);
3331}
3332
3333static enum clnt_stat
3334nfs4_ping_server_common(struct knetconfig *knc, struct netbuf *addr, int nointr)
3335{
3336	int retries;
3337	uint_t max_msgsize;
3338	enum clnt_stat status;
3339	CLIENT *cl;
3340	struct timeval timeout;
3341
3342	/* as per recov_newserver() */
3343	max_msgsize = 0;
3344	retries = 1;
3345	timeout.tv_sec = 2;
3346	timeout.tv_usec = 0;
3347
3348	if (clnt_tli_kcreate(knc, addr, NFS_PROGRAM, NFS_V4,
3349	    max_msgsize, retries, CRED(), &cl) != 0)
3350		return (RPC_FAILED);
3351
3352	if (nointr)
3353		cl->cl_nosignal = TRUE;
3354	status = CLNT_CALL(cl, RFS_NULL, xdr_void, NULL, xdr_void, NULL,
3355	    timeout);
3356	if (nointr)
3357		cl->cl_nosignal = FALSE;
3358
3359	AUTH_DESTROY(cl->cl_auth);
3360	CLNT_DESTROY(cl);
3361
3362	return (status);
3363}
3364
3365static enum clnt_stat
3366nfs4_trigger_ping_server(servinfo4_t *svp, int nointr)
3367{
3368	return (nfs4_ping_server_common(svp->sv_knconf, &svp->sv_addr, nointr));
3369}
3370