/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* * Support for ephemeral mounts, e.g. mirror-mounts. These mounts are * triggered from a "stub" rnode via a special set of vnodeops. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include extern zone_key_t nfs4clnt_zone_key; extern zone_key_t nfsidmap_zone_key; /* * The automatic unmounter thread stuff! */ static int nfs4_trigger_thread_timer = 20; /* in seconds */ /* * Just a default.... */ static uint_t nfs4_trigger_mount_to = 240; typedef struct nfs4_trigger_globals { kmutex_t ntg_forest_lock; uint_t ntg_mount_to; int ntg_thread_started; nfs4_ephemeral_tree_t *ntg_forest; } nfs4_trigger_globals_t; kmutex_t nfs4_ephemeral_thread_lock; zone_key_t nfs4_ephemeral_key = ZONE_KEY_UNINITIALIZED; static void nfs4_ephemeral_start_harvester(nfs4_trigger_globals_t *); /* * Used for ephemeral mounts; contains data either duplicated from * servinfo4_t, or hand-crafted, depending on type of ephemeral mount. * * It's intended that this structure is used solely for ephemeral * mount-type specific data, for passing this data to * nfs4_trigger_nargs_create(). */ typedef struct ephemeral_servinfo { char *esi_hostname; char *esi_netname; char *esi_path; int esi_path_len; int esi_mount_flags; struct netbuf *esi_addr; struct netbuf *esi_syncaddr; struct knetconfig *esi_knconf; } ephemeral_servinfo_t; /* * Collect together the mount-type specific and generic data args. */ typedef struct domount_args { ephemeral_servinfo_t *dma_esi; char *dma_hostlist; /* comma-sep. for RO failover */ struct nfs_args *dma_nargs; } domount_args_t; /* * The vnode ops functions for a trigger stub vnode */ static int nfs4_trigger_open(vnode_t **, int, cred_t *, caller_context_t *); static int nfs4_trigger_getattr(vnode_t *, struct vattr *, int, cred_t *, caller_context_t *); static int nfs4_trigger_setattr(vnode_t *, struct vattr *, int, cred_t *, caller_context_t *); static int nfs4_trigger_access(vnode_t *, int, int, cred_t *, caller_context_t *); static int nfs4_trigger_readlink(vnode_t *, struct uio *, cred_t *, caller_context_t *); static int nfs4_trigger_lookup(vnode_t *, char *, vnode_t **, struct pathname *, int, vnode_t *, cred_t *, caller_context_t *, int *, pathname_t *); static int nfs4_trigger_create(vnode_t *, char *, struct vattr *, enum vcexcl, int, vnode_t **, cred_t *, int, caller_context_t *, vsecattr_t *); static int nfs4_trigger_remove(vnode_t *, char *, cred_t *, caller_context_t *, int); static int nfs4_trigger_link(vnode_t *, vnode_t *, char *, cred_t *, caller_context_t *, int); static int nfs4_trigger_rename(vnode_t *, char *, vnode_t *, char *, cred_t *, caller_context_t *, int); static int nfs4_trigger_mkdir(vnode_t *, char *, struct vattr *, vnode_t **, cred_t *, caller_context_t *, int, vsecattr_t *vsecp); static int nfs4_trigger_rmdir(vnode_t *, char *, vnode_t *, cred_t *, caller_context_t *, int); static int nfs4_trigger_symlink(vnode_t *, char *, struct vattr *, char *, cred_t *, caller_context_t *, int); static int nfs4_trigger_cmp(vnode_t *, vnode_t *, caller_context_t *); /* * Regular NFSv4 vnodeops that we need to reference directly */ extern int nfs4_getattr(vnode_t *, struct vattr *, int, cred_t *, caller_context_t *); extern void nfs4_inactive(vnode_t *, cred_t *, caller_context_t *); extern int nfs4_rwlock(vnode_t *, int, caller_context_t *); extern void nfs4_rwunlock(vnode_t *, int, caller_context_t *); extern int nfs4_lookup(vnode_t *, char *, vnode_t **, struct pathname *, int, vnode_t *, cred_t *, caller_context_t *, int *, pathname_t *); extern int nfs4_pathconf(vnode_t *, int, ulong_t *, cred_t *, caller_context_t *); extern int nfs4_getsecattr(vnode_t *, vsecattr_t *, int, cred_t *, caller_context_t *); extern int nfs4_fid(vnode_t *, fid_t *, caller_context_t *); extern int nfs4_realvp(vnode_t *, vnode_t **, caller_context_t *); static int nfs4_trigger_mount(vnode_t *, cred_t *, vnode_t **); static int nfs4_trigger_domount(vnode_t *, domount_args_t *, vfs_t **, cred_t *, vnode_t **); static int nfs4_trigger_domount_args_create(vnode_t *, cred_t *, domount_args_t **dmap); static void nfs4_trigger_domount_args_destroy(domount_args_t *dma, vnode_t *vp); static ephemeral_servinfo_t *nfs4_trigger_esi_create(vnode_t *, servinfo4_t *, cred_t *); static void nfs4_trigger_esi_destroy(ephemeral_servinfo_t *, vnode_t *); static ephemeral_servinfo_t *nfs4_trigger_esi_create_mirrormount(vnode_t *, servinfo4_t *); static ephemeral_servinfo_t *nfs4_trigger_esi_create_referral(vnode_t *, cred_t *); static struct nfs_args *nfs4_trigger_nargs_create(mntinfo4_t *, servinfo4_t *, ephemeral_servinfo_t *); static void nfs4_trigger_nargs_destroy(struct nfs_args *); static char *nfs4_trigger_create_mntopts(vfs_t *); static void nfs4_trigger_destroy_mntopts(char *); static int nfs4_trigger_add_mntopt(char *, char *, vfs_t *); static enum clnt_stat nfs4_trigger_ping_server(servinfo4_t *, int); static enum clnt_stat nfs4_ping_server_common(struct knetconfig *, struct netbuf *, int); extern int umount2_engine(vfs_t *, int, cred_t *, int); vnodeops_t *nfs4_trigger_vnodeops; /* * These are the vnodeops that we must define for stub vnodes. * * * Many of the VOPs defined for NFSv4 do not need to be defined here, * for various reasons. This will result in the VFS default function being * used: * * - These VOPs require a previous VOP_OPEN to have occurred. That will have * lost the reference to the stub vnode, meaning these should not be called: * close, read, write, ioctl, readdir, seek. * * - These VOPs are meaningless for vnodes without data pages. Since the * stub vnode is of type VDIR, these should not be called: * space, getpage, putpage, map, addmap, delmap, pageio, fsync. * * - These VOPs are otherwise not applicable, and should not be called: * dump, setsecattr. * * * These VOPs we do not want to define, but nor do we want the VFS default * action. Instead, we specify the VFS error function, with fs_error(), but * note that fs_error() is not actually called. Instead it results in the * use of the error function defined for the particular VOP, in vn_ops_table[]: * * - frlock, dispose, shrlock. * * * These VOPs we define to use the corresponding regular NFSv4 vnodeop. * NOTE: if any of these ops involve an OTW call with the stub FH, then * that call must be wrapped with save_mnt_secinfo()/check_mnt_secinfo() * to protect the security data in the servinfo4_t for the "parent" * filesystem that contains the stub. * * - These VOPs should not trigger a mount, so that "ls -l" does not: * pathconf, getsecattr. * * - These VOPs would not make sense to trigger: * inactive, rwlock, rwunlock, fid, realvp. */ const fs_operation_def_t nfs4_trigger_vnodeops_template[] = { VOPNAME_OPEN, { .vop_open = nfs4_trigger_open }, VOPNAME_GETATTR, { .vop_getattr = nfs4_trigger_getattr }, VOPNAME_SETATTR, { .vop_setattr = nfs4_trigger_setattr }, VOPNAME_ACCESS, { .vop_access = nfs4_trigger_access }, VOPNAME_LOOKUP, { .vop_lookup = nfs4_trigger_lookup }, VOPNAME_CREATE, { .vop_create = nfs4_trigger_create }, VOPNAME_REMOVE, { .vop_remove = nfs4_trigger_remove }, VOPNAME_LINK, { .vop_link = nfs4_trigger_link }, VOPNAME_RENAME, { .vop_rename = nfs4_trigger_rename }, VOPNAME_MKDIR, { .vop_mkdir = nfs4_trigger_mkdir }, VOPNAME_RMDIR, { .vop_rmdir = nfs4_trigger_rmdir }, VOPNAME_SYMLINK, { .vop_symlink = nfs4_trigger_symlink }, VOPNAME_READLINK, { .vop_readlink = nfs4_trigger_readlink }, VOPNAME_INACTIVE, { .vop_inactive = nfs4_inactive }, VOPNAME_FID, { .vop_fid = nfs4_fid }, VOPNAME_RWLOCK, { .vop_rwlock = nfs4_rwlock }, VOPNAME_RWUNLOCK, { .vop_rwunlock = nfs4_rwunlock }, VOPNAME_REALVP, { .vop_realvp = nfs4_realvp }, VOPNAME_GETSECATTR, { .vop_getsecattr = nfs4_getsecattr }, VOPNAME_PATHCONF, { .vop_pathconf = nfs4_pathconf }, VOPNAME_FRLOCK, { .error = fs_error }, VOPNAME_DISPOSE, { .error = fs_error }, VOPNAME_SHRLOCK, { .error = fs_error }, VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support }, NULL, NULL }; static void nfs4_ephemeral_tree_incr(nfs4_ephemeral_tree_t *net) { ASSERT(mutex_owned(&net->net_cnt_lock)); net->net_refcnt++; ASSERT(net->net_refcnt != 0); } static void nfs4_ephemeral_tree_hold(nfs4_ephemeral_tree_t *net) { mutex_enter(&net->net_cnt_lock); nfs4_ephemeral_tree_incr(net); mutex_exit(&net->net_cnt_lock); } /* * We need a safe way to decrement the refcnt whilst the * lock is being held. */ static void nfs4_ephemeral_tree_decr(nfs4_ephemeral_tree_t *net) { ASSERT(mutex_owned(&net->net_cnt_lock)); ASSERT(net->net_refcnt != 0); net->net_refcnt--; } static void nfs4_ephemeral_tree_rele(nfs4_ephemeral_tree_t *net) { mutex_enter(&net->net_cnt_lock); nfs4_ephemeral_tree_decr(net); mutex_exit(&net->net_cnt_lock); } /* * Trigger ops for stub vnodes; for mirror mounts, etc. * * The general idea is that a "triggering" op will first call * nfs4_trigger_mount(), which will find out whether a mount has already * been triggered. * * If it has, then nfs4_trigger_mount() sets newvp to the root vnode * of the covering vfs. * * If a mount has not yet been triggered, nfs4_trigger_mount() will do so, * and again set newvp, as above. * * The triggering op may then re-issue the VOP by calling it on newvp. * * Note that some ops may perform custom action, and may or may not need * to trigger a mount. * * Some ops need to call the regular NFSv4 vnodeop for a stub vnode. We * obviously can't do this with VOP_, since it's a stub vnode * and that would just recurse. Instead, we call the v4 op directly, * by name. This is OK, since we know that the vnode is for NFSv4, * otherwise it couldn't be a stub. * */ static int nfs4_trigger_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct) { int error; vnode_t *newvp; error = nfs4_trigger_mount(*vpp, cr, &newvp); if (error) return (error); /* Release the stub vnode, as we're losing the reference to it */ VN_RELE(*vpp); /* Give the caller the root vnode of the newly-mounted fs */ *vpp = newvp; /* return with VN_HELD(newvp) */ return (VOP_OPEN(vpp, flag, cr, ct)); } void nfs4_fake_attrs(vnode_t *vp, struct vattr *vap) { uint_t mask; timespec_t now; /* * Set some attributes here for referrals. */ mask = vap->va_mask; bzero(vap, sizeof (struct vattr)); vap->va_mask = mask; vap->va_uid = 0; vap->va_gid = 0; vap->va_nlink = 1; vap->va_size = 1; gethrestime(&now); vap->va_atime = now; vap->va_mtime = now; vap->va_ctime = now; vap->va_type = VDIR; vap->va_mode = 0555; vap->va_fsid = vp->v_vfsp->vfs_dev; vap->va_rdev = 0; vap->va_blksize = MAXBSIZE; vap->va_nblocks = 1; vap->va_seq = 0; } /* * For the majority of cases, nfs4_trigger_getattr() will not trigger * a mount. However, if ATTR_TRIGGER is set, we are being informed * that we need to force the mount before we attempt to determine * the attributes. The intent is an atomic operation for security * testing. * * If we're not triggering a mount, we can still inquire about the * actual attributes from the server in the mirror mount case, * and will return manufactured attributes for a referral (see * the 'create' branch of find_referral_stubvp()). */ static int nfs4_trigger_getattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr, caller_context_t *ct) { int error; if (flags & ATTR_TRIGGER) { vnode_t *newvp; error = nfs4_trigger_mount(vp, cr, &newvp); if (error) return (error); error = VOP_GETATTR(newvp, vap, flags, cr, ct); VN_RELE(newvp); } else if (RP_ISSTUB_MIRRORMOUNT(VTOR4(vp))) { error = nfs4_getattr(vp, vap, flags, cr, ct); } else if (RP_ISSTUB_REFERRAL(VTOR4(vp))) { nfs4_fake_attrs(vp, vap); error = 0; } return (error); } static int nfs4_trigger_setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr, caller_context_t *ct) { int error; vnode_t *newvp; error = nfs4_trigger_mount(vp, cr, &newvp); if (error) return (error); error = VOP_SETATTR(newvp, vap, flags, cr, ct); VN_RELE(newvp); return (error); } static int nfs4_trigger_access(vnode_t *vp, int mode, int flags, cred_t *cr, caller_context_t *ct) { int error; vnode_t *newvp; error = nfs4_trigger_mount(vp, cr, &newvp); if (error) return (error); error = VOP_ACCESS(newvp, mode, flags, cr, ct); VN_RELE(newvp); return (error); } static int nfs4_trigger_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp, int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct, int *deflags, pathname_t *rpnp) { int error; vnode_t *newdvp; rnode4_t *drp = VTOR4(dvp); ASSERT(RP_ISSTUB(drp)); /* * It's not legal to lookup ".." for an fs root, so we mustn't pass * that up. Instead, pass onto the regular op, regardless of whether * we've triggered a mount. */ if (strcmp(nm, "..") == 0) if (RP_ISSTUB_MIRRORMOUNT(drp)) { return (nfs4_lookup(dvp, nm, vpp, pnp, flags, rdir, cr, ct, deflags, rpnp)); } else if (RP_ISSTUB_REFERRAL(drp)) { /* Return the parent vnode */ return (vtodv(dvp, vpp, cr, TRUE)); } error = nfs4_trigger_mount(dvp, cr, &newdvp); if (error) return (error); error = VOP_LOOKUP(newdvp, nm, vpp, pnp, flags, rdir, cr, ct, deflags, rpnp); VN_RELE(newdvp); return (error); } static int nfs4_trigger_create(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive, int mode, vnode_t **vpp, cred_t *cr, int flags, caller_context_t *ct, vsecattr_t *vsecp) { int error; vnode_t *newdvp; error = nfs4_trigger_mount(dvp, cr, &newdvp); if (error) return (error); error = VOP_CREATE(newdvp, nm, va, exclusive, mode, vpp, cr, flags, ct, vsecp); VN_RELE(newdvp); return (error); } static int nfs4_trigger_remove(vnode_t *dvp, char *nm, cred_t *cr, caller_context_t *ct, int flags) { int error; vnode_t *newdvp; error = nfs4_trigger_mount(dvp, cr, &newdvp); if (error) return (error); error = VOP_REMOVE(newdvp, nm, cr, ct, flags); VN_RELE(newdvp); return (error); } static int nfs4_trigger_link(vnode_t *tdvp, vnode_t *svp, char *tnm, cred_t *cr, caller_context_t *ct, int flags) { int error; vnode_t *newtdvp; error = nfs4_trigger_mount(tdvp, cr, &newtdvp); if (error) return (error); /* * We don't check whether svp is a stub. Let the NFSv4 code * detect that error, and return accordingly. */ error = VOP_LINK(newtdvp, svp, tnm, cr, ct, flags); VN_RELE(newtdvp); return (error); } static int nfs4_trigger_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr, caller_context_t *ct, int flags) { int error; vnode_t *newsdvp; rnode4_t *tdrp = VTOR4(tdvp); /* * We know that sdvp is a stub, otherwise we would not be here. * * If tdvp is also be a stub, there are two possibilities: it * is either the same stub as sdvp [i.e. VN_CMP(sdvp, tdvp)] * or it is a different stub [!VN_CMP(sdvp, tdvp)]. * * In the former case, just trigger sdvp, and treat tdvp as * though it were not a stub. * * In the latter case, it might be a different stub for the * same server fs as sdvp, or for a different server fs. * Regardless, from the client perspective this would still * be a cross-filesystem rename, and should not be allowed, * so return EXDEV, without triggering either mount. */ if (RP_ISSTUB(tdrp) && !VN_CMP(sdvp, tdvp)) return (EXDEV); error = nfs4_trigger_mount(sdvp, cr, &newsdvp); if (error) return (error); error = VOP_RENAME(newsdvp, snm, tdvp, tnm, cr, ct, flags); VN_RELE(newsdvp); return (error); } /* ARGSUSED */ static int nfs4_trigger_mkdir(vnode_t *dvp, char *nm, struct vattr *va, vnode_t **vpp, cred_t *cr, caller_context_t *ct, int flags, vsecattr_t *vsecp) { int error; vnode_t *newdvp; error = nfs4_trigger_mount(dvp, cr, &newdvp); if (error) return (error); error = VOP_MKDIR(newdvp, nm, va, vpp, cr, ct, flags, vsecp); VN_RELE(newdvp); return (error); } static int nfs4_trigger_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, cred_t *cr, caller_context_t *ct, int flags) { int error; vnode_t *newdvp; error = nfs4_trigger_mount(dvp, cr, &newdvp); if (error) return (error); error = VOP_RMDIR(newdvp, nm, cdir, cr, ct, flags); VN_RELE(newdvp); return (error); } static int nfs4_trigger_symlink(vnode_t *dvp, char *lnm, struct vattr *tva, char *tnm, cred_t *cr, caller_context_t *ct, int flags) { int error; vnode_t *newdvp; error = nfs4_trigger_mount(dvp, cr, &newdvp); if (error) return (error); error = VOP_SYMLINK(newdvp, lnm, tva, tnm, cr, ct, flags); VN_RELE(newdvp); return (error); } static int nfs4_trigger_readlink(vnode_t *vp, struct uio *uiop, cred_t *cr, caller_context_t *ct) { int error; vnode_t *newvp; error = nfs4_trigger_mount(vp, cr, &newvp); if (error) return (error); error = VOP_READLINK(newvp, uiop, cr, ct); VN_RELE(newvp); return (error); } /* end of trigger vnode ops */ /* * See if the mount has already been done by another caller. */ static int nfs4_trigger_mounted_already(vnode_t *vp, vnode_t **newvpp, bool_t *was_mounted, vfs_t **vfsp) { int error; mntinfo4_t *mi = VTOMI4(vp); *was_mounted = FALSE; error = vn_vfsrlock_wait(vp); if (error) return (error); *vfsp = vn_mountedvfs(vp); if (*vfsp != NULL) { /* the mount has already occurred */ error = VFS_ROOT(*vfsp, newvpp); if (!error) { /* need to update the reference time */ mutex_enter(&mi->mi_lock); if (mi->mi_ephemeral) mi->mi_ephemeral->ne_ref_time = gethrestime_sec(); mutex_exit(&mi->mi_lock); *was_mounted = TRUE; } } vn_vfsunlock(vp); return (0); } /* * Mount upon a trigger vnode; for mirror-mounts, referrals, etc. * * The mount may have already occurred, via another thread. If not, * assemble the location information - which may require fetching - and * perform the mount. * * Sets newvp to be the root of the fs that is now covering vp. Note * that we return with VN_HELD(*newvp). * * The caller is responsible for passing the VOP onto the covering fs. */ static int nfs4_trigger_mount(vnode_t *vp, cred_t *cr, vnode_t **newvpp) { int error; vfs_t *vfsp; rnode4_t *rp = VTOR4(vp); mntinfo4_t *mi = VTOMI4(vp); domount_args_t *dma; nfs4_ephemeral_tree_t *net; bool_t must_unlock = FALSE; bool_t is_building = FALSE; bool_t was_mounted = FALSE; cred_t *mcred = NULL; nfs4_trigger_globals_t *ntg; zone_t *zone = curproc->p_zone; ASSERT(RP_ISSTUB(rp)); *newvpp = NULL; /* * Has the mount already occurred? */ error = nfs4_trigger_mounted_already(vp, newvpp, &was_mounted, &vfsp); if (error || was_mounted) goto done; ntg = zone_getspecific(nfs4_ephemeral_key, zone); ASSERT(ntg != NULL); mutex_enter(&mi->mi_lock); /* * We need to lock down the ephemeral tree. */ if (mi->mi_ephemeral_tree == NULL) { net = kmem_zalloc(sizeof (*net), KM_SLEEP); mutex_init(&net->net_tree_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&net->net_cnt_lock, NULL, MUTEX_DEFAULT, NULL); net->net_refcnt = 1; net->net_status = NFS4_EPHEMERAL_TREE_BUILDING; is_building = TRUE; /* * We need to add it to the zone specific list for * automatic unmounting and harvesting of deadwood. */ mutex_enter(&ntg->ntg_forest_lock); if (ntg->ntg_forest != NULL) net->net_next = ntg->ntg_forest; ntg->ntg_forest = net; mutex_exit(&ntg->ntg_forest_lock); /* * No lock order confusion with mi_lock because no * other node could have grabbed net_tree_lock. */ mutex_enter(&net->net_tree_lock); mi->mi_ephemeral_tree = net; net->net_mount = mi; mutex_exit(&mi->mi_lock); MI4_HOLD(mi); VFS_HOLD(mi->mi_vfsp); } else { net = mi->mi_ephemeral_tree; nfs4_ephemeral_tree_hold(net); mutex_exit(&mi->mi_lock); mutex_enter(&net->net_tree_lock); /* * We can only procede if the tree is neither locked * nor being torn down. */ mutex_enter(&net->net_cnt_lock); if (net->net_status & NFS4_EPHEMERAL_TREE_PROCESSING) { nfs4_ephemeral_tree_decr(net); mutex_exit(&net->net_cnt_lock); mutex_exit(&net->net_tree_lock); return (EIO); } mutex_exit(&net->net_cnt_lock); } mutex_enter(&net->net_cnt_lock); net->net_status |= NFS4_EPHEMERAL_TREE_MOUNTING; mutex_exit(&net->net_cnt_lock); must_unlock = TRUE; error = nfs4_trigger_domount_args_create(vp, cr, &dma); if (error) goto done; /* * Note that since we define mirror mounts to work * for any user, we simply extend the privileges of * the user's credentials to allow the mount to * proceed. */ mcred = crdup(cr); if (mcred == NULL) { error = EINVAL; nfs4_trigger_domount_args_destroy(dma, vp); goto done; } crset_zone_privall(mcred); if (is_system_labeled()) (void) setpflags(NET_MAC_AWARE, 1, mcred); error = nfs4_trigger_domount(vp, dma, &vfsp, mcred, newvpp); nfs4_trigger_domount_args_destroy(dma, vp); DTRACE_PROBE2(nfs4clnt__func__referral__mount, vnode_t *, vp, int, error); crfree(mcred); done: if (must_unlock) { mutex_enter(&net->net_cnt_lock); net->net_status &= ~NFS4_EPHEMERAL_TREE_MOUNTING; /* * REFCNT: If we are the root of the tree, then we need * to keep a reference because we malloced the tree and * this is where we tied it to our mntinfo. * * If we are not the root of the tree, then our tie to * the mntinfo occured elsewhere and we need to * decrement the reference to the tree. */ if (is_building) net->net_status &= ~NFS4_EPHEMERAL_TREE_BUILDING; else nfs4_ephemeral_tree_decr(net); mutex_exit(&net->net_cnt_lock); mutex_exit(&net->net_tree_lock); } if (!error && (newvpp == NULL || *newvpp == NULL)) error = ENOSYS; return (error); } /* * Collect together both the generic & mount-type specific args. */ static int nfs4_trigger_domount_args_create(vnode_t *vp, cred_t *cr, domount_args_t **dmap) { int nointr; char *hostlist; servinfo4_t *svp; struct nfs_args *nargs, *nargs_head; enum clnt_stat status; ephemeral_servinfo_t *esi, *esi_first; domount_args_t *dma; mntinfo4_t *mi = VTOMI4(vp); nointr = !(mi->mi_flags & MI4_INT); hostlist = kmem_zalloc(MAXPATHLEN, KM_SLEEP); svp = mi->mi_curr_serv; /* check if the current server is responding */ status = nfs4_trigger_ping_server(svp, nointr); if (status == RPC_SUCCESS) { esi_first = nfs4_trigger_esi_create(vp, svp, cr); if (esi_first == NULL) { kmem_free(hostlist, MAXPATHLEN); return (EINVAL); } (void) strlcpy(hostlist, esi_first->esi_hostname, MAXPATHLEN); nargs_head = nfs4_trigger_nargs_create(mi, svp, esi_first); } else { /* current server did not respond */ esi_first = NULL; nargs_head = NULL; } nargs = nargs_head; /* * NFS RO failover. * * If we have multiple servinfo4 structures, linked via sv_next, * we must create one nfs_args for each, linking the nfs_args via * nfs_ext_u.nfs_extB.next. * * We need to build a corresponding esi for each, too, but that is * used solely for building nfs_args, and may be immediately * discarded, as domount() requires the info from just one esi, * but all the nfs_args. * * Currently, the NFS mount code will hang if not all servers * requested are available. To avoid that, we need to ping each * server, here, and remove it from the list if it is not * responding. This has the side-effect of that server then * being permanently unavailable for this failover mount, even if * it recovers. That's unfortunate, but the best we can do until * the mount code path is fixed. */ /* * If the current server was down, loop indefinitely until we find * at least one responsive server. */ do { /* no locking needed for sv_next; it is only set at fs mount */ for (svp = mi->mi_servers; svp != NULL; svp = svp->sv_next) { struct nfs_args *next; /* * nargs_head: the head of the nfs_args list * nargs: the current tail of the list * next: the newly-created element to be added */ /* * We've already tried the current server, above; * if it was responding, we have already included it * and it may now be ignored. * * Otherwise, try it again, since it may now have * recovered. */ if (svp == mi->mi_curr_serv && esi_first != NULL) continue; (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); if (svp->sv_flags & SV4_NOTINUSE) { nfs_rw_exit(&svp->sv_lock); continue; } nfs_rw_exit(&svp->sv_lock); /* check if the server is responding */ status = nfs4_trigger_ping_server(svp, nointr); if (status == RPC_INTR) { kmem_free(hostlist, MAXPATHLEN); nfs4_trigger_esi_destroy(esi_first, vp); nargs = nargs_head; while (nargs != NULL) { next = nargs->nfs_ext_u.nfs_extB.next; nfs4_trigger_nargs_destroy(nargs); nargs = next; } return (EINTR); } else if (status != RPC_SUCCESS) { /* if the server did not respond, ignore it */ continue; } esi = nfs4_trigger_esi_create(vp, svp, cr); if (esi == NULL) continue; /* * If the original current server (mi_curr_serv) * was down when when we first tried it, * (i.e. esi_first == NULL), * we select this new server (svp) to be the server * that we will actually contact (esi_first). * * Note that it's possible that mi_curr_serv == svp, * if that mi_curr_serv was down but has now recovered. */ next = nfs4_trigger_nargs_create(mi, svp, esi); if (esi_first == NULL) { ASSERT(nargs == NULL); ASSERT(nargs_head == NULL); nargs_head = next; esi_first = esi; (void) strlcpy(hostlist, esi_first->esi_hostname, MAXPATHLEN); } else { ASSERT(nargs_head != NULL); nargs->nfs_ext_u.nfs_extB.next = next; (void) strlcat(hostlist, ",", MAXPATHLEN); (void) strlcat(hostlist, esi->esi_hostname, MAXPATHLEN); /* esi was only needed for hostname & nargs */ nfs4_trigger_esi_destroy(esi, vp); } nargs = next; } /* if we've had no response at all, wait a second */ if (esi_first == NULL) delay(drv_usectohz(1000000)); } while (esi_first == NULL); ASSERT(nargs_head != NULL); dma = kmem_zalloc(sizeof (domount_args_t), KM_SLEEP); dma->dma_esi = esi_first; dma->dma_hostlist = hostlist; dma->dma_nargs = nargs_head; *dmap = dma; return (0); } static void nfs4_trigger_domount_args_destroy(domount_args_t *dma, vnode_t *vp) { if (dma != NULL) { if (dma->dma_esi != NULL && vp != NULL) nfs4_trigger_esi_destroy(dma->dma_esi, vp); if (dma->dma_hostlist != NULL) kmem_free(dma->dma_hostlist, MAXPATHLEN); if (dma->dma_nargs != NULL) { struct nfs_args *nargs = dma->dma_nargs; do { struct nfs_args *next = nargs->nfs_ext_u.nfs_extB.next; nfs4_trigger_nargs_destroy(nargs); nargs = next; } while (nargs != NULL); } kmem_free(dma, sizeof (domount_args_t)); } } /* * The ephemeral_servinfo_t struct contains basic information we will need to * perform the mount. Whilst the structure is generic across different * types of ephemeral mount, the way we gather its contents differs. */ static ephemeral_servinfo_t * nfs4_trigger_esi_create(vnode_t *vp, servinfo4_t *svp, cred_t *cr) { ephemeral_servinfo_t *esi; rnode4_t *rp = VTOR4(vp); ASSERT(RP_ISSTUB(rp)); /* Call the ephemeral type-specific routine */ if (RP_ISSTUB_MIRRORMOUNT(rp)) esi = nfs4_trigger_esi_create_mirrormount(vp, svp); else if (RP_ISSTUB_REFERRAL(rp)) esi = nfs4_trigger_esi_create_referral(vp, cr); else esi = NULL; return (esi); } static void nfs4_trigger_esi_destroy(ephemeral_servinfo_t *esi, vnode_t *vp) { rnode4_t *rp = VTOR4(vp); ASSERT(RP_ISSTUB(rp)); /* Currently, no need for an ephemeral type-specific routine */ /* * The contents of ephemeral_servinfo_t goes into nfs_args, * and will be handled by nfs4_trigger_nargs_destroy(). * We need only free the structure itself. */ if (esi != NULL) kmem_free(esi, sizeof (ephemeral_servinfo_t)); } /* * Some of this may turn out to be common with other ephemeral types, * in which case it should be moved to nfs4_trigger_esi_create(), or a * common function called. */ /* * Mirror mounts case - should have all data available */ static ephemeral_servinfo_t * nfs4_trigger_esi_create_mirrormount(vnode_t *vp, servinfo4_t *svp) { char *stubpath; struct knetconfig *sikncp, *svkncp; struct netbuf *bufp; ephemeral_servinfo_t *esi; esi = kmem_zalloc(sizeof (ephemeral_servinfo_t), KM_SLEEP); /* initially set to be our type of ephemeral mount; may be added to */ esi->esi_mount_flags = NFSMNT_MIRRORMOUNT; /* * We're copying info from the stub rnode's servinfo4, but * we must create new copies, not pointers, since this information * is to be associated with the new mount, which will be * unmounted (and its structures freed) separately */ /* * Sizes passed to kmem_[z]alloc here must match those freed * in nfs4_free_args() */ /* * We hold sv_lock across kmem_zalloc() calls that may sleep, but this * is difficult to avoid: as we need to read svp to calculate the * sizes to be allocated. */ (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); esi->esi_hostname = kmem_zalloc(strlen(svp->sv_hostname) + 1, KM_SLEEP); (void) strcat(esi->esi_hostname, svp->sv_hostname); esi->esi_addr = kmem_zalloc(sizeof (struct netbuf), KM_SLEEP); bufp = esi->esi_addr; bufp->len = svp->sv_addr.len; bufp->maxlen = svp->sv_addr.maxlen; bufp->buf = kmem_zalloc(bufp->len, KM_SLEEP); bcopy(svp->sv_addr.buf, bufp->buf, bufp->len); esi->esi_knconf = kmem_zalloc(sizeof (*esi->esi_knconf), KM_SLEEP); sikncp = esi->esi_knconf; svkncp = svp->sv_knconf; sikncp->knc_semantics = svkncp->knc_semantics; sikncp->knc_protofmly = (caddr_t)kmem_zalloc(KNC_STRSIZE, KM_SLEEP); (void) strcat((char *)sikncp->knc_protofmly, (char *)svkncp->knc_protofmly); sikncp->knc_proto = (caddr_t)kmem_zalloc(KNC_STRSIZE, KM_SLEEP); (void) strcat((char *)sikncp->knc_proto, (char *)svkncp->knc_proto); sikncp->knc_rdev = svkncp->knc_rdev; /* * Used when AUTH_DH is negotiated. * * This is ephemeral mount-type specific, since it contains the * server's time-sync syncaddr. */ if (svp->sv_dhsec) { struct netbuf *bufp; sec_data_t *sdata; dh_k4_clntdata_t *data; sdata = svp->sv_dhsec; data = (dh_k4_clntdata_t *)sdata->data; ASSERT(sdata->rpcflavor == AUTH_DH); bufp = kmem_zalloc(sizeof (struct netbuf), KM_SLEEP); bufp->len = data->syncaddr.len; bufp->maxlen = data->syncaddr.maxlen; bufp->buf = kmem_zalloc(bufp->len, KM_SLEEP); bcopy(data->syncaddr.buf, bufp->buf, bufp->len); esi->esi_syncaddr = bufp; if (data->netname != NULL) { int nmlen = data->netnamelen; /* * We need to copy from a dh_k4_clntdata_t * netname/netnamelen pair to a NUL-terminated * netname string suitable for putting in nfs_args, * where the latter has no netnamelen field. */ esi->esi_netname = kmem_zalloc(nmlen + 1, KM_SLEEP); bcopy(data->netname, esi->esi_netname, nmlen); } } else { esi->esi_syncaddr = NULL; esi->esi_netname = NULL; } stubpath = fn_path(VTOSV(vp)->sv_name); /* step over initial '.', to avoid e.g. sv_path: "/tank./ws" */ ASSERT(*stubpath == '.'); stubpath += 1; /* for nfs_args->fh */ esi->esi_path_len = strlen(stubpath) + 1; if (strcmp(svp->sv_path, "/") != 0) esi->esi_path_len += strlen(svp->sv_path); esi->esi_path = kmem_zalloc(esi->esi_path_len, KM_SLEEP); if (strcmp(svp->sv_path, "/") != 0) (void) strcat(esi->esi_path, svp->sv_path); (void) strcat(esi->esi_path, stubpath); stubpath -= 1; /* stubpath allocated by fn_path() */ kmem_free(stubpath, strlen(stubpath) + 1); nfs_rw_exit(&svp->sv_lock); return (esi); } /* * Makes an upcall to NFSMAPID daemon to resolve hostname of NFS server to * get network information required to do the mount call. */ int nfs4_callmapid(utf8string *server, struct nfs_fsl_info *resp) { door_arg_t door_args; door_handle_t dh; XDR xdr; refd_door_args_t *xdr_argsp; refd_door_res_t *orig_resp; k_sigset_t smask; int xdr_len = 0; int res_len = 16; /* length of an ip adress */ int orig_reslen = res_len; int error = 0; struct nfsidmap_globals *nig; if (zone_status_get(curproc->p_zone) >= ZONE_IS_SHUTTING_DOWN) return (ECONNREFUSED); nig = zone_getspecific(nfsidmap_zone_key, nfs_zone()); ASSERT(nig != NULL); mutex_enter(&nig->nfsidmap_daemon_lock); dh = nig->nfsidmap_daemon_dh; if (dh == NULL) { mutex_exit(&nig->nfsidmap_daemon_lock); cmn_err(CE_NOTE, "nfs4_callmapid: nfsmapid daemon not " \ "running unable to resolve host name\n"); return (EINVAL); } door_ki_hold(dh); mutex_exit(&nig->nfsidmap_daemon_lock); xdr_len = xdr_sizeof(&(xdr_utf8string), server); xdr_argsp = kmem_zalloc(xdr_len + sizeof (*xdr_argsp), KM_SLEEP); xdr_argsp->xdr_len = xdr_len; xdr_argsp->cmd = NFSMAPID_SRV_NETINFO; xdrmem_create(&xdr, (char *)&xdr_argsp->xdr_arg, xdr_len, XDR_ENCODE); if (!xdr_utf8string(&xdr, server)) { kmem_free(xdr_argsp, xdr_len + sizeof (*xdr_argsp)); door_ki_rele(dh); return (1); } if (orig_reslen) orig_resp = kmem_alloc(orig_reslen, KM_SLEEP); door_args.data_ptr = (char *)xdr_argsp; door_args.data_size = sizeof (*xdr_argsp) + xdr_argsp->xdr_len; door_args.desc_ptr = NULL; door_args.desc_num = 0; door_args.rbuf = orig_resp ? (char *)orig_resp : NULL; door_args.rsize = res_len; sigintr(&smask, 1); error = door_ki_upcall(dh, &door_args); sigunintr(&smask); door_ki_rele(dh); kmem_free(xdr_argsp, xdr_len + sizeof (*xdr_argsp)); if (error) { kmem_free(orig_resp, orig_reslen); /* * There is no door to connect to. The referral daemon * must not be running yet. */ cmn_err(CE_WARN, "nfsmapid not running cannot resolve host name"); goto out; } /* * If the results buffer passed back are not the same as * what was sent free the old buffer and use the new one. */ if (orig_resp && orig_reslen) { refd_door_res_t *door_resp; door_resp = (refd_door_res_t *)door_args.rbuf; if ((void *)door_args.rbuf != orig_resp) kmem_free(orig_resp, orig_reslen); if (door_resp->res_status == 0) { xdrmem_create(&xdr, (char *)&door_resp->xdr_res, door_resp->xdr_len, XDR_DECODE); bzero(resp, sizeof (struct nfs_fsl_info)); if (!xdr_nfs_fsl_info(&xdr, resp)) { DTRACE_PROBE2( nfs4clnt__debug__referral__upcall__xdrfail, struct nfs_fsl_info *, resp, char *, "nfs4_callmapid"); error = EINVAL; } } else { DTRACE_PROBE2( nfs4clnt__debug__referral__upcall__badstatus, int, door_resp->res_status, char *, "nfs4_callmapid"); error = door_resp->res_status; } kmem_free(door_args.rbuf, door_args.rsize); } out: DTRACE_PROBE2(nfs4clnt__func__referral__upcall, char *, server, int, error); return (error); } /* * Fetches the fs_locations attribute. Typically called * from a Replication/Migration/Referrals/Mirror-mount context * * Fills in the attributes in garp. The caller is assumed * to have allocated memory for garp. * * lock: if set do not lock s_recovlock and mi_recovlock mutex, * it's already done by caller. Otherwise lock these mutexes * before doing the rfs4call(). * * Returns * 1 for success * 0 for failure */ int nfs4_fetch_locations(mntinfo4_t *mi, nfs4_sharedfh_t *sfh, char *nm, cred_t *cr, nfs4_ga_res_t *garp, COMPOUND4res_clnt *callres, bool_t lock) { COMPOUND4args_clnt args; COMPOUND4res_clnt res; nfs_argop4 *argop; int argoplist_size = 3 * sizeof (nfs_argop4); nfs4_server_t *sp = NULL; int doqueue = 1; nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; int retval = 1; struct nfs4_clnt *nfscl; if (lock == TRUE) (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 0); else ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) || nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER)); sp = find_nfs4_server(mi); if (lock == TRUE) nfs_rw_exit(&mi->mi_recovlock); if (sp != NULL) mutex_exit(&sp->s_lock); if (lock == TRUE) { if (sp != NULL) (void) nfs_rw_enter_sig(&sp->s_recovlock, RW_WRITER, 0); (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_WRITER, 0); } else { if (sp != NULL) { ASSERT(nfs_rw_lock_held(&sp->s_recovlock, RW_READER) || nfs_rw_lock_held(&sp->s_recovlock, RW_WRITER)); } } /* * Do we want to do the setup for recovery here? * * We know that the server responded to a null ping a very * short time ago, and we know that we intend to do a * single stateless operation - we want to fetch attributes, * so we know we can't encounter errors about state. If * something goes wrong with the GETATTR, like not being * able to get a response from the server or getting any * kind of FH error, we should fail the mount. * * We may want to re-visited this at a later time. */ argop = kmem_alloc(argoplist_size, KM_SLEEP); args.ctag = TAG_GETATTR_FSLOCATION; /* PUTFH LOOKUP GETATTR */ args.array_len = 3; args.array = argop; /* 0. putfh file */ argop[0].argop = OP_CPUTFH; argop[0].nfs_argop4_u.opcputfh.sfh = sfh; /* 1. lookup name, can't be dotdot */ argop[1].argop = OP_CLOOKUP; argop[1].nfs_argop4_u.opclookup.cname = nm; /* 2. file attrs */ argop[2].argop = OP_GETATTR; argop[2].nfs_argop4_u.opgetattr.attr_request = FATTR4_FSID_MASK | FATTR4_FS_LOCATIONS_MASK | FATTR4_MOUNTED_ON_FILEID_MASK; argop[2].nfs_argop4_u.opgetattr.mi = mi; rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); if (lock == TRUE) { nfs_rw_exit(&mi->mi_recovlock); if (sp != NULL) nfs_rw_exit(&sp->s_recovlock); } nfscl = zone_getspecific(nfs4clnt_zone_key, nfs_zone()); nfscl->nfscl_stat.referrals.value.ui64++; DTRACE_PROBE3(nfs4clnt__func__referral__fsloc, nfs4_sharedfh_t *, sfh, char *, nm, nfs4_error_t *, &e); if (e.error != 0) { if (sp != NULL) nfs4_server_rele(sp); kmem_free(argop, argoplist_size); return (0); } /* * Check for all possible error conditions. * For valid replies without an ops array or for illegal * replies, return a failure. */ if (res.status != NFS4_OK || res.array_len < 3 || res.array[2].nfs_resop4_u.opgetattr.status != NFS4_OK) { retval = 0; goto exit; } /* * There isn't much value in putting the attributes * in the attr cache since fs_locations4 aren't * encountered very frequently, so just make them * available to the caller. */ *garp = res.array[2].nfs_resop4_u.opgetattr.ga_res; DTRACE_PROBE2(nfs4clnt__debug__referral__fsloc, nfs4_ga_res_t *, garp, char *, "nfs4_fetch_locations"); /* No fs_locations? -- return a failure */ if (garp->n4g_ext_res == NULL || garp->n4g_ext_res->n4g_fslocations.locations_val == NULL) { retval = 0; goto exit; } if (!garp->n4g_fsid_valid) retval = 0; exit: if (retval == 0) { /* the call was ok but failed validating the call results */ xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); } else { ASSERT(callres != NULL); *callres = res; } if (sp != NULL) nfs4_server_rele(sp); kmem_free(argop, argoplist_size); return (retval); } /* tunable to disable referral mounts */ int nfs4_no_referrals = 0; /* * Returns NULL if the vnode cannot be created or found. */ vnode_t * find_referral_stubvp(vnode_t *dvp, char *nm, cred_t *cr) { nfs_fh4 *stub_fh, *dfh; nfs4_sharedfh_t *sfhp; char *newfhval; vnode_t *vp = NULL; fattr4_mounted_on_fileid mnt_on_fileid; nfs4_ga_res_t garp; mntinfo4_t *mi; COMPOUND4res_clnt callres; hrtime_t t; if (nfs4_no_referrals) return (NULL); /* * Get the mounted_on_fileid, unique on that server::fsid */ mi = VTOMI4(dvp); if (nfs4_fetch_locations(mi, VTOR4(dvp)->r_fh, nm, cr, &garp, &callres, FALSE) == 0) return (NULL); mnt_on_fileid = garp.n4g_mon_fid; xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&callres); /* * Build a fake filehandle from the dir FH and the mounted_on_fileid */ dfh = &VTOR4(dvp)->r_fh->sfh_fh; stub_fh = kmem_alloc(sizeof (nfs_fh4), KM_SLEEP); stub_fh->nfs_fh4_val = kmem_alloc(dfh->nfs_fh4_len + sizeof (fattr4_mounted_on_fileid), KM_SLEEP); newfhval = stub_fh->nfs_fh4_val; /* copy directory's file handle */ bcopy(dfh->nfs_fh4_val, newfhval, dfh->nfs_fh4_len); stub_fh->nfs_fh4_len = dfh->nfs_fh4_len; newfhval = newfhval + dfh->nfs_fh4_len; /* Add mounted_on_fileid. Use bcopy to avoid alignment problem */ bcopy((char *)&mnt_on_fileid, newfhval, sizeof (fattr4_mounted_on_fileid)); stub_fh->nfs_fh4_len += sizeof (fattr4_mounted_on_fileid); sfhp = sfh4_put(stub_fh, VTOMI4(dvp), NULL); kmem_free(stub_fh->nfs_fh4_val, dfh->nfs_fh4_len + sizeof (fattr4_mounted_on_fileid)); kmem_free(stub_fh, sizeof (nfs_fh4)); if (sfhp == NULL) return (NULL); t = gethrtime(); garp.n4g_va.va_type = VDIR; vp = makenfs4node(sfhp, NULL, dvp->v_vfsp, t, cr, dvp, fn_get(VTOSV(dvp)->sv_name, nm, sfhp)); if (vp != NULL) vp->v_type = VDIR; sfh4_rele(&sfhp); return (vp); } int nfs4_setup_referral(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr) { vnode_t *nvp; rnode4_t *rp; if ((nvp = find_referral_stubvp(dvp, nm, cr)) == NULL) return (EINVAL); rp = VTOR4(nvp); mutex_enter(&rp->r_statelock); r4_stub_referral(rp); mutex_exit(&rp->r_statelock); dnlc_enter(dvp, nm, nvp); if (*vpp != NULL) VN_RELE(*vpp); /* no longer need this vnode */ *vpp = nvp; return (0); } /* * Fetch the location information and resolve the new server. * Caller needs to free up the XDR data which is returned. * Input: mount info, shared filehandle, nodename * Return: Index to the result or Error(-1) * Output: FsLocations Info, Resolved Server Info. */ int nfs4_process_referral(mntinfo4_t *mi, nfs4_sharedfh_t *sfh, char *nm, cred_t *cr, nfs4_ga_res_t *grp, COMPOUND4res_clnt *res, struct nfs_fsl_info *fsloc) { fs_location4 *fsp; struct nfs_fsl_info nfsfsloc; int ret, i, error; nfs4_ga_res_t garp; COMPOUND4res_clnt callres; struct knetconfig *knc; ret = nfs4_fetch_locations(mi, sfh, nm, cr, &garp, &callres, TRUE); if (ret == 0) return (-1); /* * As a lame attempt to figuring out if we're * handling a migration event or a referral, * look for rnodes with this fsid in the rnode * cache. * * If we can find one or more such rnodes, it * means we're handling a migration event and * we want to bail out in that case. */ if (r4find_by_fsid(mi, &garp.n4g_fsid)) { DTRACE_PROBE3(nfs4clnt__debug__referral__migration, mntinfo4_t *, mi, nfs4_ga_res_t *, &garp, char *, "nfs4_process_referral"); xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&callres); return (-1); } /* * Find the first responsive server to mount. When we find * one, fsp will point to it. */ for (i = 0; i < garp.n4g_ext_res->n4g_fslocations.locations_len; i++) { fsp = &garp.n4g_ext_res->n4g_fslocations.locations_val[i]; if (fsp->server_len == 0 || fsp->server_val == NULL) continue; error = nfs4_callmapid(fsp->server_val, &nfsfsloc); if (error != 0) continue; error = nfs4_ping_server_common(nfsfsloc.knconf, nfsfsloc.addr, !(mi->mi_flags & MI4_INT)); if (error == RPC_SUCCESS) break; DTRACE_PROBE2(nfs4clnt__debug__referral__srvaddr, sockaddr_in *, (struct sockaddr_in *)nfsfsloc.addr->buf, char *, "nfs4_process_referral"); xdr_free(xdr_nfs_fsl_info, (char *)&nfsfsloc); } knc = nfsfsloc.knconf; if ((i >= garp.n4g_ext_res->n4g_fslocations.locations_len) || (knc->knc_protofmly == NULL) || (knc->knc_proto == NULL)) { DTRACE_PROBE2(nfs4clnt__debug__referral__nofsloc, nfs4_ga_res_t *, &garp, char *, "nfs4_process_referral"); xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&callres); return (-1); } /* Send the results back */ *fsloc = nfsfsloc; *grp = garp; *res = callres; return (i); } /* * Referrals case - need to fetch referral data and then upcall to * user-level to get complete mount data. */ static ephemeral_servinfo_t * nfs4_trigger_esi_create_referral(vnode_t *vp, cred_t *cr) { struct knetconfig *sikncp, *svkncp; struct netbuf *bufp; ephemeral_servinfo_t *esi; vnode_t *dvp; rnode4_t *drp; fs_location4 *fsp; struct nfs_fsl_info nfsfsloc; nfs4_ga_res_t garp; char *p; char fn[MAXNAMELEN]; int i, index = -1; mntinfo4_t *mi; COMPOUND4res_clnt callres; /* * If we're passed in a stub vnode that * isn't a "referral" stub, bail out * and return a failure */ if (!RP_ISSTUB_REFERRAL(VTOR4(vp))) return (NULL); if (vtodv(vp, &dvp, CRED(), TRUE) != 0) return (NULL); drp = VTOR4(dvp); if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp))) { VN_RELE(dvp); return (NULL); } if (vtoname(vp, fn, MAXNAMELEN) != 0) { nfs_rw_exit(&drp->r_rwlock); VN_RELE(dvp); return (NULL); } mi = VTOMI4(dvp); index = nfs4_process_referral(mi, drp->r_fh, fn, cr, &garp, &callres, &nfsfsloc); nfs_rw_exit(&drp->r_rwlock); VN_RELE(dvp); if (index < 0) return (NULL); fsp = &garp.n4g_ext_res->n4g_fslocations.locations_val[index]; esi = kmem_zalloc(sizeof (ephemeral_servinfo_t), KM_SLEEP); /* initially set to be our type of ephemeral mount; may be added to */ esi->esi_mount_flags = NFSMNT_REFERRAL; esi->esi_hostname = kmem_zalloc(fsp->server_val->utf8string_len + 1, KM_SLEEP); bcopy(fsp->server_val->utf8string_val, esi->esi_hostname, fsp->server_val->utf8string_len); esi->esi_hostname[fsp->server_val->utf8string_len] = '\0'; bufp = kmem_alloc(sizeof (struct netbuf), KM_SLEEP); bufp->len = nfsfsloc.addr->len; bufp->maxlen = nfsfsloc.addr->maxlen; bufp->buf = kmem_zalloc(bufp->len, KM_SLEEP); bcopy(nfsfsloc.addr->buf, bufp->buf, bufp->len); esi->esi_addr = bufp; esi->esi_knconf = kmem_zalloc(sizeof (*esi->esi_knconf), KM_SLEEP); sikncp = esi->esi_knconf; DTRACE_PROBE2(nfs4clnt__debug__referral__nfsfsloc, struct nfs_fsl_info *, &nfsfsloc, char *, "nfs4_trigger_esi_create_referral"); svkncp = nfsfsloc.knconf; sikncp->knc_semantics = svkncp->knc_semantics; sikncp->knc_protofmly = (caddr_t)kmem_zalloc(KNC_STRSIZE, KM_SLEEP); (void) strlcat((char *)sikncp->knc_protofmly, (char *)svkncp->knc_protofmly, KNC_STRSIZE); sikncp->knc_proto = (caddr_t)kmem_zalloc(KNC_STRSIZE, KM_SLEEP); (void) strlcat((char *)sikncp->knc_proto, (char *)svkncp->knc_proto, KNC_STRSIZE); sikncp->knc_rdev = svkncp->knc_rdev; DTRACE_PROBE2(nfs4clnt__debug__referral__knetconf, struct knetconfig *, sikncp, char *, "nfs4_trigger_esi_create_referral"); esi->esi_netname = kmem_zalloc(nfsfsloc.netnm_len, KM_SLEEP); bcopy(nfsfsloc.netname, esi->esi_netname, nfsfsloc.netnm_len); esi->esi_syncaddr = NULL; esi->esi_path = p = kmem_zalloc(MAXPATHLEN, KM_SLEEP); esi->esi_path_len = MAXPATHLEN; *p++ = '/'; for (i = 0; i < fsp->rootpath.pathname4_len; i++) { component4 *comp; comp = &fsp->rootpath.pathname4_val[i]; /* If no space, null the string and bail */ if ((p - esi->esi_path) + comp->utf8string_len + 1 > MAXPATHLEN) goto err; bcopy(comp->utf8string_val, p, comp->utf8string_len); p += comp->utf8string_len; *p++ = '/'; } if (fsp->rootpath.pathname4_len != 0) *(p - 1) = '\0'; else *p = '\0'; p = esi->esi_path; esi->esi_path = strdup(p); esi->esi_path_len = strlen(p) + 1; kmem_free(p, MAXPATHLEN); /* Allocated in nfs4_process_referral() */ xdr_free(xdr_nfs_fsl_info, (char *)&nfsfsloc); xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&callres); return (esi); err: kmem_free(esi->esi_path, esi->esi_path_len); kmem_free(esi->esi_hostname, fsp->server_val->utf8string_len + 1); kmem_free(esi->esi_addr->buf, esi->esi_addr->len); kmem_free(esi->esi_addr, sizeof (struct netbuf)); kmem_free(esi->esi_knconf->knc_protofmly, KNC_STRSIZE); kmem_free(esi->esi_knconf->knc_proto, KNC_STRSIZE); kmem_free(esi->esi_knconf, sizeof (*esi->esi_knconf)); kmem_free(esi->esi_netname, nfsfsloc.netnm_len); kmem_free(esi, sizeof (ephemeral_servinfo_t)); xdr_free(xdr_nfs_fsl_info, (char *)&nfsfsloc); xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&callres); return (NULL); } /* * Assemble the args, and call the generic VFS mount function to * finally perform the ephemeral mount. */ static int nfs4_trigger_domount(vnode_t *stubvp, domount_args_t *dma, vfs_t **vfsp, cred_t *cr, vnode_t **newvpp) { struct mounta *uap; char *mntpt, *orig_path, *path; const char *orig_mntpt; int retval; int mntpt_len; int spec_len; zone_t *zone = curproc->p_zone; bool_t has_leading_slash; int i; vfs_t *stubvfsp = stubvp->v_vfsp; ephemeral_servinfo_t *esi = dma->dma_esi; struct nfs_args *nargs = dma->dma_nargs; /* first, construct the mount point for the ephemeral mount */ orig_path = path = fn_path(VTOSV(stubvp)->sv_name); orig_mntpt = (char *)refstr_value(stubvfsp->vfs_mntpt); if (*orig_path == '.') orig_path++; /* * Get rid of zone's root path */ if (zone != global_zone) { /* * -1 for trailing '/' and -1 for EOS. */ if (strncmp(zone->zone_rootpath, orig_mntpt, zone->zone_rootpathlen - 1) == 0) { orig_mntpt += (zone->zone_rootpathlen - 2); } } mntpt_len = strlen(orig_mntpt) + strlen(orig_path); mntpt = kmem_zalloc(mntpt_len + 1, KM_SLEEP); (void) strcat(mntpt, orig_mntpt); (void) strcat(mntpt, orig_path); kmem_free(path, strlen(path) + 1); path = esi->esi_path; if (*path == '.') path++; if (path[0] == '/' && path[1] == '/') path++; has_leading_slash = (*path == '/'); spec_len = strlen(dma->dma_hostlist); spec_len += strlen(path); /* We are going to have to add this in */ if (!has_leading_slash) spec_len++; /* We need to get the ':' for dma_hostlist:esi_path */ spec_len++; uap = kmem_zalloc(sizeof (struct mounta), KM_SLEEP); uap->spec = kmem_zalloc(spec_len + 1, KM_SLEEP); (void) snprintf(uap->spec, spec_len + 1, "%s:%s%s", dma->dma_hostlist, has_leading_slash ? "" : "/", path); uap->dir = mntpt; uap->flags = MS_SYSSPACE | MS_DATA; /* fstype-independent mount options not covered elsewhere */ /* copy parent's mount(8) "-m" flag */ if (stubvfsp->vfs_flag & VFS_NOMNTTAB) uap->flags |= MS_NOMNTTAB; uap->fstype = MNTTYPE_NFS4; uap->dataptr = (char *)nargs; /* not needed for MS_SYSSPACE */ uap->datalen = 0; /* use optptr to pass in extra mount options */ uap->flags |= MS_OPTIONSTR; uap->optptr = nfs4_trigger_create_mntopts(stubvfsp); if (uap->optptr == NULL) { retval = EINVAL; goto done; } /* domount() expects us to count the trailing NUL */ uap->optlen = strlen(uap->optptr) + 1; /* * If we get EBUSY, we try again once to see if we can perform * the mount. We do this because of a spurious race condition. */ for (i = 0; i < 2; i++) { int error; bool_t was_mounted; retval = domount(NULL, uap, stubvp, cr, vfsp); if (retval == 0) { retval = VFS_ROOT(*vfsp, newvpp); VFS_RELE(*vfsp); break; } else if (retval != EBUSY) { break; } /* * We might find it mounted by the other racer... */ error = nfs4_trigger_mounted_already(stubvp, newvpp, &was_mounted, vfsp); if (error) { goto done; } else if (was_mounted) { retval = 0; break; } } done: if (uap->optptr) nfs4_trigger_destroy_mntopts(uap->optptr); kmem_free(uap->spec, spec_len + 1); kmem_free(uap, sizeof (struct mounta)); kmem_free(mntpt, mntpt_len + 1); return (retval); } /* * Build an nfs_args structure for passing to domount(). * * Ephemeral mount-type specific data comes from the ephemeral_servinfo_t; * generic data - common to all ephemeral mount types - is read directly * from the parent mount's servinfo4_t and mntinfo4_t, via the stub vnode. */ static struct nfs_args * nfs4_trigger_nargs_create(mntinfo4_t *mi, servinfo4_t *svp, ephemeral_servinfo_t *esi) { sec_data_t *secdata; struct nfs_args *nargs; /* setup the nfs args */ nargs = kmem_zalloc(sizeof (struct nfs_args), KM_SLEEP); (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); nargs->addr = esi->esi_addr; /* for AUTH_DH by negotiation */ if (esi->esi_syncaddr || esi->esi_netname) { nargs->flags |= NFSMNT_SECURE; nargs->syncaddr = esi->esi_syncaddr; nargs->netname = esi->esi_netname; } nargs->flags |= NFSMNT_KNCONF; nargs->knconf = esi->esi_knconf; nargs->flags |= NFSMNT_HOSTNAME; nargs->hostname = esi->esi_hostname; nargs->fh = esi->esi_path; /* general mount settings, all copied from parent mount */ mutex_enter(&mi->mi_lock); if (!(mi->mi_flags & MI4_HARD)) nargs->flags |= NFSMNT_SOFT; nargs->flags |= NFSMNT_WSIZE | NFSMNT_RSIZE | NFSMNT_TIMEO | NFSMNT_RETRANS; nargs->wsize = mi->mi_stsize; nargs->rsize = mi->mi_tsize; nargs->timeo = mi->mi_timeo; nargs->retrans = mi->mi_retrans; if (mi->mi_flags & MI4_INT) nargs->flags |= NFSMNT_INT; if (mi->mi_flags & MI4_NOAC) nargs->flags |= NFSMNT_NOAC; nargs->flags |= NFSMNT_ACREGMIN | NFSMNT_ACREGMAX | NFSMNT_ACDIRMIN | NFSMNT_ACDIRMAX; nargs->acregmin = HR2SEC(mi->mi_acregmin); nargs->acregmax = HR2SEC(mi->mi_acregmax); nargs->acdirmin = HR2SEC(mi->mi_acdirmin); nargs->acdirmax = HR2SEC(mi->mi_acdirmax); /* add any specific flags for this type of ephemeral mount */ nargs->flags |= esi->esi_mount_flags; if (mi->mi_flags & MI4_NOCTO) nargs->flags |= NFSMNT_NOCTO; if (mi->mi_flags & MI4_GRPID) nargs->flags |= NFSMNT_GRPID; if (mi->mi_flags & MI4_LLOCK) nargs->flags |= NFSMNT_LLOCK; if (mi->mi_flags & MI4_NOPRINT) nargs->flags |= NFSMNT_NOPRINT; if (mi->mi_flags & MI4_DIRECTIO) nargs->flags |= NFSMNT_DIRECTIO; if (mi->mi_flags & MI4_PUBLIC && nargs->flags & NFSMNT_MIRRORMOUNT) nargs->flags |= NFSMNT_PUBLIC; /* Do some referral-specific option tweaking */ if (nargs->flags & NFSMNT_REFERRAL) { nargs->flags &= ~NFSMNT_DORDMA; nargs->flags |= NFSMNT_TRYRDMA; } mutex_exit(&mi->mi_lock); /* * Security data & negotiation policy. * * For mirror mounts, we need to preserve the parent mount's * preference for security negotiation, translating SV4_TRYSECDEFAULT * to NFSMNT_SECDEFAULT if present. * * For referrals, we always want security negotiation and will * set NFSMNT_SECDEFAULT and we will not copy current secdata. * The reason is that we can't negotiate down from a parent's * Kerberos flavor to AUTH_SYS. * * If SV4_TRYSECDEFAULT is not set, that indicates that a specific * security flavour was requested, with data in sv_secdata, and that * no negotiation should occur. If this specified flavour fails, that's * it. We will copy sv_secdata, and not set NFSMNT_SECDEFAULT. * * If SV4_TRYSECDEFAULT is set, then we start with a passed-in * default flavour, in sv_secdata, but then negotiate a new flavour. * Possible flavours are recorded in an array in sv_secinfo, with * currently in-use flavour pointed to by sv_currsec. * * If sv_currsec is set, i.e. if negotiation has already occurred, * we will copy sv_currsec. Otherwise, copy sv_secdata. Regardless, * we will set NFSMNT_SECDEFAULT, to enable negotiation. */ if (nargs->flags & NFSMNT_REFERRAL) { /* enable negotiation for referral mount */ nargs->flags |= NFSMNT_SECDEFAULT; secdata = kmem_alloc(sizeof (sec_data_t), KM_SLEEP); secdata->secmod = secdata->rpcflavor = AUTH_SYS; secdata->data = NULL; } else if (svp->sv_flags & SV4_TRYSECDEFAULT) { /* enable negotiation for mirror mount */ nargs->flags |= NFSMNT_SECDEFAULT; /* * As a starting point for negotiation, copy parent * mount's negotiated flavour (sv_currsec) if available, * or its passed-in flavour (sv_secdata) if not. */ if (svp->sv_currsec != NULL) secdata = copy_sec_data(svp->sv_currsec); else if (svp->sv_secdata != NULL) secdata = copy_sec_data(svp->sv_secdata); else secdata = NULL; } else { /* do not enable negotiation; copy parent's passed-in flavour */ if (svp->sv_secdata != NULL) secdata = copy_sec_data(svp->sv_secdata); else secdata = NULL; } nfs_rw_exit(&svp->sv_lock); nargs->flags |= NFSMNT_NEWARGS; nargs->nfs_args_ext = NFS_ARGS_EXTB; nargs->nfs_ext_u.nfs_extB.secdata = secdata; /* for NFS RO failover; caller will set if necessary */ nargs->nfs_ext_u.nfs_extB.next = NULL; return (nargs); } static void nfs4_trigger_nargs_destroy(struct nfs_args *nargs) { /* * Either the mount failed, in which case the data is not needed, or * nfs4_mount() has either taken copies of what it needs or, * where it has merely copied the ptr, it has set *our* ptr to NULL, * whereby nfs4_free_args() will ignore it. */ nfs4_free_args(nargs); kmem_free(nargs, sizeof (struct nfs_args)); } /* * When we finally get into the mounting, we need to add this * node to the ephemeral tree. * * This is called from nfs4_mount(). */ int nfs4_record_ephemeral_mount(mntinfo4_t *mi, vnode_t *mvp) { mntinfo4_t *mi_parent; nfs4_ephemeral_t *eph; nfs4_ephemeral_tree_t *net; nfs4_ephemeral_t *prior; nfs4_ephemeral_t *child; nfs4_ephemeral_t *peer; nfs4_trigger_globals_t *ntg; zone_t *zone = curproc->p_zone; int rc = 0; mi_parent = VTOMI4(mvp); /* * Get this before grabbing anything else! */ ntg = zone_getspecific(nfs4_ephemeral_key, zone); if (!ntg->ntg_thread_started) { nfs4_ephemeral_start_harvester(ntg); } mutex_enter(&mi_parent->mi_lock); mutex_enter(&mi->mi_lock); net = mi->mi_ephemeral_tree = mi_parent->mi_ephemeral_tree; /* * If the mi_ephemeral_tree is NULL, then it * means that either the harvester or a manual * umount has cleared the tree out right before * we got here. * * There is nothing we can do here, so return * to the caller and let them decide whether they * try again. */ if (net == NULL) { mutex_exit(&mi->mi_lock); mutex_exit(&mi_parent->mi_lock); return (EBUSY); } /* * We've just tied the mntinfo to the tree, so * now we bump the refcnt and hold it there until * this mntinfo is removed from the tree. */ nfs4_ephemeral_tree_hold(net); /* * We need to tack together the ephemeral mount * with this new mntinfo. */ eph = kmem_zalloc(sizeof (*eph), KM_SLEEP); eph->ne_mount = mi; MI4_HOLD(mi); VFS_HOLD(mi->mi_vfsp); eph->ne_ref_time = gethrestime_sec(); /* * We need to tell the ephemeral mount when * to time out. */ eph->ne_mount_to = ntg->ntg_mount_to; mi->mi_ephemeral = eph; /* * If the enclosing mntinfo4 is also ephemeral, * then we need to point to its enclosing parent. * Else the enclosing mntinfo4 is the enclosing parent. * * We also need to weave this ephemeral node * into the tree. */ if (mi_parent->mi_flags & MI4_EPHEMERAL) { /* * We need to decide if we are * the root node of this branch * or if we are a sibling of this * branch. */ prior = mi_parent->mi_ephemeral; if (prior == NULL) { /* * Race condition, clean up, and * let caller handle mntinfo. */ mi->mi_flags &= ~MI4_EPHEMERAL; mi->mi_ephemeral = NULL; kmem_free(eph, sizeof (*eph)); VFS_RELE(mi->mi_vfsp); MI4_RELE(mi); nfs4_ephemeral_tree_rele(net); rc = EBUSY; } else { if (prior->ne_child == NULL) { prior->ne_child = eph; } else { child = prior->ne_child; prior->ne_child = eph; eph->ne_peer = child; child->ne_prior = eph; } eph->ne_prior = prior; } } else { /* * The parent mntinfo4 is the non-ephemeral * root of the ephemeral tree. We * need to decide if we are the root * node of that tree or if we are a * sibling of the root node. * * We are the root if there is no * other node. */ if (net->net_root == NULL) { net->net_root = eph; } else { eph->ne_peer = peer = net->net_root; ASSERT(peer != NULL); net->net_root = eph; peer->ne_prior = eph; } eph->ne_prior = NULL; } mutex_exit(&mi->mi_lock); mutex_exit(&mi_parent->mi_lock); return (rc); } /* * Commit the changes to the ephemeral tree for removing this node. */ static void nfs4_ephemeral_umount_cleanup(nfs4_ephemeral_t *eph) { nfs4_ephemeral_t *e = eph; nfs4_ephemeral_t *peer; nfs4_ephemeral_t *prior; peer = eph->ne_peer; prior = e->ne_prior; /* * If this branch root was not the * tree root, then we need to fix back pointers. */ if (prior) { if (prior->ne_child == e) { prior->ne_child = peer; } else { prior->ne_peer = peer; } if (peer) peer->ne_prior = prior; } else if (peer) { peer->ne_mount->mi_ephemeral_tree->net_root = peer; peer->ne_prior = NULL; } else { e->ne_mount->mi_ephemeral_tree->net_root = NULL; } } /* * We want to avoid recursion at all costs. So we need to * unroll the tree. We do this by a depth first traversal to * leaf nodes. We blast away the leaf and work our way back * up and down the tree. */ static int nfs4_ephemeral_unmount_engine(nfs4_ephemeral_t *eph, int isTreeRoot, int flag, cred_t *cr) { nfs4_ephemeral_t *e = eph; nfs4_ephemeral_t *prior; mntinfo4_t *mi; vfs_t *vfsp; int error; /* * We use the loop while unrolling the ephemeral tree. */ for (;;) { /* * First we walk down the child. */ if (e->ne_child) { prior = e; e = e->ne_child; continue; } /* * If we are the root of the branch we are removing, * we end it here. But if the branch is the root of * the tree, we have to forge on. We do not consider * the peer list for the root because while it may * be okay to remove, it is both extra work and a * potential for a false-positive error to stall the * unmount attempt. */ if (e == eph && isTreeRoot == FALSE) return (0); /* * Next we walk down the peer list. */ if (e->ne_peer) { prior = e; e = e->ne_peer; continue; } /* * We can only remove the node passed in by the * caller if it is the root of the ephemeral tree. * Otherwise, the caller will remove it. */ if (e == eph && isTreeRoot == FALSE) return (0); /* * Okay, we have a leaf node, time * to prune it! * * Note that prior can only be NULL if * and only if it is the root of the * ephemeral tree. */ prior = e->ne_prior; mi = e->ne_mount; mutex_enter(&mi->mi_lock); vfsp = mi->mi_vfsp; ASSERT(vfsp != NULL); /* * Cleared by umount2_engine. */ VFS_HOLD(vfsp); /* * Inform nfs4_unmount to not recursively * descend into this node's children when it * gets processed. */ mi->mi_flags |= MI4_EPHEMERAL_RECURSED; mutex_exit(&mi->mi_lock); error = umount2_engine(vfsp, flag, cr, FALSE); if (error) { /* * We need to reenable nfs4_unmount's ability * to recursively descend on this node. */ mutex_enter(&mi->mi_lock); mi->mi_flags &= ~MI4_EPHEMERAL_RECURSED; mutex_exit(&mi->mi_lock); return (error); } /* * If we are the current node, we do not want to * touch anything else. At this point, the only * way the current node can have survived to here * is if it is the root of the ephemeral tree and * we are unmounting the enclosing mntinfo4. */ if (e == eph) { ASSERT(prior == NULL); return (0); } /* * Stitch up the prior node. Note that since * we have handled the root of the tree, prior * must be non-NULL. */ ASSERT(prior != NULL); if (prior->ne_child == e) { prior->ne_child = NULL; } else { ASSERT(prior->ne_peer == e); prior->ne_peer = NULL; } e = prior; } /* NOTREACHED */ } /* * Common code to safely release net_cnt_lock and net_tree_lock */ void nfs4_ephemeral_umount_unlock(bool_t *pmust_unlock, nfs4_ephemeral_tree_t **pnet) { nfs4_ephemeral_tree_t *net = *pnet; if (*pmust_unlock) { mutex_enter(&net->net_cnt_lock); net->net_status &= ~NFS4_EPHEMERAL_TREE_UMOUNTING; mutex_exit(&net->net_cnt_lock); mutex_exit(&net->net_tree_lock); *pmust_unlock = FALSE; } } /* * While we may have removed any child or sibling nodes of this * ephemeral node, we can not nuke it until we know that there * were no actived vnodes on it. This will do that final * work once we know it is not busy. */ void nfs4_ephemeral_umount_activate(mntinfo4_t *mi, bool_t *pmust_unlock, nfs4_ephemeral_tree_t **pnet) { /* * Now we need to get rid of the ephemeral data if it exists. */ mutex_enter(&mi->mi_lock); if (mi->mi_ephemeral) { /* * If we are the root node of an ephemeral branch * which is being removed, then we need to fixup * pointers into and out of the node. */ if (!(mi->mi_flags & MI4_EPHEMERAL_RECURSED)) nfs4_ephemeral_umount_cleanup(mi->mi_ephemeral); nfs4_ephemeral_tree_rele(*pnet); ASSERT(mi->mi_ephemeral != NULL); kmem_free(mi->mi_ephemeral, sizeof (*mi->mi_ephemeral)); mi->mi_ephemeral = NULL; VFS_RELE(mi->mi_vfsp); MI4_RELE(mi); } mutex_exit(&mi->mi_lock); nfs4_ephemeral_umount_unlock(pmust_unlock, pnet); } /* * Unmount an ephemeral node. * * Note that if this code fails, then it must unlock. * * If it succeeds, then the caller must be prepared to do so. */ int nfs4_ephemeral_umount(mntinfo4_t *mi, int flag, cred_t *cr, bool_t *pmust_unlock, nfs4_ephemeral_tree_t **pnet) { int error = 0; nfs4_ephemeral_t *eph; nfs4_ephemeral_tree_t *net; int is_derooting = FALSE; int is_recursed = FALSE; int was_locked = FALSE; /* * Make sure to set the default state for cleaning * up the tree in the caller (and on the way out). */ *pmust_unlock = FALSE; /* * The active vnodes on this file system may be ephemeral * children. We need to check for and try to unmount them * here. If any can not be unmounted, we are going * to return EBUSY. */ mutex_enter(&mi->mi_lock); /* * If an ephemeral tree, we need to check to see if * the lock is already held. If it is, then we need * to see if we are being called as a result of * the recursive removal of some node of the tree or * if we are another attempt to remove the tree. * * mi_flags & MI4_EPHEMERAL indicates an ephemeral * node. mi_ephemeral being non-NULL also does this. * * mi_ephemeral_tree being non-NULL is sufficient * to also indicate either it is an ephemeral node * or the enclosing mntinfo4. * * Do we need MI4_EPHEMERAL? Yes, it is useful for * when we delete the ephemeral node and need to * differentiate from an ephemeral node and the * enclosing root node. */ *pnet = net = mi->mi_ephemeral_tree; if (net == NULL) { mutex_exit(&mi->mi_lock); return (0); } eph = mi->mi_ephemeral; is_recursed = mi->mi_flags & MI4_EPHEMERAL_RECURSED; is_derooting = (eph == NULL); mutex_enter(&net->net_cnt_lock); /* * If this is not recursion, then we need to * check to see if a harvester thread has * already grabbed the lock. * * After we exit this branch, we may not * blindly return, we need to jump to * is_busy! */ if (!is_recursed) { if (net->net_status & NFS4_EPHEMERAL_TREE_LOCKED) { /* * If the tree is locked, we need * to decide whether we are the * harvester or some explicit call * for a umount. The only way that * we are the harvester is if * MS_SYSSPACE is set. * * We only let the harvester through * at this point. * * We return EBUSY so that the * caller knows something is * going on. Note that by that * time, the umount in the other * thread may have already occured. */ if (!(flag & MS_SYSSPACE)) { mutex_exit(&net->net_cnt_lock); mutex_exit(&mi->mi_lock); return (EBUSY); } was_locked = TRUE; } } mutex_exit(&net->net_cnt_lock); mutex_exit(&mi->mi_lock); /* * If we are not the harvester, we need to check * to see if we need to grab the tree lock. */ if (was_locked == FALSE) { /* * If we grab the lock, it means that no other * operation is working on the tree. If we don't * grab it, we need to decide if this is because * we are a recursive call or a new operation. */ if (mutex_tryenter(&net->net_tree_lock)) { *pmust_unlock = TRUE; } else { /* * If we are a recursive call, we can * proceed without the lock. * Otherwise we have to wait until * the lock becomes free. */ if (!is_recursed) { mutex_enter(&net->net_cnt_lock); if (net->net_status & (NFS4_EPHEMERAL_TREE_DEROOTING | NFS4_EPHEMERAL_TREE_INVALID)) { mutex_exit(&net->net_cnt_lock); goto is_busy; } mutex_exit(&net->net_cnt_lock); /* * We can't hold any other locks whilst * we wait on this to free up. */ mutex_enter(&net->net_tree_lock); /* * Note that while mi->mi_ephemeral * may change and thus we have to * update eph, it is the case that * we have tied down net and * do not care if mi->mi_ephemeral_tree * has changed. */ mutex_enter(&mi->mi_lock); eph = mi->mi_ephemeral; mutex_exit(&mi->mi_lock); /* * Okay, we need to see if either the * tree got nuked or the current node * got nuked. Both of which will cause * an error. * * Note that a subsequent retry of the * umount shall work. */ mutex_enter(&net->net_cnt_lock); if (net->net_status & NFS4_EPHEMERAL_TREE_INVALID || (!is_derooting && eph == NULL)) { mutex_exit(&net->net_cnt_lock); mutex_exit(&net->net_tree_lock); goto is_busy; } mutex_exit(&net->net_cnt_lock); *pmust_unlock = TRUE; } } } /* * Only once we have grabbed the lock can we mark what we * are planning on doing to the ephemeral tree. */ if (*pmust_unlock) { mutex_enter(&net->net_cnt_lock); net->net_status |= NFS4_EPHEMERAL_TREE_UMOUNTING; /* * Check to see if we are nuking the root. */ if (is_derooting) net->net_status |= NFS4_EPHEMERAL_TREE_DEROOTING; mutex_exit(&net->net_cnt_lock); } if (!is_derooting) { /* * Only work on children if the caller has not already * done so. */ if (!is_recursed) { ASSERT(eph != NULL); error = nfs4_ephemeral_unmount_engine(eph, FALSE, flag, cr); if (error) goto is_busy; } } else { eph = net->net_root; /* * Only work if there is something there. */ if (eph) { error = nfs4_ephemeral_unmount_engine(eph, TRUE, flag, cr); if (error) { mutex_enter(&net->net_cnt_lock); net->net_status &= ~NFS4_EPHEMERAL_TREE_DEROOTING; mutex_exit(&net->net_cnt_lock); goto is_busy; } /* * Nothing else which goes wrong will * invalidate the blowing away of the * ephmeral tree. */ net->net_root = NULL; } /* * We have derooted and we have caused the tree to be * invalidated. */ mutex_enter(&net->net_cnt_lock); net->net_status &= ~NFS4_EPHEMERAL_TREE_DEROOTING; net->net_status |= NFS4_EPHEMERAL_TREE_INVALID; DTRACE_NFSV4_1(nfs4clnt__dbg__ephemeral__tree__derooting, uint_t, net->net_refcnt); /* * We will not finalize this node, so safe to * release it. */ nfs4_ephemeral_tree_decr(net); mutex_exit(&net->net_cnt_lock); if (was_locked == FALSE) mutex_exit(&net->net_tree_lock); /* * We have just blown away any notation of this * tree being locked or having a refcnt. * We can't let the caller try to clean things up. */ *pmust_unlock = FALSE; /* * At this point, the tree should no longer be * associated with the mntinfo4. We need to pull * it off there and let the harvester take * care of it once the refcnt drops. */ mutex_enter(&mi->mi_lock); mi->mi_ephemeral_tree = NULL; mutex_exit(&mi->mi_lock); } return (0); is_busy: nfs4_ephemeral_umount_unlock(pmust_unlock, pnet); return (error); } /* * Do the umount and record any error in the parent. */ static void nfs4_ephemeral_record_umount(vfs_t *vfsp, int flag, nfs4_ephemeral_t *e, nfs4_ephemeral_t *prior) { int error; /* * Only act on if the fs is still mounted. */ if (vfsp == NULL) return; error = umount2_engine(vfsp, flag, kcred, FALSE); if (error) { if (prior) { if (prior->ne_child == e) prior->ne_state |= NFS4_EPHEMERAL_CHILD_ERROR; else prior->ne_state |= NFS4_EPHEMERAL_PEER_ERROR; } } } /* * For each tree in the forest (where the forest is in * effect all of the ephemeral trees for this zone), * scan to see if a node can be unmounted. Note that * unlike nfs4_ephemeral_unmount_engine(), we do * not process the current node before children or * siblings. I.e., if a node can be unmounted, we * do not recursively check to see if the nodes * hanging off of it can also be unmounted. * * Instead, we delve down deep to try and remove the * children first. Then, because we share code with * nfs4_ephemeral_unmount_engine(), we will try * them again. This could be a performance issue in * the future. * * Also note that unlike nfs4_ephemeral_unmount_engine(), * we do not halt on an error. We will not remove the * current node, but we will keep on trying to remove * the others. * * force indicates that we want the unmount to occur * even if there is something blocking it. * * time_check indicates that we want to see if the * mount has expired past mount_to or not. Typically * we want to do this and only on a shutdown of the * zone would we want to ignore the check. */ static void nfs4_ephemeral_harvest_forest(nfs4_trigger_globals_t *ntg, bool_t force, bool_t time_check) { nfs4_ephemeral_tree_t *net; nfs4_ephemeral_tree_t *prev = NULL; nfs4_ephemeral_tree_t *next; nfs4_ephemeral_t *e; nfs4_ephemeral_t *prior; time_t now = gethrestime_sec(); nfs4_ephemeral_tree_t *harvest = NULL; int flag; mntinfo4_t *mi; vfs_t *vfsp; if (force) flag = MS_FORCE | MS_SYSSPACE; else flag = MS_SYSSPACE; mutex_enter(&ntg->ntg_forest_lock); for (net = ntg->ntg_forest; net != NULL; net = next) { next = net->net_next; nfs4_ephemeral_tree_hold(net); mutex_enter(&net->net_tree_lock); /* * Let the unmount code know that the * tree is already locked! */ mutex_enter(&net->net_cnt_lock); net->net_status |= NFS4_EPHEMERAL_TREE_LOCKED; mutex_exit(&net->net_cnt_lock); /* * If the intent is force all ephemeral nodes to * be unmounted in this zone, we can short circuit a * lot of tree traversal and simply zap the root node. */ if (force) { if (net->net_root) { mi = net->net_root->ne_mount; vfsp = mi->mi_vfsp; ASSERT(vfsp != NULL); /* * Cleared by umount2_engine. */ VFS_HOLD(vfsp); (void) umount2_engine(vfsp, flag, kcred, FALSE); goto check_done; } } e = net->net_root; if (e) e->ne_state = NFS4_EPHEMERAL_VISIT_CHILD; while (e) { if (e->ne_state == NFS4_EPHEMERAL_VISIT_CHILD) { e->ne_state = NFS4_EPHEMERAL_VISIT_SIBLING; if (e->ne_child) { e = e->ne_child; e->ne_state = NFS4_EPHEMERAL_VISIT_CHILD; } continue; } else if (e->ne_state == NFS4_EPHEMERAL_VISIT_SIBLING) { e->ne_state = NFS4_EPHEMERAL_PROCESS_ME; if (e->ne_peer) { e = e->ne_peer; e->ne_state = NFS4_EPHEMERAL_VISIT_CHILD; } continue; } else if (e->ne_state == NFS4_EPHEMERAL_CHILD_ERROR) { prior = e->ne_prior; /* * If a child reported an error, do * not bother trying to unmount. * * If your prior node is a parent, * pass the error up such that they * also do not try to unmount. * * However, if your prior is a sibling, * let them try to unmount if they can. */ if (prior) { if (prior->ne_child == e) prior->ne_state |= NFS4_EPHEMERAL_CHILD_ERROR; else prior->ne_state |= NFS4_EPHEMERAL_PEER_ERROR; } /* * Clear the error and if needed, process peers. * * Once we mask out the error, we know whether * or we have to process another node. */ e->ne_state &= ~NFS4_EPHEMERAL_CHILD_ERROR; if (e->ne_state == NFS4_EPHEMERAL_PROCESS_ME) e = prior; continue; } else if (e->ne_state == NFS4_EPHEMERAL_PEER_ERROR) { prior = e->ne_prior; if (prior) { if (prior->ne_child == e) prior->ne_state = NFS4_EPHEMERAL_CHILD_ERROR; else prior->ne_state = NFS4_EPHEMERAL_PEER_ERROR; } /* * Clear the error from this node and do the * correct processing. */ e->ne_state &= ~NFS4_EPHEMERAL_PEER_ERROR; continue; } prior = e->ne_prior; e->ne_state = NFS4_EPHEMERAL_OK; /* * It must be the case that we need to process * this node. */ if (!time_check || now - e->ne_ref_time > e->ne_mount_to) { mi = e->ne_mount; vfsp = mi->mi_vfsp; /* * Cleared by umount2_engine. */ if (vfsp != NULL) VFS_HOLD(vfsp); /* * Note that we effectively work down to the * leaf nodes first, try to unmount them, * then work our way back up into the leaf * nodes. * * Also note that we deal with a lot of * complexity by sharing the work with * the manual unmount code. */ nfs4_ephemeral_record_umount(vfsp, flag, e, prior); } e = prior; } check_done: /* * At this point we are done processing this tree. * * If the tree is invalid and we were the only reference * to it, then we push it on the local linked list * to remove it at the end. We avoid that action now * to keep the tree processing going along at a fair clip. * * Else, even if we were the only reference, we * allow it to be reused as needed. */ mutex_enter(&net->net_cnt_lock); nfs4_ephemeral_tree_decr(net); if (net->net_refcnt == 0 && net->net_status & NFS4_EPHEMERAL_TREE_INVALID) { net->net_status &= ~NFS4_EPHEMERAL_TREE_LOCKED; mutex_exit(&net->net_cnt_lock); mutex_exit(&net->net_tree_lock); if (prev) prev->net_next = net->net_next; else ntg->ntg_forest = net->net_next; net->net_next = harvest; harvest = net; VFS_RELE(net->net_mount->mi_vfsp); MI4_RELE(net->net_mount); continue; } net->net_status &= ~NFS4_EPHEMERAL_TREE_LOCKED; mutex_exit(&net->net_cnt_lock); mutex_exit(&net->net_tree_lock); prev = net; } mutex_exit(&ntg->ntg_forest_lock); for (net = harvest; net != NULL; net = next) { next = net->net_next; mutex_destroy(&net->net_tree_lock); mutex_destroy(&net->net_cnt_lock); kmem_free(net, sizeof (*net)); } } /* * This is the thread which decides when the harvesting * can proceed and when to kill it off for this zone. */ static void nfs4_ephemeral_harvester(nfs4_trigger_globals_t *ntg) { clock_t timeleft; zone_t *zone = curproc->p_zone; for (;;) { timeleft = zone_status_timedwait(zone, ddi_get_lbolt() + nfs4_trigger_thread_timer * hz, ZONE_IS_SHUTTING_DOWN); /* * zone is exiting... */ if (timeleft != -1) { ASSERT(zone_status_get(zone) >= ZONE_IS_SHUTTING_DOWN); zthread_exit(); /* NOTREACHED */ } /* * Only bother scanning if there is potential * work to be done. */ if (ntg->ntg_forest == NULL) continue; /* * Now scan the list and get rid of everything which * is old. */ nfs4_ephemeral_harvest_forest(ntg, FALSE, TRUE); } /* NOTREACHED */ } /* * The zone specific glue needed to start the unmount harvester. * * Note that we want to avoid holding the mutex as long as possible, * hence the multiple checks. * * The caller should avoid us getting down here in the first * place. */ static void nfs4_ephemeral_start_harvester(nfs4_trigger_globals_t *ntg) { /* * It got started before we got here... */ if (ntg->ntg_thread_started) return; mutex_enter(&nfs4_ephemeral_thread_lock); if (ntg->ntg_thread_started) { mutex_exit(&nfs4_ephemeral_thread_lock); return; } /* * Start the unmounter harvester thread for this zone. */ (void) zthread_create(NULL, 0, nfs4_ephemeral_harvester, ntg, 0, minclsyspri); ntg->ntg_thread_started = TRUE; mutex_exit(&nfs4_ephemeral_thread_lock); } /*ARGSUSED*/ static void * nfs4_ephemeral_zsd_create(zoneid_t zoneid) { nfs4_trigger_globals_t *ntg; ntg = kmem_zalloc(sizeof (*ntg), KM_SLEEP); ntg->ntg_thread_started = FALSE; /* * This is the default.... */ ntg->ntg_mount_to = nfs4_trigger_mount_to; mutex_init(&ntg->ntg_forest_lock, NULL, MUTEX_DEFAULT, NULL); return (ntg); } /* * Try a nice gentle walk down the forest and convince * all of the trees to gracefully give it up. */ /*ARGSUSED*/ static void nfs4_ephemeral_zsd_shutdown(zoneid_t zoneid, void *arg) { nfs4_trigger_globals_t *ntg = arg; if (!ntg) return; nfs4_ephemeral_harvest_forest(ntg, FALSE, FALSE); } /* * Race along the forest and rip all of the trees out by * their rootballs! */ /*ARGSUSED*/ static void nfs4_ephemeral_zsd_destroy(zoneid_t zoneid, void *arg) { nfs4_trigger_globals_t *ntg = arg; if (!ntg) return; nfs4_ephemeral_harvest_forest(ntg, TRUE, FALSE); mutex_destroy(&ntg->ntg_forest_lock); kmem_free(ntg, sizeof (*ntg)); } /* * This is the zone independent cleanup needed for * emphemeral mount processing. */ void nfs4_ephemeral_fini(void) { (void) zone_key_delete(nfs4_ephemeral_key); mutex_destroy(&nfs4_ephemeral_thread_lock); } /* * This is the zone independent initialization needed for * emphemeral mount processing. */ void nfs4_ephemeral_init(void) { mutex_init(&nfs4_ephemeral_thread_lock, NULL, MUTEX_DEFAULT, NULL); zone_key_create(&nfs4_ephemeral_key, nfs4_ephemeral_zsd_create, nfs4_ephemeral_zsd_shutdown, nfs4_ephemeral_zsd_destroy); } /* * nfssys() calls this function to set the per-zone * value of mount_to to drive when an ephemeral mount is * timed out. Each mount will grab a copy of this value * when mounted. */ void nfs4_ephemeral_set_mount_to(uint_t mount_to) { nfs4_trigger_globals_t *ntg; zone_t *zone = curproc->p_zone; ntg = zone_getspecific(nfs4_ephemeral_key, zone); ntg->ntg_mount_to = mount_to; } /* * Walk the list of v4 mount options; if they are currently set in vfsp, * append them to a new comma-separated mount option string, and return it. * * Caller should free by calling nfs4_trigger_destroy_mntopts(). */ static char * nfs4_trigger_create_mntopts(vfs_t *vfsp) { uint_t i; char *mntopts; struct vfssw *vswp; mntopts_t *optproto; mntopts = kmem_zalloc(MAX_MNTOPT_STR, KM_SLEEP); /* get the list of applicable mount options for v4; locks *vswp */ vswp = vfs_getvfssw(MNTTYPE_NFS4); optproto = &vswp->vsw_optproto; for (i = 0; i < optproto->mo_count; i++) { struct mntopt *mop = &optproto->mo_list[i]; if (mop->mo_flags & MO_EMPTY) continue; if (nfs4_trigger_add_mntopt(mntopts, mop->mo_name, vfsp)) { kmem_free(mntopts, MAX_MNTOPT_STR); vfs_unrefvfssw(vswp); return (NULL); } } vfs_unrefvfssw(vswp); /* * MNTOPT_XATTR is not in the v4 mount opt proto list, * and it may only be passed via MS_OPTIONSTR, so we * must handle it here. * * Ideally, it would be in the list, but NFS does not specify its * own opt proto list, it uses instead the default one. Since * not all filesystems support extended attrs, it would not be * appropriate to add it there. */ if (nfs4_trigger_add_mntopt(mntopts, MNTOPT_XATTR, vfsp) || nfs4_trigger_add_mntopt(mntopts, MNTOPT_NOXATTR, vfsp)) { kmem_free(mntopts, MAX_MNTOPT_STR); return (NULL); } return (mntopts); } static void nfs4_trigger_destroy_mntopts(char *mntopts) { if (mntopts) kmem_free(mntopts, MAX_MNTOPT_STR); } /* * Check a single mount option (optname). Add to mntopts if it is set in VFS. */ static int nfs4_trigger_add_mntopt(char *mntopts, char *optname, vfs_t *vfsp) { if (mntopts == NULL || optname == NULL || vfsp == NULL) return (EINVAL); if (vfs_optionisset(vfsp, optname, NULL)) { size_t mntoptslen = strlen(mntopts); size_t optnamelen = strlen(optname); /* +1 for ',', +1 for NUL */ if (mntoptslen + optnamelen + 2 > MAX_MNTOPT_STR) return (EOVERFLOW); /* first or subsequent mount option? */ if (*mntopts != '\0') (void) strcat(mntopts, ","); (void) strcat(mntopts, optname); } return (0); } static enum clnt_stat nfs4_ping_server_common(struct knetconfig *knc, struct netbuf *addr, int nointr) { int retries; uint_t max_msgsize; enum clnt_stat status; CLIENT *cl; struct timeval timeout; /* as per recov_newserver() */ max_msgsize = 0; retries = 1; timeout.tv_sec = 2; timeout.tv_usec = 0; if (clnt_tli_kcreate(knc, addr, NFS_PROGRAM, NFS_V4, max_msgsize, retries, CRED(), &cl) != 0) return (RPC_FAILED); if (nointr) cl->cl_nosignal = TRUE; status = CLNT_CALL(cl, RFS_NULL, xdr_void, NULL, xdr_void, NULL, timeout); if (nointr) cl->cl_nosignal = FALSE; AUTH_DESTROY(cl->cl_auth); CLNT_DESTROY(cl); return (status); } static enum clnt_stat nfs4_trigger_ping_server(servinfo4_t *svp, int nointr) { return (nfs4_ping_server_common(svp->sv_knconf, &svp->sv_addr, nointr)); }