1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * Support for ephemeral mounts, e.g. mirror-mounts. These mounts are 31 * triggered from a "stub" rnode via a special set of vnodeops. 32 */ 33 34 #include <sys/param.h> 35 #include <sys/types.h> 36 #include <sys/systm.h> 37 #include <sys/cred.h> 38 #include <sys/time.h> 39 #include <sys/vnode.h> 40 #include <sys/vfs.h> 41 #include <sys/vfs_opreg.h> 42 #include <sys/file.h> 43 #include <sys/filio.h> 44 #include <sys/uio.h> 45 #include <sys/buf.h> 46 #include <sys/mman.h> 47 #include <sys/pathname.h> 48 #include <sys/dirent.h> 49 #include <sys/debug.h> 50 #include <sys/vmsystm.h> 51 #include <sys/fcntl.h> 52 #include <sys/flock.h> 53 #include <sys/swap.h> 54 #include <sys/errno.h> 55 #include <sys/strsubr.h> 56 #include <sys/sysmacros.h> 57 #include <sys/kmem.h> 58 #include <sys/mount.h> 59 #include <sys/cmn_err.h> 60 #include <sys/pathconf.h> 61 #include <sys/utsname.h> 62 #include <sys/dnlc.h> 63 #include <sys/acl.h> 64 #include <sys/systeminfo.h> 65 #include <sys/policy.h> 66 #include <sys/sdt.h> 67 #include <sys/list.h> 68 #include <sys/stat.h> 69 #include <sys/mntent.h> 70 71 #include <rpc/types.h> 72 #include <rpc/auth.h> 73 #include <rpc/clnt.h> 74 75 #include <nfs/nfs.h> 76 #include <nfs/nfs_clnt.h> 77 #include <nfs/nfs_acl.h> 78 #include <nfs/lm.h> 79 #include <nfs/nfs4.h> 80 #include <nfs/nfs4_kprot.h> 81 #include <nfs/rnode4.h> 82 #include <nfs/nfs4_clnt.h> 83 84 #include <vm/hat.h> 85 #include <vm/as.h> 86 #include <vm/page.h> 87 #include <vm/pvn.h> 88 #include <vm/seg.h> 89 #include <vm/seg_map.h> 90 #include <vm/seg_kpm.h> 91 #include <vm/seg_vn.h> 92 93 #include <fs/fs_subr.h> 94 95 #include <sys/ddi.h> 96 #include <sys/int_fmtio.h> 97 98 #include <util/string.h> 99 100 /* 101 * The automatic unmounter thread stuff! 102 */ 103 static int nfs4_trigger_thread_timer = 20; /* in seconds */ 104 105 /* 106 * Just a default.... 107 */ 108 static uint_t nfs4_trigger_mount_to = 240; 109 110 typedef struct nfs4_trigger_globals { 111 kmutex_t ntg_forest_lock; 112 uint_t ntg_mount_to; 113 int ntg_thread_started; 114 nfs4_ephemeral_tree_t *ntg_forest; 115 } nfs4_trigger_globals_t; 116 117 kmutex_t nfs4_ephemeral_thread_lock; 118 119 zone_key_t nfs4_ephemeral_key = ZONE_KEY_UNINITIALIZED; 120 121 static void nfs4_ephemeral_start_harvester(nfs4_trigger_globals_t *); 122 123 /* 124 * Used for ephemeral mounts; contains data either duplicated from 125 * servinfo4_t, or hand-crafted, depending on type of ephemeral mount. 126 * 127 * It's intended that this structure is used solely for ephemeral 128 * mount-type specific data, for passing this data to 129 * nfs4_trigger_nargs_create(). 130 */ 131 typedef struct ephemeral_servinfo { 132 char *esi_hostname; 133 char *esi_netname; 134 char *esi_path; 135 int esi_path_len; 136 int esi_mount_flags; 137 struct netbuf *esi_addr; 138 struct netbuf *esi_syncaddr; 139 struct knetconfig *esi_knconf; 140 } ephemeral_servinfo_t; 141 142 /* 143 * Collect together the mount-type specific and generic data args. 144 */ 145 typedef struct domount_args { 146 ephemeral_servinfo_t *dma_esi; 147 char *dma_hostlist; /* comma-sep. for RO failover */ 148 struct nfs_args *dma_nargs; 149 } domount_args_t; 150 151 152 /* 153 * The vnode ops functions for a trigger stub vnode 154 */ 155 static int nfs4_trigger_open(vnode_t **, int, cred_t *); 156 static int nfs4_trigger_getattr(vnode_t *, struct vattr *, int, cred_t *); 157 static int nfs4_trigger_setattr(vnode_t *, struct vattr *, int, cred_t *, 158 caller_context_t *); 159 static int nfs4_trigger_access(vnode_t *, int, int, cred_t *); 160 static int nfs4_trigger_readlink(vnode_t *, struct uio *, cred_t *); 161 static int nfs4_trigger_lookup(vnode_t *, char *, vnode_t **, 162 struct pathname *, int, vnode_t *, cred_t *); 163 static int nfs4_trigger_create(vnode_t *, char *, struct vattr *, 164 enum vcexcl, int, vnode_t **, cred_t *, int); 165 static int nfs4_trigger_remove(vnode_t *, char *, cred_t *); 166 static int nfs4_trigger_link(vnode_t *, vnode_t *, char *, cred_t *); 167 static int nfs4_trigger_rename(vnode_t *, char *, vnode_t *, char *, 168 cred_t *); 169 static int nfs4_trigger_mkdir(vnode_t *, char *, struct vattr *, 170 vnode_t **, cred_t *); 171 static int nfs4_trigger_rmdir(vnode_t *, char *, vnode_t *, cred_t *); 172 static int nfs4_trigger_symlink(vnode_t *, char *, struct vattr *, char *, 173 cred_t *); 174 static int nfs4_trigger_cmp(vnode_t *, vnode_t *); 175 176 /* 177 * Regular NFSv4 vnodeops that we need to reference directly 178 */ 179 extern int nfs4_getattr(vnode_t *, struct vattr *, int, cred_t *); 180 extern void nfs4_inactive(vnode_t *, cred_t *); 181 extern int nfs4_rwlock(vnode_t *, int, caller_context_t *); 182 extern void nfs4_rwunlock(vnode_t *, int, caller_context_t *); 183 extern int nfs4_lookup(vnode_t *, char *, vnode_t **, 184 struct pathname *, int, vnode_t *, cred_t *); 185 extern int nfs4_pathconf(vnode_t *, int, ulong_t *, cred_t *); 186 extern int nfs4_getsecattr(vnode_t *, vsecattr_t *, int, cred_t *); 187 extern int nfs4_fid(vnode_t *, fid_t *); 188 extern int nfs4_realvp(vnode_t *, vnode_t **); 189 190 static int nfs4_trigger_mount(vnode_t *, vnode_t **); 191 static int nfs4_trigger_domount(vnode_t *, domount_args_t *, vfs_t **, 192 cred_t *); 193 static domount_args_t *nfs4_trigger_domount_args_create(vnode_t *); 194 static void nfs4_trigger_domount_args_destroy(domount_args_t *dma, 195 vnode_t *vp); 196 static ephemeral_servinfo_t *nfs4_trigger_esi_create(vnode_t *, servinfo4_t *); 197 static void nfs4_trigger_esi_destroy(ephemeral_servinfo_t *, vnode_t *); 198 static ephemeral_servinfo_t *nfs4_trigger_esi_create_mirrormount(vnode_t *, 199 servinfo4_t *); 200 static struct nfs_args *nfs4_trigger_nargs_create(mntinfo4_t *, servinfo4_t *, 201 ephemeral_servinfo_t *); 202 static void nfs4_trigger_nargs_destroy(struct nfs_args *); 203 static char *nfs4_trigger_create_mntopts(vfs_t *); 204 static void nfs4_trigger_destroy_mntopts(char *); 205 static int nfs4_trigger_add_mntopt(char *, char *, vfs_t *); 206 static enum clnt_stat nfs4_trigger_ping_server(servinfo4_t *, int); 207 208 extern int umount2_engine(vfs_t *, int, cred_t *, int); 209 210 211 vnodeops_t *nfs4_trigger_vnodeops; 212 213 /* 214 * These are the vnodeops that we must define for stub vnodes. 215 * 216 * 217 * Many of the VOPs defined for NFSv4 do not need to be defined here, 218 * for various reasons. This will result in the VFS default function being 219 * used: 220 * 221 * - These VOPs require a previous VOP_OPEN to have occurred. That will have 222 * lost the reference to the stub vnode, meaning these should not be called: 223 * close, read, write, ioctl, readdir, seek. 224 * 225 * - These VOPs are meaningless for vnodes without data pages. Since the 226 * stub vnode is of type VDIR, these should not be called: 227 * space, getpage, putpage, map, addmap, delmap, pageio, fsync. 228 * 229 * - These VOPs are otherwise not applicable, and should not be called: 230 * dump, setsecattr. 231 * 232 * 233 * These VOPs we do not want to define, but nor do we want the VFS default 234 * action. Instead, we specify the VFS error function, with fs_error(), but 235 * note that fs_error() is not actually called. Instead it results in the 236 * use of the error function defined for the particular VOP, in vn_ops_table[]: 237 * 238 * - frlock, dispose, shrlock. 239 * 240 * 241 * These VOPs we define to use the corresponding regular NFSv4 vnodeop. 242 * NOTE: if any of these ops involve an OTW call with the stub FH, then 243 * that call must be wrapped with save_mnt_secinfo()/check_mnt_secinfo() 244 * to protect the security data in the servinfo4_t for the "parent" 245 * filesystem that contains the stub. 246 * 247 * - These VOPs should not trigger a mount, so that "ls -l" does not: 248 * pathconf, getsecattr. 249 * 250 * - These VOPs would not make sense to trigger: 251 * inactive, rwlock, rwunlock, fid, realvp. 252 */ 253 const fs_operation_def_t nfs4_trigger_vnodeops_template[] = { 254 VOPNAME_OPEN, { .vop_open = nfs4_trigger_open }, 255 VOPNAME_GETATTR, { .vop_getattr = nfs4_trigger_getattr }, 256 VOPNAME_SETATTR, { .vop_setattr = nfs4_trigger_setattr }, 257 VOPNAME_ACCESS, { .vop_access = nfs4_trigger_access }, 258 VOPNAME_LOOKUP, { .vop_lookup = nfs4_trigger_lookup }, 259 VOPNAME_CREATE, { .vop_create = nfs4_trigger_create }, 260 VOPNAME_REMOVE, { .vop_remove = nfs4_trigger_remove }, 261 VOPNAME_LINK, { .vop_link = nfs4_trigger_link }, 262 VOPNAME_RENAME, { .vop_rename = nfs4_trigger_rename }, 263 VOPNAME_MKDIR, { .vop_mkdir = nfs4_trigger_mkdir }, 264 VOPNAME_RMDIR, { .vop_rmdir = nfs4_trigger_rmdir }, 265 VOPNAME_SYMLINK, { .vop_symlink = nfs4_trigger_symlink }, 266 VOPNAME_READLINK, { .vop_readlink = nfs4_trigger_readlink }, 267 VOPNAME_INACTIVE, { .vop_inactive = nfs4_inactive }, 268 VOPNAME_FID, { .vop_fid = nfs4_fid }, 269 VOPNAME_RWLOCK, { .vop_rwlock = nfs4_rwlock }, 270 VOPNAME_RWUNLOCK, { .vop_rwunlock = nfs4_rwunlock }, 271 VOPNAME_REALVP, { .vop_realvp = nfs4_realvp }, 272 VOPNAME_GETSECATTR, { .vop_getsecattr = nfs4_getsecattr }, 273 VOPNAME_PATHCONF, { .vop_pathconf = nfs4_pathconf }, 274 VOPNAME_FRLOCK, { .error = fs_error }, 275 VOPNAME_DISPOSE, { .error = fs_error }, 276 VOPNAME_SHRLOCK, { .error = fs_error }, 277 VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support }, 278 NULL, NULL 279 }; 280 281 /* 282 * Trigger ops for stub vnodes; for mirror mounts, etc. 283 * 284 * The general idea is that a "triggering" op will first call 285 * nfs4_trigger_mount(), which will find out whether a mount has already 286 * been triggered. 287 * 288 * If it has, then nfs4_trigger_mount() sets newvp to the root vnode 289 * of the covering vfs. 290 * 291 * If a mount has not yet been triggered, nfs4_trigger_mount() will do so, 292 * and again set newvp, as above. 293 * 294 * The triggering op may then re-issue the VOP by calling it on newvp. 295 * 296 * Note that some ops may perform custom action, and may or may not need 297 * to trigger a mount. 298 * 299 * Some ops need to call the regular NFSv4 vnodeop for a stub vnode. We 300 * obviously can't do this with VOP_<whatever>, since it's a stub vnode 301 * and that would just recurse. Instead, we call the v4 op directly, 302 * by name. This is OK, since we know that the vnode is for NFSv4, 303 * otherwise it couldn't be a stub. 304 * 305 */ 306 307 static int 308 nfs4_trigger_open(vnode_t **vpp, int flag, cred_t *cr) 309 { 310 int error; 311 vnode_t *newvp; 312 313 error = nfs4_trigger_mount(*vpp, &newvp); 314 if (error) 315 return (error); 316 317 /* Release the stub vnode, as we're losing the reference to it */ 318 VN_RELE(*vpp); 319 320 /* Give the caller the root vnode of the newly-mounted fs */ 321 *vpp = newvp; 322 323 /* return with VN_HELD(newvp) */ 324 return (VOP_OPEN(vpp, flag, cr)); 325 } 326 327 /* 328 * For the majority of cases, nfs4_trigger_getattr() will not trigger 329 * a mount. However, if ATTR_TRIGGER is set, we are being informed 330 * that we need to force the mount before we attempt to determine 331 * the attributes. The intent is an atomic operation for security 332 * testing. 333 */ 334 static int 335 nfs4_trigger_getattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr) 336 { 337 int error; 338 339 if (flags & ATTR_TRIGGER) { 340 vnode_t *newvp; 341 342 error = nfs4_trigger_mount(vp, &newvp); 343 if (error) 344 return (error); 345 346 error = VOP_GETATTR(newvp, vap, flags, cr); 347 VN_RELE(newvp); 348 } else { 349 error = nfs4_getattr(vp, vap, flags, cr); 350 } 351 352 return (error); 353 } 354 355 static int 356 nfs4_trigger_setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr, 357 caller_context_t *ct) 358 { 359 int error; 360 vnode_t *newvp; 361 362 error = nfs4_trigger_mount(vp, &newvp); 363 if (error) 364 return (error); 365 366 error = VOP_SETATTR(newvp, vap, flags, cr, ct); 367 VN_RELE(newvp); 368 369 return (error); 370 } 371 372 static int 373 nfs4_trigger_access(vnode_t *vp, int mode, int flags, cred_t *cr) 374 { 375 int error; 376 vnode_t *newvp; 377 378 error = nfs4_trigger_mount(vp, &newvp); 379 if (error) 380 return (error); 381 382 error = VOP_ACCESS(newvp, mode, flags, cr); 383 VN_RELE(newvp); 384 385 return (error); 386 } 387 388 static int 389 nfs4_trigger_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp, 390 int flags, vnode_t *rdir, cred_t *cr) 391 { 392 int error; 393 vnode_t *newdvp; 394 rnode4_t *drp = VTOR4(dvp); 395 396 ASSERT(RP_ISSTUB(drp)); 397 398 /* for now, we only support mirror-mounts */ 399 ASSERT(RP_ISSTUB_MIRRORMOUNT(drp)); 400 401 /* 402 * It's not legal to lookup ".." for an fs root, so we mustn't pass 403 * that up. Instead, pass onto the regular op, regardless of whether 404 * we've triggered a mount. 405 */ 406 if (strcmp(nm, "..") == 0) 407 return (nfs4_lookup(dvp, nm, vpp, pnp, flags, rdir, cr)); 408 409 error = nfs4_trigger_mount(dvp, &newdvp); 410 if (error) 411 return (error); 412 413 error = VOP_LOOKUP(newdvp, nm, vpp, pnp, flags, rdir, cr); 414 VN_RELE(newdvp); 415 416 return (error); 417 } 418 419 static int 420 nfs4_trigger_create(vnode_t *dvp, char *nm, struct vattr *va, 421 enum vcexcl exclusive, int mode, vnode_t **vpp, cred_t *cr, 422 int flags) 423 { 424 int error; 425 vnode_t *newdvp; 426 427 error = nfs4_trigger_mount(dvp, &newdvp); 428 if (error) 429 return (error); 430 431 error = VOP_CREATE(newdvp, nm, va, exclusive, mode, vpp, cr, flags); 432 VN_RELE(newdvp); 433 434 return (error); 435 } 436 437 static int 438 nfs4_trigger_remove(vnode_t *dvp, char *nm, cred_t *cr) 439 { 440 int error; 441 vnode_t *newdvp; 442 443 error = nfs4_trigger_mount(dvp, &newdvp); 444 if (error) 445 return (error); 446 447 error = VOP_REMOVE(newdvp, nm, cr); 448 VN_RELE(newdvp); 449 450 return (error); 451 } 452 453 static int 454 nfs4_trigger_link(vnode_t *tdvp, vnode_t *svp, char *tnm, cred_t *cr) 455 { 456 int error; 457 vnode_t *newtdvp; 458 459 error = nfs4_trigger_mount(tdvp, &newtdvp); 460 if (error) 461 return (error); 462 463 /* 464 * We don't check whether svp is a stub. Let the NFSv4 code 465 * detect that error, and return accordingly. 466 */ 467 error = VOP_LINK(newtdvp, svp, tnm, cr); 468 VN_RELE(newtdvp); 469 470 return (error); 471 } 472 473 static int 474 nfs4_trigger_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, 475 cred_t *cr) 476 { 477 int error; 478 vnode_t *newsdvp; 479 rnode4_t *tdrp = VTOR4(tdvp); 480 481 /* 482 * We know that sdvp is a stub, otherwise we would not be here. 483 * 484 * If tdvp is also be a stub, there are two possibilities: it 485 * is either the same stub as sdvp [i.e. VN_CMP(sdvp, tdvp)] 486 * or it is a different stub [!VN_CMP(sdvp, tdvp)]. 487 * 488 * In the former case, just trigger sdvp, and treat tdvp as 489 * though it were not a stub. 490 * 491 * In the latter case, it might be a different stub for the 492 * same server fs as sdvp, or for a different server fs. 493 * Regardless, from the client perspective this would still 494 * be a cross-filesystem rename, and should not be allowed, 495 * so return EXDEV, without triggering either mount. 496 */ 497 if (RP_ISSTUB(tdrp) && !VN_CMP(sdvp, tdvp)) 498 return (EXDEV); 499 500 error = nfs4_trigger_mount(sdvp, &newsdvp); 501 if (error) 502 return (error); 503 504 error = VOP_RENAME(newsdvp, snm, tdvp, tnm, cr); 505 506 VN_RELE(newsdvp); 507 508 return (error); 509 } 510 511 static int 512 nfs4_trigger_mkdir(vnode_t *dvp, char *nm, struct vattr *va, vnode_t **vpp, 513 cred_t *cr) 514 { 515 int error; 516 vnode_t *newdvp; 517 518 error = nfs4_trigger_mount(dvp, &newdvp); 519 if (error) 520 return (error); 521 522 error = VOP_MKDIR(newdvp, nm, va, vpp, cr); 523 VN_RELE(newdvp); 524 525 return (error); 526 } 527 528 static int 529 nfs4_trigger_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, cred_t *cr) 530 { 531 int error; 532 vnode_t *newdvp; 533 534 error = nfs4_trigger_mount(dvp, &newdvp); 535 if (error) 536 return (error); 537 538 error = VOP_RMDIR(newdvp, nm, cdir, cr); 539 VN_RELE(newdvp); 540 541 return (error); 542 } 543 544 static int 545 nfs4_trigger_symlink(vnode_t *dvp, char *lnm, struct vattr *tva, char *tnm, 546 cred_t *cr) 547 { 548 int error; 549 vnode_t *newdvp; 550 551 error = nfs4_trigger_mount(dvp, &newdvp); 552 if (error) 553 return (error); 554 555 error = VOP_SYMLINK(newdvp, lnm, tva, tnm, cr); 556 VN_RELE(newdvp); 557 558 return (error); 559 } 560 561 static int 562 nfs4_trigger_readlink(vnode_t *vp, struct uio *uiop, cred_t *cr) 563 { 564 int error; 565 vnode_t *newvp; 566 567 error = nfs4_trigger_mount(vp, &newvp); 568 if (error) 569 return (error); 570 571 error = VOP_READLINK(newvp, uiop, cr); 572 VN_RELE(newvp); 573 574 return (error); 575 } 576 577 /* end of trigger vnode ops */ 578 579 580 /* 581 * Mount upon a trigger vnode; for mirror-mounts, etc. 582 * 583 * The mount may have already occurred, via another thread. If not, 584 * assemble the location information - which may require fetching - and 585 * perform the mount. 586 * 587 * Sets newvp to be the root of the fs that is now covering vp. Note 588 * that we return with VN_HELD(*newvp). 589 * 590 * The caller is responsible for passing the VOP onto the covering fs. 591 */ 592 static int 593 nfs4_trigger_mount(vnode_t *vp, vnode_t **newvpp) 594 { 595 int error; 596 vfs_t *vfsp; 597 rnode4_t *rp = VTOR4(vp); 598 mntinfo4_t *mi = VTOMI4(vp); 599 domount_args_t *dma; 600 601 nfs4_ephemeral_tree_t *net; 602 603 bool_t must_unlock = FALSE; 604 bool_t is_building = FALSE; 605 606 cred_t *zcred; 607 608 nfs4_trigger_globals_t *ntg; 609 610 zone_t *zone = curproc->p_zone; 611 612 ASSERT(RP_ISSTUB(rp)); 613 614 /* for now, we only support mirror-mounts */ 615 ASSERT(RP_ISSTUB_MIRRORMOUNT(rp)); 616 617 *newvpp = NULL; 618 619 /* 620 * Has the mount already occurred? 621 */ 622 error = vn_vfsrlock_wait(vp); 623 if (error) 624 goto done; 625 vfsp = vn_mountedvfs(vp); 626 if (vfsp != NULL) { 627 /* the mount has already occurred */ 628 error = VFS_ROOT(vfsp, newvpp); 629 if (!error) { 630 /* need to update the reference time */ 631 mutex_enter(&mi->mi_lock); 632 if (mi->mi_ephemeral) 633 mi->mi_ephemeral->ne_ref_time = 634 gethrestime_sec(); 635 mutex_exit(&mi->mi_lock); 636 } 637 638 vn_vfsunlock(vp); 639 goto done; 640 } 641 vn_vfsunlock(vp); 642 643 ntg = zone_getspecific(nfs4_ephemeral_key, zone); 644 ASSERT(ntg != NULL); 645 646 mutex_enter(&mi->mi_lock); 647 648 /* 649 * We need to lock down the ephemeral tree. 650 */ 651 if (mi->mi_ephemeral_tree == NULL) { 652 net = kmem_zalloc(sizeof (*net), KM_SLEEP); 653 mutex_init(&net->net_tree_lock, NULL, MUTEX_DEFAULT, NULL); 654 mutex_init(&net->net_cnt_lock, NULL, MUTEX_DEFAULT, NULL); 655 net->net_refcnt = 1; 656 net->net_status = NFS4_EPHEMERAL_TREE_BUILDING; 657 is_building = TRUE; 658 659 /* 660 * We need to add it to the zone specific list for 661 * automatic unmounting and harvesting of deadwood. 662 */ 663 mutex_enter(&ntg->ntg_forest_lock); 664 if (ntg->ntg_forest != NULL) 665 net->net_next = ntg->ntg_forest; 666 ntg->ntg_forest = net; 667 mutex_exit(&ntg->ntg_forest_lock); 668 669 /* 670 * No lock order confusion with mi_lock because no 671 * other node could have grabbed net_tree_lock. 672 */ 673 mutex_enter(&net->net_tree_lock); 674 mi->mi_ephemeral_tree = net; 675 net->net_mount = mi; 676 mutex_exit(&mi->mi_lock); 677 } else { 678 net = mi->mi_ephemeral_tree; 679 mutex_exit(&mi->mi_lock); 680 681 mutex_enter(&net->net_cnt_lock); 682 net->net_refcnt++; 683 mutex_exit(&net->net_cnt_lock); 684 685 /* 686 * Note that we do not do any checks to 687 * see if the parent has been nuked. 688 * We count on the vfs layer having protected 689 * us from feet shooters. 690 */ 691 mutex_enter(&net->net_tree_lock); 692 } 693 694 mutex_enter(&net->net_cnt_lock); 695 net->net_status |= NFS4_EPHEMERAL_TREE_MOUNTING; 696 mutex_exit(&net->net_cnt_lock); 697 698 must_unlock = TRUE; 699 700 dma = nfs4_trigger_domount_args_create(vp); 701 if (dma == NULL) { 702 error = EINVAL; 703 goto done; 704 } 705 706 /* 707 * Need to be root for this call to make mount work. 708 * Note that since we define mirror mounts to work 709 * for any user, we allow the mount to proceed. And 710 * we realize that the server will perform security 711 * checks to make sure that the client is allowed 712 * access. Finally, once the mount takes place, 713 * directory permissions will ensure that the 714 * content is secure. 715 */ 716 zcred = zone_get_kcred(getzoneid()); 717 ASSERT(zcred != NULL); 718 719 error = nfs4_trigger_domount(vp, dma, &vfsp, zcred); 720 nfs4_trigger_domount_args_destroy(dma, vp); 721 722 crfree(zcred); 723 724 if (!error) 725 error = VFS_ROOT(vfsp, newvpp); 726 done: 727 if (must_unlock) { 728 mutex_enter(&net->net_cnt_lock); 729 net->net_status &= ~NFS4_EPHEMERAL_TREE_MOUNTING; 730 if (is_building) 731 net->net_status &= ~NFS4_EPHEMERAL_TREE_BUILDING; 732 net->net_refcnt--; 733 mutex_exit(&net->net_cnt_lock); 734 735 mutex_exit(&net->net_tree_lock); 736 } 737 738 if (!error && (newvpp == NULL || *newvpp == NULL)) 739 error = ENOSYS; 740 741 return (error); 742 } 743 744 /* 745 * Collect together both the generic & mount-type specific args. 746 */ 747 static domount_args_t * 748 nfs4_trigger_domount_args_create(vnode_t *vp) 749 { 750 int nointr; 751 char *hostlist; 752 servinfo4_t *svp; 753 struct nfs_args *nargs, *nargs_head; 754 enum clnt_stat status; 755 ephemeral_servinfo_t *esi, *esi_first; 756 domount_args_t *dma; 757 mntinfo4_t *mi = VTOMI4(vp); 758 759 nointr = !(mi->mi_flags & MI4_INT); 760 hostlist = kmem_zalloc(MAXPATHLEN, KM_SLEEP); 761 762 svp = mi->mi_curr_serv; 763 /* check if the current server is responding */ 764 status = nfs4_trigger_ping_server(svp, nointr); 765 if (status == RPC_SUCCESS) { 766 esi_first = nfs4_trigger_esi_create(vp, svp); 767 if (esi_first == NULL) { 768 kmem_free(hostlist, MAXPATHLEN); 769 return (NULL); 770 } 771 772 (void) strlcpy(hostlist, esi_first->esi_hostname, MAXPATHLEN); 773 774 nargs_head = nfs4_trigger_nargs_create(mi, svp, esi_first); 775 } else { 776 /* current server did not respond */ 777 esi_first = NULL; 778 nargs_head = NULL; 779 } 780 nargs = nargs_head; 781 782 /* 783 * NFS RO failover. 784 * 785 * If we have multiple servinfo4 structures, linked via sv_next, 786 * we must create one nfs_args for each, linking the nfs_args via 787 * nfs_ext_u.nfs_extB.next. 788 * 789 * We need to build a corresponding esi for each, too, but that is 790 * used solely for building nfs_args, and may be immediately 791 * discarded, as domount() requires the info from just one esi, 792 * but all the nfs_args. 793 * 794 * Currently, the NFS mount code will hang if not all servers 795 * requested are available. To avoid that, we need to ping each 796 * server, here, and remove it from the list if it is not 797 * responding. This has the side-effect of that server then 798 * being permanently unavailable for this failover mount, even if 799 * it recovers. That's unfortunate, but the best we can do until 800 * the mount code path is fixed. 801 */ 802 803 /* 804 * If the current server was down, loop indefinitely until we find 805 * at least one responsive server. 806 */ 807 do { 808 /* no locking needed for sv_next; it is only set at fs mount */ 809 for (svp = mi->mi_servers; svp != NULL; svp = svp->sv_next) { 810 struct nfs_args *next; 811 812 /* 813 * nargs_head: the head of the nfs_args list 814 * nargs: the current tail of the list 815 * next: the newly-created element to be added 816 */ 817 818 /* 819 * We've already tried the current server, above; 820 * if it was responding, we have already included it 821 * and it may now be ignored. 822 * 823 * Otherwise, try it again, since it may now have 824 * recovered. 825 */ 826 if (svp == mi->mi_curr_serv && esi_first != NULL) 827 continue; 828 829 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 830 if (svp->sv_flags & SV4_NOTINUSE) { 831 nfs_rw_exit(&svp->sv_lock); 832 continue; 833 } 834 nfs_rw_exit(&svp->sv_lock); 835 836 /* check if the server is responding */ 837 status = nfs4_trigger_ping_server(svp, nointr); 838 /* if the server did not respond, ignore it */ 839 if (status != RPC_SUCCESS) 840 continue; 841 842 esi = nfs4_trigger_esi_create(vp, svp); 843 if (esi == NULL) 844 continue; 845 846 /* 847 * If the original current server (mi_curr_serv) 848 * was down when when we first tried it, 849 * (i.e. esi_first == NULL), 850 * we select this new server (svp) to be the server 851 * that we will actually contact (esi_first). 852 * 853 * Note that it's possible that mi_curr_serv == svp, 854 * if that mi_curr_serv was down but has now recovered. 855 */ 856 next = nfs4_trigger_nargs_create(mi, svp, esi); 857 if (esi_first == NULL) { 858 ASSERT(nargs == NULL); 859 ASSERT(nargs_head == NULL); 860 nargs_head = next; 861 esi_first = esi; 862 (void) strlcpy(hostlist, 863 esi_first->esi_hostname, MAXPATHLEN); 864 } else { 865 ASSERT(nargs_head != NULL); 866 nargs->nfs_ext_u.nfs_extB.next = next; 867 (void) strlcat(hostlist, ",", MAXPATHLEN); 868 (void) strlcat(hostlist, esi->esi_hostname, 869 MAXPATHLEN); 870 /* esi was only needed for hostname & nargs */ 871 nfs4_trigger_esi_destroy(esi, vp); 872 } 873 874 nargs = next; 875 } 876 877 /* if we've had no response at all, wait a second */ 878 if (esi_first == NULL) 879 delay(drv_usectohz(1000000)); 880 881 } while (esi_first == NULL); 882 ASSERT(nargs_head != NULL); 883 884 dma = kmem_zalloc(sizeof (domount_args_t), KM_SLEEP); 885 dma->dma_esi = esi_first; 886 dma->dma_hostlist = hostlist; 887 dma->dma_nargs = nargs_head; 888 889 return (dma); 890 } 891 892 static void 893 nfs4_trigger_domount_args_destroy(domount_args_t *dma, vnode_t *vp) 894 { 895 if (dma != NULL) { 896 if (dma->dma_esi != NULL && vp != NULL) 897 nfs4_trigger_esi_destroy(dma->dma_esi, vp); 898 899 if (dma->dma_hostlist != NULL) 900 kmem_free(dma->dma_hostlist, MAXPATHLEN); 901 902 if (dma->dma_nargs != NULL) { 903 struct nfs_args *nargs = dma->dma_nargs; 904 905 do { 906 struct nfs_args *next = 907 nargs->nfs_ext_u.nfs_extB.next; 908 909 nfs4_trigger_nargs_destroy(nargs); 910 nargs = next; 911 } while (nargs != NULL); 912 } 913 914 kmem_free(dma, sizeof (domount_args_t)); 915 } 916 } 917 918 /* 919 * The ephemeral_servinfo_t struct contains basic information we will need to 920 * perform the mount. Whilst the structure is generic across different 921 * types of ephemeral mount, the way we gather its contents differs. 922 */ 923 static ephemeral_servinfo_t * 924 nfs4_trigger_esi_create(vnode_t *vp, servinfo4_t *svp) 925 { 926 ephemeral_servinfo_t *esi; 927 rnode4_t *rp = VTOR4(vp); 928 929 ASSERT(RP_ISSTUB(rp)); 930 931 /* Call the ephemeral type-specific routine */ 932 if (RP_ISSTUB_MIRRORMOUNT(rp)) 933 esi = nfs4_trigger_esi_create_mirrormount(vp, svp); 934 else 935 esi = NULL; 936 937 /* for now, we only support mirror-mounts */ 938 ASSERT(esi != NULL); 939 940 return (esi); 941 } 942 943 static void 944 nfs4_trigger_esi_destroy(ephemeral_servinfo_t *esi, vnode_t *vp) 945 { 946 rnode4_t *rp = VTOR4(vp); 947 948 ASSERT(RP_ISSTUB(rp)); 949 950 /* for now, we only support mirror-mounts */ 951 ASSERT(RP_ISSTUB_MIRRORMOUNT(rp)); 952 953 /* Currently, no need for an ephemeral type-specific routine */ 954 955 /* 956 * The contents of ephemeral_servinfo_t goes into nfs_args, 957 * and will be handled by nfs4_trigger_nargs_destroy(). 958 * We need only free the structure itself. 959 */ 960 if (esi != NULL) 961 kmem_free(esi, sizeof (ephemeral_servinfo_t)); 962 } 963 964 /* 965 * Some of this may turn out to be common with other ephemeral types, 966 * in which case it should be moved to nfs4_trigger_esi_create(), or a 967 * common function called. 968 */ 969 static ephemeral_servinfo_t * 970 nfs4_trigger_esi_create_mirrormount(vnode_t *vp, servinfo4_t *svp) 971 { 972 char *stubpath; 973 struct knetconfig *sikncp, *svkncp; 974 struct netbuf *bufp; 975 ephemeral_servinfo_t *esi; 976 977 esi = kmem_zalloc(sizeof (ephemeral_servinfo_t), KM_SLEEP); 978 979 /* initially set to be our type of ephemeral mount; may be added to */ 980 esi->esi_mount_flags = NFSMNT_MIRRORMOUNT; 981 982 /* 983 * We're copying info from the stub rnode's servinfo4, but 984 * we must create new copies, not pointers, since this information 985 * is to be associated with the new mount, which will be 986 * unmounted (and its structures freed) separately 987 */ 988 989 /* 990 * Sizes passed to kmem_[z]alloc here must match those freed 991 * in nfs4_free_args() 992 */ 993 994 /* 995 * We hold sv_lock across kmem_zalloc() calls that may sleep, but this 996 * is difficult to avoid: as we need to read svp to calculate the 997 * sizes to be allocated. 998 */ 999 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 1000 1001 esi->esi_hostname = kmem_zalloc(strlen(svp->sv_hostname) + 1, KM_SLEEP); 1002 (void) strcat(esi->esi_hostname, svp->sv_hostname); 1003 1004 esi->esi_addr = kmem_zalloc(sizeof (struct netbuf), KM_SLEEP); 1005 bufp = esi->esi_addr; 1006 bufp->len = svp->sv_addr.len; 1007 bufp->maxlen = svp->sv_addr.maxlen; 1008 bufp->buf = kmem_zalloc(bufp->len, KM_SLEEP); 1009 bcopy(svp->sv_addr.buf, bufp->buf, bufp->len); 1010 1011 esi->esi_knconf = kmem_zalloc(sizeof (*esi->esi_knconf), KM_SLEEP); 1012 sikncp = esi->esi_knconf; 1013 svkncp = svp->sv_knconf; 1014 sikncp->knc_semantics = svkncp->knc_semantics; 1015 sikncp->knc_protofmly = (caddr_t)kmem_zalloc(KNC_STRSIZE, KM_SLEEP); 1016 (void) strcat((char *)sikncp->knc_protofmly, 1017 (char *)svkncp->knc_protofmly); 1018 sikncp->knc_proto = (caddr_t)kmem_zalloc(KNC_STRSIZE, KM_SLEEP); 1019 (void) strcat((char *)sikncp->knc_proto, (char *)svkncp->knc_proto); 1020 sikncp->knc_rdev = svkncp->knc_rdev; 1021 1022 /* 1023 * Used when AUTH_DH is negotiated. 1024 * 1025 * This is ephemeral mount-type specific, since it contains the 1026 * server's time-sync syncaddr. 1027 */ 1028 if (svp->sv_dhsec) { 1029 struct netbuf *bufp; 1030 sec_data_t *sdata; 1031 dh_k4_clntdata_t *data; 1032 1033 sdata = svp->sv_dhsec; 1034 data = (dh_k4_clntdata_t *)sdata->data; 1035 ASSERT(sdata->rpcflavor == AUTH_DH); 1036 1037 bufp = kmem_zalloc(sizeof (struct netbuf), KM_SLEEP); 1038 bufp->len = data->syncaddr.len; 1039 bufp->maxlen = data->syncaddr.maxlen; 1040 bufp->buf = kmem_zalloc(bufp->len, KM_SLEEP); 1041 bcopy(data->syncaddr.buf, bufp->buf, bufp->len); 1042 esi->esi_syncaddr = bufp; 1043 1044 if (data->netname != NULL) { 1045 int nmlen = data->netnamelen; 1046 1047 /* 1048 * We need to copy from a dh_k4_clntdata_t 1049 * netname/netnamelen pair to a NUL-terminated 1050 * netname string suitable for putting in nfs_args, 1051 * where the latter has no netnamelen field. 1052 */ 1053 esi->esi_netname = kmem_zalloc(nmlen + 1, KM_SLEEP); 1054 bcopy(data->netname, esi->esi_netname, nmlen); 1055 } 1056 } else { 1057 esi->esi_syncaddr = NULL; 1058 esi->esi_netname = NULL; 1059 } 1060 1061 stubpath = fn_path(VTOSV(vp)->sv_name); 1062 /* step over initial '.', to avoid e.g. sv_path: "/tank./ws" */ 1063 ASSERT(*stubpath == '.'); 1064 stubpath += 1; 1065 1066 /* for nfs_args->fh */ 1067 esi->esi_path_len = strlen(svp->sv_path) + strlen(stubpath) + 1; 1068 esi->esi_path = kmem_zalloc(esi->esi_path_len, KM_SLEEP); 1069 (void) strcat(esi->esi_path, svp->sv_path); 1070 (void) strcat(esi->esi_path, stubpath); 1071 1072 stubpath -= 1; 1073 /* stubpath allocated by fn_path() */ 1074 kmem_free(stubpath, strlen(stubpath) + 1); 1075 1076 nfs_rw_exit(&svp->sv_lock); 1077 1078 return (esi); 1079 } 1080 1081 /* 1082 * Assemble the args, and call the generic VFS mount function to 1083 * finally perform the ephemeral mount. 1084 */ 1085 static int 1086 nfs4_trigger_domount(vnode_t *stubvp, domount_args_t *dma, vfs_t **vfsp, 1087 cred_t *cr) 1088 { 1089 struct mounta *uap; 1090 char *mntpt, *orig_path, *path; 1091 const char *orig_mntpt; 1092 int retval; 1093 int mntpt_len; 1094 int spec_len; 1095 zone_t *zone = curproc->p_zone; 1096 bool_t has_leading_slash; 1097 1098 vfs_t *stubvfsp = stubvp->v_vfsp; 1099 ephemeral_servinfo_t *esi = dma->dma_esi; 1100 struct nfs_args *nargs = dma->dma_nargs; 1101 1102 /* first, construct the mount point for the ephemeral mount */ 1103 orig_path = path = fn_path(VTOSV(stubvp)->sv_name); 1104 orig_mntpt = (char *)refstr_value(stubvfsp->vfs_mntpt); 1105 1106 if (*orig_path == '.') 1107 orig_path++; 1108 1109 /* 1110 * Get rid of zone's root path 1111 */ 1112 if (zone != global_zone) { 1113 /* 1114 * -1 for trailing '/' and -1 for EOS. 1115 */ 1116 if (strncmp(zone->zone_rootpath, orig_mntpt, 1117 zone->zone_rootpathlen - 1) == 0) { 1118 orig_mntpt += (zone->zone_rootpathlen - 2); 1119 } 1120 } 1121 1122 mntpt_len = strlen(orig_mntpt) + strlen(orig_path); 1123 mntpt = kmem_zalloc(mntpt_len + 1, KM_SLEEP); 1124 (void) strcat(mntpt, orig_mntpt); 1125 (void) strcat(mntpt, orig_path); 1126 1127 kmem_free(path, strlen(path) + 1); 1128 path = esi->esi_path; 1129 if (*path == '.') 1130 path++; 1131 if (path[0] == '/' && path[1] == '/') 1132 path++; 1133 has_leading_slash = (*path == '/'); 1134 1135 spec_len = strlen(dma->dma_hostlist); 1136 spec_len += strlen(path); 1137 1138 /* We are going to have to add this in */ 1139 if (!has_leading_slash) 1140 spec_len++; 1141 1142 /* We need to get the ':' for dma_hostlist:esi_path */ 1143 spec_len++; 1144 1145 uap = kmem_zalloc(sizeof (struct mounta), KM_SLEEP); 1146 uap->spec = kmem_zalloc(spec_len + 1, KM_SLEEP); 1147 (void) snprintf(uap->spec, spec_len + 1, "%s:%s%s", dma->dma_hostlist, 1148 has_leading_slash ? "" : "/", path); 1149 1150 uap->dir = mntpt; 1151 1152 uap->flags = MS_SYSSPACE | MS_DATA; 1153 /* fstype-independent mount options not covered elsewhere */ 1154 /* copy parent's mount(1M) "-m" flag */ 1155 if (stubvfsp->vfs_flag & VFS_NOMNTTAB) 1156 uap->flags |= MS_NOMNTTAB; 1157 1158 uap->fstype = MNTTYPE_NFS4; 1159 uap->dataptr = (char *)nargs; 1160 /* not needed for MS_SYSSPACE */ 1161 uap->datalen = 0; 1162 1163 /* use optptr to pass in extra mount options */ 1164 uap->flags |= MS_OPTIONSTR; 1165 uap->optptr = nfs4_trigger_create_mntopts(stubvfsp); 1166 if (uap->optptr == NULL) { 1167 retval = EINVAL; 1168 goto done; 1169 } 1170 /* domount() expects us to count the trailing NUL */ 1171 uap->optlen = strlen(uap->optptr) + 1; 1172 1173 retval = domount(NULL, uap, stubvp, cr, vfsp); 1174 if (retval == 0) 1175 VFS_RELE(*vfsp); 1176 done: 1177 if (uap->optptr) 1178 nfs4_trigger_destroy_mntopts(uap->optptr); 1179 1180 kmem_free(uap->spec, spec_len + 1); 1181 kmem_free(uap, sizeof (struct mounta)); 1182 kmem_free(mntpt, mntpt_len + 1); 1183 1184 return (retval); 1185 } 1186 1187 /* 1188 * Build an nfs_args structure for passing to domount(). 1189 * 1190 * Ephemeral mount-type specific data comes from the ephemeral_servinfo_t; 1191 * generic data - common to all ephemeral mount types - is read directly 1192 * from the parent mount's servinfo4_t and mntinfo4_t, via the stub vnode. 1193 */ 1194 static struct nfs_args * 1195 nfs4_trigger_nargs_create(mntinfo4_t *mi, servinfo4_t *svp, 1196 ephemeral_servinfo_t *esi) 1197 { 1198 sec_data_t *secdata; 1199 struct nfs_args *nargs; 1200 1201 /* setup the nfs args */ 1202 nargs = kmem_zalloc(sizeof (struct nfs_args), KM_SLEEP); 1203 1204 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 1205 1206 nargs->addr = esi->esi_addr; 1207 1208 /* for AUTH_DH by negotiation */ 1209 if (esi->esi_syncaddr || esi->esi_netname) { 1210 nargs->flags |= NFSMNT_SECURE; 1211 nargs->syncaddr = esi->esi_syncaddr; 1212 nargs->netname = esi->esi_netname; 1213 } 1214 1215 nargs->flags |= NFSMNT_KNCONF; 1216 nargs->knconf = esi->esi_knconf; 1217 nargs->flags |= NFSMNT_HOSTNAME; 1218 nargs->hostname = esi->esi_hostname; 1219 nargs->fh = esi->esi_path; 1220 1221 /* general mount settings, all copied from parent mount */ 1222 mutex_enter(&mi->mi_lock); 1223 1224 if (!(mi->mi_flags & MI4_HARD)) 1225 nargs->flags |= NFSMNT_SOFT; 1226 1227 nargs->flags |= NFSMNT_WSIZE | NFSMNT_RSIZE | NFSMNT_TIMEO | 1228 NFSMNT_RETRANS; 1229 nargs->wsize = mi->mi_stsize; 1230 nargs->rsize = mi->mi_tsize; 1231 nargs->timeo = mi->mi_timeo; 1232 nargs->retrans = mi->mi_retrans; 1233 1234 if (mi->mi_flags & MI4_INT) 1235 nargs->flags |= NFSMNT_INT; 1236 if (mi->mi_flags & MI4_NOAC) 1237 nargs->flags |= NFSMNT_NOAC; 1238 1239 nargs->flags |= NFSMNT_ACREGMIN | NFSMNT_ACREGMAX | NFSMNT_ACDIRMIN | 1240 NFSMNT_ACDIRMAX; 1241 nargs->acregmin = HR2SEC(mi->mi_acregmin); 1242 nargs->acregmax = HR2SEC(mi->mi_acregmax); 1243 nargs->acdirmin = HR2SEC(mi->mi_acdirmin); 1244 nargs->acdirmax = HR2SEC(mi->mi_acdirmax); 1245 1246 if (mi->mi_flags & MI4_NOCTO) 1247 nargs->flags |= NFSMNT_NOCTO; 1248 if (mi->mi_flags & MI4_GRPID) 1249 nargs->flags |= NFSMNT_GRPID; 1250 if (mi->mi_flags & MI4_LLOCK) 1251 nargs->flags |= NFSMNT_LLOCK; 1252 if (mi->mi_flags & MI4_NOPRINT) 1253 nargs->flags |= NFSMNT_NOPRINT; 1254 if (mi->mi_flags & MI4_DIRECTIO) 1255 nargs->flags |= NFSMNT_DIRECTIO; 1256 if (mi->mi_flags & MI4_PUBLIC) 1257 nargs->flags |= NFSMNT_PUBLIC; 1258 1259 mutex_exit(&mi->mi_lock); 1260 1261 /* add any specific flags for this type of ephemeral mount */ 1262 nargs->flags |= esi->esi_mount_flags; 1263 1264 /* 1265 * Security data & negotiation policy. 1266 * 1267 * We need to preserve the parent mount's preference for security 1268 * negotiation, translating SV4_TRYSECDEFAULT -> NFSMNT_SECDEFAULT. 1269 * 1270 * If SV4_TRYSECDEFAULT is not set, that indicates that a specific 1271 * security flavour was requested, with data in sv_secdata, and that 1272 * no negotiation should occur. If this specified flavour fails, that's 1273 * it. We will copy sv_secdata, and not set NFSMNT_SECDEFAULT. 1274 * 1275 * If SV4_TRYSECDEFAULT is set, then we start with a passed-in 1276 * default flavour, in sv_secdata, but then negotiate a new flavour. 1277 * Possible flavours are recorded in an array in sv_secinfo, with 1278 * currently in-use flavour pointed to by sv_currsec. 1279 * 1280 * If sv_currsec is set, i.e. if negotiation has already occurred, 1281 * we will copy sv_currsec. Otherwise, copy sv_secdata. Regardless, 1282 * we will set NFSMNT_SECDEFAULT, to enable negotiation. 1283 */ 1284 if (svp->sv_flags & SV4_TRYSECDEFAULT) { 1285 /* enable negotiation for ephemeral mount */ 1286 nargs->flags |= NFSMNT_SECDEFAULT; 1287 1288 /* 1289 * As a starting point for negotiation, copy parent 1290 * mount's negotiated flavour (sv_currsec) if available, 1291 * or its passed-in flavour (sv_secdata) if not. 1292 */ 1293 if (svp->sv_currsec != NULL) 1294 secdata = copy_sec_data(svp->sv_currsec); 1295 else if (svp->sv_secdata != NULL) 1296 secdata = copy_sec_data(svp->sv_secdata); 1297 else 1298 secdata = NULL; 1299 } else { 1300 /* do not enable negotiation; copy parent's passed-in flavour */ 1301 if (svp->sv_secdata != NULL) 1302 secdata = copy_sec_data(svp->sv_secdata); 1303 else 1304 secdata = NULL; 1305 } 1306 1307 nfs_rw_exit(&svp->sv_lock); 1308 1309 nargs->flags |= NFSMNT_NEWARGS; 1310 nargs->nfs_args_ext = NFS_ARGS_EXTB; 1311 nargs->nfs_ext_u.nfs_extB.secdata = secdata; 1312 1313 /* for NFS RO failover; caller will set if necessary */ 1314 nargs->nfs_ext_u.nfs_extB.next = NULL; 1315 1316 return (nargs); 1317 } 1318 1319 static void 1320 nfs4_trigger_nargs_destroy(struct nfs_args *nargs) 1321 { 1322 /* 1323 * Either the mount failed, in which case the data is not needed, or 1324 * nfs4_mount() has either taken copies of what it needs or, 1325 * where it has merely copied the ptr, it has set *our* ptr to NULL, 1326 * whereby nfs4_free_args() will ignore it. 1327 */ 1328 nfs4_free_args(nargs); 1329 kmem_free(nargs, sizeof (struct nfs_args)); 1330 } 1331 1332 /* 1333 * When we finally get into the mounting, we need to add this 1334 * node to the ephemeral tree. 1335 * 1336 * This is called from nfs4_mount(). 1337 */ 1338 void 1339 nfs4_record_ephemeral_mount(mntinfo4_t *mi, vnode_t *mvp) 1340 { 1341 mntinfo4_t *mi_parent; 1342 nfs4_ephemeral_t *eph; 1343 nfs4_ephemeral_tree_t *net; 1344 1345 nfs4_ephemeral_t *prior; 1346 nfs4_ephemeral_t *child; 1347 1348 nfs4_ephemeral_t *peer; 1349 1350 nfs4_trigger_globals_t *ntg; 1351 zone_t *zone = curproc->p_zone; 1352 1353 mi_parent = VTOMI4(mvp); 1354 1355 /* 1356 * Get this before grabbing anything else! 1357 */ 1358 ntg = zone_getspecific(nfs4_ephemeral_key, zone); 1359 if (!ntg->ntg_thread_started) { 1360 nfs4_ephemeral_start_harvester(ntg); 1361 } 1362 1363 mutex_enter(&mi_parent->mi_lock); 1364 mutex_enter(&mi->mi_lock); 1365 1366 /* 1367 * We need to tack together the ephemeral mount 1368 * with this new mntinfo. 1369 */ 1370 eph = kmem_zalloc(sizeof (*eph), KM_SLEEP); 1371 eph->ne_mount = mi; 1372 eph->ne_ref_time = gethrestime_sec(); 1373 1374 /* 1375 * We need to tell the ephemeral mount when 1376 * to time out. 1377 */ 1378 eph->ne_mount_to = ntg->ntg_mount_to; 1379 1380 mi->mi_flags |= MI4_EPHEMERAL; 1381 mi->mi_ephemeral = eph; 1382 1383 net = mi->mi_ephemeral_tree = 1384 mi_parent->mi_ephemeral_tree; 1385 ASSERT(net != NULL); 1386 1387 /* 1388 * If the enclosing mntinfo4 is also ephemeral, 1389 * then we need to point to its enclosing parent. 1390 * Else the enclosing mntinfo4 is the enclosing parent. 1391 * 1392 * We also need to weave this ephemeral node 1393 * into the tree. 1394 */ 1395 if (mi_parent->mi_flags & MI4_EPHEMERAL) { 1396 /* 1397 * We need to decide if we are 1398 * the root node of this branch 1399 * or if we are a sibling of this 1400 * branch. 1401 */ 1402 prior = mi_parent->mi_ephemeral; 1403 ASSERT(prior != NULL); 1404 if (prior->ne_child == NULL) { 1405 prior->ne_child = eph; 1406 } else { 1407 child = prior->ne_child; 1408 1409 prior->ne_child = eph; 1410 eph->ne_peer = child; 1411 1412 child->ne_prior = eph; 1413 } 1414 1415 eph->ne_prior = prior; 1416 } else { 1417 /* 1418 * The parent mntinfo4 is the non-ephemeral 1419 * root of the ephemeral tree. We 1420 * need to decide if we are the root 1421 * node of that tree or if we are a 1422 * sibling of the root node. 1423 * 1424 * We are the root if there is no 1425 * other node. 1426 */ 1427 if (net->net_root == NULL) { 1428 net->net_root = eph; 1429 } else { 1430 eph->ne_peer = peer = net->net_root; 1431 ASSERT(peer != NULL); 1432 net->net_root = eph; 1433 1434 peer->ne_prior = eph; 1435 } 1436 1437 eph->ne_prior = NULL; 1438 } 1439 1440 mutex_exit(&mi->mi_lock); 1441 mutex_exit(&mi_parent->mi_lock); 1442 } 1443 1444 /* 1445 * Commit the changes to the ephemeral tree for removing this node. 1446 */ 1447 static void 1448 nfs4_ephemeral_umount_cleanup(nfs4_ephemeral_t *eph) 1449 { 1450 nfs4_ephemeral_t *e = eph; 1451 nfs4_ephemeral_t *peer; 1452 nfs4_ephemeral_t *prior; 1453 1454 peer = eph->ne_peer; 1455 prior = e->ne_prior; 1456 1457 /* 1458 * If this branch root was not the 1459 * tree root, then we need to fix back pointers. 1460 */ 1461 if (prior) { 1462 if (prior->ne_child == e) { 1463 prior->ne_child = peer; 1464 } else { 1465 prior->ne_peer = peer; 1466 } 1467 1468 if (peer) 1469 peer->ne_prior = prior; 1470 } else if (peer) { 1471 peer->ne_mount->mi_ephemeral_tree->net_root = peer; 1472 peer->ne_prior = NULL; 1473 } else { 1474 e->ne_mount->mi_ephemeral_tree->net_root = NULL; 1475 } 1476 } 1477 1478 /* 1479 * We want to avoid recursion at all costs. So we need to 1480 * unroll the tree. We do this by a depth first traversal to 1481 * leaf nodes. We blast away the leaf and work our way back 1482 * up and down the tree. 1483 */ 1484 static int 1485 nfs4_ephemeral_unmount_engine(nfs4_ephemeral_t *eph, 1486 int isTreeRoot, int flag, cred_t *cr) 1487 { 1488 nfs4_ephemeral_t *e = eph; 1489 nfs4_ephemeral_t *prior; 1490 mntinfo4_t *mi; 1491 vfs_t *vfsp; 1492 int error; 1493 1494 /* 1495 * We use the loop while unrolling the ephemeral tree. 1496 */ 1497 for (;;) { 1498 /* 1499 * First we walk down the child. 1500 */ 1501 if (e->ne_child) { 1502 prior = e; 1503 e = e->ne_child; 1504 continue; 1505 } 1506 1507 /* 1508 * If we are the root of the branch we are removing, 1509 * we end it here. But if the branch is the root of 1510 * the tree, we have to forge on. We do not consider 1511 * the peer list for the root because while it may 1512 * be okay to remove, it is both extra work and a 1513 * potential for a false-positive error to stall the 1514 * unmount attempt. 1515 */ 1516 if (e == eph && isTreeRoot == FALSE) 1517 return (0); 1518 1519 /* 1520 * Next we walk down the peer list. 1521 */ 1522 if (e->ne_peer) { 1523 prior = e; 1524 e = e->ne_peer; 1525 continue; 1526 } 1527 1528 /* 1529 * We can only remove the node passed in by the 1530 * caller if it is the root of the ephemeral tree. 1531 * Otherwise, the caller will remove it. 1532 */ 1533 if (e == eph && isTreeRoot == FALSE) 1534 return (0); 1535 1536 /* 1537 * Okay, we have a leaf node, time 1538 * to prune it! 1539 * 1540 * Note that prior can only be NULL if 1541 * and only if it is the root of the 1542 * ephemeral tree. 1543 */ 1544 prior = e->ne_prior; 1545 1546 mi = e->ne_mount; 1547 mutex_enter(&mi->mi_lock); 1548 vfsp = mi->mi_vfsp; 1549 1550 /* 1551 * Cleared by umount2_engine. 1552 */ 1553 VFS_HOLD(vfsp); 1554 1555 /* 1556 * Inform nfs4_unmount to not recursively 1557 * descend into this node's children when it 1558 * gets processed. 1559 */ 1560 mi->mi_flags |= MI4_EPHEMERAL_RECURSED; 1561 mutex_exit(&mi->mi_lock); 1562 1563 error = umount2_engine(vfsp, flag, cr, FALSE); 1564 if (error) { 1565 /* 1566 * We need to reenable nfs4_unmount's ability 1567 * to recursively descend on this node. 1568 */ 1569 mutex_enter(&mi->mi_lock); 1570 mi->mi_flags &= ~MI4_EPHEMERAL_RECURSED; 1571 mutex_exit(&mi->mi_lock); 1572 1573 return (error); 1574 } 1575 1576 /* 1577 * If we are the current node, we do not want to 1578 * touch anything else. At this point, the only 1579 * way the current node can have survived to here 1580 * is if it is the root of the ephemeral tree and 1581 * we are unmounting the enclosing mntinfo4. 1582 */ 1583 if (e == eph) { 1584 ASSERT(prior == NULL); 1585 return (0); 1586 } 1587 1588 /* 1589 * Stitch up the prior node. Note that since 1590 * we have handled the root of the tree, prior 1591 * must be non-NULL. 1592 */ 1593 ASSERT(prior != NULL); 1594 if (prior->ne_child == e) { 1595 prior->ne_child = NULL; 1596 } else { 1597 ASSERT(prior->ne_peer == e); 1598 1599 prior->ne_peer = NULL; 1600 } 1601 1602 e = prior; 1603 } 1604 1605 /* NOTREACHED */ 1606 } 1607 1608 /* 1609 * Common code to safely release net_cnt_lock and net_tree_lock 1610 */ 1611 void 1612 nfs4_ephemeral_umount_unlock(bool_t *pmust_unlock, 1613 nfs4_ephemeral_tree_t **pnet) 1614 { 1615 nfs4_ephemeral_tree_t *net = *pnet; 1616 1617 if (*pmust_unlock) { 1618 mutex_enter(&net->net_cnt_lock); 1619 net->net_refcnt--; 1620 net->net_status &= ~NFS4_EPHEMERAL_TREE_UMOUNTING; 1621 mutex_exit(&net->net_cnt_lock); 1622 1623 mutex_exit(&net->net_tree_lock); 1624 1625 *pmust_unlock = FALSE; 1626 } 1627 } 1628 1629 /* 1630 * While we may have removed any child or sibling nodes of this 1631 * ephemeral node, we can not nuke it until we know that there 1632 * were no actived vnodes on it. This will do that final 1633 * work once we know it is not busy. 1634 */ 1635 void 1636 nfs4_ephemeral_umount_activate(mntinfo4_t *mi, bool_t *pmust_unlock, 1637 nfs4_ephemeral_tree_t **pnet) 1638 { 1639 /* 1640 * Now we need to get rid of the ephemeral data if it exists. 1641 */ 1642 mutex_enter(&mi->mi_lock); 1643 if (mi->mi_ephemeral) { 1644 /* 1645 * If we are the root node of an ephemeral branch 1646 * which is being removed, then we need to fixup 1647 * pointers into and out of the node. 1648 */ 1649 if (!(mi->mi_flags & MI4_EPHEMERAL_RECURSED)) 1650 nfs4_ephemeral_umount_cleanup(mi->mi_ephemeral); 1651 1652 ASSERT(mi->mi_ephemeral != NULL); 1653 1654 kmem_free(mi->mi_ephemeral, sizeof (*mi->mi_ephemeral)); 1655 mi->mi_ephemeral = NULL; 1656 } 1657 mutex_exit(&mi->mi_lock); 1658 1659 nfs4_ephemeral_umount_unlock(pmust_unlock, pnet); 1660 } 1661 1662 /* 1663 * Unmount an ephemeral node. 1664 */ 1665 int 1666 nfs4_ephemeral_umount(mntinfo4_t *mi, int flag, cred_t *cr, 1667 bool_t *pmust_unlock, nfs4_ephemeral_tree_t **pnet) 1668 { 1669 int error = 0; 1670 nfs4_ephemeral_t *eph; 1671 nfs4_ephemeral_tree_t *net; 1672 int is_derooting = FALSE; 1673 int is_recursed = FALSE; 1674 int was_locked = FALSE; 1675 1676 /* 1677 * The active vnodes on this file system may be ephemeral 1678 * children. We need to check for and try to unmount them 1679 * here. If any can not be unmounted, we are going 1680 * to return EBUSY. 1681 */ 1682 mutex_enter(&mi->mi_lock); 1683 1684 /* 1685 * If an ephemeral tree, we need to check to see if 1686 * the lock is already held. If it is, then we need 1687 * to see if we are being called as a result of 1688 * the recursive removal of some node of the tree or 1689 * if we are another attempt to remove the tree. 1690 * 1691 * mi_flags & MI4_EPHEMERAL indicates an ephemeral 1692 * node. mi_ephemeral being non-NULL also does this. 1693 * 1694 * mi_ephemeral_tree being non-NULL is sufficient 1695 * to also indicate either it is an ephemeral node 1696 * or the enclosing mntinfo4. 1697 * 1698 * Do we need MI4_EPHEMERAL? Yes, it is useful for 1699 * when we delete the ephemeral node and need to 1700 * differentiate from an ephemeral node and the 1701 * enclosing root node. 1702 */ 1703 *pnet = net = mi->mi_ephemeral_tree; 1704 eph = mi->mi_ephemeral; 1705 if (net) { 1706 is_recursed = mi->mi_flags & MI4_EPHEMERAL_RECURSED; 1707 is_derooting = (eph == NULL); 1708 mutex_exit(&mi->mi_lock); 1709 1710 /* 1711 * If this is not recursion, then we need to 1712 * grab a ref count. 1713 * 1714 * But wait, we also do not want to do that 1715 * if a harvester thread has already grabbed 1716 * the lock. 1717 */ 1718 if (!is_recursed) { 1719 mutex_enter(&net->net_cnt_lock); 1720 if (net->net_status & 1721 NFS4_EPHEMERAL_TREE_LOCKED) 1722 was_locked = TRUE; 1723 else 1724 net->net_refcnt++; 1725 mutex_exit(&net->net_cnt_lock); 1726 } 1727 1728 /* 1729 * If we grab the lock, it means that no other 1730 * operation is working on the tree. If we don't 1731 * grab it, we need to decide if this is because 1732 * we are a recursive call or a new operation. 1733 * 1734 * If we are a recursive call, we proceed without 1735 * the lock. 1736 * 1737 * Else we have to wait until the lock becomes free. 1738 */ 1739 if (was_locked == FALSE && 1740 !mutex_tryenter(&net->net_tree_lock)) { 1741 if (!is_recursed) { 1742 mutex_enter(&net->net_cnt_lock); 1743 if (net->net_status & 1744 (NFS4_EPHEMERAL_TREE_DEROOTING 1745 | NFS4_EPHEMERAL_TREE_INVALID)) { 1746 net->net_refcnt--; 1747 mutex_exit(&net->net_cnt_lock); 1748 goto is_busy; 1749 } 1750 mutex_exit(&net->net_cnt_lock); 1751 1752 /* 1753 * We can't hold any other locks whilst 1754 * we wait on this to free up. 1755 */ 1756 mutex_enter(&net->net_tree_lock); 1757 1758 /* 1759 * Note that while mi->mi_ephemeral 1760 * may change and thus we have to 1761 * update eph, it is the case that 1762 * we have tied down net and 1763 * do not care if mi->mi_ephemeral_tree 1764 * has changed. 1765 */ 1766 mutex_enter(&mi->mi_lock); 1767 eph = mi->mi_ephemeral; 1768 mutex_exit(&mi->mi_lock); 1769 1770 /* 1771 * Okay, we need to see if either the 1772 * tree got nuked or the current node 1773 * got nuked. Both of which will cause 1774 * an error. 1775 * 1776 * Note that a subsequent retry of the 1777 * umount shall work. 1778 */ 1779 mutex_enter(&net->net_cnt_lock); 1780 if (net->net_status & 1781 NFS4_EPHEMERAL_TREE_INVALID || 1782 (!is_derooting && eph == NULL)) { 1783 net->net_refcnt--; 1784 mutex_exit(&net->net_cnt_lock); 1785 mutex_exit(&net->net_tree_lock); 1786 goto is_busy; 1787 } 1788 mutex_exit(&net->net_cnt_lock); 1789 *pmust_unlock = TRUE; 1790 } 1791 } else if (was_locked == FALSE) { 1792 /* 1793 * If we grab it right away, everything must 1794 * be great! 1795 */ 1796 *pmust_unlock = TRUE; 1797 } 1798 1799 /* 1800 * Only once we have grabbed the lock can we mark what we 1801 * are planning on doing to the ephemeral tree. 1802 */ 1803 if (*pmust_unlock) { 1804 mutex_enter(&net->net_cnt_lock); 1805 net->net_status |= NFS4_EPHEMERAL_TREE_UMOUNTING; 1806 1807 /* 1808 * Check to see if we are nuking the root. 1809 */ 1810 if (is_derooting) 1811 net->net_status |= 1812 NFS4_EPHEMERAL_TREE_DEROOTING; 1813 mutex_exit(&net->net_cnt_lock); 1814 } 1815 1816 if (!is_derooting) { 1817 /* 1818 * Only work on children if the caller has not already 1819 * done so. 1820 */ 1821 if (!is_recursed) { 1822 ASSERT(eph != NULL); 1823 1824 error = nfs4_ephemeral_unmount_engine(eph, 1825 FALSE, flag, cr); 1826 if (error) 1827 goto is_busy; 1828 } 1829 } else { 1830 eph = net->net_root; 1831 1832 /* 1833 * Only work if there is something there. 1834 */ 1835 if (eph) { 1836 error = nfs4_ephemeral_unmount_engine(eph, TRUE, 1837 flag, cr); 1838 if (error) { 1839 mutex_enter(&net->net_cnt_lock); 1840 net->net_status &= 1841 ~NFS4_EPHEMERAL_TREE_DEROOTING; 1842 mutex_exit(&net->net_cnt_lock); 1843 goto is_busy; 1844 } 1845 1846 /* 1847 * Nothing else which goes wrong will 1848 * invalidate the blowing away of the 1849 * ephmeral tree. 1850 */ 1851 net->net_root = NULL; 1852 } 1853 1854 /* 1855 * We have derooted and we have caused the tree to be 1856 * invalid. 1857 */ 1858 mutex_enter(&net->net_cnt_lock); 1859 net->net_status &= ~NFS4_EPHEMERAL_TREE_DEROOTING; 1860 net->net_status |= NFS4_EPHEMERAL_TREE_INVALID; 1861 net->net_refcnt--; 1862 mutex_exit(&net->net_cnt_lock); 1863 1864 /* 1865 * At this point, the tree should no 1866 * longer be associated with the 1867 * mntinfo4. We need to pull it off 1868 * there and let the harvester take 1869 * care of it once the refcnt drops. 1870 */ 1871 mutex_enter(&mi->mi_lock); 1872 mi->mi_ephemeral_tree = NULL; 1873 mutex_exit(&mi->mi_lock); 1874 } 1875 } else { 1876 mutex_exit(&mi->mi_lock); 1877 } 1878 1879 return (0); 1880 1881 is_busy: 1882 1883 nfs4_ephemeral_umount_unlock(pmust_unlock, pnet); 1884 1885 return (error); 1886 } 1887 1888 /* 1889 * Do the umount and record any error in the parent. 1890 */ 1891 static void 1892 nfs4_ephemeral_record_umount(vfs_t *vfsp, int flag, 1893 nfs4_ephemeral_t *e, nfs4_ephemeral_t *prior) 1894 { 1895 int error; 1896 1897 error = umount2_engine(vfsp, flag, kcred, FALSE); 1898 if (error) { 1899 if (prior) { 1900 if (prior->ne_child == e) 1901 prior->ne_state |= 1902 NFS4_EPHEMERAL_CHILD_ERROR; 1903 else 1904 prior->ne_state |= 1905 NFS4_EPHEMERAL_PEER_ERROR; 1906 } 1907 } 1908 } 1909 1910 /* 1911 * For each tree in the forest (where the forest is in 1912 * effect all of the ephemeral trees for this zone), 1913 * scan to see if a node can be unmounted. Note that 1914 * unlike nfs4_ephemeral_unmount_engine(), we do 1915 * not process the current node before children or 1916 * siblings. I.e., if a node can be unmounted, we 1917 * do not recursively check to see if the nodes 1918 * hanging off of it can also be unmounted. 1919 * 1920 * Instead, we delve down deep to try and remove the 1921 * children first. Then, because we share code with 1922 * nfs4_ephemeral_unmount_engine(), we will try 1923 * them again. This could be a performance issue in 1924 * the future. 1925 * 1926 * Also note that unlike nfs4_ephemeral_unmount_engine(), 1927 * we do not halt on an error. We will not remove the 1928 * current node, but we will keep on trying to remove 1929 * the others. 1930 * 1931 * force indicates that we want the unmount to occur 1932 * even if there is something blocking it. 1933 * 1934 * time_check indicates that we want to see if the 1935 * mount has expired past mount_to or not. Typically 1936 * we want to do this and only on a shutdown of the 1937 * zone would we want to ignore the check. 1938 */ 1939 static void 1940 nfs4_ephemeral_harvest_forest(nfs4_trigger_globals_t *ntg, 1941 bool_t force, bool_t time_check) 1942 { 1943 nfs4_ephemeral_tree_t *net; 1944 nfs4_ephemeral_tree_t *prev = NULL; 1945 nfs4_ephemeral_tree_t *next; 1946 nfs4_ephemeral_t *e; 1947 nfs4_ephemeral_t *prior; 1948 time_t now = gethrestime_sec(); 1949 1950 nfs4_ephemeral_tree_t *harvest = NULL; 1951 1952 int flag; 1953 1954 mntinfo4_t *mi; 1955 vfs_t *vfsp; 1956 1957 if (force) 1958 flag = MS_FORCE; 1959 else 1960 flag = 0; 1961 1962 mutex_enter(&ntg->ntg_forest_lock); 1963 for (net = ntg->ntg_forest; net != NULL; net = next) { 1964 next = net->net_next; 1965 1966 mutex_enter(&net->net_cnt_lock); 1967 net->net_refcnt++; 1968 mutex_exit(&net->net_cnt_lock); 1969 1970 mutex_enter(&net->net_tree_lock); 1971 1972 /* 1973 * Let the unmount code know that the 1974 * tree is already locked! 1975 */ 1976 mutex_enter(&net->net_cnt_lock); 1977 net->net_status |= NFS4_EPHEMERAL_TREE_LOCKED; 1978 mutex_exit(&net->net_cnt_lock); 1979 1980 /* 1981 * If the intent is force all ephemeral nodes to 1982 * be unmounted in this zone, we can short circuit a 1983 * lot of tree traversal and simply zap the root node. 1984 */ 1985 if (force) { 1986 if (net->net_root) { 1987 mi = net->net_root->ne_mount; 1988 vfsp = mi->mi_vfsp; 1989 1990 /* 1991 * Cleared by umount2_engine. 1992 */ 1993 VFS_HOLD(vfsp); 1994 1995 (void) umount2_engine(vfsp, flag, 1996 kcred, FALSE); 1997 1998 goto check_done; 1999 } 2000 } 2001 2002 e = net->net_root; 2003 if (e) 2004 e->ne_state = NFS4_EPHEMERAL_VISIT_CHILD; 2005 2006 while (e) { 2007 if (e->ne_state == NFS4_EPHEMERAL_VISIT_CHILD) { 2008 e->ne_state = NFS4_EPHEMERAL_VISIT_SIBLING; 2009 if (e->ne_child) { 2010 e = e->ne_child; 2011 e->ne_state = 2012 NFS4_EPHEMERAL_VISIT_CHILD; 2013 } 2014 2015 continue; 2016 } else if (e->ne_state == 2017 NFS4_EPHEMERAL_VISIT_SIBLING) { 2018 e->ne_state = NFS4_EPHEMERAL_PROCESS_ME; 2019 if (e->ne_peer) { 2020 e = e->ne_peer; 2021 e->ne_state = 2022 NFS4_EPHEMERAL_VISIT_CHILD; 2023 } 2024 2025 continue; 2026 } else if (e->ne_state == 2027 NFS4_EPHEMERAL_CHILD_ERROR) { 2028 prior = e->ne_prior; 2029 2030 /* 2031 * If a child reported an error, do 2032 * not bother trying to unmount. 2033 * 2034 * If your prior node is a parent, 2035 * pass the error up such that they 2036 * also do not try to unmount. 2037 * 2038 * However, if your prior is a sibling, 2039 * let them try to unmount if they can. 2040 */ 2041 if (prior) { 2042 if (prior->ne_child == e) 2043 prior->ne_state |= 2044 NFS4_EPHEMERAL_CHILD_ERROR; 2045 else 2046 prior->ne_state |= 2047 NFS4_EPHEMERAL_PEER_ERROR; 2048 } 2049 2050 /* 2051 * Clear the error and if needed, process peers. 2052 * 2053 * Once we mask out the error, we know whether 2054 * or we have to process another node. 2055 */ 2056 e->ne_state &= ~NFS4_EPHEMERAL_CHILD_ERROR; 2057 if (e->ne_state == NFS4_EPHEMERAL_PROCESS_ME) 2058 e = prior; 2059 2060 continue; 2061 } else if (e->ne_state == 2062 NFS4_EPHEMERAL_PEER_ERROR) { 2063 prior = e->ne_prior; 2064 2065 if (prior) { 2066 if (prior->ne_child == e) 2067 prior->ne_state = 2068 NFS4_EPHEMERAL_CHILD_ERROR; 2069 else 2070 prior->ne_state = 2071 NFS4_EPHEMERAL_PEER_ERROR; 2072 } 2073 2074 /* 2075 * Clear the error from this node and do the 2076 * correct processing. 2077 */ 2078 e->ne_state &= ~NFS4_EPHEMERAL_PEER_ERROR; 2079 continue; 2080 } 2081 2082 prior = e->ne_prior; 2083 e->ne_state = NFS4_EPHEMERAL_OK; 2084 2085 /* 2086 * It must be the case that we need to process 2087 * this node. 2088 */ 2089 if (!time_check || 2090 now - e->ne_ref_time > e->ne_mount_to) { 2091 mi = e->ne_mount; 2092 vfsp = mi->mi_vfsp; 2093 2094 /* 2095 * Cleared by umount2_engine. 2096 */ 2097 VFS_HOLD(vfsp); 2098 2099 /* 2100 * Note that we effectively work down to the 2101 * leaf nodes first, try to unmount them, 2102 * then work our way back up into the leaf 2103 * nodes. 2104 * 2105 * Also note that we deal with a lot of 2106 * complexity by sharing the work with 2107 * the manual unmount code. 2108 */ 2109 nfs4_ephemeral_record_umount(vfsp, flag, 2110 e, prior); 2111 } 2112 2113 e = prior; 2114 } 2115 2116 check_done: 2117 2118 /* 2119 * Are we done with this tree? 2120 */ 2121 mutex_enter(&net->net_cnt_lock); 2122 if (net->net_refcnt == 1 && 2123 net->net_status & NFS4_EPHEMERAL_TREE_INVALID) { 2124 net->net_refcnt--; 2125 net->net_status &= ~NFS4_EPHEMERAL_TREE_LOCKED; 2126 mutex_exit(&net->net_cnt_lock); 2127 mutex_exit(&net->net_tree_lock); 2128 2129 if (prev) 2130 prev->net_next = net->net_next; 2131 else 2132 ntg->ntg_forest = net->net_next; 2133 2134 net->net_next = harvest; 2135 harvest = net; 2136 continue; 2137 } 2138 2139 net->net_refcnt--; 2140 net->net_status &= ~NFS4_EPHEMERAL_TREE_LOCKED; 2141 mutex_exit(&net->net_cnt_lock); 2142 mutex_exit(&net->net_tree_lock); 2143 2144 prev = net; 2145 } 2146 mutex_exit(&ntg->ntg_forest_lock); 2147 2148 for (net = harvest; net != NULL; net = next) { 2149 next = net->net_next; 2150 2151 mutex_destroy(&net->net_tree_lock); 2152 mutex_destroy(&net->net_cnt_lock); 2153 kmem_free(net, sizeof (*net)); 2154 } 2155 } 2156 2157 /* 2158 * This is the thread which decides when the harvesting 2159 * can proceed and when to kill it off for this zone. 2160 */ 2161 static void 2162 nfs4_ephemeral_harvester(nfs4_trigger_globals_t *ntg) 2163 { 2164 clock_t timeleft; 2165 zone_t *zone = curproc->p_zone; 2166 2167 for (;;) { 2168 timeleft = zone_status_timedwait(zone, lbolt + 2169 nfs4_trigger_thread_timer * hz, ZONE_IS_SHUTTING_DOWN); 2170 2171 /* 2172 * zone is exiting... 2173 */ 2174 if (timeleft != -1) { 2175 ASSERT(zone_status_get(zone) >= ZONE_IS_SHUTTING_DOWN); 2176 zthread_exit(); 2177 /* NOTREACHED */ 2178 } 2179 2180 /* 2181 * Only bother scanning if there is potential 2182 * work to be done. 2183 */ 2184 if (ntg->ntg_forest == NULL) 2185 continue; 2186 2187 /* 2188 * Now scan the list and get rid of everything which 2189 * is old. 2190 */ 2191 nfs4_ephemeral_harvest_forest(ntg, FALSE, TRUE); 2192 } 2193 2194 /* NOTREACHED */ 2195 } 2196 2197 /* 2198 * The zone specific glue needed to start the unmount harvester. 2199 * 2200 * Note that we want to avoid holding the mutex as long as possible, 2201 * hence the multiple checks. 2202 * 2203 * The caller should avoid us getting down here in the first 2204 * place. 2205 */ 2206 static void 2207 nfs4_ephemeral_start_harvester(nfs4_trigger_globals_t *ntg) 2208 { 2209 /* 2210 * It got started before we got here... 2211 */ 2212 if (ntg->ntg_thread_started) 2213 return; 2214 2215 mutex_enter(&nfs4_ephemeral_thread_lock); 2216 2217 if (ntg->ntg_thread_started) { 2218 mutex_exit(&nfs4_ephemeral_thread_lock); 2219 return; 2220 } 2221 2222 /* 2223 * Start the unmounter harvester thread for this zone. 2224 */ 2225 (void) zthread_create(NULL, 0, nfs4_ephemeral_harvester, 2226 ntg, 0, minclsyspri); 2227 2228 ntg->ntg_thread_started = TRUE; 2229 mutex_exit(&nfs4_ephemeral_thread_lock); 2230 } 2231 2232 /*ARGSUSED*/ 2233 static void * 2234 nfs4_ephemeral_zsd_create(zoneid_t zoneid) 2235 { 2236 nfs4_trigger_globals_t *ntg; 2237 2238 ntg = kmem_zalloc(sizeof (*ntg), KM_SLEEP); 2239 ntg->ntg_thread_started = FALSE; 2240 2241 /* 2242 * This is the default.... 2243 */ 2244 ntg->ntg_mount_to = nfs4_trigger_thread_timer; 2245 2246 mutex_init(&ntg->ntg_forest_lock, NULL, 2247 MUTEX_DEFAULT, NULL); 2248 2249 return (ntg); 2250 } 2251 2252 /* 2253 * Try a nice gentle walk down the forest and convince 2254 * all of the trees to gracefully give it up. 2255 */ 2256 /*ARGSUSED*/ 2257 static void 2258 nfs4_ephemeral_zsd_shutdown(zoneid_t zoneid, void *arg) 2259 { 2260 nfs4_trigger_globals_t *ntg = arg; 2261 2262 if (!ntg) 2263 return; 2264 2265 nfs4_ephemeral_harvest_forest(ntg, FALSE, FALSE); 2266 } 2267 2268 /* 2269 * Race along the forest and rip all of the trees out by 2270 * their rootballs! 2271 */ 2272 /*ARGSUSED*/ 2273 static void 2274 nfs4_ephemeral_zsd_destroy(zoneid_t zoneid, void *arg) 2275 { 2276 nfs4_trigger_globals_t *ntg = arg; 2277 2278 if (!ntg) 2279 return; 2280 2281 nfs4_ephemeral_harvest_forest(ntg, TRUE, FALSE); 2282 2283 mutex_destroy(&ntg->ntg_forest_lock); 2284 kmem_free(ntg, sizeof (*ntg)); 2285 } 2286 2287 /* 2288 * This is the zone independent cleanup needed for 2289 * emphemeral mount processing. 2290 */ 2291 void 2292 nfs4_ephemeral_fini(void) 2293 { 2294 (void) zone_key_delete(nfs4_ephemeral_key); 2295 mutex_destroy(&nfs4_ephemeral_thread_lock); 2296 } 2297 2298 /* 2299 * This is the zone independent initialization needed for 2300 * emphemeral mount processing. 2301 */ 2302 void 2303 nfs4_ephemeral_init(void) 2304 { 2305 mutex_init(&nfs4_ephemeral_thread_lock, NULL, MUTEX_DEFAULT, 2306 NULL); 2307 2308 zone_key_create(&nfs4_ephemeral_key, nfs4_ephemeral_zsd_create, 2309 nfs4_ephemeral_zsd_shutdown, nfs4_ephemeral_zsd_destroy); 2310 } 2311 2312 /* 2313 * nfssys() calls this function to set the per-zone 2314 * value of mount_to to drive when an ephemeral mount is 2315 * timed out. Each mount will grab a copy of this value 2316 * when mounted. 2317 */ 2318 void 2319 nfs4_ephemeral_set_mount_to(uint_t mount_to) 2320 { 2321 nfs4_trigger_globals_t *ntg; 2322 zone_t *zone = curproc->p_zone; 2323 2324 ntg = zone_getspecific(nfs4_ephemeral_key, zone); 2325 2326 ntg->ntg_mount_to = mount_to; 2327 } 2328 2329 /* 2330 * Walk the list of v4 mount options; if they are currently set in vfsp, 2331 * append them to a new comma-separated mount option string, and return it. 2332 * 2333 * Caller should free by calling nfs4_trigger_destroy_mntopts(). 2334 */ 2335 static char * 2336 nfs4_trigger_create_mntopts(vfs_t *vfsp) 2337 { 2338 uint_t i; 2339 char *mntopts; 2340 struct vfssw *vswp; 2341 mntopts_t *optproto; 2342 2343 mntopts = kmem_zalloc(MAX_MNTOPT_STR, KM_SLEEP); 2344 2345 /* get the list of applicable mount options for v4; locks *vswp */ 2346 vswp = vfs_getvfssw(MNTTYPE_NFS4); 2347 optproto = &vswp->vsw_optproto; 2348 2349 for (i = 0; i < optproto->mo_count; i++) { 2350 struct mntopt *mop = &optproto->mo_list[i]; 2351 2352 if (mop->mo_flags & MO_EMPTY) 2353 continue; 2354 2355 if (nfs4_trigger_add_mntopt(mntopts, mop->mo_name, vfsp)) { 2356 kmem_free(mntopts, MAX_MNTOPT_STR); 2357 vfs_unrefvfssw(vswp); 2358 return (NULL); 2359 } 2360 } 2361 2362 vfs_unrefvfssw(vswp); 2363 2364 /* 2365 * MNTOPT_XATTR is not in the v4 mount opt proto list, 2366 * and it may only be passed via MS_OPTIONSTR, so we 2367 * must handle it here. 2368 * 2369 * Ideally, it would be in the list, but NFS does not specify its 2370 * own opt proto list, it uses instead the default one. Since 2371 * not all filesystems support extended attrs, it would not be 2372 * appropriate to add it there. 2373 */ 2374 if (nfs4_trigger_add_mntopt(mntopts, MNTOPT_XATTR, vfsp) || 2375 nfs4_trigger_add_mntopt(mntopts, MNTOPT_NOXATTR, vfsp)) { 2376 kmem_free(mntopts, MAX_MNTOPT_STR); 2377 return (NULL); 2378 } 2379 2380 return (mntopts); 2381 } 2382 2383 static void 2384 nfs4_trigger_destroy_mntopts(char *mntopts) 2385 { 2386 if (mntopts) 2387 kmem_free(mntopts, MAX_MNTOPT_STR); 2388 } 2389 2390 /* 2391 * Check a single mount option (optname). Add to mntopts if it is set in VFS. 2392 */ 2393 static int 2394 nfs4_trigger_add_mntopt(char *mntopts, char *optname, vfs_t *vfsp) 2395 { 2396 if (mntopts == NULL || optname == NULL || vfsp == NULL) 2397 return (EINVAL); 2398 2399 if (vfs_optionisset(vfsp, optname, NULL)) { 2400 size_t mntoptslen = strlen(mntopts); 2401 size_t optnamelen = strlen(optname); 2402 2403 /* +1 for ',', +1 for NUL */ 2404 if (mntoptslen + optnamelen + 2 > MAX_MNTOPT_STR) 2405 return (EOVERFLOW); 2406 2407 /* first or subsequent mount option? */ 2408 if (*mntopts != '\0') 2409 (void) strcat(mntopts, ","); 2410 2411 (void) strcat(mntopts, optname); 2412 } 2413 2414 return (0); 2415 } 2416 2417 static enum clnt_stat 2418 nfs4_trigger_ping_server(servinfo4_t *svp, int nointr) 2419 { 2420 int retries, error; 2421 uint_t max_msgsize; 2422 enum clnt_stat status; 2423 CLIENT *cl; 2424 struct timeval timeout; 2425 2426 /* as per recov_newserver() */ 2427 max_msgsize = 0; 2428 retries = 1; 2429 timeout.tv_sec = 2; 2430 timeout.tv_usec = 0; 2431 2432 error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr, NFS_PROGRAM, 2433 NFS_V4, max_msgsize, retries, CRED(), &cl); 2434 if (error) 2435 return (RPC_FAILED); 2436 2437 if (nointr) 2438 cl->cl_nosignal = TRUE; 2439 status = CLNT_CALL(cl, RFS_NULL, xdr_void, NULL, xdr_void, NULL, 2440 timeout); 2441 if (nointr) 2442 cl->cl_nosignal = FALSE; 2443 2444 AUTH_DESTROY(cl->cl_auth); 2445 CLNT_DESTROY(cl); 2446 2447 return (status); 2448 } 2449