1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/param.h>
30 #include <sys/errno.h>
31 #include <sys/vfs.h>
32 #include <sys/vnode.h>
33 #include <sys/uio.h>
34 #include <sys/pathname.h>
35 #include <sys/kmem.h>
36 #include <sys/cred.h>
37 #include <sys/statvfs.h>
38 #include <sys/fs/lofs_info.h>
39 #include <sys/fs/lofs_node.h>
40 #include <sys/mount.h>
41 #include <sys/mntent.h>
42 #include <sys/mkdev.h>
43 #include <sys/sysmacros.h>
44 #include <sys/systm.h>
45 #include <sys/cmn_err.h>
46 #include <sys/policy.h>
47 #include "fs/fs_subr.h"
48 
49 /*
50  * This is the loadable module wrapper.
51  */
52 #include <sys/modctl.h>
53 
54 static mntopts_t lofs_mntopts;
55 
56 static int lofsinit(int, char *);
57 
58 static vfsdef_t vfw = {
59 	VFSDEF_VERSION,
60 	"lofs",
61 	lofsinit,
62 	VSW_HASPROTO,
63 	&lofs_mntopts
64 };
65 
66 /*
67  * Stuff needed to support "zonedevfs" mode.
68  */
69 static major_t lofs_major;
70 static minor_t lofs_minor;
71 static kmutex_t lofs_minor_lock;
72 
73 /*
74  * LOFS mount options table
75  */
76 static char *xattr_cancel[] = { MNTOPT_NOXATTR, NULL };
77 static char *noxattr_cancel[] = { MNTOPT_XATTR, NULL };
78 static char *zonedevfs_cancel[] = { MNTOPT_LOFS_NOZONEDEVFS, NULL };
79 static char *nozonedevfs_cancel[] = { MNTOPT_LOFS_ZONEDEVFS, NULL };
80 static char *sub_cancel[] = { MNTOPT_LOFS_NOSUB, NULL };
81 static char *nosub_cancel[] = { MNTOPT_LOFS_SUB, NULL };
82 
83 static mntopt_t mntopts[] = {
84 /*
85  *	option name		cancel option	default arg	flags
86  *		private data
87  */
88 	{ MNTOPT_XATTR,		xattr_cancel,	NULL,		0,
89 		(void *)0 },
90 	{ MNTOPT_NOXATTR,	noxattr_cancel,	NULL,		0,
91 		(void *)0 },
92 	{ MNTOPT_LOFS_ZONEDEVFS,	zonedevfs_cancel,	NULL,	0,
93 		(void *)0 },
94 	{ MNTOPT_LOFS_NOZONEDEVFS,	nozonedevfs_cancel,	NULL,	0,
95 		(void *)0 },
96 	{ MNTOPT_LOFS_SUB,	sub_cancel,	NULL,		0,
97 		(void *)0 },
98 	{ MNTOPT_LOFS_NOSUB,	nosub_cancel,	NULL,		0,
99 		(void *)0 },
100 };
101 
102 static mntopts_t lofs_mntopts = {
103 	sizeof (mntopts) / sizeof (mntopt_t),
104 	mntopts
105 };
106 
107 /*
108  * Module linkage information for the kernel.
109  */
110 
111 static struct modlfs modlfs = {
112 	&mod_fsops, "filesystem for lofs", &vfw
113 };
114 
115 static struct modlinkage modlinkage = {
116 	MODREV_1, (void *)&modlfs, NULL
117 };
118 
119 /*
120  * This is the module initialization routine.
121  */
122 int
123 _init()
124 {
125 	int status;
126 
127 	lofs_subrinit();
128 	status = mod_install(&modlinkage);
129 	if (status != 0) {
130 		/*
131 		 * Cleanup previously initialized work.
132 		 */
133 		lofs_subrfini();
134 	}
135 
136 	return (status);
137 }
138 
139 /*
140  * Don't allow the lofs module to be unloaded for now.
141  * There is a memory leak if it gets unloaded.
142  */
143 int
144 _fini()
145 {
146 	return (EBUSY);
147 }
148 
149 int
150 _info(struct modinfo *modinfop)
151 {
152 	return (mod_info(&modlinkage, modinfop));
153 }
154 
155 
156 static int lofsfstype;
157 vfsops_t *lo_vfsops;
158 
159 /*
160  * lo mount vfsop
161  * Set up mount info record and attach it to vfs struct.
162  */
163 /*ARGSUSED*/
164 static int
165 lo_mount(struct vfs *vfsp,
166 	struct vnode *vp,
167 	struct mounta *uap,
168 	struct cred *cr)
169 {
170 	int error;
171 	struct vnode *srootvp = NULL;	/* the server's root */
172 	struct vnode *realrootvp;
173 	struct loinfo *li;
174 	int is_zonedevfs = 0;
175 	int nodev;
176 
177 	nodev = vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL);
178 
179 	if ((error = secpolicy_fs_mount(cr, vp, vfsp)) != 0)
180 		return (EPERM);
181 
182 	/*
183 	 * Loopback devices which get "nodevices" added can be done without
184 	 * "nodevices" set because we cannot import devices into a zone
185 	 * with loopback.  Note that we have all zone privileges when
186 	 * this happens; if not, we'd have gotten "nosuid".
187 	 */
188 	if (!nodev && vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL))
189 		vfs_setmntopt(vfsp, MNTOPT_DEVICES, NULL, VFS_NODISPLAY);
190 
191 	/*
192 	 * We must ensure that only the global zone applies the 'zonedevfs'
193 	 * option; we don't want non-global zones to be able to establish
194 	 * lofs mounts using the special dev_t we use to ensure that the
195 	 * contents of a zone's /dev cannot be victim to link(2) or rename(2).
196 	 * See below, where we set all of this up.
197 	 *
198 	 * Since this is more like a privilege check, we use crgetzoneid(cr)
199 	 * instead of getzoneid().
200 	 */
201 	is_zonedevfs = vfs_optionisset(vfsp, MNTOPT_LOFS_ZONEDEVFS, NULL);
202 	if (crgetzoneid(cr) != GLOBAL_ZONEID && is_zonedevfs)
203 		return (EPERM);
204 
205 	mutex_enter(&vp->v_lock);
206 	if (!(uap->flags & MS_OVERLAY) &&
207 		(vp->v_count != 1 || (vp->v_flag & VROOT))) {
208 		mutex_exit(&vp->v_lock);
209 		return (EBUSY);
210 	}
211 	mutex_exit(&vp->v_lock);
212 
213 	/*
214 	 * Find real root, and make vfs point to real vfs
215 	 */
216 	if (error = lookupname(uap->spec, (uap->flags & MS_SYSSPACE) ?
217 		UIO_SYSSPACE : UIO_USERSPACE, FOLLOW, NULLVPP,
218 	    &realrootvp))
219 		return (error);
220 
221 	/*
222 	 * realrootvp may be an AUTOFS node, in which case we
223 	 * perform a VOP_ACCESS() to trigger the mount of the
224 	 * intended filesystem, so we loopback mount the intended
225 	 * filesystem instead of the AUTOFS filesystem.
226 	 */
227 	(void) VOP_ACCESS(realrootvp, 0, 0, cr);
228 
229 	/*
230 	 * We're interested in the top most filesystem.
231 	 * This is specially important when uap->spec is a trigger
232 	 * AUTOFS node, since we're really interested in mounting the
233 	 * filesystem AUTOFS mounted as result of the VOP_ACCESS()
234 	 * call not the AUTOFS node itself.
235 	 */
236 	if (vn_mountedvfs(realrootvp) != NULL) {
237 		if (error = traverse(&realrootvp)) {
238 			VN_RELE(realrootvp);
239 			return (error);
240 		}
241 	}
242 
243 	/*
244 	 * Allocate a vfs info struct and attach it
245 	 */
246 	li = kmem_zalloc(sizeof (struct loinfo), KM_SLEEP);
247 	li->li_realvfs = realrootvp->v_vfsp;
248 	li->li_mountvfs = vfsp;
249 
250 	/*
251 	 * Set mount flags to be inherited by loopback vfs's
252 	 */
253 	if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) {
254 		li->li_mflag |= VFS_RDONLY;
255 	}
256 	if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
257 		li->li_mflag |= (VFS_NOSETUID|VFS_NODEVICES);
258 	}
259 	if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL)) {
260 		li->li_mflag |= VFS_NODEVICES;
261 	}
262 	if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) {
263 		li->li_mflag |= VFS_NOSETUID;
264 	}
265 	/*
266 	 * Permissive flags are added to the "deny" bitmap.
267 	 */
268 	if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) {
269 		li->li_dflag |= VFS_XATTR;
270 	}
271 	if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) {
272 		li->li_dflag |= VFS_NBMAND;
273 	}
274 
275 	/*
276 	 * Propagate inheritable mount flags from the real vfs.
277 	 */
278 	if ((li->li_realvfs->vfs_flag & VFS_RDONLY) &&
279 	    !vfs_optionisset(vfsp, MNTOPT_RO, NULL))
280 		vfs_setmntopt(vfsp, MNTOPT_RO, NULL,
281 		    VFS_NODISPLAY);
282 	if ((li->li_realvfs->vfs_flag & VFS_NOSETUID) &&
283 	    !vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL))
284 		vfs_setmntopt(vfsp, MNTOPT_NOSETUID, NULL,
285 		    VFS_NODISPLAY);
286 	if ((li->li_realvfs->vfs_flag & VFS_NODEVICES) &&
287 	    !vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL))
288 		vfs_setmntopt(vfsp, MNTOPT_NODEVICES, NULL,
289 		    VFS_NODISPLAY);
290 	/*
291 	 * Permissive flags such as VFS_XATTR, as opposed to restrictive flags
292 	 * such as VFS_RDONLY, are handled differently.  An explicit
293 	 * MNTOPT_NOXATTR should override the underlying filesystem's VFS_XATTR.
294 	 */
295 	if ((li->li_realvfs->vfs_flag & VFS_XATTR) &&
296 	    !vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL) &&
297 	    !vfs_optionisset(vfsp, MNTOPT_XATTR, NULL))
298 		vfs_setmntopt(vfsp, MNTOPT_XATTR, NULL,
299 		    VFS_NODISPLAY);
300 	if ((li->li_realvfs->vfs_flag & VFS_NBMAND) &&
301 	    !vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL) &&
302 	    !vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL))
303 		vfs_setmntopt(vfsp, MNTOPT_NBMAND, NULL,
304 		    VFS_NODISPLAY);
305 
306 	li->li_refct = 0;
307 	vfsp->vfs_data = (caddr_t)li;
308 	vfsp->vfs_bcount = 0;
309 	vfsp->vfs_fstype = lofsfstype;
310 	vfsp->vfs_bsize = li->li_realvfs->vfs_bsize;
311 
312 	/*
313 	 * Test to see if we need to be in "zone /dev" mode.  In zonedevfs
314 	 * mode, we pull a nasty trick; we make sure that the lofs dev_t does
315 	 * *not* reflect the underlying device, so that no renames or links
316 	 * can occur to or from the /dev hierarchy.
317 	 */
318 	if (is_zonedevfs) {
319 		dev_t dev;
320 
321 		mutex_enter(&lofs_minor_lock);
322 		do {
323 			lofs_minor = (lofs_minor + 1) & MAXMIN32;
324 			dev = makedevice(lofs_major, lofs_minor);
325 		} while (vfs_devismounted(dev));
326 		mutex_exit(&lofs_minor_lock);
327 
328 		vfsp->vfs_dev = dev;
329 		vfs_make_fsid(&vfsp->vfs_fsid, dev, lofsfstype);
330 
331 		li->li_flag |= LO_ZONEDEVFS;
332 	} else {
333 		vfsp->vfs_dev = li->li_realvfs->vfs_dev;
334 		vfsp->vfs_fsid.val[0] = li->li_realvfs->vfs_fsid.val[0];
335 		vfsp->vfs_fsid.val[1] = li->li_realvfs->vfs_fsid.val[1];
336 	}
337 
338 	if (vfs_optionisset(vfsp, MNTOPT_LOFS_NOSUB, NULL)) {
339 		li->li_flag |= LO_NOSUB;
340 	}
341 
342 	/*
343 	 * Setup the hashtable. If the root of this mount isn't a directory,
344 	 * there's no point in allocating a large hashtable. A table with one
345 	 * bucket is sufficient.
346 	 */
347 	if (realrootvp->v_type != VDIR)
348 		lsetup(li, 1);
349 	else
350 		lsetup(li, 0);
351 
352 	/*
353 	 * Make the root vnode
354 	 */
355 	srootvp = makelonode(realrootvp, li);
356 	srootvp->v_flag |= VROOT;
357 	li->li_rootvp = srootvp;
358 
359 #ifdef LODEBUG
360 	lo_dprint(4, "lo_mount: vfs %p realvfs %p root %p realroot %p li %p\n",
361 	    vfsp, li->li_realvfs, srootvp, realrootvp, li);
362 #endif
363 	return (0);
364 }
365 
366 /*
367  * Undo loopback mount
368  */
369 static int
370 lo_unmount(struct vfs *vfsp, int flag, struct cred *cr)
371 {
372 	struct loinfo *li;
373 
374 	if (secpolicy_fs_unmount(cr, vfsp) != 0)
375 		return (EPERM);
376 
377 	/*
378 	 * Forced unmount is not supported by this file system
379 	 * and thus, ENOTSUP, is being returned.
380 	 */
381 	if (flag & MS_FORCE)
382 		return (ENOTSUP);
383 
384 	li = vtoli(vfsp);
385 #ifdef LODEBUG
386 	lo_dprint(4, "lo_unmount(%p) li %p\n", vfsp, li);
387 #endif
388 	if (li->li_refct != 1 || li->li_rootvp->v_count != 1) {
389 #ifdef LODEBUG
390 		lo_dprint(4, "refct %d v_ct %d\n", li->li_refct,
391 		    li->li_rootvp->v_count);
392 #endif
393 		return (EBUSY);
394 	}
395 	VN_RELE(li->li_rootvp);
396 	return (0);
397 }
398 
399 /*
400  * Find root of lofs mount.
401  */
402 static int
403 lo_root(struct vfs *vfsp, struct vnode **vpp)
404 {
405 	*vpp = vtoli(vfsp)->li_rootvp;
406 #ifdef LODEBUG
407 	lo_dprint(4, "lo_root(0x%p) = %p\n", vfsp, *vpp);
408 #endif
409 	/*
410 	 * If the root of the filesystem is a special file, return the specvp
411 	 * version of the vnode. We don't save the specvp vnode in our
412 	 * hashtable since that's exclusively for lnodes.
413 	 */
414 	if (IS_DEVVP(*vpp)) {
415 		struct vnode *svp;
416 
417 		svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, kcred);
418 		if (svp == NULL)
419 			return (ENOSYS);
420 		*vpp = svp;
421 	} else {
422 		VN_HOLD(*vpp);
423 	}
424 
425 	return (0);
426 }
427 
428 /*
429  * Get file system statistics.
430  */
431 static int
432 lo_statvfs(register struct vfs *vfsp, struct statvfs64 *sbp)
433 {
434 	vnode_t *realrootvp;
435 
436 #ifdef LODEBUG
437 	lo_dprint(4, "lostatvfs %p\n", vfsp);
438 #endif
439 	/*
440 	 * Using realrootvp->v_vfsp (instead of the realvfsp that was
441 	 * cached) is necessary to make lofs work woth forced UFS unmounts.
442 	 * In the case of a forced unmount, UFS stores a set of dummy vfsops
443 	 * in all the (i)vnodes in the filesystem. The dummy ops simply
444 	 * returns back EIO.
445 	 */
446 	(void) lo_realvfs(vfsp, &realrootvp);
447 	if (realrootvp != NULL)
448 		return (VFS_STATVFS(realrootvp->v_vfsp, sbp));
449 	else
450 		return (EIO);
451 }
452 
453 /*
454  * LOFS doesn't have any data or metadata to flush, pending I/O on the
455  * underlying filesystem will be flushed when such filesystem is synched.
456  */
457 /* ARGSUSED */
458 static int
459 lo_sync(struct vfs *vfsp,
460 	short flag,
461 	struct cred *cr)
462 {
463 #ifdef LODEBUG
464 	lo_dprint(4, "lo_sync: %p\n", vfsp);
465 #endif
466 	return (0);
467 }
468 
469 /*
470  * Obtain the vnode from the underlying filesystem.
471  */
472 static int
473 lo_vget(struct vfs *vfsp, struct vnode **vpp, struct fid *fidp)
474 {
475 	vnode_t *realrootvp;
476 
477 #ifdef LODEBUG
478 	lo_dprint(4, "lo_vget: %p\n", vfsp);
479 #endif
480 	(void) lo_realvfs(vfsp, &realrootvp);
481 	if (realrootvp != NULL)
482 		return (VFS_VGET(realrootvp->v_vfsp, vpp, fidp));
483 	else
484 		return (EIO);
485 }
486 
487 /*
488  * Free mount-specific data.
489  */
490 static void
491 lo_freevfs(struct vfs *vfsp)
492 {
493 	struct loinfo *li = vtoli(vfsp);
494 
495 	ldestroy(li);
496 	kmem_free(li, sizeof (struct loinfo));
497 }
498 
499 static int
500 lofsinit(int fstyp, char *name)
501 {
502 	static const fs_operation_def_t lo_vfsops_template[] = {
503 		VFSNAME_MOUNT, lo_mount,
504 		VFSNAME_UNMOUNT, lo_unmount,
505 		VFSNAME_ROOT, lo_root,
506 		VFSNAME_STATVFS, lo_statvfs,
507 		VFSNAME_SYNC, (fs_generic_func_p) lo_sync,
508 		VFSNAME_VGET, lo_vget,
509 		VFSNAME_FREEVFS, (fs_generic_func_p) lo_freevfs,
510 		NULL, NULL
511 	};
512 	int error;
513 
514 	error = vfs_setfsops(fstyp, lo_vfsops_template, &lo_vfsops);
515 	if (error != 0) {
516 		cmn_err(CE_WARN, "lofsinit: bad vfs ops template");
517 		return (error);
518 	}
519 
520 	error = vn_make_ops(name, lo_vnodeops_template, &lo_vnodeops);
521 	if (error != 0) {
522 		(void) vfs_freevfsops_by_type(fstyp);
523 		cmn_err(CE_WARN, "lofsinit: bad vnode ops template");
524 		return (error);
525 	}
526 
527 	lofsfstype = fstyp;
528 
529 	if ((lofs_major = getudev()) == (major_t)-1) {
530 		(void) vfs_freevfsops_by_type(fstyp);
531 		cmn_err(CE_WARN, "lofsinit: Can't get unique device number.");
532 		return (ENXIO);
533 	}
534 
535 	lofs_minor = 0;
536 	mutex_init(&lofs_minor_lock, NULL, MUTEX_DEFAULT, NULL);
537 
538 	return (0);
539 }
540