1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/types.h>
27 #include <sys/param.h>
28 #include <sys/sysmacros.h>
29 #include <sys/kmem.h>
30 #include <sys/time.h>
31 #include <sys/pathname.h>
32 #include <sys/vfs.h>
33 #include <sys/vfs_opreg.h>
34 #include <sys/vnode.h>
35 #include <sys/stat.h>
36 #include <sys/uio.h>
37 #include <sys/stat.h>
38 #include <sys/errno.h>
39 #include <sys/cmn_err.h>
40 #include <sys/cred.h>
41 #include <sys/statvfs.h>
42 #include <sys/mount.h>
43 #include <sys/debug.h>
44 #include <sys/systm.h>
45 #include <sys/mntent.h>
46 #include <fs/fs_subr.h>
47 #include <vm/page.h>
48 #include <vm/anon.h>
49 #include <sys/model.h>
50 #include <sys/policy.h>
51 
52 #include <sys/fs/swapnode.h>
53 #include <sys/fs/tmp.h>
54 #include <sys/fs/tmpnode.h>
55 
56 static int tmpfsfstype;
57 
58 /*
59  * tmpfs vfs operations.
60  */
61 static int tmpfsinit(int, char *);
62 static int tmp_mount(struct vfs *, struct vnode *,
63 	struct mounta *, struct cred *);
64 static int tmp_unmount(struct vfs *, int, struct cred *);
65 static int tmp_root(struct vfs *, struct vnode **);
66 static int tmp_statvfs(struct vfs *, struct statvfs64 *);
67 static int tmp_vget(struct vfs *, struct vnode **, struct fid *);
68 
69 /*
70  * Loadable module wrapper
71  */
72 #include <sys/modctl.h>
73 
74 static mntopts_t tmpfs_proto_opttbl;
75 
76 static vfsdef_t vfw = {
77 	VFSDEF_VERSION,
78 	"tmpfs",
79 	tmpfsinit,
80 	VSW_HASPROTO|VSW_STATS,
81 	&tmpfs_proto_opttbl
82 };
83 
84 /*
85  * in-kernel mnttab options
86  */
87 static char *xattr_cancel[] = { MNTOPT_NOXATTR, NULL };
88 static char *noxattr_cancel[] = { MNTOPT_XATTR, NULL };
89 
90 static mntopt_t tmpfs_options[] = {
91 	/* Option name		Cancel Opt	Arg	Flags		Data */
92 	{ MNTOPT_XATTR,		xattr_cancel,	NULL,	MO_DEFAULT,	NULL},
93 	{ MNTOPT_NOXATTR,	noxattr_cancel,	NULL,	NULL,		NULL},
94 	{ "size",		NULL,		"0",	MO_HASVALUE,	NULL}
95 };
96 
97 
98 static mntopts_t tmpfs_proto_opttbl = {
99 	sizeof (tmpfs_options) / sizeof (mntopt_t),
100 	tmpfs_options
101 };
102 
103 /*
104  * Module linkage information
105  */
106 static struct modlfs modlfs = {
107 	&mod_fsops, "filesystem for tmpfs", &vfw
108 };
109 
110 static struct modlinkage modlinkage = {
111 	MODREV_1, &modlfs, NULL
112 };
113 
114 int
115 _init()
116 {
117 	return (mod_install(&modlinkage));
118 }
119 
120 int
121 _fini()
122 {
123 	int error;
124 
125 	error = mod_remove(&modlinkage);
126 	if (error)
127 		return (error);
128 	/*
129 	 * Tear down the operations vectors
130 	 */
131 	(void) vfs_freevfsops_by_type(tmpfsfstype);
132 	vn_freevnodeops(tmp_vnodeops);
133 	return (0);
134 }
135 
136 int
137 _info(struct modinfo *modinfop)
138 {
139 	return (mod_info(&modlinkage, modinfop));
140 }
141 
142 /*
143  * The following are patchable variables limiting the amount of system
144  * resources tmpfs can use.
145  *
146  * tmpfs_maxkmem limits the amount of kernel kmem_alloc memory
147  * tmpfs can use for it's data structures (e.g. tmpnodes, directory entries)
148  * It is not determined by setting a hard limit but rather as a percentage of
149  * physical memory which is determined when tmpfs is first used in the system.
150  *
151  * tmpfs_minfree is the minimum amount of swap space that tmpfs leaves for
152  * the rest of the system.  In other words, if the amount of free swap space
153  * in the system (i.e. anoninfo.ani_free) drops below tmpfs_minfree, tmpfs
154  * anon allocations will fail.
155  *
156  * There is also a per mount limit on the amount of swap space
157  * (tmount.tm_anonmax) settable via a mount option.
158  */
159 size_t tmpfs_maxkmem = 0;
160 size_t tmpfs_minfree = 0;
161 size_t tmp_kmemspace;		/* bytes of kernel heap used by all tmpfs */
162 
163 static major_t tmpfs_major;
164 static minor_t tmpfs_minor;
165 static kmutex_t	tmpfs_minor_lock;
166 
167 /*
168  * initialize global tmpfs locks and such
169  * called when loading tmpfs module
170  */
171 static int
172 tmpfsinit(int fstype, char *name)
173 {
174 	static const fs_operation_def_t tmp_vfsops_template[] = {
175 		VFSNAME_MOUNT,		{ .vfs_mount = tmp_mount },
176 		VFSNAME_UNMOUNT,	{ .vfs_unmount = tmp_unmount },
177 		VFSNAME_ROOT,		{ .vfs_root = tmp_root },
178 		VFSNAME_STATVFS,	{ .vfs_statvfs = tmp_statvfs },
179 		VFSNAME_VGET,		{ .vfs_vget = tmp_vget },
180 		NULL,			NULL
181 	};
182 	int error;
183 	extern  void    tmpfs_hash_init();
184 
185 	tmpfs_hash_init();
186 	tmpfsfstype = fstype;
187 	ASSERT(tmpfsfstype != 0);
188 
189 	error = vfs_setfsops(fstype, tmp_vfsops_template, NULL);
190 	if (error != 0) {
191 		cmn_err(CE_WARN, "tmpfsinit: bad vfs ops template");
192 		return (error);
193 	}
194 
195 	error = vn_make_ops(name, tmp_vnodeops_template, &tmp_vnodeops);
196 	if (error != 0) {
197 		(void) vfs_freevfsops_by_type(fstype);
198 		cmn_err(CE_WARN, "tmpfsinit: bad vnode ops template");
199 		return (error);
200 	}
201 
202 	/*
203 	 * tmpfs_minfree doesn't need to be some function of configured
204 	 * swap space since it really is an absolute limit of swap space
205 	 * which still allows other processes to execute.
206 	 */
207 	if (tmpfs_minfree == 0) {
208 		/*
209 		 * Set if not patched
210 		 */
211 		tmpfs_minfree = btopr(TMPMINFREE);
212 	}
213 
214 	/*
215 	 * The maximum amount of space tmpfs can allocate is
216 	 * TMPMAXPROCKMEM percent of kernel memory
217 	 */
218 	if (tmpfs_maxkmem == 0)
219 		tmpfs_maxkmem = MAX(PAGESIZE, kmem_maxavail() / TMPMAXFRACKMEM);
220 
221 	if ((tmpfs_major = getudev()) == (major_t)-1) {
222 		cmn_err(CE_WARN, "tmpfsinit: Can't get unique device number.");
223 		tmpfs_major = 0;
224 	}
225 	mutex_init(&tmpfs_minor_lock, NULL, MUTEX_DEFAULT, NULL);
226 	return (0);
227 }
228 
229 static int
230 tmp_mount(
231 	struct vfs *vfsp,
232 	struct vnode *mvp,
233 	struct mounta *uap,
234 	struct cred *cr)
235 {
236 	struct tmount *tm = NULL;
237 	struct tmpnode *tp;
238 	struct pathname dpn;
239 	int error;
240 	pgcnt_t anonmax;
241 	struct vattr rattr;
242 	int got_attrs;
243 
244 	char *sizestr;
245 
246 	if ((error = secpolicy_fs_mount(cr, mvp, vfsp)) != 0)
247 		return (error);
248 
249 	if (mvp->v_type != VDIR)
250 		return (ENOTDIR);
251 
252 	mutex_enter(&mvp->v_lock);
253 	if ((uap->flags & MS_OVERLAY) == 0 &&
254 	    (mvp->v_count != 1 || (mvp->v_flag & VROOT))) {
255 		mutex_exit(&mvp->v_lock);
256 		return (EBUSY);
257 	}
258 	mutex_exit(&mvp->v_lock);
259 
260 	/*
261 	 * Having the resource be anything but "swap" doesn't make sense.
262 	 */
263 	vfs_setresource(vfsp, "swap");
264 
265 	/*
266 	 * now look for options we understand...
267 	 */
268 
269 	/* tmpfs doesn't support read-only mounts */
270 	if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) {
271 		error = EINVAL;
272 		goto out;
273 	}
274 
275 	/*
276 	 * tm_anonmax is set according to the mount arguments
277 	 * if any.  Otherwise, it is set to a maximum value.
278 	 */
279 	if (vfs_optionisset(vfsp, "size", &sizestr)) {
280 		if ((error = tmp_convnum(sizestr, &anonmax)) != 0)
281 			goto out;
282 	} else {
283 		anonmax = ULONG_MAX;
284 	}
285 
286 	if (error = pn_get(uap->dir,
287 	    (uap->flags & MS_SYSSPACE) ? UIO_SYSSPACE : UIO_USERSPACE, &dpn))
288 		goto out;
289 
290 	if ((tm = tmp_memalloc(sizeof (struct tmount), 0)) == NULL) {
291 		pn_free(&dpn);
292 		error = ENOMEM;
293 		goto out;
294 	}
295 
296 	/*
297 	 * find an available minor device number for this mount
298 	 */
299 	mutex_enter(&tmpfs_minor_lock);
300 	do {
301 		tmpfs_minor = (tmpfs_minor + 1) & L_MAXMIN32;
302 		tm->tm_dev = makedevice(tmpfs_major, tmpfs_minor);
303 	} while (vfs_devismounted(tm->tm_dev));
304 	mutex_exit(&tmpfs_minor_lock);
305 
306 	/*
307 	 * Set but don't bother entering the mutex
308 	 * (tmount not on mount list yet)
309 	 */
310 	mutex_init(&tm->tm_contents, NULL, MUTEX_DEFAULT, NULL);
311 	mutex_init(&tm->tm_renamelck, NULL, MUTEX_DEFAULT, NULL);
312 
313 	tm->tm_vfsp = vfsp;
314 	tm->tm_anonmax = anonmax;
315 
316 	vfsp->vfs_data = (caddr_t)tm;
317 	vfsp->vfs_fstype = tmpfsfstype;
318 	vfsp->vfs_dev = tm->tm_dev;
319 	vfsp->vfs_bsize = PAGESIZE;
320 	vfsp->vfs_flag |= VFS_NOTRUNC;
321 	vfs_make_fsid(&vfsp->vfs_fsid, tm->tm_dev, tmpfsfstype);
322 	tm->tm_mntpath = tmp_memalloc(dpn.pn_pathlen + 1, TMP_MUSTHAVE);
323 	(void) strcpy(tm->tm_mntpath, dpn.pn_path);
324 
325 	/*
326 	 * allocate and initialize root tmpnode structure
327 	 */
328 	bzero(&rattr, sizeof (struct vattr));
329 	rattr.va_mode = (mode_t)(S_IFDIR | 0777);	/* XXX modes */
330 	rattr.va_type = VDIR;
331 	rattr.va_rdev = 0;
332 	tp = tmp_memalloc(sizeof (struct tmpnode), TMP_MUSTHAVE);
333 	tmpnode_init(tm, tp, &rattr, cr);
334 
335 	/*
336 	 * Get the mode, uid, and gid from the underlying mount point.
337 	 */
338 	rattr.va_mask = AT_MODE|AT_UID|AT_GID;	/* Hint to getattr */
339 	got_attrs = VOP_GETATTR(mvp, &rattr, 0, cr, NULL);
340 
341 	rw_enter(&tp->tn_rwlock, RW_WRITER);
342 	TNTOV(tp)->v_flag |= VROOT;
343 
344 	/*
345 	 * If the getattr succeeded, use its results.  Otherwise allow
346 	 * the previously set hardwired defaults to prevail.
347 	 */
348 	if (got_attrs == 0) {
349 		tp->tn_mode = rattr.va_mode;
350 		tp->tn_uid = rattr.va_uid;
351 		tp->tn_gid = rattr.va_gid;
352 	}
353 
354 	/*
355 	 * initialize linked list of tmpnodes so that the back pointer of
356 	 * the root tmpnode always points to the last one on the list
357 	 * and the forward pointer of the last node is null
358 	 */
359 	tp->tn_back = tp;
360 	tp->tn_forw = NULL;
361 	tp->tn_nlink = 0;
362 	tm->tm_rootnode = tp;
363 
364 	tdirinit(tp, tp);
365 
366 	rw_exit(&tp->tn_rwlock);
367 
368 	pn_free(&dpn);
369 	error = 0;
370 
371 out:
372 	if (error == 0)
373 		vfs_set_feature(vfsp, VFSFT_SYSATTR_VIEWS);
374 
375 	return (error);
376 }
377 
378 static int
379 tmp_unmount(struct vfs *vfsp, int flag, struct cred *cr)
380 {
381 	struct tmount *tm = (struct tmount *)VFSTOTM(vfsp);
382 	struct tmpnode *tnp, *cancel;
383 	struct vnode	*vp;
384 	int error;
385 
386 	if ((error = secpolicy_fs_unmount(cr, vfsp)) != 0)
387 		return (error);
388 
389 	/*
390 	 * forced unmount is not supported by this file system
391 	 * and thus, ENOTSUP, is being returned.
392 	 */
393 	if (flag & MS_FORCE)
394 		return (ENOTSUP);
395 
396 	mutex_enter(&tm->tm_contents);
397 
398 	/*
399 	 * If there are no open files, only the root node should have
400 	 * a reference count.
401 	 * With tm_contents held, nothing can be added or removed.
402 	 * There may be some dirty pages.  To prevent fsflush from
403 	 * disrupting the unmount, put a hold on each node while scanning.
404 	 * If we find a previously referenced node, undo the holds we have
405 	 * placed and fail EBUSY.
406 	 */
407 	tnp = tm->tm_rootnode;
408 	if (TNTOV(tnp)->v_count > 1) {
409 		mutex_exit(&tm->tm_contents);
410 		return (EBUSY);
411 	}
412 
413 	for (tnp = tnp->tn_forw; tnp; tnp = tnp->tn_forw) {
414 		if ((vp = TNTOV(tnp))->v_count > 0) {
415 			cancel = tm->tm_rootnode->tn_forw;
416 			while (cancel != tnp) {
417 				vp = TNTOV(cancel);
418 				ASSERT(vp->v_count > 0);
419 				VN_RELE(vp);
420 				cancel = cancel->tn_forw;
421 			}
422 			mutex_exit(&tm->tm_contents);
423 			return (EBUSY);
424 		}
425 		VN_HOLD(vp);
426 	}
427 
428 	/*
429 	 * We can drop the mutex now because no one can find this mount
430 	 */
431 	mutex_exit(&tm->tm_contents);
432 
433 	/*
434 	 * Free all kmemalloc'd and anonalloc'd memory associated with
435 	 * this filesystem.  To do this, we go through the file list twice,
436 	 * once to remove all the directory entries, and then to remove
437 	 * all the files.  We do this because there is useful code in
438 	 * tmpnode_free which assumes that the directory entry has been
439 	 * removed before the file.
440 	 */
441 	/*
442 	 * Remove all directory entries
443 	 */
444 	for (tnp = tm->tm_rootnode; tnp; tnp = tnp->tn_forw) {
445 		rw_enter(&tnp->tn_rwlock, RW_WRITER);
446 		if (tnp->tn_type == VDIR)
447 			tdirtrunc(tnp);
448 		if (tnp->tn_vnode->v_flag & V_XATTRDIR) {
449 			/*
450 			 * Account for implicit attrdir reference.
451 			 */
452 			ASSERT(tnp->tn_nlink > 0);
453 			DECR_COUNT(&tnp->tn_nlink, &tnp->tn_tlock);
454 		}
455 		rw_exit(&tnp->tn_rwlock);
456 	}
457 
458 	ASSERT(tm->tm_rootnode);
459 
460 	/*
461 	 * All links are gone, v_count is keeping nodes in place.
462 	 * VN_RELE should make the node disappear, unless somebody
463 	 * is holding pages against it.  Nap and retry until it disappears.
464 	 *
465 	 * We re-acquire the lock to prevent others who have a HOLD on
466 	 * a tmpnode via its pages or anon slots from blowing it away
467 	 * (in tmp_inactive) while we're trying to get to it here. Once
468 	 * we have a HOLD on it we know it'll stick around.
469 	 *
470 	 */
471 	mutex_enter(&tm->tm_contents);
472 	/*
473 	 * Remove all the files (except the rootnode) backwards.
474 	 */
475 	while ((tnp = tm->tm_rootnode->tn_back) != tm->tm_rootnode) {
476 		mutex_exit(&tm->tm_contents);
477 		/*
478 		 * Inhibit tmp_inactive from touching attribute directory
479 		 * as all nodes will be released here.
480 		 * Note we handled the link count in pass 2 above.
481 		 */
482 		rw_enter(&tnp->tn_rwlock, RW_WRITER);
483 		tnp->tn_xattrdp = NULL;
484 		rw_exit(&tnp->tn_rwlock);
485 		vp = TNTOV(tnp);
486 		VN_RELE(vp);
487 		mutex_enter(&tm->tm_contents);
488 		/*
489 		 * It's still there after the RELE. Someone else like pageout
490 		 * has a hold on it so wait a bit and then try again - we know
491 		 * they'll give it up soon.
492 		 */
493 		if (tnp == tm->tm_rootnode->tn_back) {
494 			VN_HOLD(vp);
495 			mutex_exit(&tm->tm_contents);
496 			delay(hz / 4);
497 			mutex_enter(&tm->tm_contents);
498 		}
499 	}
500 	mutex_exit(&tm->tm_contents);
501 
502 	tm->tm_rootnode->tn_xattrdp = NULL;
503 	VN_RELE(TNTOV(tm->tm_rootnode));
504 
505 	ASSERT(tm->tm_mntpath);
506 
507 	tmp_memfree(tm->tm_mntpath, strlen(tm->tm_mntpath) + 1);
508 
509 	ASSERT(tm->tm_anonmem == 0);
510 
511 	mutex_destroy(&tm->tm_contents);
512 	mutex_destroy(&tm->tm_renamelck);
513 	tmp_memfree(tm, sizeof (struct tmount));
514 
515 	return (0);
516 }
517 
518 /*
519  * return root tmpnode for given vnode
520  */
521 static int
522 tmp_root(struct vfs *vfsp, struct vnode **vpp)
523 {
524 	struct tmount *tm = (struct tmount *)VFSTOTM(vfsp);
525 	struct tmpnode *tp = tm->tm_rootnode;
526 	struct vnode *vp;
527 
528 	ASSERT(tp);
529 
530 	vp = TNTOV(tp);
531 	VN_HOLD(vp);
532 	*vpp = vp;
533 	return (0);
534 }
535 
536 static int
537 tmp_statvfs(struct vfs *vfsp, struct statvfs64 *sbp)
538 {
539 	struct tmount	*tm = (struct tmount *)VFSTOTM(vfsp);
540 	ulong_t	blocks;
541 	dev32_t d32;
542 
543 	sbp->f_bsize = PAGESIZE;
544 	sbp->f_frsize = PAGESIZE;
545 
546 	/*
547 	 * Find the amount of available physical and memory swap
548 	 */
549 	mutex_enter(&anoninfo_lock);
550 	ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv);
551 	blocks = (ulong_t)CURRENT_TOTAL_AVAILABLE_SWAP;
552 	mutex_exit(&anoninfo_lock);
553 
554 	/*
555 	 * If tm_anonmax for this mount is less than the available swap space
556 	 * (minus the amount tmpfs can't use), use that instead
557 	 */
558 	if (blocks > tmpfs_minfree)
559 		sbp->f_bfree = MIN(blocks - tmpfs_minfree,
560 		    tm->tm_anonmax - tm->tm_anonmem);
561 	else
562 		sbp->f_bfree = 0;
563 
564 	sbp->f_bavail = sbp->f_bfree;
565 
566 	/*
567 	 * Total number of blocks is what's available plus what's been used
568 	 */
569 	sbp->f_blocks = (fsblkcnt64_t)(sbp->f_bfree + tm->tm_anonmem);
570 
571 	/*
572 	 * The maximum number of files available is approximately the number
573 	 * of tmpnodes we can allocate from the remaining kernel memory
574 	 * available to tmpfs.  This is fairly inaccurate since it doesn't
575 	 * take into account the names stored in the directory entries.
576 	 */
577 	if (tmpfs_maxkmem > tmp_kmemspace)
578 		sbp->f_ffree = (tmpfs_maxkmem - tmp_kmemspace) /
579 		    (sizeof (struct tmpnode) + sizeof (struct tdirent));
580 	else
581 		sbp->f_ffree = 0;
582 
583 	sbp->f_files = tmpfs_maxkmem /
584 	    (sizeof (struct tmpnode) + sizeof (struct tdirent));
585 	sbp->f_favail = (fsfilcnt64_t)(sbp->f_ffree);
586 	(void) cmpldev(&d32, vfsp->vfs_dev);
587 	sbp->f_fsid = d32;
588 	(void) strcpy(sbp->f_basetype, vfssw[tmpfsfstype].vsw_name);
589 	(void) strncpy(sbp->f_fstr, tm->tm_mntpath, sizeof (sbp->f_fstr));
590 	/*
591 	 * ensure null termination
592 	 */
593 	sbp->f_fstr[sizeof (sbp->f_fstr) - 1] = '\0';
594 	sbp->f_flag = vf_to_stf(vfsp->vfs_flag);
595 	sbp->f_namemax = MAXNAMELEN - 1;
596 	return (0);
597 }
598 
599 static int
600 tmp_vget(struct vfs *vfsp, struct vnode **vpp, struct fid *fidp)
601 {
602 	struct tfid *tfid;
603 	struct tmount *tm = (struct tmount *)VFSTOTM(vfsp);
604 	struct tmpnode *tp = NULL;
605 
606 	tfid = (struct tfid *)fidp;
607 	*vpp = NULL;
608 
609 	mutex_enter(&tm->tm_contents);
610 	for (tp = tm->tm_rootnode; tp; tp = tp->tn_forw) {
611 		mutex_enter(&tp->tn_tlock);
612 		if (tp->tn_nodeid == tfid->tfid_ino) {
613 			/*
614 			 * If the gen numbers don't match we know the
615 			 * file won't be found since only one tmpnode
616 			 * can have this number at a time.
617 			 */
618 			if (tp->tn_gen != tfid->tfid_gen || tp->tn_nlink == 0) {
619 				mutex_exit(&tp->tn_tlock);
620 				mutex_exit(&tm->tm_contents);
621 				return (0);
622 			}
623 			*vpp = (struct vnode *)TNTOV(tp);
624 
625 			VN_HOLD(*vpp);
626 
627 			if ((tp->tn_mode & S_ISVTX) &&
628 			    !(tp->tn_mode & (S_IXUSR | S_IFDIR))) {
629 				mutex_enter(&(*vpp)->v_lock);
630 				(*vpp)->v_flag |= VISSWAP;
631 				mutex_exit(&(*vpp)->v_lock);
632 			}
633 			mutex_exit(&tp->tn_tlock);
634 			mutex_exit(&tm->tm_contents);
635 			return (0);
636 		}
637 		mutex_exit(&tp->tn_tlock);
638 	}
639 	mutex_exit(&tm->tm_contents);
640 	return (0);
641 }
642