1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * This is the device filesystem.
28  *
29  * It is a combination of a namer to drive autoconfiguration,
30  * plus the access methods for the device drivers of the system.
31  *
32  * The prototype is fairly dependent on specfs for the latter part
33  * of its implementation, though a final version would integrate the two.
34  */
35 #include <sys/types.h>
36 #include <sys/param.h>
37 #include <sys/sysmacros.h>
38 #include <sys/systm.h>
39 #include <sys/kmem.h>
40 #include <sys/time.h>
41 #include <sys/pathname.h>
42 #include <sys/vfs.h>
43 #include <sys/vfs_opreg.h>
44 #include <sys/vnode.h>
45 #include <sys/stat.h>
46 #include <sys/uio.h>
47 #include <sys/stat.h>
48 #include <sys/errno.h>
49 #include <sys/cmn_err.h>
50 #include <sys/cred.h>
51 #include <sys/statvfs.h>
52 #include <sys/mount.h>
53 #include <sys/debug.h>
54 #include <sys/modctl.h>
55 #include <fs/fs_subr.h>
56 #include <sys/fs/dv_node.h>
57 #include <sys/fs/snode.h>
58 #include <sys/sunndi.h>
59 #include <sys/policy.h>
60 #include <sys/sunmdi.h>
61 
62 /*
63  * devfs vfs operations.
64  */
65 static int devfs_mount(struct vfs *, struct vnode *, struct mounta *,
66     struct cred *);
67 static int devfs_unmount(struct vfs *, int, struct cred *);
68 static int devfs_root(struct vfs *, struct vnode **);
69 static int devfs_statvfs(struct vfs *, struct statvfs64 *);
70 static int devfs_mountroot(struct vfs *, enum whymountroot);
71 
72 static int devfsinit(int, char *);
73 
74 static vfsdef_t devfs_vfssw = {
75 	VFSDEF_VERSION,
76 	"devfs",	/* type name string */
77 	devfsinit,	/* init routine */
78 	0,		/* flags */
79 	NULL		/* mount options table prototype */
80 };
81 
82 static kmutex_t devfs_lock;	/* protects global data */
83 static int devfstype;		/* fstype */
84 static dev_t devfsdev;		/* the fictious 'device' we live on */
85 static struct devfs_data *devfs_mntinfo;	/* linked list of instances */
86 
87 /*
88  * Module linkage information
89  */
90 static struct modlfs modlfs = {
91 	&mod_fsops, "devices filesystem", &devfs_vfssw
92 };
93 
94 static struct modlinkage modlinkage = {
95 	MODREV_1, (void *)&modlfs, NULL
96 };
97 
98 int
_init(void)99 _init(void)
100 {
101 	int e;
102 
103 	mutex_init(&devfs_lock, "devfs lock", MUTEX_DEFAULT, NULL);
104 	dv_node_cache_init();
105 	if ((e = mod_install(&modlinkage)) != 0) {
106 		dv_node_cache_fini();
107 		mutex_destroy(&devfs_lock);
108 		return (e);
109 	}
110 	dcmn_err(("devfs loaded\n"));
111 	return (0);
112 }
113 
114 int
_fini(void)115 _fini(void)
116 {
117 	return (EBUSY);
118 }
119 
120 int
_info(struct modinfo * modinfop)121 _info(struct modinfo *modinfop)
122 {
123 	return (mod_info(&modlinkage, modinfop));
124 }
125 
126 /*ARGSUSED1*/
127 static int
devfsinit(int fstype,char * name)128 devfsinit(int fstype, char *name)
129 {
130 	static const fs_operation_def_t devfs_vfsops_template[] = {
131 		VFSNAME_MOUNT,		{ .vfs_mount = devfs_mount },
132 		VFSNAME_UNMOUNT,	{ .vfs_unmount = devfs_unmount },
133 		VFSNAME_ROOT,		{ .vfs_root = devfs_root },
134 		VFSNAME_STATVFS,	{ .vfs_statvfs = devfs_statvfs },
135 		VFSNAME_SYNC,		{ .vfs_sync = fs_sync },
136 		VFSNAME_MOUNTROOT,	{ .vfs_mountroot = devfs_mountroot },
137 		NULL,			NULL
138 	};
139 	int error;
140 	int dev;
141 	extern major_t getudev(void);	/* gack - what a function */
142 
143 	devfstype = fstype;
144 	/*
145 	 * Associate VFS ops vector with this fstype
146 	 */
147 	error = vfs_setfsops(fstype, devfs_vfsops_template, NULL);
148 	if (error != 0) {
149 		cmn_err(CE_WARN, "devfsinit: bad vfs ops template");
150 		return (error);
151 	}
152 
153 	error = vn_make_ops("dev fs", dv_vnodeops_template, &dv_vnodeops);
154 	if (error != 0) {
155 		(void) vfs_freevfsops_by_type(fstype);
156 		cmn_err(CE_WARN, "devfsinit: bad vnode ops template");
157 		return (error);
158 	}
159 
160 	/*
161 	 * Invent a dev_t (sigh).
162 	 */
163 	if ((dev = getudev()) == DDI_MAJOR_T_NONE) {
164 		cmn_err(CE_NOTE, "%s: can't get unique dev", devfs_vfssw.name);
165 		dev = 0;
166 	}
167 	devfsdev = makedevice(dev, 0);
168 
169 	return (0);
170 }
171 
172 /*
173  * The name of the mount point and the name of the attribute
174  * filesystem are passed down from userland for now.
175  */
176 static int
devfs_mount(struct vfs * vfsp,struct vnode * mvp,struct mounta * uap,struct cred * cr)177 devfs_mount(struct vfs *vfsp, struct vnode *mvp, struct mounta *uap,
178     struct cred *cr)
179 {
180 	struct devfs_data *devfs_data;
181 	struct vnode *avp;
182 	struct dv_node *dv;
183 	struct vattr va;
184 
185 	dcmn_err(("devfs_mount\n"));
186 
187 	if (secpolicy_fs_mount(cr, mvp, vfsp) != 0)
188 		return (EPERM);
189 
190 	/*
191 	 * check that the mount point is sane
192 	 */
193 	if (mvp->v_type != VDIR)
194 		return (ENOTDIR);
195 
196 	ASSERT(uap->flags & MS_SYSSPACE);
197 	/*
198 	 * Devfs can only be mounted from kernel during boot.
199 	 * avp is the existing /devices, the same as the mount point.
200 	 */
201 	avp = mvp;
202 
203 	/*
204 	 * Create and initialize the vfs-private data.
205 	 * This includes a hand-crafted root vnode (we build
206 	 * this here mostly so that traverse() doesn't sleep
207 	 * in VFS_ROOT()).
208 	 */
209 	mutex_enter(&devfs_lock);
210 	ASSERT(devfs_mntinfo == NULL);
211 	dv = dv_mkroot(vfsp, devfsdev);
212 	dv->dv_attrvp = avp;		/* attribute root vp */
213 
214 	ASSERT(dv == dv->dv_dotdot);
215 
216 	devfs_data = kmem_zalloc(sizeof (struct devfs_data), KM_SLEEP);
217 	devfs_data->devfs_vfsp = vfsp;
218 	devfs_data->devfs_root = dv;
219 
220 	vfsp->vfs_data = (caddr_t)devfs_data;
221 	vfsp->vfs_fstype = devfstype;
222 	vfsp->vfs_dev = devfsdev;
223 	vfsp->vfs_bsize = DEV_BSIZE;
224 	vfsp->vfs_mtime = ddi_get_time();
225 	vfs_make_fsid(&vfsp->vfs_fsid, vfsp->vfs_dev, devfstype);
226 
227 	/* We're there. */
228 	devfs_mntinfo = devfs_data;
229 	mutex_exit(&devfs_lock);
230 
231 	va.va_mask = AT_ATIME|AT_MTIME;
232 	gethrestime(&va.va_atime);
233 	gethrestime(&va.va_mtime);
234 	(void) VOP_SETATTR(DVTOV(dv), &va, 0, cr, NULL);
235 	return (0);
236 }
237 
238 
239 /*
240  * We never unmount devfs in a real production system.
241  */
242 /*ARGSUSED*/
243 static int
devfs_unmount(struct vfs * vfsp,int flag,struct cred * cr)244 devfs_unmount(struct vfs *vfsp, int flag, struct cred *cr)
245 {
246 	return (EBUSY);
247 }
248 
249 /*
250  * return root vnode for given vfs
251  */
252 static int
devfs_root(struct vfs * vfsp,struct vnode ** vpp)253 devfs_root(struct vfs *vfsp, struct vnode **vpp)
254 {
255 	dcmn_err(("devfs_root\n"));
256 	*vpp = DVTOV(VFSTODVFS(vfsp)->devfs_root);
257 	VN_HOLD(*vpp);
258 	return (0);
259 }
260 
261 /*
262  * return 'generic superblock' information to userland.
263  *
264  * not much that we can usefully admit to here
265  */
266 static int
devfs_statvfs(struct vfs * vfsp,struct statvfs64 * sbp)267 devfs_statvfs(struct vfs *vfsp, struct statvfs64 *sbp)
268 {
269 	extern kmem_cache_t *dv_node_cache;
270 
271 	dev32_t d32;
272 
273 	dcmn_err(("devfs_statvfs\n"));
274 	bzero(sbp, sizeof (*sbp));
275 	sbp->f_frsize = sbp->f_bsize = vfsp->vfs_bsize;
276 	/*
277 	 * We could compute the number of devfsnodes here .. but since
278 	 * it's dynamic anyway, it's not clear how useful this is.
279 	 */
280 	sbp->f_files = kmem_cache_stat(dv_node_cache, "alloc");
281 
282 	/* no illusions that free/avail files is relevant to devfs */
283 	sbp->f_ffree = 0;
284 	sbp->f_favail = 0;
285 
286 	/* no illusions that blocks are relevant to devfs */
287 	sbp->f_bfree = 0;
288 	sbp->f_bavail = 0;
289 	sbp->f_blocks = 0;
290 
291 	(void) cmpldev(&d32, vfsp->vfs_dev);
292 	sbp->f_fsid = d32;
293 	(void) strcpy(sbp->f_basetype, vfssw[devfstype].vsw_name);
294 	sbp->f_flag = vf_to_stf(vfsp->vfs_flag);
295 	sbp->f_namemax = MAXNAMELEN - 1;
296 	(void) strcpy(sbp->f_fstr, "devices");
297 
298 	return (0);
299 }
300 
301 /*
302  * devfs always mount after root is mounted, so this should never
303  * be invoked.
304  */
305 /*ARGSUSED*/
306 static int
devfs_mountroot(struct vfs * vfsp,enum whymountroot why)307 devfs_mountroot(struct vfs *vfsp, enum whymountroot why)
308 {
309 	dcmn_err(("devfs_mountroot\n"));
310 
311 	return (EINVAL);
312 }
313 
314 struct dv_node *
devfs_dip_to_dvnode(dev_info_t * dip)315 devfs_dip_to_dvnode(dev_info_t *dip)
316 {
317 	char *dirpath;
318 	struct vnode *dirvp;
319 
320 	ASSERT(dip != NULL);
321 
322 	/* no-op if devfs not mounted yet */
323 	if (devfs_mntinfo == NULL)
324 		return (NULL);
325 
326 	/*
327 	 * The lookupname below only looks up cached dv_nodes
328 	 * because devfs_clean_key is set in thread specific data.
329 	 */
330 	dirpath = kmem_alloc(MAXPATHLEN, KM_SLEEP);
331 	(void) ddi_pathname(dip, dirpath);
332 	if (devfs_lookupname(dirpath, NULLVPP, &dirvp)) {
333 		dcmn_err(("directory %s not found\n", dirpath));
334 		kmem_free(dirpath, MAXPATHLEN);
335 		return (NULL);
336 	}
337 
338 	kmem_free(dirpath, MAXPATHLEN);
339 	return (VTODV(dirvp));
340 }
341 
342 /*
343  * If DV_CLEAN_FORCE devfs_clean is issued with a dip that is not the root
344  * and not a vHCI we also need to clean any vHCI branches because they
345  * may contain pHCI nodes. A detach_node() of a pHCI will fail if its
346  * mdi_devi_offline() fails, and the mdi_devi_offline() of the last
347  * pHCI will fail unless an ndi_devi_offline() of the Client nodes under
348  * the vHCI is successful - which requires a clean vHCI branch to removed
349  * the devi_refs associated with devfs vnodes.
350  */
351 static int
devfs_clean_vhci(dev_info_t * dip,void * args)352 devfs_clean_vhci(dev_info_t *dip, void *args)
353 {
354 	struct dv_node	*dvp;
355 	uint_t		flags = (uint_t)(uintptr_t)args;
356 
357 	(void) tsd_set(devfs_clean_key, (void *)1);
358 	dvp = devfs_dip_to_dvnode(dip);
359 	if (dvp) {
360 		(void) dv_cleandir(dvp, NULL, flags);
361 		VN_RELE(DVTOV(dvp));
362 	}
363 	(void) tsd_set(devfs_clean_key, NULL);
364 	return (DDI_WALK_CONTINUE);
365 }
366 
367 /*
368  * devfs_clean()
369  *
370  * Destroy unreferenced dv_node's and detach devices.
371  *
372  * devfs_clean will try its best to clean up unused nodes. It is
373  * no longer valid to assume that just because devfs_clean fails,
374  * the device is not removable. This is because device contracts
375  * can result in userland processes releasing a device during the
376  * device offline process in the kernel. Thus it is no longer
377  * correct to fail an offline just because devfs_clean finds
378  * referenced dv_nodes. To enforce this, devfs_clean() always
379  * returns success i.e. 0.
380  *
381  * devfs_clean() may return before removing all possible nodes if
382  * we cannot acquire locks in areas of the code where potential for
383  * deadlock exists (see comments in dv_find() and dv_cleandir() for
384  * examples of this).
385  *
386  * devfs caches unreferenced dv_node to speed by the performance
387  * of ls, find, etc. devfs_clean() is invoked to cleanup cached
388  * dv_nodes to reclaim memory as well as to facilitate device
389  * removal (dv_node reference devinfo nodes, which prevents driver
390  * detach).
391  *
392  * If a shell parks in a /devices directory, the dv_node will be
393  * held, preventing the corresponding device to be detached.
394  * This would be a denial of service against DR. To prevent this,
395  * DR code calls devfs_clean() with the DV_CLEAN_FORCE flag.
396  * The dv_cleandir() implementation does the right thing to ensure
397  * successful DR.
398  */
399 int
devfs_clean(dev_info_t * dip,char * devnm,uint_t flags)400 devfs_clean(dev_info_t *dip, char *devnm, uint_t flags)
401 {
402 	struct dv_node		*dvp;
403 
404 	dcmn_err(("devfs_unconfigure: dip = 0x%p, flags = 0x%x",
405 	    (void *)dip, flags));
406 
407 	/* avoid recursion back into the device tree */
408 	(void) tsd_set(devfs_clean_key, (void *)1);
409 	dvp = devfs_dip_to_dvnode(dip);
410 	if (dvp == NULL) {
411 		(void) tsd_set(devfs_clean_key, NULL);
412 		return (0);
413 	}
414 
415 	(void) dv_cleandir(dvp, devnm, flags);
416 	(void) tsd_set(devfs_clean_key, NULL);
417 	VN_RELE(DVTOV(dvp));
418 
419 	/*
420 	 * If we are doing a DV_CLEAN_FORCE, and we did not start at the
421 	 * root, and we did not start at a vHCI node then clean vHCI
422 	 * branches too.  Failure to clean vHCI branch does not cause EBUSY.
423 	 *
424 	 * Also, to accommodate nexus callers that clean 'self' to DR 'child'
425 	 * (like pcihp) we clean vHCIs even when dv_cleandir() of dip branch
426 	 * above fails - this prevents a busy DR 'child' sibling from causing
427 	 * the DR of 'child' to fail because a vHCI branch was not cleaned.
428 	 */
429 	if ((flags & DV_CLEAN_FORCE) && (dip != ddi_root_node()) &&
430 	    (mdi_component_is_vhci(dip, NULL) != MDI_SUCCESS)) {
431 		/*
432 		 * NOTE: for backport the following is recommended
433 		 * 	(void) devfs_clean_vhci(scsi_vhci_dip,
434 		 *	    (void *)(uintptr_t)flags);
435 		 */
436 		mdi_walk_vhcis(devfs_clean_vhci, (void *)(uintptr_t)flags);
437 	}
438 
439 	return (0);
440 }
441 
442 /*
443  * lookup a devfs relative pathname, returning held vnodes for the final
444  * component and the containing directory (if requested).
445  *
446  * NOTE: We can't use lookupname because this would use the current
447  *	processes credentials (CRED) in the call lookuppnvp instead
448  *	of kcred.  It also does not give you the flexibility so
449  * 	specify the directory to start the resolution in (devicesdir).
450  */
451 int
devfs_lookupname(char * pathname,vnode_t ** dirvpp,vnode_t ** compvpp)452 devfs_lookupname(
453 	char	*pathname,		/* user pathname */
454 	vnode_t **dirvpp,		/* ret for ptr to parent dir vnode */
455 	vnode_t **compvpp)		/* ret for ptr to component vnode */
456 {
457 	struct pathname	pn;
458 	int		error;
459 
460 	ASSERT(devicesdir);		/* devfs must be initialized */
461 	ASSERT(pathname);		/* must have some path */
462 
463 	if (error = pn_get(pathname, UIO_SYSSPACE, &pn))
464 		return (error);
465 
466 	/* make the path relative to /devices. */
467 	pn_skipslash(&pn);
468 	if (pn_pathleft(&pn) == 0) {
469 		/* all we had was "\0" or "/" (which skipslash skiped) */
470 		if (dirvpp)
471 			*dirvpp = NULL;
472 		if (compvpp) {
473 			VN_HOLD(devicesdir);
474 			*compvpp = devicesdir;
475 		}
476 	} else {
477 		/*
478 		 * Use devfs lookup to resolve pathname to the vnode for
479 		 * the device via relative lookup in devfs. Extra holds for
480 		 * using devicesdir as directory we are searching and for
481 		 * being our root without being == rootdir.
482 		 */
483 		VN_HOLD(devicesdir);
484 		VN_HOLD(devicesdir);
485 		error = lookuppnvp(&pn, NULL, FOLLOW, dirvpp, compvpp,
486 		    devicesdir, devicesdir, kcred);
487 	}
488 	pn_free(&pn);
489 
490 	return (error);
491 }
492 
493 /*
494  * Given a devfs path (without the /devices prefix), walk
495  * the dv_node sub-tree rooted at the path.
496  */
497 int
devfs_walk(char * path,void (* callback)(struct dv_node *,void *),void * arg)498 devfs_walk(
499 	char		*path,
500 	void		(*callback)(struct dv_node *, void *),
501 	void		*arg)
502 {
503 	char *dirpath, *devnm;
504 	struct vnode	*dirvp;
505 
506 	ASSERT(path && callback);
507 
508 	if (*path != '/' || devfs_mntinfo == NULL)
509 		return (ENXIO);
510 
511 	dcmn_err(("devfs_walk: path = %s", path));
512 
513 	dirpath = kmem_alloc(MAXPATHLEN, KM_SLEEP);
514 
515 	(void) snprintf(dirpath, MAXPATHLEN, "/devices%s", path);
516 
517 	devnm = strrchr(dirpath, '/');
518 
519 	ASSERT(devnm);
520 
521 	*devnm++ = '\0';
522 
523 	if (lookupname(dirpath, UIO_SYSSPACE, 0, NULL, &dirvp)) {
524 		dcmn_err(("directory %s not found\n", dirpath));
525 		kmem_free(dirpath, MAXPATHLEN);
526 		return (ENXIO);
527 	}
528 
529 	/*
530 	 * if path == "/", visit the root dv_node
531 	 */
532 	if (*devnm == '\0') {
533 		callback(VTODV(dirvp), arg);
534 		devnm = NULL;
535 	}
536 
537 	dv_walk(VTODV(dirvp), devnm, callback, arg);
538 
539 	VN_RELE(dirvp);
540 
541 	kmem_free(dirpath, MAXPATHLEN);
542 
543 	return (0);
544 }
545 
546 int
devfs_devpolicy(vnode_t * vp,devplcy_t ** dpp)547 devfs_devpolicy(vnode_t *vp, devplcy_t **dpp)
548 {
549 	struct vnode *rvp;
550 	struct dv_node *dvp;
551 	int rval = -1;
552 
553 	/* fail if devfs not mounted yet */
554 	if (devfs_mntinfo == NULL)
555 		return (rval);
556 
557 	if (VOP_REALVP(vp, &rvp, NULL) == 0 && vn_matchops(rvp, dv_vnodeops)) {
558 		dvp = VTODV(rvp);
559 		rw_enter(&dvp->dv_contents, RW_READER);
560 		if (dvp->dv_priv) {
561 			dphold(dvp->dv_priv);
562 			*dpp = dvp->dv_priv;
563 			rval = 0;
564 		}
565 		rw_exit(&dvp->dv_contents);
566 	}
567 	return (rval);
568 }
569