1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
23 */
24
25#include <sys/file.h>
26#include <sys/stat.h>
27#include <sys/atomic.h>
28#include <sys/mntio.h>
29#include <sys/mnttab.h>
30#include <sys/mount.h>
31#include <sys/sunddi.h>
32#include <sys/sysmacros.h>
33#include <sys/systm.h>
34#include <sys/vfs.h>
35#include <sys/vfs_opreg.h>
36#include <sys/fs/mntdata.h>
37#include <fs/fs_subr.h>
38#include <sys/vmsystm.h>
39#include <vm/seg_vn.h>
40#include <sys/time.h>
41#include <sys/ksynch.h>
42#include <sys/sdt.h>
43
44#define	MNTROOTINO	2
45
46static mntnode_t *mntgetnode(vnode_t *);
47
48vnodeops_t *mntvnodeops;
49extern void vfs_mnttab_readop(void);
50
51/*
52 * Design of kernel mnttab accounting.
53 *
54 * mntfs provides two methods of reading the in-kernel mnttab, i.e. the state of
55 * the mounted resources: the read-only file /etc/mnttab, and a collection of
56 * ioctl() commands. Most of these interfaces are public and are described in
57 * mnttab(4). Three private ioctl() commands, MNTIOC_GETMNTENT,
58 * MNTIOC_GETEXTMNTENT and MNTIOC_GETMNTANY, provide for the getmntent(3C)
59 * family of functions, allowing them to support white space in mount names.
60 *
61 * A significant feature of mntfs is that it provides a file descriptor with a
62 * snapshot once it begins to consume mnttab data. Thus, as the process
63 * continues to consume data, its view of the in-kernel mnttab does not change
64 * even if resources are mounted or unmounted. The intent is to ensure that
65 * processes are guaranteed to read self-consistent data even as the system
66 * changes.
67 *
68 * The snapshot is implemented by a "database", unique to each zone, that
69 * comprises a linked list of mntelem_ts. The database is identified by
70 * zone_mntfs_db and is protected by zone_mntfs_db_lock. Each element contains
71 * the text entry in /etc/mnttab for a mounted resource, i.e. a vfs_t, and is
72 * marked with its time of "birth", i.e. creation. An element is "killed", and
73 * marked with its time of death, when it is found to be out of date, e.g. when
74 * the corresponding resource has been unmounted.
75 *
76 * When a process performs the first read() or ioctl() for a file descriptor for
77 * /etc/mnttab, the database is updated by a call to mntfs_snapshot() to ensure
78 * that an element exists for each currently mounted resource. Following this,
79 * the current time is written into a snapshot structure, a mntsnap_t, embedded
80 * in the descriptor's mntnode_t.
81 *
82 * mntfs is able to enumerate the /etc/mnttab entries corresponding to a
83 * particular file descriptor by searching the database for entries that were
84 * born before the appropriate snapshot and that either are still alive or died
85 * after the snapshot was created. Consumers use the iterator function
86 * mntfs_get_next_elem() to identify the next suitable element in the database.
87 *
88 * Each snapshot has a hold on its corresponding database elements, effected by
89 * a per-element reference count. At last close(), a snapshot is destroyed in
90 * mntfs_freesnap() by releasing all of its holds; an element is destroyed if
91 * its reference count becomes zero. Therefore the database never exists unless
92 * there is at least one active consumer of /etc/mnttab.
93 *
94 * getmntent(3C) et al. "do not open, close or rewind the file." This implies
95 * that getmntent() and read() must be able to operate without interaction on
96 * the same file descriptor; this is accomplished by the use of separate
97 * mntsnap_ts for both read() and ioctl().
98 *
99 * mntfs observes the following lock-ordering:
100 *
101 *	mnp->mnt_contents -> vfslist -> zonep->zone_mntfs_db_lock
102 *
103 * NOTE: The following variable enables the generation of the "dev=xxx"
104 * in the option string for a mounted file system.  Really this should
105 * be gotten rid of altogether, but for the sake of backwards compatibility
106 * we had to leave it in.  It is defined as a 32-bit device number.  This
107 * means that when 64-bit device numbers are in use, if either the major or
108 * minor part of the device number will not fit in a 16 bit quantity, the
109 * "dev=" will be set to NODEV (0x7fffffff).  See PSARC 1999/566 and
110 * 1999/131 for details.  The cmpldev() function used to generate the 32-bit
111 * device number handles this check and assigns the proper value.
112 */
113int mntfs_enabledev = 1;	/* enable old "dev=xxx" option */
114
115extern void vfs_mono_time(timespec_t *);
116enum { MNTFS_FIRST, MNTFS_SECOND, MNTFS_NEITHER };
117
118/*
119 * Determine whether a field within a line from /etc/mnttab contains actual
120 * content or simply the marker string "-". This never applies to the time,
121 * therefore the delimiter must be a tab.
122 */
123#define	MNTFS_REAL_FIELD(x)	(*(x) != '-' || *((x) + 1) != '\t')
124
125static int
126mntfs_devsize(struct vfs *vfsp)
127{
128	dev32_t odev;
129
130	(void) cmpldev(&odev, vfsp->vfs_dev);
131	return (snprintf(NULL, 0, "dev=%x", odev));
132}
133
134static int
135mntfs_devprint(struct vfs *vfsp, char *buf)
136{
137	dev32_t odev;
138
139	(void) cmpldev(&odev, vfsp->vfs_dev);
140	return (snprintf(buf, MAX_MNTOPT_STR, "dev=%x", odev));
141}
142
143/* Identify which, if either, of two supplied timespec structs is newer. */
144static int
145mntfs_newest(timespec_t *a, timespec_t *b)
146{
147	if (a->tv_sec == b->tv_sec &&
148	    a->tv_nsec == b->tv_nsec) {
149		return (MNTFS_NEITHER);
150	} else if (b->tv_sec > a->tv_sec ||
151	    (b->tv_sec == a->tv_sec &&
152	    b->tv_nsec > a->tv_nsec)) {
153		return (MNTFS_SECOND);
154	} else {
155		return (MNTFS_FIRST);
156	}
157}
158
159static int
160mntfs_optsize(struct vfs *vfsp)
161{
162	int i, size = 0;
163	mntopt_t *mop;
164
165	for (i = 0; i < vfsp->vfs_mntopts.mo_count; i++) {
166		mop = &vfsp->vfs_mntopts.mo_list[i];
167		if (mop->mo_flags & MO_NODISPLAY)
168			continue;
169		if (mop->mo_flags & MO_SET) {
170			if (size)
171				size++; /* space for comma */
172			size += strlen(mop->mo_name);
173			/*
174			 * count option value if there is one
175			 */
176			if (mop->mo_arg != NULL) {
177				size += strlen(mop->mo_arg) + 1;
178			}
179		}
180	}
181	if (vfsp->vfs_zone != NULL && vfsp->vfs_zone != global_zone) {
182		/*
183		 * Add space for "zone=<zone_name>" if required.
184		 */
185		if (size)
186			size++;	/* space for comma */
187		size += sizeof ("zone=") - 1;
188		size += strlen(vfsp->vfs_zone->zone_name);
189	}
190	if (mntfs_enabledev) {
191		if (size != 0)
192			size++; /* space for comma */
193		size += mntfs_devsize(vfsp);
194	}
195	if (size == 0)
196		size = strlen("-");
197	return (size);
198}
199
200static int
201mntfs_optprint(struct vfs *vfsp, char *buf)
202{
203	int i, optinbuf = 0;
204	mntopt_t *mop;
205	char *origbuf = buf;
206
207	for (i = 0; i < vfsp->vfs_mntopts.mo_count; i++) {
208		mop = &vfsp->vfs_mntopts.mo_list[i];
209		if (mop->mo_flags & MO_NODISPLAY)
210			continue;
211		if (mop->mo_flags & MO_SET) {
212			if (optinbuf)
213				*buf++ = ',';
214			else
215				optinbuf = 1;
216			buf += snprintf(buf, MAX_MNTOPT_STR,
217			    "%s", mop->mo_name);
218			/*
219			 * print option value if there is one
220			 */
221			if (mop->mo_arg != NULL) {
222				buf += snprintf(buf, MAX_MNTOPT_STR, "=%s",
223				    mop->mo_arg);
224			}
225		}
226	}
227	if (vfsp->vfs_zone != NULL && vfsp->vfs_zone != global_zone) {
228		if (optinbuf)
229			*buf++ = ',';
230		else
231			optinbuf = 1;
232		buf += snprintf(buf, MAX_MNTOPT_STR, "zone=%s",
233		    vfsp->vfs_zone->zone_name);
234	}
235	if (mntfs_enabledev) {
236		if (optinbuf++)
237			*buf++ = ',';
238		buf += mntfs_devprint(vfsp, buf);
239	}
240	if (!optinbuf) {
241		buf += snprintf(buf, MAX_MNTOPT_STR, "-");
242	}
243	return (buf - origbuf);
244}
245
246void
247mntfs_populate_text(vfs_t *vfsp, zone_t *zonep, mntelem_t *elemp)
248{
249	struct extmnttab *tabp = &elemp->mnte_tab;
250	const char *resource, *mntpt;
251	char *cp = elemp->mnte_text;
252	mntpt = refstr_value(vfsp->vfs_mntpt);
253	resource = refstr_value(vfsp->vfs_resource);
254
255	tabp->mnt_special = 0;
256	if (resource != NULL && resource[0] != '\0') {
257		if (resource[0] != '/') {
258			cp += snprintf(cp, MAXPATHLEN, "%s\t", resource);
259		} else if (!ZONE_PATH_VISIBLE(resource, zonep)) {
260			/*
261			 * Use the mount point as the resource.
262			 */
263			cp += snprintf(cp, MAXPATHLEN, "%s\t",
264			    ZONE_PATH_TRANSLATE(mntpt, zonep));
265		} else {
266			cp += snprintf(cp, MAXPATHLEN, "%s\t",
267			    ZONE_PATH_TRANSLATE(resource, zonep));
268		}
269	} else {
270		cp += snprintf(cp, MAXPATHLEN, "-\t");
271	}
272
273	tabp->mnt_mountp = (char *)(cp - elemp->mnte_text);
274	if (mntpt != NULL && mntpt[0] != '\0') {
275		/*
276		 * We know the mount point is visible from within the zone,
277		 * otherwise it wouldn't be on the zone's vfs list.
278		 */
279		cp += snprintf(cp, MAXPATHLEN, "%s\t",
280		    ZONE_PATH_TRANSLATE(mntpt, zonep));
281	} else {
282		cp += snprintf(cp, MAXPATHLEN, "-\t");
283	}
284
285	tabp->mnt_fstype = (char *)(cp - elemp->mnte_text);
286	cp += snprintf(cp, MAXPATHLEN, "%s\t",
287	    vfssw[vfsp->vfs_fstype].vsw_name);
288
289	tabp->mnt_mntopts = (char *)(cp - elemp->mnte_text);
290	cp += mntfs_optprint(vfsp, cp);
291	*cp++ = '\t';
292
293	tabp->mnt_time = (char *)(cp - elemp->mnte_text);
294	cp += snprintf(cp, MAX_MNTOPT_STR, "%ld", vfsp->vfs_mtime);
295	*cp++ = '\n'; /* over-write snprintf's trailing null-byte */
296
297	tabp->mnt_major = getmajor(vfsp->vfs_dev);
298	tabp->mnt_minor = getminor(vfsp->vfs_dev);
299
300	elemp->mnte_text_size = cp - elemp->mnte_text;
301	elemp->mnte_vfs_ctime = vfsp->vfs_hrctime;
302	elemp->mnte_hidden = vfsp->vfs_flag & VFS_NOMNTTAB;
303}
304
305/* Determine the length of the /etc/mnttab entry for this vfs_t. */
306static size_t
307mntfs_text_len(vfs_t *vfsp, zone_t *zone)
308{
309	size_t size = 0;
310	const char *resource, *mntpt;
311	size_t mntsize;
312
313	mntpt = refstr_value(vfsp->vfs_mntpt);
314	if (mntpt != NULL && mntpt[0] != '\0') {
315		mntsize = strlen(ZONE_PATH_TRANSLATE(mntpt, zone)) + 1;
316	} else {
317		mntsize = 2;	/* "-\t" */
318	}
319	size += mntsize;
320
321	resource = refstr_value(vfsp->vfs_resource);
322	if (resource != NULL && resource[0] != '\0') {
323		if (resource[0] != '/') {
324			size += strlen(resource) + 1;
325		} else if (!ZONE_PATH_VISIBLE(resource, zone)) {
326			/*
327			 * Same as the zone's view of the mount point.
328			 */
329			size += mntsize;
330		} else {
331			size += strlen(ZONE_PATH_TRANSLATE(resource, zone)) + 1;
332		}
333	} else {
334		size += 2;	/* "-\t" */
335	}
336	size += strlen(vfssw[vfsp->vfs_fstype].vsw_name) + 1;
337	size += mntfs_optsize(vfsp);
338	size += snprintf(NULL, 0, "\t%ld\n", vfsp->vfs_mtime);
339	return (size);
340}
341
342/* Destroy the resources associated with a snapshot element. */
343static void
344mntfs_destroy_elem(mntelem_t *elemp)
345{
346	kmem_free(elemp->mnte_text, elemp->mnte_text_size);
347	kmem_free(elemp, sizeof (mntelem_t));
348}
349
350/*
351 * Return 1 if the given snapshot is in the range of the given element; return
352 * 0 otherwise.
353 */
354static int
355mntfs_elem_in_range(mntsnap_t *snapp, mntelem_t *elemp)
356{
357	timespec_t	*stimep = &snapp->mnts_time;
358	timespec_t	*btimep = &elemp->mnte_birth;
359	timespec_t	*dtimep = &elemp->mnte_death;
360
361	/*
362	 * If a snapshot is in range of an element then the snapshot must have
363	 * been created after the birth of the element, and either the element
364	 * is still alive or it died after the snapshot was created.
365	 */
366	if (mntfs_newest(btimep, stimep) == MNTFS_SECOND &&
367	    (MNTFS_ELEM_IS_ALIVE(elemp) ||
368	    mntfs_newest(stimep, dtimep) == MNTFS_SECOND))
369		return (1);
370	else
371		return (0);
372}
373
374/*
375 * Return the next valid database element, after the one provided, for a given
376 * snapshot; return NULL if none exists. The caller must hold the zone's
377 * database lock as a reader before calling this function.
378 */
379static mntelem_t *
380mntfs_get_next_elem(mntsnap_t *snapp, mntelem_t *elemp)
381{
382	int show_hidden = snapp->mnts_flags & MNTS_SHOWHIDDEN;
383
384	do {
385		elemp = elemp->mnte_next;
386	} while (elemp &&
387	    (!mntfs_elem_in_range(snapp, elemp) ||
388	    (!show_hidden && elemp->mnte_hidden)));
389	return (elemp);
390}
391
392/*
393 * This function frees the resources associated with a mntsnap_t. It walks
394 * through the database, decrementing the reference count of any element that
395 * satisfies the snapshot. If the reference count of an element becomes zero
396 * then it is removed from the database.
397 */
398static void
399mntfs_freesnap(mntnode_t *mnp, mntsnap_t *snapp)
400{
401	zone_t *zonep = MTOD(mnp)->mnt_zone_ref.zref_zone;
402	krwlock_t *dblockp = &zonep->zone_mntfs_db_lock;
403	mntelem_t **elempp = &zonep->zone_mntfs_db;
404	mntelem_t *elemp;
405	int show_hidden = snapp->mnts_flags & MNTS_SHOWHIDDEN;
406	size_t number_decremented = 0;
407
408	ASSERT(RW_WRITE_HELD(&mnp->mnt_contents));
409
410	/* Ignore an uninitialised snapshot. */
411	if (snapp->mnts_nmnts == 0)
412		return;
413
414	/* Drop the holds on any matching database elements. */
415	rw_enter(dblockp, RW_WRITER);
416	while ((elemp = *elempp) != NULL) {
417		if (mntfs_elem_in_range(snapp, elemp) &&
418		    (!elemp->mnte_hidden || show_hidden) &&
419		    ++number_decremented && --elemp->mnte_refcnt == 0) {
420			if ((*elempp = elemp->mnte_next) != NULL)
421				(*elempp)->mnte_prev = elemp->mnte_prev;
422			mntfs_destroy_elem(elemp);
423		} else {
424			elempp = &elemp->mnte_next;
425		}
426	}
427	rw_exit(dblockp);
428	ASSERT(number_decremented == snapp->mnts_nmnts);
429
430	/* Clear the snapshot data. */
431	bzero(snapp, sizeof (mntsnap_t));
432}
433
434/* Insert the new database element newp after the existing element prevp. */
435static void
436mntfs_insert_after(mntelem_t *newp, mntelem_t *prevp)
437{
438	newp->mnte_prev = prevp;
439	newp->mnte_next = prevp->mnte_next;
440	prevp->mnte_next = newp;
441	if (newp->mnte_next != NULL)
442		newp->mnte_next->mnte_prev = newp;
443}
444
445/* Create and return a copy of a given database element. */
446static mntelem_t *
447mntfs_copy(mntelem_t *origp)
448{
449	mntelem_t *copyp;
450
451	copyp = kmem_zalloc(sizeof (mntelem_t), KM_SLEEP);
452	copyp->mnte_vfs_ctime = origp->mnte_vfs_ctime;
453	copyp->mnte_text_size = origp->mnte_text_size;
454	copyp->mnte_text = kmem_alloc(copyp->mnte_text_size, KM_SLEEP);
455	bcopy(origp->mnte_text, copyp->mnte_text, copyp->mnte_text_size);
456	copyp->mnte_tab = origp->mnte_tab;
457	copyp->mnte_hidden = origp->mnte_hidden;
458
459	return (copyp);
460}
461
462/*
463 * Compare two database elements and determine whether or not the vfs_t payload
464 * data of each are the same. Return 1 if so and 0 otherwise.
465 */
466static int
467mntfs_is_same_element(mntelem_t *a, mntelem_t *b)
468{
469	if (a->mnte_hidden == b->mnte_hidden &&
470	    a->mnte_text_size == b->mnte_text_size &&
471	    bcmp(a->mnte_text, b->mnte_text, a->mnte_text_size) == 0 &&
472	    bcmp(&a->mnte_tab, &b->mnte_tab, sizeof (struct extmnttab)) == 0)
473		return (1);
474	else
475		return (0);
476}
477
478/*
479 * mntfs_snapshot() updates the database, creating it if necessary, so that it
480 * accurately reflects the state of the in-kernel mnttab. It also increments
481 * the reference count on all database elements that correspond to currently-
482 * mounted resources. Finally, it initialises the appropriate snapshot
483 * structure.
484 *
485 * Each vfs_t is given a high-resolution time stamp, for the benefit of mntfs,
486 * when it is inserted into the in-kernel mnttab. This time stamp is copied into
487 * the corresponding database element when it is created, allowing the element
488 * and the vfs_t to be identified as a pair. It is possible that some file
489 * systems may make unadvertised changes to, for example, a resource's mount
490 * options. Therefore, in order to determine whether a database element is an
491 * up-to-date representation of a given vfs_t, it is compared with a temporary
492 * element generated for this purpose. Although less efficient, this is safer
493 * than implementing an mtime for a vfs_t.
494 *
495 * Some mounted resources are marked as "hidden" with a VFS_NOMNTTAB flag. These
496 * are considered invisible unless the user has already set the MNT_SHOWHIDDEN
497 * flag in the vnode using the MNTIOC_SHOWHIDDEN ioctl.
498 */
499static void
500mntfs_snapshot(mntnode_t *mnp, mntsnap_t *snapp)
501{
502	mntdata_t	*mnd = MTOD(mnp);
503	zone_t		*zonep = mnd->mnt_zone_ref.zref_zone;
504	int		is_global_zone = (zonep == global_zone);
505	int		show_hidden = mnp->mnt_flags & MNT_SHOWHIDDEN;
506	vfs_t		*vfsp, *firstvfsp, *lastvfsp;
507	vfs_t		dummyvfs;
508	vfs_t		*dummyvfsp = NULL;
509	krwlock_t	*dblockp = &zonep->zone_mntfs_db_lock;
510	mntelem_t	**headpp = &zonep->zone_mntfs_db;
511	mntelem_t	*elemp;
512	mntelem_t	*prevp = NULL;
513	int		order;
514	mntelem_t	*tempelemp;
515	mntelem_t	*newp;
516	mntelem_t	*firstp = NULL;
517	size_t		nmnts = 0;
518	size_t		total_text_size = 0;
519	size_t		normal_text_size = 0;
520	int		insert_before;
521	timespec_t	last_mtime;
522	size_t		entry_length, new_entry_length;
523
524
525	ASSERT(RW_WRITE_HELD(&mnp->mnt_contents));
526	vfs_list_read_lock();
527	vfs_mnttab_modtime(&last_mtime);
528
529	/*
530	 * If this snapshot already exists then we must have been asked to
531	 * rewind the file, i.e. discard the snapshot and create a new one in
532	 * its place. In this case we first see if the in-kernel mnttab has
533	 * advertised a change; if not then we simply reinitialise the metadata.
534	 */
535	if (snapp->mnts_nmnts) {
536		if (mntfs_newest(&last_mtime, &snapp->mnts_last_mtime) ==
537		    MNTFS_NEITHER) {
538			/*
539			 * An unchanged mtime is no guarantee that the
540			 * in-kernel mnttab is unchanged; for example, a
541			 * concurrent remount may be between calls to
542			 * vfs_setmntopt_nolock() and vfs_mnttab_modtimeupd().
543			 * It follows that the database may have changed, and
544			 * in particular that some elements in this snapshot
545			 * may have been killed by another call to
546			 * mntfs_snapshot(). It is therefore not merely
547			 * unnecessary to update the snapshot's time but in
548			 * fact dangerous; it needs to be left alone.
549			 */
550			snapp->mnts_next = snapp->mnts_first;
551			snapp->mnts_flags &= ~MNTS_REWIND;
552			snapp->mnts_foffset = snapp->mnts_ieoffset = 0;
553			vfs_list_unlock();
554			return;
555		} else {
556			mntfs_freesnap(mnp, snapp);
557		}
558	}
559
560	/*
561	 * Create a temporary database element. For each vfs_t, the temporary
562	 * element will be populated with the corresponding text. If the vfs_t
563	 * does not have a corresponding element within the database, or if
564	 * there is such an element but it is stale, a copy of the temporary
565	 * element is inserted into the database at the appropriate location.
566	 */
567	tempelemp = kmem_alloc(sizeof (mntelem_t), KM_SLEEP);
568	entry_length = MNT_LINE_MAX;
569	tempelemp->mnte_text = kmem_alloc(entry_length, KM_SLEEP);
570
571	/* Find the first and last vfs_t for the given zone. */
572	if (is_global_zone) {
573		firstvfsp = rootvfs;
574		lastvfsp = firstvfsp->vfs_prev;
575	} else {
576		firstvfsp = zonep->zone_vfslist;
577		/*
578		 * If there isn't already a vfs_t for root then we create a
579		 * dummy which will be used as the head of the list (which will
580		 * therefore no longer be circular).
581		 */
582		if (firstvfsp == NULL ||
583		    strcmp(refstr_value(firstvfsp->vfs_mntpt),
584		    zonep->zone_rootpath) != 0) {
585			/*
586			 * The zone's vfs_ts will have mount points relative to
587			 * the zone's root path. The vfs_t for the zone's
588			 * root file system would therefore have a mount point
589			 * equal to the zone's root path. Since the zone's root
590			 * path isn't a mount point, we copy the vfs_t of the
591			 * zone's root vnode, and provide it with a fake mount
592			 * and resource. However, if the zone's root is a
593			 * zfs dataset, use the dataset name as the resource.
594			 *
595			 * Note that by cloning another vfs_t we also acquire
596			 * its high-resolution ctime. This might appear to
597			 * violate the requirement that the ctimes in the list
598			 * of vfs_ts are unique and monotonically increasing;
599			 * this is not the case. The dummy vfs_t appears in only
600			 * a non-global zone's vfs_t list, where the cloned
601			 * vfs_t would not ordinarily be visible; the ctimes are
602			 * therefore unique. The zone's root path must be
603			 * available before the zone boots, and so its root
604			 * vnode's vfs_t's ctime must be lower than those of any
605			 * resources subsequently mounted by the zone. The
606			 * ctimes are therefore monotonically increasing.
607			 */
608			dummyvfs = *zonep->zone_rootvp->v_vfsp;
609			dummyvfs.vfs_mntpt = refstr_alloc(zonep->zone_rootpath);
610			if (strcmp(vfssw[dummyvfs.vfs_fstype].vsw_name, "zfs")
611			    != 0)
612				dummyvfs.vfs_resource = dummyvfs.vfs_mntpt;
613			dummyvfsp = &dummyvfs;
614			if (firstvfsp == NULL) {
615				lastvfsp = dummyvfsp;
616			} else {
617				lastvfsp = firstvfsp->vfs_zone_prev;
618				dummyvfsp->vfs_zone_next = firstvfsp;
619			}
620			firstvfsp = dummyvfsp;
621		} else {
622			lastvfsp = firstvfsp->vfs_zone_prev;
623		}
624	}
625
626	/*
627	 * Now walk through all the vfs_ts for this zone. For each one, find the
628	 * corresponding database element, creating it first if necessary, and
629	 * increment its reference count.
630	 */
631	rw_enter(dblockp, RW_WRITER);
632	elemp = zonep->zone_mntfs_db;
633	/* CSTYLED */
634	for (vfsp = firstvfsp;;
635	    vfsp = is_global_zone ? vfsp->vfs_next : vfsp->vfs_zone_next) {
636		DTRACE_PROBE1(new__vfs, vfs_t *, vfsp);
637		/* Consider only visible entries. */
638		if ((vfsp->vfs_flag & VFS_NOMNTTAB) == 0 || show_hidden) {
639			/*
640			 * Walk through the existing database looking for either
641			 * an element that matches the current vfs_t, or for the
642			 * correct place in which to insert a new element.
643			 */
644			insert_before = 0;
645			for (; elemp; prevp = elemp, elemp = elemp->mnte_next) {
646				DTRACE_PROBE1(considering__elem, mntelem_t *,
647				    elemp);
648
649				/* Compare the vfs_t with the element. */
650				order = mntfs_newest(&elemp->mnte_vfs_ctime,
651				    &vfsp->vfs_hrctime);
652
653				/*
654				 * If we encounter a database element newer than
655				 * this vfs_t then we've stepped over a gap
656				 * where the element for this vfs_t must be
657				 * inserted.
658				 */
659				if (order == MNTFS_FIRST) {
660					insert_before = 1;
661					break;
662				}
663
664				/* Dead elements no longer interest us. */
665				if (MNTFS_ELEM_IS_DEAD(elemp))
666					continue;
667
668				/*
669				 * If the time stamps are the same then the
670				 * element is potential match for the vfs_t,
671				 * although it may later prove to be stale.
672				 */
673				if (order == MNTFS_NEITHER)
674					break;
675
676				/*
677				 * This element must be older than the vfs_t.
678				 * It must, therefore, correspond to a vfs_t
679				 * that has been unmounted. Since the element is
680				 * still alive, we kill it if it is visible.
681				 */
682				if (!elemp->mnte_hidden || show_hidden)
683					vfs_mono_time(&elemp->mnte_death);
684			}
685			DTRACE_PROBE2(possible__match, vfs_t *, vfsp,
686			    mntelem_t *, elemp);
687
688			/* Create a new database element if required. */
689			new_entry_length = mntfs_text_len(vfsp, zonep);
690			if (new_entry_length > entry_length) {
691				kmem_free(tempelemp->mnte_text, entry_length);
692				tempelemp->mnte_text =
693				    kmem_alloc(new_entry_length, KM_SLEEP);
694				entry_length = new_entry_length;
695			}
696			mntfs_populate_text(vfsp, zonep, tempelemp);
697			ASSERT(tempelemp->mnte_text_size == new_entry_length);
698			if (elemp == NULL) {
699				/*
700				 * We ran off the end of the database. Insert a
701				 * new element at the end.
702				 */
703				newp = mntfs_copy(tempelemp);
704				vfs_mono_time(&newp->mnte_birth);
705				if (prevp) {
706					mntfs_insert_after(newp, prevp);
707				} else {
708					newp->mnte_next = NULL;
709					newp->mnte_prev = NULL;
710					ASSERT(*headpp == NULL);
711					*headpp = newp;
712				}
713				elemp = newp;
714			} else if (insert_before) {
715				/*
716				 * Insert a new element before the current one.
717				 */
718				newp = mntfs_copy(tempelemp);
719				vfs_mono_time(&newp->mnte_birth);
720				if (prevp) {
721					mntfs_insert_after(newp, prevp);
722				} else {
723					newp->mnte_next = elemp;
724					newp->mnte_prev = NULL;
725					elemp->mnte_prev = newp;
726					ASSERT(*headpp == elemp);
727					*headpp = newp;
728				}
729				elemp = newp;
730			} else if (!mntfs_is_same_element(elemp, tempelemp)) {
731				/*
732				 * The element corresponds to the vfs_t, but the
733				 * vfs_t has changed; it must have been
734				 * remounted. Kill the old element and insert a
735				 * new one after it.
736				 */
737				vfs_mono_time(&elemp->mnte_death);
738				newp = mntfs_copy(tempelemp);
739				vfs_mono_time(&newp->mnte_birth);
740				mntfs_insert_after(newp, elemp);
741				elemp = newp;
742			}
743
744			/* We've found the corresponding element. Hold it. */
745			DTRACE_PROBE1(incrementing, mntelem_t *, elemp);
746			elemp->mnte_refcnt++;
747
748			/*
749			 * Update the parameters used to initialise the
750			 * snapshot.
751			 */
752			nmnts++;
753			total_text_size += elemp->mnte_text_size;
754			if (!elemp->mnte_hidden)
755				normal_text_size += elemp->mnte_text_size;
756			if (!firstp)
757				firstp = elemp;
758
759			prevp = elemp;
760			elemp = elemp->mnte_next;
761		}
762
763		if (vfsp == lastvfsp)
764			break;
765	}
766
767	/*
768	 * Any remaining visible database elements that are still alive must be
769	 * killed now, because their corresponding vfs_ts must have been
770	 * unmounted.
771	 */
772	for (; elemp; elemp = elemp->mnte_next) {
773		if (MNTFS_ELEM_IS_ALIVE(elemp) &&
774		    (!elemp->mnte_hidden || show_hidden))
775			vfs_mono_time(&elemp->mnte_death);
776	}
777
778	/* Initialise the snapshot. */
779	vfs_mono_time(&snapp->mnts_time);
780	snapp->mnts_last_mtime = last_mtime;
781	snapp->mnts_first = snapp->mnts_next = firstp;
782	snapp->mnts_flags = show_hidden ? MNTS_SHOWHIDDEN : 0;
783	snapp->mnts_nmnts = nmnts;
784	snapp->mnts_text_size = total_text_size;
785	snapp->mnts_foffset = snapp->mnts_ieoffset = 0;
786
787	/*
788	 * Record /etc/mnttab's current size and mtime for possible future use
789	 * by mntgetattr().
790	 */
791	mnd->mnt_size = normal_text_size;
792	mnd->mnt_mtime = last_mtime;
793	if (show_hidden) {
794		mnd->mnt_hidden_size = total_text_size;
795		mnd->mnt_hidden_mtime = last_mtime;
796	}
797
798	/* Clean up. */
799	rw_exit(dblockp);
800	vfs_list_unlock();
801	if (dummyvfsp != NULL)
802		refstr_rele(dummyvfsp->vfs_mntpt);
803	kmem_free(tempelemp->mnte_text, entry_length);
804	kmem_free(tempelemp, sizeof (mntelem_t));
805}
806
807/*
808 * Public function to convert vfs_mntopts into a string.
809 * A buffer of sufficient size is allocated, which is returned via bufp,
810 * and whose length is returned via lenp.
811 */
812void
813mntfs_getmntopts(struct vfs *vfsp, char **bufp, size_t *lenp)
814{
815	size_t len;
816	char *buf;
817
818	vfs_list_read_lock();
819
820	len = mntfs_optsize(vfsp) + 1;
821	buf = kmem_alloc(len, KM_NOSLEEP);
822	if (buf == NULL) {
823		*bufp = NULL;
824		vfs_list_unlock();
825		return;
826	}
827	buf[len - 1] = '\0';
828	(void) mntfs_optprint(vfsp, buf);
829	ASSERT(buf[len - 1] == '\0');
830
831	vfs_list_unlock();
832	*bufp = buf;
833	*lenp = len;
834}
835
836/* ARGSUSED */
837static int
838mntopen(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
839{
840	vnode_t *vp = *vpp;
841	mntnode_t *nmnp;
842
843	/*
844	 * Not allowed to open for writing, return error.
845	 */
846	if (flag & FWRITE)
847		return (EPERM);
848	/*
849	 * Create a new mnt/vnode for each open, this will give us a handle to
850	 * hang the snapshot on.
851	 */
852	nmnp = mntgetnode(vp);
853
854	*vpp = MTOV(nmnp);
855	atomic_inc_32(&MTOD(nmnp)->mnt_nopen);
856	VN_RELE(vp);
857	return (0);
858}
859
860/* ARGSUSED */
861static int
862mntclose(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
863	caller_context_t *ct)
864{
865	mntnode_t *mnp = VTOM(vp);
866
867	/* Clean up any locks or shares held by the current process */
868	cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
869	cleanshares(vp, ttoproc(curthread)->p_pid);
870
871	if (count > 1)
872		return (0);
873	if (vp->v_count == 1) {
874		rw_enter(&mnp->mnt_contents, RW_WRITER);
875		mntfs_freesnap(mnp, &mnp->mnt_read);
876		mntfs_freesnap(mnp, &mnp->mnt_ioctl);
877		rw_exit(&mnp->mnt_contents);
878		atomic_dec_32(&MTOD(mnp)->mnt_nopen);
879	}
880	return (0);
881}
882
883/* ARGSUSED */
884static int
885mntread(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cred, caller_context_t *ct)
886{
887	mntnode_t *mnp = VTOM(vp);
888	zone_t *zonep = MTOD(mnp)->mnt_zone_ref.zref_zone;
889	mntsnap_t *snapp = &mnp->mnt_read;
890	off_t off = uio->uio_offset;
891	size_t len = uio->uio_resid;
892	char *bufferp;
893	size_t available, copylen;
894	size_t written = 0;
895	mntelem_t *elemp;
896	krwlock_t *dblockp = &zonep->zone_mntfs_db_lock;
897	int error = 0;
898	off_t	ieoffset;
899
900	rw_enter(&mnp->mnt_contents, RW_WRITER);
901	if (snapp->mnts_nmnts == 0 || (off == (off_t)0))
902		mntfs_snapshot(mnp, snapp);
903
904	if ((size_t)(off + len) > snapp->mnts_text_size)
905		len = snapp->mnts_text_size - off;
906
907	if (off < 0 || len > snapp->mnts_text_size) {
908		rw_exit(&mnp->mnt_contents);
909		return (EFAULT);
910	}
911
912	if (len == 0) {
913		rw_exit(&mnp->mnt_contents);
914		return (0);
915	}
916
917	/*
918	 * For the file offset provided, locate the corresponding database
919	 * element and calculate the corresponding offset within its text. If
920	 * the file offset is the same as that reached during the last read(2)
921	 * then use the saved element and intra-element offset.
922	 */
923	rw_enter(dblockp, RW_READER);
924	if (off == 0 || (off == snapp->mnts_foffset)) {
925		elemp = snapp->mnts_next;
926		ieoffset = snapp->mnts_ieoffset;
927	} else {
928		off_t total_off;
929		/*
930		 * Find the element corresponding to the requested file offset
931		 * by walking through the database and summing the text sizes
932		 * of the individual elements. If the requested file offset is
933		 * greater than that reached on the last visit then we can start
934		 * at the last seen element; otherwise, we have to start at the
935		 * beginning.
936		 */
937		if (off > snapp->mnts_foffset) {
938			elemp = snapp->mnts_next;
939			total_off = snapp->mnts_foffset - snapp->mnts_ieoffset;
940		} else {
941			elemp = snapp->mnts_first;
942			total_off = 0;
943		}
944		while (off > total_off + elemp->mnte_text_size) {
945			total_off += elemp->mnte_text_size;
946			elemp = mntfs_get_next_elem(snapp, elemp);
947			ASSERT(elemp != NULL);
948		}
949		/* Calculate the intra-element offset. */
950		if (off > total_off)
951			ieoffset = off - total_off;
952		else
953			ieoffset = 0;
954	}
955
956	/*
957	 * Create a buffer and populate it with the text from successive
958	 * database elements until it is full.
959	 */
960	bufferp = kmem_alloc(len, KM_SLEEP);
961	while (written < len) {
962		available = elemp->mnte_text_size - ieoffset;
963		copylen = MIN(len - written, available);
964		bcopy(elemp->mnte_text + ieoffset, bufferp + written, copylen);
965		written += copylen;
966		if (copylen == available) {
967			elemp = mntfs_get_next_elem(snapp, elemp);
968			ASSERT(elemp != NULL || written == len);
969			ieoffset = 0;
970		} else {
971			ieoffset += copylen;
972		}
973	}
974	rw_exit(dblockp);
975
976	/*
977	 * Write the populated buffer, update the snapshot's state if
978	 * successful and then advertise our read.
979	 */
980	error = uiomove(bufferp, len, UIO_READ, uio);
981	if (error == 0) {
982		snapp->mnts_next = elemp;
983		snapp->mnts_foffset = off + len;
984		snapp->mnts_ieoffset = ieoffset;
985	}
986	vfs_mnttab_readop();
987	rw_exit(&mnp->mnt_contents);
988
989	/* Clean up. */
990	kmem_free(bufferp, len);
991	return (error);
992}
993
994static int
995mntgetattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
996	caller_context_t *ct)
997{
998	int mask = vap->va_mask;
999	int error;
1000	mntnode_t *mnp = VTOM(vp);
1001	timespec_t mtime, old_mtime;
1002	size_t size, old_size;
1003	mntdata_t *mntdata = MTOD(VTOM(vp));
1004	mntsnap_t *rsnapp, *isnapp;
1005	extern timespec_t vfs_mnttab_ctime;
1006
1007
1008	/* AT_MODE, AT_UID and AT_GID are derived from the underlying file. */
1009	if (mask & AT_MODE|AT_UID|AT_GID) {
1010		if (error = VOP_GETATTR(mnp->mnt_mountvp, vap, flags, cr, ct))
1011			return (error);
1012	}
1013
1014	/*
1015	 * There are some minor subtleties in the determination of
1016	 * /etc/mnttab's size and mtime. We wish to avoid any condition in
1017	 * which, in the vicinity of a change to the in-kernel mnttab, we
1018	 * return an old value for one but a new value for the other. We cannot
1019	 * simply hold vfslist for the entire calculation because we might need
1020	 * to call mntfs_snapshot(), which calls vfs_list_read_lock().
1021	 */
1022	if (mask & AT_SIZE|AT_NBLOCKS) {
1023		rw_enter(&mnp->mnt_contents, RW_WRITER);
1024
1025		vfs_list_read_lock();
1026		vfs_mnttab_modtime(&mtime);
1027		if (mnp->mnt_flags & MNT_SHOWHIDDEN) {
1028			old_mtime = mntdata->mnt_hidden_mtime;
1029			old_size = mntdata->mnt_hidden_size;
1030		} else {
1031			old_mtime = mntdata->mnt_mtime;
1032			old_size = mntdata->mnt_size;
1033		}
1034		vfs_list_unlock();
1035
1036		rsnapp = &mnp->mnt_read;
1037		isnapp = &mnp->mnt_ioctl;
1038		if (rsnapp->mnts_nmnts || isnapp->mnts_nmnts) {
1039			/*
1040			 * The mntnode already has at least one snapshot from
1041			 * which to take the size; the user will understand from
1042			 * mnttab(4) that the current size of the in-kernel
1043			 * mnttab is irrelevant.
1044			 */
1045			size = rsnapp->mnts_nmnts ? rsnapp->mnts_text_size :
1046			    isnapp->mnts_text_size;
1047		} else if (mntfs_newest(&mtime, &old_mtime) == MNTFS_NEITHER) {
1048			/*
1049			 * There is no existing valid snapshot but the in-kernel
1050			 * mnttab has not changed since the time that the last
1051			 * one was generated. Use the old file size; note that
1052			 * it is guaranteed to be consistent with mtime, which
1053			 * may be returned to the user later.
1054			 */
1055			size = old_size;
1056		} else {
1057			/*
1058			 * There is no snapshot and the in-kernel mnttab has
1059			 * changed since the last one was created. We generate a
1060			 * new snapshot which we use for not only the size but
1061			 * also the mtime, thereby ensuring that the two are
1062			 * consistent.
1063			 */
1064			mntfs_snapshot(mnp, rsnapp);
1065			size = rsnapp->mnts_text_size;
1066			mtime = rsnapp->mnts_last_mtime;
1067			mntfs_freesnap(mnp, rsnapp);
1068		}
1069
1070		rw_exit(&mnp->mnt_contents);
1071	} else if (mask & AT_ATIME|AT_MTIME) {
1072		vfs_list_read_lock();
1073		vfs_mnttab_modtime(&mtime);
1074		vfs_list_unlock();
1075	}
1076
1077	/* Always look like a regular file. */
1078	if (mask & AT_TYPE)
1079		vap->va_type = VREG;
1080	/* Mode should basically be read only. */
1081	if (mask & AT_MODE)
1082		vap->va_mode &= 07444;
1083	if (mask & AT_FSID)
1084		vap->va_fsid = vp->v_vfsp->vfs_dev;
1085	/* Nodeid is always ROOTINO. */
1086	if (mask & AT_NODEID)
1087		vap->va_nodeid = (ino64_t)MNTROOTINO;
1088	/*
1089	 * Set nlink to the number of open vnodes for mnttab info
1090	 * plus one for existing.
1091	 */
1092	if (mask & AT_NLINK)
1093		vap->va_nlink = mntdata->mnt_nopen + 1;
1094	if (mask & AT_SIZE)
1095		vap->va_size = size;
1096	if (mask & AT_ATIME)
1097		vap->va_atime = mtime;
1098	if (mask & AT_MTIME)
1099		vap->va_mtime = mtime;
1100	if (mask & AT_CTIME)
1101		vap->va_ctime = vfs_mnttab_ctime;
1102	if (mask & AT_RDEV)
1103		vap->va_rdev = 0;
1104	if (mask & AT_BLKSIZE)
1105		vap->va_blksize = DEV_BSIZE;
1106	if (mask & AT_NBLOCKS)
1107		vap->va_nblocks = btod(size);
1108	if (mask & AT_SEQ)
1109		vap->va_seq = 0;
1110
1111	return (0);
1112}
1113
1114static int
1115mntaccess(vnode_t *vp, int mode, int flags, cred_t *cr,
1116	caller_context_t *ct)
1117{
1118	mntnode_t *mnp = VTOM(vp);
1119
1120	if (mode & (VWRITE|VEXEC))
1121		return (EROFS);
1122
1123	/*
1124	 * Do access check on the underlying directory vnode.
1125	 */
1126	return (VOP_ACCESS(mnp->mnt_mountvp, mode, flags, cr, ct));
1127}
1128
1129
1130/*
1131 * New /mntfs vnode required; allocate it and fill in most of the fields.
1132 */
1133static mntnode_t *
1134mntgetnode(vnode_t *dp)
1135{
1136	mntnode_t *mnp;
1137	vnode_t *vp;
1138
1139	mnp = kmem_zalloc(sizeof (mntnode_t), KM_SLEEP);
1140	mnp->mnt_vnode = vn_alloc(KM_SLEEP);
1141	mnp->mnt_mountvp = VTOM(dp)->mnt_mountvp;
1142	rw_init(&mnp->mnt_contents, NULL, RW_DEFAULT, NULL);
1143	vp = MTOV(mnp);
1144	vp->v_flag = VNOCACHE|VNOMAP|VNOSWAP|VNOMOUNT;
1145	vn_setops(vp, mntvnodeops);
1146	vp->v_vfsp = dp->v_vfsp;
1147	vp->v_type = VREG;
1148	vp->v_data = (caddr_t)mnp;
1149
1150	return (mnp);
1151}
1152
1153/*
1154 * Free the storage obtained from mntgetnode().
1155 */
1156static void
1157mntfreenode(mntnode_t *mnp)
1158{
1159	vnode_t *vp = MTOV(mnp);
1160
1161	rw_destroy(&mnp->mnt_contents);
1162	vn_invalid(vp);
1163	vn_free(vp);
1164	kmem_free(mnp, sizeof (*mnp));
1165}
1166
1167
1168/* ARGSUSED */
1169static int
1170mntfsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
1171{
1172	return (0);
1173}
1174
1175/* ARGSUSED */
1176static void
1177mntinactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
1178{
1179	mntnode_t *mnp = VTOM(vp);
1180
1181	mntfreenode(mnp);
1182}
1183
1184/*
1185 * lseek(2) is supported only to rewind the file by resetmnttab(3C). Rewinding
1186 * has a special meaning for /etc/mnttab: it forces mntfs to refresh the
1187 * snapshot at the next ioctl().
1188 *
1189 * mnttab(4) explains that "the snapshot...is taken any time a read(2) is
1190 * performed at offset 0". We therefore ignore the read snapshot here.
1191 */
1192/* ARGSUSED */
1193static int
1194mntseek(vnode_t *vp, offset_t ooff, offset_t *noffp, caller_context_t *ct)
1195{
1196	mntnode_t *mnp = VTOM(vp);
1197
1198	if (*noffp == 0) {
1199		rw_enter(&mnp->mnt_contents, RW_WRITER);
1200		mnp->mnt_ioctl.mnts_flags |= MNTS_REWIND;
1201		rw_exit(&mnp->mnt_contents);
1202	}
1203
1204	return (0);
1205}
1206
1207/*
1208 * Return the answer requested to poll().
1209 * POLLRDBAND will return when the mtime of the mnttab
1210 * information is newer than the latest one read for this open.
1211 */
1212/* ARGSUSED */
1213static int
1214mntpoll(vnode_t *vp, short ev, int any, short *revp, pollhead_t **phpp,
1215	caller_context_t *ct)
1216{
1217	mntnode_t *mnp = VTOM(vp);
1218	mntsnap_t *snapp;
1219
1220	rw_enter(&mnp->mnt_contents, RW_READER);
1221	if (mntfs_newest(&mnp->mnt_ioctl.mnts_last_mtime,
1222	    &mnp->mnt_read.mnts_last_mtime) == MNTFS_FIRST)
1223		snapp = &mnp->mnt_ioctl;
1224	else
1225		snapp = &mnp->mnt_read;
1226
1227	*revp = 0;
1228	*phpp = (pollhead_t *)NULL;
1229	if (ev & POLLIN)
1230		*revp |= POLLIN;
1231
1232	if (ev & POLLRDNORM)
1233		*revp |= POLLRDNORM;
1234
1235	if (ev & POLLRDBAND) {
1236		vfs_mnttab_poll(&snapp->mnts_last_mtime, phpp);
1237		if (*phpp == (pollhead_t *)NULL)
1238			*revp |= POLLRDBAND;
1239	}
1240	rw_exit(&mnp->mnt_contents);
1241
1242	if (*revp || *phpp != NULL || any) {
1243		return (0);
1244	}
1245	/*
1246	 * If someone is polling an unsupported poll events (e.g.
1247	 * POLLOUT, POLLPRI, etc.), just return POLLERR revents.
1248	 * That way we will ensure that we don't return a 0
1249	 * revents with a NULL pollhead pointer.
1250	 */
1251	*revp = POLLERR;
1252	return (0);
1253}
1254
1255/*
1256 * mntfs_same_word() returns 1 if two words are the same in the context of
1257 * MNTIOC_GETMNTANY and 0 otherwise.
1258 *
1259 * worda is a memory address that lies somewhere in the buffer bufa; it cannot
1260 * be NULL since this is used to indicate to getmntany(3C) that the user does
1261 * not wish to match a particular field. The text to which worda points is
1262 * supplied by the user; if it is not null-terminated then it cannot match.
1263 *
1264 * Buffer bufb contains a line from /etc/mnttab, in which the fields are
1265 * delimited by tab or new-line characters. offb is the offset of the second
1266 * word within this buffer.
1267 *
1268 * mntfs_same_word() returns 1 if the words are the same and 0 otherwise.
1269 */
1270int
1271mntfs_same_word(char *worda, char *bufa, size_t sizea, off_t offb, char *bufb,
1272    size_t sizeb)
1273{
1274	char *wordb = bufb + offb;
1275	int bytes_remaining;
1276
1277	ASSERT(worda != NULL);
1278
1279	bytes_remaining = MIN(((bufa + sizea) - worda),
1280	    ((bufb + sizeb) - wordb));
1281	while (bytes_remaining && *worda == *wordb) {
1282		worda++;
1283		wordb++;
1284		bytes_remaining--;
1285	}
1286	if (bytes_remaining &&
1287	    *worda == '\0' && (*wordb == '\t' || *wordb == '\n'))
1288		return (1);
1289	else
1290		return (0);
1291}
1292
1293/*
1294 * mntfs_special_info_string() returns which, if either, of VBLK or VCHR
1295 * corresponds to a supplied path. If the path is a special device then the
1296 * function optionally sets the major and minor numbers.
1297 */
1298vtype_t
1299mntfs_special_info_string(char *path, uint_t *major, uint_t *minor, cred_t *cr)
1300{
1301	vattr_t vattr;
1302	vnode_t *vp;
1303	vtype_t type;
1304	int error;
1305
1306	if (path == NULL || *path != '/' ||
1307	    lookupnameat(path + 1, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp, rootdir))
1308		return (0);
1309
1310	vattr.va_mask = AT_TYPE | AT_RDEV;
1311	error = VOP_GETATTR(vp, &vattr, ATTR_REAL, cr, NULL);
1312	VN_RELE(vp);
1313
1314	if (error == 0 && ((type = vattr.va_type) == VBLK || type == VCHR)) {
1315		if (major && minor) {
1316			*major = getmajor(vattr.va_rdev);
1317			*minor = getminor(vattr.va_rdev);
1318		}
1319		return (type);
1320	} else {
1321		return (0);
1322	}
1323}
1324
1325/*
1326 * mntfs_special_info_element() extracts the name of the mounted resource
1327 * for a given element and copies it into a null-terminated string, which it
1328 * then passes to mntfs_special_info_string().
1329 */
1330vtype_t
1331mntfs_special_info_element(mntelem_t *elemp, cred_t *cr)
1332{
1333	char *newpath;
1334	vtype_t type;
1335
1336	newpath = kmem_alloc(elemp->mnte_text_size, KM_SLEEP);
1337	bcopy(elemp->mnte_text, newpath, (off_t)(elemp->mnte_tab.mnt_mountp));
1338	*(newpath + (off_t)elemp->mnte_tab.mnt_mountp - 1) = '\0';
1339	type = mntfs_special_info_string(newpath, NULL, NULL, cr);
1340	kmem_free(newpath, elemp->mnte_text_size);
1341
1342	return (type);
1343}
1344
1345/*
1346 * Convert an address that points to a byte within a user buffer into an
1347 * address that points to the corresponding offset within a kernel buffer. If
1348 * the user address is NULL then make no conversion. If the address does not
1349 * lie within the buffer then reset it to NULL.
1350 */
1351char *
1352mntfs_import_addr(char *uaddr, char *ubufp, char *kbufp, size_t bufsize)
1353{
1354	if (uaddr < ubufp || uaddr >= ubufp + bufsize)
1355		return (NULL);
1356	else
1357		return (kbufp + (uaddr - ubufp));
1358}
1359
1360/*
1361 * These 32-bit versions are to support STRUCT_DECL(9F) etc. in
1362 * mntfs_copyout_element() and mntioctl().
1363 */
1364#ifdef _SYSCALL32_IMPL
1365typedef struct extmnttab32 {
1366	uint32_t	mnt_special;
1367	uint32_t	mnt_mountp;
1368	uint32_t	mnt_fstype;
1369	uint32_t	mnt_mntopts;
1370	uint32_t	mnt_time;
1371	uint_t		mnt_major;
1372	uint_t		mnt_minor;
1373} extmnttab32_t;
1374
1375typedef struct mnttab32 {
1376	uint32_t	mnt_special;
1377	uint32_t	mnt_mountp;
1378	uint32_t	mnt_fstype;
1379	uint32_t	mnt_mntopts;
1380	uint32_t	mnt_time;
1381} mnttab32_t;
1382
1383struct mntentbuf32 {
1384	uint32_t	mbuf_emp;
1385	uint_t		mbuf_bufsize;
1386	uint32_t	mbuf_buf;
1387};
1388#endif
1389
1390/*
1391 * mntfs_copyout_element() is common code for the MNTIOC_GETMNTENT,
1392 * MNTIOC_GETEXTMNTENT and MNTIOC_GETMNTANY ioctls. Having identifed the
1393 * database element desired by the user, this function copies out the text and
1394 * the pointers to the relevant userland addresses. It returns 0 on success
1395 * and non-zero otherwise.
1396 */
1397int
1398mntfs_copyout_elem(mntelem_t *elemp, struct extmnttab *uemp,
1399    char *ubufp, int cmd, int datamodel)
1400{
1401		STRUCT_DECL(extmnttab, ktab);
1402		char *dbbufp = elemp->mnte_text;
1403		size_t dbbufsize = elemp->mnte_text_size;
1404		struct extmnttab *dbtabp = &elemp->mnte_tab;
1405		size_t ssize;
1406		char *kbufp;
1407		int error = 0;
1408
1409
1410		/*
1411		 * We create a struct extmnttab within the kernel of the size
1412		 * determined by the user's data model. We then populate its
1413		 * fields by combining the start address of the text buffer
1414		 * supplied by the user, ubufp, with the offsets stored for
1415		 * this database element within dbtabp, a pointer to a struct
1416		 * extmnttab.
1417		 *
1418		 * Note that if the corresponding field is "-" this signifies
1419		 * no real content, and we set the address to NULL. This does
1420		 * not apply to mnt_time.
1421		 */
1422		STRUCT_INIT(ktab, datamodel);
1423		STRUCT_FSETP(ktab, mnt_special,
1424		    MNTFS_REAL_FIELD(dbbufp) ? ubufp : NULL);
1425		STRUCT_FSETP(ktab, mnt_mountp,
1426		    MNTFS_REAL_FIELD(dbbufp + (off_t)dbtabp->mnt_mountp) ?
1427		    ubufp + (off_t)dbtabp->mnt_mountp : NULL);
1428		STRUCT_FSETP(ktab, mnt_fstype,
1429		    MNTFS_REAL_FIELD(dbbufp + (off_t)dbtabp->mnt_fstype) ?
1430		    ubufp + (off_t)dbtabp->mnt_fstype : NULL);
1431		STRUCT_FSETP(ktab, mnt_mntopts,
1432		    MNTFS_REAL_FIELD(dbbufp + (off_t)dbtabp->mnt_mntopts) ?
1433		    ubufp + (off_t)dbtabp->mnt_mntopts : NULL);
1434		STRUCT_FSETP(ktab, mnt_time,
1435		    ubufp + (off_t)dbtabp->mnt_time);
1436		if (cmd == MNTIOC_GETEXTMNTENT) {
1437			STRUCT_FSETP(ktab, mnt_major, dbtabp->mnt_major);
1438			STRUCT_FSETP(ktab, mnt_minor, dbtabp->mnt_minor);
1439			ssize = SIZEOF_STRUCT(extmnttab, datamodel);
1440		} else {
1441			ssize = SIZEOF_STRUCT(mnttab, datamodel);
1442		}
1443		if (copyout(STRUCT_BUF(ktab), uemp, ssize))
1444			return (EFAULT);
1445
1446		/*
1447		 * We create a text buffer in the kernel into which we copy the
1448		 * /etc/mnttab entry for this element. We change the tab and
1449		 * new-line delimiters to null bytes before copying out the
1450		 * buffer.
1451		 */
1452		kbufp = kmem_alloc(dbbufsize, KM_SLEEP);
1453		bcopy(elemp->mnte_text, kbufp, dbbufsize);
1454		*(kbufp + (off_t)dbtabp->mnt_mountp - 1) =
1455		    *(kbufp + (off_t)dbtabp->mnt_fstype - 1) =
1456		    *(kbufp + (off_t)dbtabp->mnt_mntopts - 1) =
1457		    *(kbufp + (off_t)dbtabp->mnt_time - 1) =
1458		    *(kbufp + dbbufsize - 1) = '\0';
1459		if (copyout(kbufp, ubufp, dbbufsize))
1460			error = EFAULT;
1461
1462		kmem_free(kbufp, dbbufsize);
1463		return (error);
1464}
1465
1466/* ARGSUSED */
1467static int
1468mntioctl(struct vnode *vp, int cmd, intptr_t arg, int flag, cred_t *cr,
1469    int *rvalp, caller_context_t *ct)
1470{
1471	uint_t *up = (uint_t *)arg;
1472	mntnode_t *mnp = VTOM(vp);
1473	mntsnap_t *snapp = &mnp->mnt_ioctl;
1474	int error = 0;
1475	zone_t *zonep = MTOD(mnp)->mnt_zone_ref.zref_zone;
1476	krwlock_t *dblockp = &zonep->zone_mntfs_db_lock;
1477	model_t datamodel = flag & DATAMODEL_MASK;
1478
1479	switch (cmd) {
1480
1481	case MNTIOC_NMNTS:  		/* get no. of mounted resources */
1482	{
1483		rw_enter(&mnp->mnt_contents, RW_READER);
1484		if (snapp->mnts_nmnts == 0 ||
1485		    (snapp->mnts_flags & MNTS_REWIND)) {
1486			if (!rw_tryupgrade(&mnp->mnt_contents)) {
1487				rw_exit(&mnp->mnt_contents);
1488				rw_enter(&mnp->mnt_contents, RW_WRITER);
1489			}
1490			if (snapp->mnts_nmnts == 0 ||
1491			    (snapp->mnts_flags & MNTS_REWIND))
1492				mntfs_snapshot(mnp, snapp);
1493		}
1494		rw_exit(&mnp->mnt_contents);
1495
1496		if (suword32(up, snapp->mnts_nmnts) != 0)
1497			error = EFAULT;
1498		break;
1499	}
1500
1501	case MNTIOC_GETDEVLIST:  	/* get mounted device major/minor nos */
1502	{
1503		size_t len;
1504		uint_t *devlist;
1505		mntelem_t *elemp;
1506		int i = 0;
1507
1508		rw_enter(&mnp->mnt_contents, RW_READER);
1509		if (snapp->mnts_nmnts == 0 ||
1510		    (snapp->mnts_flags & MNTS_REWIND)) {
1511			if (!rw_tryupgrade(&mnp->mnt_contents)) {
1512				rw_exit(&mnp->mnt_contents);
1513				rw_enter(&mnp->mnt_contents, RW_WRITER);
1514			}
1515			if (snapp->mnts_nmnts == 0 ||
1516			    (snapp->mnts_flags & MNTS_REWIND))
1517				mntfs_snapshot(mnp, snapp);
1518			rw_downgrade(&mnp->mnt_contents);
1519		}
1520
1521		/* Create a local buffer to hold the device numbers. */
1522		len = 2 * snapp->mnts_nmnts * sizeof (uint_t);
1523		devlist = kmem_alloc(len, KM_SLEEP);
1524
1525		/*
1526		 * Walk the database elements for this snapshot and add their
1527		 * major and minor numbers.
1528		 */
1529		rw_enter(dblockp, RW_READER);
1530		for (elemp = snapp->mnts_first; elemp;
1531		    elemp = mntfs_get_next_elem(snapp, elemp)) {
1532				devlist[2 * i] = elemp->mnte_tab.mnt_major;
1533				devlist[2 * i + 1] = elemp->mnte_tab.mnt_minor;
1534				i++;
1535		}
1536		rw_exit(dblockp);
1537		ASSERT(i == snapp->mnts_nmnts);
1538		rw_exit(&mnp->mnt_contents);
1539
1540		error = xcopyout(devlist, up, len);
1541		kmem_free(devlist, len);
1542		break;
1543	}
1544
1545	case MNTIOC_SETTAG:		/* set tag on mounted file system */
1546	case MNTIOC_CLRTAG:		/* clear tag on mounted file system */
1547	{
1548		struct mnttagdesc *dp = (struct mnttagdesc *)arg;
1549		STRUCT_DECL(mnttagdesc, tagdesc);
1550		char *cptr;
1551		uint32_t major, minor;
1552		char tagbuf[MAX_MNTOPT_TAG];
1553		char *pbuf;
1554		size_t len;
1555		uint_t start = 0;
1556		mntdata_t *mntdata = MTOD(mnp);
1557		zone_t *zone = mntdata->mnt_zone_ref.zref_zone;
1558
1559		STRUCT_INIT(tagdesc, flag & DATAMODEL_MASK);
1560		if (copyin(dp, STRUCT_BUF(tagdesc), STRUCT_SIZE(tagdesc))) {
1561			error = EFAULT;
1562			break;
1563		}
1564		pbuf = kmem_alloc(MAXPATHLEN, KM_SLEEP);
1565		if (zone != global_zone) {
1566			(void) strcpy(pbuf, zone->zone_rootpath);
1567			/* truncate "/" and nul */
1568			start = zone->zone_rootpathlen - 2;
1569			ASSERT(pbuf[start] == '/');
1570		}
1571		cptr = STRUCT_FGETP(tagdesc, mtd_mntpt);
1572		error = copyinstr(cptr, pbuf + start, MAXPATHLEN - start, &len);
1573		if (error) {
1574			kmem_free(pbuf, MAXPATHLEN);
1575			break;
1576		}
1577		if (start != 0 && pbuf[start] != '/') {
1578			kmem_free(pbuf, MAXPATHLEN);
1579			error = EINVAL;
1580			break;
1581		}
1582		cptr = STRUCT_FGETP(tagdesc, mtd_tag);
1583		if ((error = copyinstr(cptr, tagbuf, MAX_MNTOPT_TAG, &len))) {
1584			kmem_free(pbuf, MAXPATHLEN);
1585			break;
1586		}
1587		major = STRUCT_FGET(tagdesc, mtd_major);
1588		minor = STRUCT_FGET(tagdesc, mtd_minor);
1589		if (cmd == MNTIOC_SETTAG)
1590			error = vfs_settag(major, minor, pbuf, tagbuf, cr);
1591		else
1592			error = vfs_clrtag(major, minor, pbuf, tagbuf, cr);
1593		kmem_free(pbuf, MAXPATHLEN);
1594		break;
1595	}
1596
1597	case MNTIOC_SHOWHIDDEN:
1598	{
1599		rw_enter(&mnp->mnt_contents, RW_WRITER);
1600		mnp->mnt_flags |= MNT_SHOWHIDDEN;
1601		rw_exit(&mnp->mnt_contents);
1602		break;
1603	}
1604
1605	case MNTIOC_GETMNTANY:
1606	{
1607		STRUCT_DECL(mntentbuf, embuf);	/* Our copy of user's embuf */
1608		STRUCT_DECL(extmnttab, ktab);	/* Out copy of user's emp */
1609		struct extmnttab *uemp;		/* uaddr of user's emp */
1610		char *ubufp;			/* uaddr of user's text buf */
1611		size_t ubufsize;		/* size of the above */
1612		struct extmnttab preftab;	/* our version of user's emp */
1613		char *prefbuf;			/* our copy of user's text */
1614		mntelem_t *elemp;		/* a database element */
1615		struct extmnttab *dbtabp;	/* element's extmnttab */
1616		char *dbbufp;			/* element's text buf */
1617		size_t dbbufsize;		/* size of the above */
1618		vtype_t type;			/* type, if any, of special */
1619
1620
1621		/*
1622		 * embuf is a struct embuf within the kernel. We copy into it
1623		 * the struct embuf supplied by the user.
1624		 */
1625		STRUCT_INIT(embuf, datamodel);
1626		if (copyin((void *) arg, STRUCT_BUF(embuf),
1627		    STRUCT_SIZE(embuf))) {
1628			error = EFAULT;
1629			break;
1630		}
1631		uemp = STRUCT_FGETP(embuf, mbuf_emp);
1632		ubufp = STRUCT_FGETP(embuf, mbuf_buf);
1633		ubufsize = STRUCT_FGET(embuf, mbuf_bufsize);
1634
1635		/*
1636		 * Check that the text buffer offered by the user is the
1637		 * agreed size.
1638		 */
1639		if (ubufsize != MNT_LINE_MAX) {
1640			error = EINVAL;
1641			break;
1642		}
1643
1644		/* Copy the user-supplied entry into a local buffer. */
1645		prefbuf = kmem_alloc(MNT_LINE_MAX, KM_SLEEP);
1646		if (copyin(ubufp, prefbuf, MNT_LINE_MAX)) {
1647			kmem_free(prefbuf, MNT_LINE_MAX);
1648			error = EFAULT;
1649			break;
1650		}
1651
1652		/* Ensure that any string within it is null-terminated. */
1653		*(prefbuf + MNT_LINE_MAX - 1) = 0;
1654
1655		/* Copy in the user-supplied mpref */
1656		STRUCT_INIT(ktab, datamodel);
1657		if (copyin(uemp, STRUCT_BUF(ktab),
1658		    SIZEOF_STRUCT(mnttab, datamodel))) {
1659			kmem_free(prefbuf, MNT_LINE_MAX);
1660			error = EFAULT;
1661			break;
1662		}
1663
1664		/*
1665		 * Copy the members of the user's pref struct into a local
1666		 * struct. The pointers need to be offset and verified to
1667		 * ensure that they lie within the bounds of the buffer.
1668		 */
1669		preftab.mnt_special = mntfs_import_addr(STRUCT_FGETP(ktab,
1670		    mnt_special), ubufp, prefbuf, MNT_LINE_MAX);
1671		preftab.mnt_mountp = mntfs_import_addr(STRUCT_FGETP(ktab,
1672		    mnt_mountp), ubufp, prefbuf, MNT_LINE_MAX);
1673		preftab.mnt_fstype = mntfs_import_addr(STRUCT_FGETP(ktab,
1674		    mnt_fstype), ubufp, prefbuf, MNT_LINE_MAX);
1675		preftab.mnt_mntopts = mntfs_import_addr(STRUCT_FGETP(ktab,
1676		    mnt_mntopts), ubufp, prefbuf, MNT_LINE_MAX);
1677		preftab.mnt_time = mntfs_import_addr(STRUCT_FGETP(ktab,
1678		    mnt_time), ubufp, prefbuf, MNT_LINE_MAX);
1679
1680		/*
1681		 * If the user specifies a mounted resource that is a special
1682		 * device then we capture its mode and major and minor numbers;
1683		 * cf. the block comment below.
1684		 */
1685		type = mntfs_special_info_string(preftab.mnt_special,
1686		    &preftab.mnt_major, &preftab.mnt_minor, cr);
1687
1688		rw_enter(&mnp->mnt_contents, RW_WRITER);
1689		if (snapp->mnts_nmnts == 0 ||
1690		    (snapp->mnts_flags & MNTS_REWIND))
1691			mntfs_snapshot(mnp, snapp);
1692
1693		/*
1694		 * This is the core functionality that implements getmntany().
1695		 * We walk through the mntfs database until we find an element
1696		 * matching the user's preferences that are contained in
1697		 * preftab. Typically, this means checking that the text
1698		 * matches. However, the mounted resource is special: if the
1699		 * user is looking for a special device then we must find a
1700		 * database element with the same major and minor numbers and
1701		 * the same type, i.e. VBLK or VCHR. The type is not recorded
1702		 * in the element because it cannot be inferred from the vfs_t.
1703		 * We therefore check the type of suitable candidates via
1704		 * mntfs_special_info_element(); since this calls into the
1705		 * underlying file system we make sure to drop the database lock
1706		 * first.
1707		 */
1708		elemp = snapp->mnts_next;
1709		rw_enter(dblockp, RW_READER);
1710		for (;;) {
1711			for (; elemp; elemp = mntfs_get_next_elem(snapp,
1712			    elemp)) {
1713				dbtabp = &elemp->mnte_tab;
1714				dbbufp = elemp->mnte_text;
1715				dbbufsize = elemp->mnte_text_size;
1716
1717				if (((type &&
1718				    dbtabp->mnt_major == preftab.mnt_major &&
1719				    dbtabp->mnt_minor == preftab.mnt_minor &&
1720				    MNTFS_REAL_FIELD(dbbufp)) ||
1721				    (!type && (!preftab.mnt_special ||
1722				    mntfs_same_word(preftab.mnt_special,
1723				    prefbuf, MNT_LINE_MAX, (off_t)0, dbbufp,
1724				    dbbufsize)))) &&
1725
1726				    (!preftab.mnt_mountp || mntfs_same_word(
1727				    preftab.mnt_mountp, prefbuf, MNT_LINE_MAX,
1728				    (off_t)dbtabp->mnt_mountp, dbbufp,
1729				    dbbufsize)) &&
1730
1731				    (!preftab.mnt_fstype || mntfs_same_word(
1732				    preftab.mnt_fstype, prefbuf, MNT_LINE_MAX,
1733				    (off_t)dbtabp->mnt_fstype, dbbufp,
1734				    dbbufsize)) &&
1735
1736				    (!preftab.mnt_mntopts || mntfs_same_word(
1737				    preftab.mnt_mntopts, prefbuf, MNT_LINE_MAX,
1738				    (off_t)dbtabp->mnt_mntopts, dbbufp,
1739				    dbbufsize)) &&
1740
1741				    (!preftab.mnt_time || mntfs_same_word(
1742				    preftab.mnt_time, prefbuf, MNT_LINE_MAX,
1743				    (off_t)dbtabp->mnt_time, dbbufp,
1744				    dbbufsize)))
1745					break;
1746			}
1747			rw_exit(dblockp);
1748
1749			if (elemp == NULL || type == 0 ||
1750			    type == mntfs_special_info_element(elemp, cr))
1751				break;
1752
1753			rw_enter(dblockp, RW_READER);
1754			elemp = mntfs_get_next_elem(snapp, elemp);
1755		}
1756
1757		kmem_free(prefbuf, MNT_LINE_MAX);
1758
1759		/* If we failed to find a match then return EOF. */
1760		if (elemp == NULL) {
1761			rw_exit(&mnp->mnt_contents);
1762			*rvalp = MNTFS_EOF;
1763			break;
1764		}
1765
1766		/*
1767		 * Check that the text buffer offered by the user will be large
1768		 * enough to accommodate the text for this entry.
1769		 */
1770		if (elemp->mnte_text_size > MNT_LINE_MAX) {
1771			rw_exit(&mnp->mnt_contents);
1772			*rvalp = MNTFS_TOOLONG;
1773			break;
1774		}
1775
1776		/*
1777		 * Populate the user's struct mnttab and text buffer using the
1778		 * element's contents.
1779		 */
1780		if (mntfs_copyout_elem(elemp, uemp, ubufp, cmd, datamodel)) {
1781			error = EFAULT;
1782		} else {
1783			rw_enter(dblockp, RW_READER);
1784			elemp = mntfs_get_next_elem(snapp, elemp);
1785			rw_exit(dblockp);
1786			snapp->mnts_next = elemp;
1787		}
1788		rw_exit(&mnp->mnt_contents);
1789		break;
1790	}
1791
1792	case MNTIOC_GETMNTENT:
1793	case MNTIOC_GETEXTMNTENT:
1794	{
1795		STRUCT_DECL(mntentbuf, embuf);	/* Our copy of user's embuf */
1796		struct extmnttab *uemp;		/* uaddr of user's emp */
1797		char *ubufp;			/* uaddr of user's text buf */
1798		size_t ubufsize;		/* size of the above */
1799		mntelem_t *elemp;		/* a database element */
1800
1801
1802		rw_enter(&mnp->mnt_contents, RW_WRITER);
1803		if (snapp->mnts_nmnts == 0 ||
1804		    (snapp->mnts_flags & MNTS_REWIND))
1805			mntfs_snapshot(mnp, snapp);
1806		if ((elemp = snapp->mnts_next) == NULL) {
1807			rw_exit(&mnp->mnt_contents);
1808			*rvalp = MNTFS_EOF;
1809			break;
1810		}
1811
1812		/*
1813		 * embuf is a struct embuf within the kernel. We copy into it
1814		 * the struct embuf supplied by the user.
1815		 */
1816		STRUCT_INIT(embuf, datamodel);
1817		if (copyin((void *) arg, STRUCT_BUF(embuf),
1818		    STRUCT_SIZE(embuf))) {
1819			rw_exit(&mnp->mnt_contents);
1820			error = EFAULT;
1821			break;
1822		}
1823		uemp = STRUCT_FGETP(embuf, mbuf_emp);
1824		ubufp = STRUCT_FGETP(embuf, mbuf_buf);
1825		ubufsize = STRUCT_FGET(embuf, mbuf_bufsize);
1826
1827		/*
1828		 * Check that the text buffer offered by the user will be large
1829		 * enough to accommodate the text for this entry.
1830		 */
1831		if (elemp->mnte_text_size > ubufsize) {
1832			rw_exit(&mnp->mnt_contents);
1833			*rvalp = MNTFS_TOOLONG;
1834			break;
1835		}
1836
1837		/*
1838		 * Populate the user's struct mnttab and text buffer using the
1839		 * element's contents.
1840		 */
1841		if (mntfs_copyout_elem(elemp, uemp, ubufp, cmd, datamodel)) {
1842			error = EFAULT;
1843		} else {
1844			rw_enter(dblockp, RW_READER);
1845			elemp = mntfs_get_next_elem(snapp, elemp);
1846			rw_exit(dblockp);
1847			snapp->mnts_next = elemp;
1848		}
1849		rw_exit(&mnp->mnt_contents);
1850		break;
1851	}
1852
1853	default:
1854		error = EINVAL;
1855		break;
1856	}
1857
1858	return (error);
1859}
1860
1861/*
1862 * mntfs provides a new vnode for each open(2). Two vnodes will represent the
1863 * same instance of /etc/mnttab if they share the same (zone-specific) vfs.
1864 */
1865/* ARGSUSED */
1866int
1867mntcmp(vnode_t *vp1, vnode_t *vp2, caller_context_t *ct)
1868{
1869	return (vp1 != NULL && vp2 != NULL && vp1->v_vfsp == vp2->v_vfsp);
1870}
1871
1872/*
1873 * /mntfs vnode operations vector
1874 */
1875const fs_operation_def_t mnt_vnodeops_template[] = {
1876	VOPNAME_OPEN,		{ .vop_open = mntopen },
1877	VOPNAME_CLOSE,		{ .vop_close = mntclose },
1878	VOPNAME_READ,		{ .vop_read = mntread },
1879	VOPNAME_IOCTL,		{ .vop_ioctl = mntioctl },
1880	VOPNAME_GETATTR,	{ .vop_getattr = mntgetattr },
1881	VOPNAME_ACCESS,		{ .vop_access = mntaccess },
1882	VOPNAME_FSYNC,		{ .vop_fsync = mntfsync },
1883	VOPNAME_INACTIVE,	{ .vop_inactive = mntinactive },
1884	VOPNAME_SEEK,		{ .vop_seek = mntseek },
1885	VOPNAME_POLL,		{ .vop_poll = mntpoll },
1886	VOPNAME_CMP,		{ .vop_cmp = mntcmp },
1887	VOPNAME_DISPOSE,	{ .error = fs_error },
1888	VOPNAME_SHRLOCK,	{ .error = fs_error },
1889	NULL,			NULL
1890};
1891