1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
24 * Copyright (c) 2014 Integros [integros.com]
25 * Copyright 2016 Nexenta Systems, Inc. All rights reserved.
26 * Copyright 2019 Joyent, Inc.
27 */
28
29/* Portions Copyright 2010 Robert Milkowski */
30
31#include <sys/types.h>
32#include <sys/param.h>
33#include <sys/systm.h>
34#include <sys/sysmacros.h>
35#include <sys/kmem.h>
36#include <sys/pathname.h>
37#include <sys/vnode.h>
38#include <sys/vfs.h>
39#include <sys/vfs_opreg.h>
40#include <sys/mntent.h>
41#include <sys/mount.h>
42#include <sys/cmn_err.h>
43#include "fs/fs_subr.h"
44#include <sys/zfs_znode.h>
45#include <sys/zfs_dir.h>
46#include <sys/zil.h>
47#include <sys/fs/zfs.h>
48#include <sys/dmu.h>
49#include <sys/dsl_prop.h>
50#include <sys/dsl_dataset.h>
51#include <sys/dsl_deleg.h>
52#include <sys/spa.h>
53#include <sys/zap.h>
54#include <sys/sa.h>
55#include <sys/sa_impl.h>
56#include <sys/varargs.h>
57#include <sys/policy.h>
58#include <sys/atomic.h>
59#include <sys/mkdev.h>
60#include <sys/modctl.h>
61#include <sys/refstr.h>
62#include <sys/zfs_ioctl.h>
63#include <sys/zfs_ctldir.h>
64#include <sys/zfs_fuid.h>
65#include <sys/bootconf.h>
66#include <sys/sunddi.h>
67#include <sys/dnlc.h>
68#include <sys/dmu_objset.h>
69#include <sys/spa_boot.h>
70#include "zfs_comutil.h"
71
72int zfsfstype;
73vfsops_t *zfs_vfsops = NULL;
74static major_t zfs_major;
75static minor_t zfs_minor;
76static kmutex_t	zfs_dev_mtx;
77
78extern int sys_shutdown;
79
80static int zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr);
81static int zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr);
82static int zfs_mountroot(vfs_t *vfsp, enum whymountroot);
83static int zfs_root(vfs_t *vfsp, vnode_t **vpp);
84static int zfs_statvfs(vfs_t *vfsp, struct statvfs64 *statp);
85static int zfs_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp);
86static void zfs_freevfs(vfs_t *vfsp);
87
88static const fs_operation_def_t zfs_vfsops_template[] = {
89	VFSNAME_MOUNT,		{ .vfs_mount = zfs_mount },
90	VFSNAME_MOUNTROOT,	{ .vfs_mountroot = zfs_mountroot },
91	VFSNAME_UNMOUNT,	{ .vfs_unmount = zfs_umount },
92	VFSNAME_ROOT,		{ .vfs_root = zfs_root },
93	VFSNAME_STATVFS,	{ .vfs_statvfs = zfs_statvfs },
94	VFSNAME_SYNC,		{ .vfs_sync = zfs_sync },
95	VFSNAME_VGET,		{ .vfs_vget = zfs_vget },
96	VFSNAME_FREEVFS,	{ .vfs_freevfs = zfs_freevfs },
97	NULL,			NULL
98};
99
100/*
101 * We need to keep a count of active fs's.
102 * This is necessary to prevent our module
103 * from being unloaded after a umount -f
104 */
105static uint32_t	zfs_active_fs_count = 0;
106
107static char *noatime_cancel[] = { MNTOPT_ATIME, NULL };
108static char *atime_cancel[] = { MNTOPT_NOATIME, NULL };
109static char *noxattr_cancel[] = { MNTOPT_XATTR, NULL };
110static char *xattr_cancel[] = { MNTOPT_NOXATTR, NULL };
111
112/*
113 * MO_DEFAULT is not used since the default value is determined
114 * by the equivalent property.
115 */
116static mntopt_t mntopts[] = {
117	{ MNTOPT_NOXATTR, noxattr_cancel, NULL, 0, NULL },
118	{ MNTOPT_XATTR, xattr_cancel, NULL, 0, NULL },
119	{ MNTOPT_NOATIME, noatime_cancel, NULL, 0, NULL },
120	{ MNTOPT_ATIME, atime_cancel, NULL, 0, NULL }
121};
122
123static mntopts_t zfs_mntopts = {
124	sizeof (mntopts) / sizeof (mntopt_t),
125	mntopts
126};
127
128/*ARGSUSED*/
129int
130zfs_sync(vfs_t *vfsp, short flag, cred_t *cr)
131{
132	/*
133	 * Data integrity is job one.  We don't want a compromised kernel
134	 * writing to the storage pool, so we never sync during panic.
135	 */
136	if (panicstr)
137		return (0);
138
139	/*
140	 * SYNC_ATTR is used by fsflush() to force old filesystems like UFS
141	 * to sync metadata, which they would otherwise cache indefinitely.
142	 * Semantically, the only requirement is that the sync be initiated.
143	 * The DMU syncs out txgs frequently, so there's nothing to do.
144	 */
145	if (flag & SYNC_ATTR)
146		return (0);
147
148	if (vfsp != NULL) {
149		/*
150		 * Sync a specific filesystem.
151		 */
152		zfsvfs_t *zfsvfs = vfsp->vfs_data;
153		dsl_pool_t *dp;
154
155		ZFS_ENTER(zfsvfs);
156		dp = dmu_objset_pool(zfsvfs->z_os);
157
158		/*
159		 * If the system is shutting down, then skip any
160		 * filesystems which may exist on a suspended pool.
161		 */
162		if (sys_shutdown && spa_suspended(dp->dp_spa)) {
163			ZFS_EXIT(zfsvfs);
164			return (0);
165		}
166
167		if (zfsvfs->z_log != NULL)
168			zil_commit(zfsvfs->z_log, 0);
169
170		ZFS_EXIT(zfsvfs);
171	} else {
172		/*
173		 * Sync all ZFS filesystems.  This is what happens when you
174		 * run sync(1M).  Unlike other filesystems, ZFS honors the
175		 * request by waiting for all pools to commit all dirty data.
176		 */
177		spa_sync_allpools();
178	}
179
180	return (0);
181}
182
183static int
184zfs_create_unique_device(dev_t *dev)
185{
186	major_t new_major;
187
188	do {
189		ASSERT3U(zfs_minor, <=, MAXMIN32);
190		minor_t start = zfs_minor;
191		do {
192			mutex_enter(&zfs_dev_mtx);
193			if (zfs_minor >= MAXMIN32) {
194				/*
195				 * If we're still using the real major
196				 * keep out of /dev/zfs and /dev/zvol minor
197				 * number space.  If we're using a getudev()'ed
198				 * major number, we can use all of its minors.
199				 */
200				if (zfs_major == ddi_name_to_major(ZFS_DRIVER))
201					zfs_minor = ZFS_MIN_MINOR;
202				else
203					zfs_minor = 0;
204			} else {
205				zfs_minor++;
206			}
207			*dev = makedevice(zfs_major, zfs_minor);
208			mutex_exit(&zfs_dev_mtx);
209		} while (vfs_devismounted(*dev) && zfs_minor != start);
210		if (zfs_minor == start) {
211			/*
212			 * We are using all ~262,000 minor numbers for the
213			 * current major number.  Create a new major number.
214			 */
215			if ((new_major = getudev()) == (major_t)-1) {
216				cmn_err(CE_WARN,
217				    "zfs_mount: Can't get unique major "
218				    "device number.");
219				return (-1);
220			}
221			mutex_enter(&zfs_dev_mtx);
222			zfs_major = new_major;
223			zfs_minor = 0;
224
225			mutex_exit(&zfs_dev_mtx);
226		} else {
227			break;
228		}
229		/* CONSTANTCONDITION */
230	} while (1);
231
232	return (0);
233}
234
235static void
236atime_changed_cb(void *arg, uint64_t newval)
237{
238	zfsvfs_t *zfsvfs = arg;
239
240	if (newval == TRUE) {
241		zfsvfs->z_atime = TRUE;
242		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME);
243		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0);
244	} else {
245		zfsvfs->z_atime = FALSE;
246		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME);
247		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0);
248	}
249}
250
251static void
252xattr_changed_cb(void *arg, uint64_t newval)
253{
254	zfsvfs_t *zfsvfs = arg;
255
256	if (newval == TRUE) {
257		/* XXX locking on vfs_flag? */
258		zfsvfs->z_vfs->vfs_flag |= VFS_XATTR;
259		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR);
260		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_XATTR, NULL, 0);
261	} else {
262		/* XXX locking on vfs_flag? */
263		zfsvfs->z_vfs->vfs_flag &= ~VFS_XATTR;
264		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_XATTR);
265		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR, NULL, 0);
266	}
267}
268
269static void
270blksz_changed_cb(void *arg, uint64_t newval)
271{
272	zfsvfs_t *zfsvfs = arg;
273	ASSERT3U(newval, <=, spa_maxblocksize(dmu_objset_spa(zfsvfs->z_os)));
274	ASSERT3U(newval, >=, SPA_MINBLOCKSIZE);
275	ASSERT(ISP2(newval));
276
277	zfsvfs->z_max_blksz = newval;
278	zfsvfs->z_vfs->vfs_bsize = newval;
279}
280
281static void
282readonly_changed_cb(void *arg, uint64_t newval)
283{
284	zfsvfs_t *zfsvfs = arg;
285
286	if (newval) {
287		/* XXX locking on vfs_flag? */
288		zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY;
289		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW);
290		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0);
291	} else {
292		/* XXX locking on vfs_flag? */
293		zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
294		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO);
295		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0);
296	}
297}
298
299static void
300devices_changed_cb(void *arg, uint64_t newval)
301{
302	zfsvfs_t *zfsvfs = arg;
303
304	if (newval == FALSE) {
305		zfsvfs->z_vfs->vfs_flag |= VFS_NODEVICES;
306		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_DEVICES);
307		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NODEVICES, NULL, 0);
308	} else {
309		zfsvfs->z_vfs->vfs_flag &= ~VFS_NODEVICES;
310		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NODEVICES);
311		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_DEVICES, NULL, 0);
312	}
313}
314
315static void
316setuid_changed_cb(void *arg, uint64_t newval)
317{
318	zfsvfs_t *zfsvfs = arg;
319
320	if (newval == FALSE) {
321		zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID;
322		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID);
323		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0);
324	} else {
325		zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID;
326		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID);
327		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0);
328	}
329}
330
331static void
332exec_changed_cb(void *arg, uint64_t newval)
333{
334	zfsvfs_t *zfsvfs = arg;
335
336	if (newval == FALSE) {
337		zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC;
338		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC);
339		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0);
340	} else {
341		zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC;
342		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC);
343		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0);
344	}
345}
346
347/*
348 * The nbmand mount option can be changed at mount time.
349 * We can't allow it to be toggled on live file systems or incorrect
350 * behavior may be seen from cifs clients
351 *
352 * This property isn't registered via dsl_prop_register(), but this callback
353 * will be called when a file system is first mounted
354 */
355static void
356nbmand_changed_cb(void *arg, uint64_t newval)
357{
358	zfsvfs_t *zfsvfs = arg;
359	if (newval == FALSE) {
360		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND);
361		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND, NULL, 0);
362	} else {
363		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND);
364		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND, NULL, 0);
365	}
366}
367
368static void
369snapdir_changed_cb(void *arg, uint64_t newval)
370{
371	zfsvfs_t *zfsvfs = arg;
372
373	zfsvfs->z_show_ctldir = newval;
374}
375
376static void
377vscan_changed_cb(void *arg, uint64_t newval)
378{
379	zfsvfs_t *zfsvfs = arg;
380
381	zfsvfs->z_vscan = newval;
382}
383
384static void
385acl_mode_changed_cb(void *arg, uint64_t newval)
386{
387	zfsvfs_t *zfsvfs = arg;
388
389	zfsvfs->z_acl_mode = newval;
390}
391
392static void
393acl_inherit_changed_cb(void *arg, uint64_t newval)
394{
395	zfsvfs_t *zfsvfs = arg;
396
397	zfsvfs->z_acl_inherit = newval;
398}
399
400static int
401zfs_register_callbacks(vfs_t *vfsp)
402{
403	struct dsl_dataset *ds = NULL;
404	objset_t *os = NULL;
405	zfsvfs_t *zfsvfs = NULL;
406	uint64_t nbmand;
407	boolean_t readonly = B_FALSE;
408	boolean_t do_readonly = B_FALSE;
409	boolean_t setuid = B_FALSE;
410	boolean_t do_setuid = B_FALSE;
411	boolean_t exec = B_FALSE;
412	boolean_t do_exec = B_FALSE;
413	boolean_t devices = B_FALSE;
414	boolean_t do_devices = B_FALSE;
415	boolean_t xattr = B_FALSE;
416	boolean_t do_xattr = B_FALSE;
417	boolean_t atime = B_FALSE;
418	boolean_t do_atime = B_FALSE;
419	int error = 0;
420
421	ASSERT(vfsp);
422	zfsvfs = vfsp->vfs_data;
423	ASSERT(zfsvfs);
424	os = zfsvfs->z_os;
425
426	/*
427	 * The act of registering our callbacks will destroy any mount
428	 * options we may have.  In order to enable temporary overrides
429	 * of mount options, we stash away the current values and
430	 * restore them after we register the callbacks.
431	 */
432	if (vfs_optionisset(vfsp, MNTOPT_RO, NULL) ||
433	    !spa_writeable(dmu_objset_spa(os))) {
434		readonly = B_TRUE;
435		do_readonly = B_TRUE;
436	} else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) {
437		readonly = B_FALSE;
438		do_readonly = B_TRUE;
439	}
440	if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
441		devices = B_FALSE;
442		setuid = B_FALSE;
443		do_devices = B_TRUE;
444		do_setuid = B_TRUE;
445	} else {
446		if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL)) {
447			devices = B_FALSE;
448			do_devices = B_TRUE;
449		} else if (vfs_optionisset(vfsp, MNTOPT_DEVICES, NULL)) {
450			devices = B_TRUE;
451			do_devices = B_TRUE;
452		}
453
454		if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) {
455			setuid = B_FALSE;
456			do_setuid = B_TRUE;
457		} else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) {
458			setuid = B_TRUE;
459			do_setuid = B_TRUE;
460		}
461	}
462	if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) {
463		exec = B_FALSE;
464		do_exec = B_TRUE;
465	} else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) {
466		exec = B_TRUE;
467		do_exec = B_TRUE;
468	}
469	if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) {
470		xattr = B_FALSE;
471		do_xattr = B_TRUE;
472	} else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) {
473		xattr = B_TRUE;
474		do_xattr = B_TRUE;
475	}
476	if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) {
477		atime = B_FALSE;
478		do_atime = B_TRUE;
479	} else if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) {
480		atime = B_TRUE;
481		do_atime = B_TRUE;
482	}
483
484	/*
485	 * nbmand is a special property.  It can only be changed at
486	 * mount time.
487	 *
488	 * This is weird, but it is documented to only be changeable
489	 * at mount time.
490	 */
491	if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) {
492		nbmand = B_FALSE;
493	} else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) {
494		nbmand = B_TRUE;
495	} else {
496		char osname[ZFS_MAX_DATASET_NAME_LEN];
497
498		dmu_objset_name(os, osname);
499		if (error = dsl_prop_get_integer(osname, "nbmand", &nbmand,
500		    NULL)) {
501			return (error);
502		}
503	}
504
505	/*
506	 * Register property callbacks.
507	 *
508	 * It would probably be fine to just check for i/o error from
509	 * the first prop_register(), but I guess I like to go
510	 * overboard...
511	 */
512	ds = dmu_objset_ds(os);
513	dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
514	error = dsl_prop_register(ds,
515	    zfs_prop_to_name(ZFS_PROP_ATIME), atime_changed_cb, zfsvfs);
516	error = error ? error : dsl_prop_register(ds,
517	    zfs_prop_to_name(ZFS_PROP_XATTR), xattr_changed_cb, zfsvfs);
518	error = error ? error : dsl_prop_register(ds,
519	    zfs_prop_to_name(ZFS_PROP_RECORDSIZE), blksz_changed_cb, zfsvfs);
520	error = error ? error : dsl_prop_register(ds,
521	    zfs_prop_to_name(ZFS_PROP_READONLY), readonly_changed_cb, zfsvfs);
522	error = error ? error : dsl_prop_register(ds,
523	    zfs_prop_to_name(ZFS_PROP_DEVICES), devices_changed_cb, zfsvfs);
524	error = error ? error : dsl_prop_register(ds,
525	    zfs_prop_to_name(ZFS_PROP_SETUID), setuid_changed_cb, zfsvfs);
526	error = error ? error : dsl_prop_register(ds,
527	    zfs_prop_to_name(ZFS_PROP_EXEC), exec_changed_cb, zfsvfs);
528	error = error ? error : dsl_prop_register(ds,
529	    zfs_prop_to_name(ZFS_PROP_SNAPDIR), snapdir_changed_cb, zfsvfs);
530	error = error ? error : dsl_prop_register(ds,
531	    zfs_prop_to_name(ZFS_PROP_ACLMODE), acl_mode_changed_cb, zfsvfs);
532	error = error ? error : dsl_prop_register(ds,
533	    zfs_prop_to_name(ZFS_PROP_ACLINHERIT), acl_inherit_changed_cb,
534	    zfsvfs);
535	error = error ? error : dsl_prop_register(ds,
536	    zfs_prop_to_name(ZFS_PROP_VSCAN), vscan_changed_cb, zfsvfs);
537	dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
538	if (error)
539		goto unregister;
540
541	/*
542	 * Invoke our callbacks to restore temporary mount options.
543	 */
544	if (do_readonly)
545		readonly_changed_cb(zfsvfs, readonly);
546	if (do_setuid)
547		setuid_changed_cb(zfsvfs, setuid);
548	if (do_exec)
549		exec_changed_cb(zfsvfs, exec);
550	if (do_devices)
551		devices_changed_cb(zfsvfs, devices);
552	if (do_xattr)
553		xattr_changed_cb(zfsvfs, xattr);
554	if (do_atime)
555		atime_changed_cb(zfsvfs, atime);
556
557	nbmand_changed_cb(zfsvfs, nbmand);
558
559	return (0);
560
561unregister:
562	dsl_prop_unregister_all(ds, zfsvfs);
563	return (error);
564}
565
566static int
567zfs_space_delta_cb(dmu_object_type_t bonustype, void *data,
568    uint64_t *userp, uint64_t *groupp, uint64_t *projectp)
569{
570	sa_hdr_phys_t sa;
571	sa_hdr_phys_t *sap = data;
572	uint64_t flags;
573	int hdrsize;
574	boolean_t swap = B_FALSE;
575
576	/*
577	 * Is it a valid type of object to track?
578	 */
579	if (bonustype != DMU_OT_ZNODE && bonustype != DMU_OT_SA)
580		return (SET_ERROR(ENOENT));
581
582	/*
583	 * If we have a NULL data pointer
584	 * then assume the id's aren't changing and
585	 * return EEXIST to the dmu to let it know to
586	 * use the same ids
587	 */
588	if (data == NULL)
589		return (SET_ERROR(EEXIST));
590
591	if (bonustype == DMU_OT_ZNODE) {
592		znode_phys_t *znp = data;
593		*userp = znp->zp_uid;
594		*groupp = znp->zp_gid;
595		*projectp = ZFS_DEFAULT_PROJID;
596		return (0);
597	}
598
599	if (sap->sa_magic == 0) {
600		/*
601		 * This should only happen for newly created files
602		 * that haven't had the znode data filled in yet.
603		 */
604		*userp = 0;
605		*groupp = 0;
606		*projectp = ZFS_DEFAULT_PROJID;
607		return (0);
608	}
609
610	sa = *sap;
611	if (sa.sa_magic == BSWAP_32(SA_MAGIC)) {
612		sa.sa_magic = SA_MAGIC;
613		sa.sa_layout_info = BSWAP_16(sa.sa_layout_info);
614		swap = B_TRUE;
615	} else {
616		VERIFY3U(sa.sa_magic, ==, SA_MAGIC);
617	}
618
619	hdrsize = sa_hdrsize(&sa);
620	VERIFY3U(hdrsize, >=, sizeof (sa_hdr_phys_t));
621
622	*userp = *((uint64_t *)((uintptr_t)data + hdrsize + SA_UID_OFFSET));
623	*groupp = *((uint64_t *)((uintptr_t)data + hdrsize + SA_GID_OFFSET));
624	flags = *((uint64_t *)((uintptr_t)data + hdrsize + SA_FLAGS_OFFSET));
625	if (swap)
626		flags = BSWAP_64(flags);
627
628	if (flags & ZFS_PROJID)
629		*projectp = *((uint64_t *)((uintptr_t)data + hdrsize +
630		    SA_PROJID_OFFSET));
631	else
632		*projectp = ZFS_DEFAULT_PROJID;
633
634	if (swap) {
635		*userp = BSWAP_64(*userp);
636		*groupp = BSWAP_64(*groupp);
637		*projectp = BSWAP_64(*projectp);
638	}
639	return (0);
640}
641
642static void
643fuidstr_to_sid(zfsvfs_t *zfsvfs, const char *fuidstr,
644    char *domainbuf, int buflen, uid_t *ridp)
645{
646	uint64_t fuid;
647	const char *domain;
648
649	fuid = zfs_strtonum(fuidstr, NULL);
650
651	domain = zfs_fuid_find_by_idx(zfsvfs, FUID_INDEX(fuid));
652	if (domain)
653		(void) strlcpy(domainbuf, domain, buflen);
654	else
655		domainbuf[0] = '\0';
656	*ridp = FUID_RID(fuid);
657}
658
659static uint64_t
660zfs_userquota_prop_to_obj(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type)
661{
662	switch (type) {
663	case ZFS_PROP_USERUSED:
664	case ZFS_PROP_USEROBJUSED:
665		return (DMU_USERUSED_OBJECT);
666	case ZFS_PROP_GROUPUSED:
667	case ZFS_PROP_GROUPOBJUSED:
668		return (DMU_GROUPUSED_OBJECT);
669	case ZFS_PROP_PROJECTUSED:
670	case ZFS_PROP_PROJECTOBJUSED:
671		return (DMU_PROJECTUSED_OBJECT);
672	case ZFS_PROP_USERQUOTA:
673		return (zfsvfs->z_userquota_obj);
674	case ZFS_PROP_GROUPQUOTA:
675		return (zfsvfs->z_groupquota_obj);
676	case ZFS_PROP_USEROBJQUOTA:
677		return (zfsvfs->z_userobjquota_obj);
678	case ZFS_PROP_GROUPOBJQUOTA:
679		return (zfsvfs->z_groupobjquota_obj);
680	case ZFS_PROP_PROJECTQUOTA:
681		return (zfsvfs->z_projectquota_obj);
682	case ZFS_PROP_PROJECTOBJQUOTA:
683		return (zfsvfs->z_projectobjquota_obj);
684	default:
685		return (ZFS_NO_OBJECT);
686	}
687}
688
689int
690zfs_userspace_many(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
691    uint64_t *cookiep, void *vbuf, uint64_t *bufsizep)
692{
693	int error;
694	zap_cursor_t zc;
695	zap_attribute_t za;
696	zfs_useracct_t *buf = vbuf;
697	uint64_t obj;
698	int offset = 0;
699
700	if (!dmu_objset_userspace_present(zfsvfs->z_os))
701		return (SET_ERROR(ENOTSUP));
702
703	if ((type == ZFS_PROP_PROJECTQUOTA || type == ZFS_PROP_PROJECTUSED ||
704	    type == ZFS_PROP_PROJECTOBJQUOTA ||
705	    type == ZFS_PROP_PROJECTOBJUSED) &&
706	    !dmu_objset_projectquota_present(zfsvfs->z_os))
707		return (SET_ERROR(ENOTSUP));
708
709	if ((type == ZFS_PROP_USEROBJUSED || type == ZFS_PROP_GROUPOBJUSED ||
710	    type == ZFS_PROP_USEROBJQUOTA || type == ZFS_PROP_GROUPOBJQUOTA ||
711	    type == ZFS_PROP_PROJECTOBJUSED ||
712	    type == ZFS_PROP_PROJECTOBJQUOTA) &&
713	    !dmu_objset_userobjspace_present(zfsvfs->z_os))
714		return (SET_ERROR(ENOTSUP));
715
716	obj = zfs_userquota_prop_to_obj(zfsvfs, type);
717	if (obj == ZFS_NO_OBJECT) {
718		*bufsizep = 0;
719		return (0);
720	}
721
722	if (type == ZFS_PROP_USEROBJUSED || type == ZFS_PROP_GROUPOBJUSED ||
723	    type == ZFS_PROP_PROJECTOBJUSED)
724		offset = DMU_OBJACCT_PREFIX_LEN;
725
726	for (zap_cursor_init_serialized(&zc, zfsvfs->z_os, obj, *cookiep);
727	    (error = zap_cursor_retrieve(&zc, &za)) == 0;
728	    zap_cursor_advance(&zc)) {
729		if ((uintptr_t)buf - (uintptr_t)vbuf + sizeof (zfs_useracct_t) >
730		    *bufsizep)
731			break;
732
733		/*
734		 * skip object quota (with zap name prefix DMU_OBJACCT_PREFIX)
735		 * when dealing with block quota and vice versa.
736		 */
737		if ((offset > 0) != (strncmp(za.za_name, DMU_OBJACCT_PREFIX,
738		    DMU_OBJACCT_PREFIX_LEN) == 0))
739			continue;
740
741		fuidstr_to_sid(zfsvfs, za.za_name + offset,
742		    buf->zu_domain, sizeof (buf->zu_domain), &buf->zu_rid);
743
744		buf->zu_space = za.za_first_integer;
745		buf++;
746	}
747	if (error == ENOENT)
748		error = 0;
749
750	ASSERT3U((uintptr_t)buf - (uintptr_t)vbuf, <=, *bufsizep);
751	*bufsizep = (uintptr_t)buf - (uintptr_t)vbuf;
752	*cookiep = zap_cursor_serialize(&zc);
753	zap_cursor_fini(&zc);
754	return (error);
755}
756
757/*
758 * buf must be big enough (eg, 32 bytes)
759 */
760static int
761id_to_fuidstr(zfsvfs_t *zfsvfs, const char *domain, uid_t rid,
762    char *buf, boolean_t addok)
763{
764	uint64_t fuid;
765	int domainid = 0;
766
767	if (domain && domain[0]) {
768		domainid = zfs_fuid_find_by_domain(zfsvfs, domain, NULL, addok);
769		if (domainid == -1)
770			return (SET_ERROR(ENOENT));
771	}
772	fuid = FUID_ENCODE(domainid, rid);
773	(void) sprintf(buf, "%llx", (longlong_t)fuid);
774	return (0);
775}
776
777int
778zfs_userspace_one(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
779    const char *domain, uint64_t rid, uint64_t *valp)
780{
781	char buf[20 + DMU_OBJACCT_PREFIX_LEN];
782	int offset = 0;
783	int err;
784	uint64_t obj;
785
786	*valp = 0;
787
788	if (!dmu_objset_userspace_present(zfsvfs->z_os))
789		return (SET_ERROR(ENOTSUP));
790
791	if ((type == ZFS_PROP_USEROBJUSED || type == ZFS_PROP_GROUPOBJUSED ||
792	    type == ZFS_PROP_USEROBJQUOTA || type == ZFS_PROP_GROUPOBJQUOTA ||
793	    type == ZFS_PROP_PROJECTOBJUSED ||
794	    type == ZFS_PROP_PROJECTOBJQUOTA) &&
795	    !dmu_objset_userobjspace_present(zfsvfs->z_os))
796		return (SET_ERROR(ENOTSUP));
797
798	if (type == ZFS_PROP_PROJECTQUOTA || type == ZFS_PROP_PROJECTUSED ||
799	    type == ZFS_PROP_PROJECTOBJQUOTA ||
800	    type == ZFS_PROP_PROJECTOBJUSED) {
801		if (!dmu_objset_projectquota_present(zfsvfs->z_os))
802			return (SET_ERROR(ENOTSUP));
803		if (!zpl_is_valid_projid(rid))
804			return (SET_ERROR(EINVAL));
805	}
806
807	obj = zfs_userquota_prop_to_obj(zfsvfs, type);
808	if (obj == ZFS_NO_OBJECT)
809		return (0);
810
811	if (type == ZFS_PROP_USEROBJUSED || type == ZFS_PROP_GROUPOBJUSED ||
812	    type == ZFS_PROP_PROJECTOBJUSED) {
813		strncpy(buf, DMU_OBJACCT_PREFIX, DMU_OBJACCT_PREFIX_LEN);
814		offset = DMU_OBJACCT_PREFIX_LEN;
815	}
816
817	err = id_to_fuidstr(zfsvfs, domain, rid, buf + offset, B_FALSE);
818	if (err)
819		return (err);
820
821	err = zap_lookup(zfsvfs->z_os, obj, buf, 8, 1, valp);
822	if (err == ENOENT)
823		err = 0;
824	return (err);
825}
826
827int
828zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
829    const char *domain, uint64_t rid, uint64_t quota)
830{
831	char buf[32];
832	int err;
833	dmu_tx_t *tx;
834	uint64_t *objp;
835	boolean_t fuid_dirtied;
836
837	if (zfsvfs->z_version < ZPL_VERSION_USERSPACE)
838		return (SET_ERROR(ENOTSUP));
839
840	switch (type) {
841	case ZFS_PROP_USERQUOTA:
842		objp = &zfsvfs->z_userquota_obj;
843		break;
844	case ZFS_PROP_GROUPQUOTA:
845		objp = &zfsvfs->z_groupquota_obj;
846		break;
847	case ZFS_PROP_USEROBJQUOTA:
848		objp = &zfsvfs->z_userobjquota_obj;
849		break;
850	case ZFS_PROP_GROUPOBJQUOTA:
851		objp = &zfsvfs->z_groupobjquota_obj;
852		break;
853	case ZFS_PROP_PROJECTQUOTA:
854		if (!dmu_objset_projectquota_enabled(zfsvfs->z_os))
855			return (SET_ERROR(ENOTSUP));
856		if (!zpl_is_valid_projid(rid))
857			return (SET_ERROR(EINVAL));
858
859		objp = &zfsvfs->z_projectquota_obj;
860		break;
861	case ZFS_PROP_PROJECTOBJQUOTA:
862		if (!dmu_objset_projectquota_enabled(zfsvfs->z_os))
863			return (SET_ERROR(ENOTSUP));
864		if (!zpl_is_valid_projid(rid))
865			return (SET_ERROR(EINVAL));
866
867		objp = &zfsvfs->z_projectobjquota_obj;
868		break;
869	default:
870		return (SET_ERROR(EINVAL));
871	}
872
873	err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_TRUE);
874	if (err)
875		return (err);
876	fuid_dirtied = zfsvfs->z_fuid_dirty;
877
878	tx = dmu_tx_create(zfsvfs->z_os);
879	dmu_tx_hold_zap(tx, *objp ? *objp : DMU_NEW_OBJECT, B_TRUE, NULL);
880	if (*objp == 0) {
881		dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE,
882		    zfs_userquota_prop_prefixes[type]);
883	}
884	if (fuid_dirtied)
885		zfs_fuid_txhold(zfsvfs, tx);
886	err = dmu_tx_assign(tx, TXG_WAIT);
887	if (err) {
888		dmu_tx_abort(tx);
889		return (err);
890	}
891
892	mutex_enter(&zfsvfs->z_lock);
893	if (*objp == 0) {
894		*objp = zap_create(zfsvfs->z_os, DMU_OT_USERGROUP_QUOTA,
895		    DMU_OT_NONE, 0, tx);
896		VERIFY(0 == zap_add(zfsvfs->z_os, MASTER_NODE_OBJ,
897		    zfs_userquota_prop_prefixes[type], 8, 1, objp, tx));
898	}
899	mutex_exit(&zfsvfs->z_lock);
900
901	if (quota == 0) {
902		err = zap_remove(zfsvfs->z_os, *objp, buf, tx);
903		if (err == ENOENT)
904			err = 0;
905	} else {
906		err = zap_update(zfsvfs->z_os, *objp, buf, 8, 1, &quota, tx);
907	}
908	ASSERT(err == 0);
909	if (fuid_dirtied)
910		zfs_fuid_sync(zfsvfs, tx);
911	dmu_tx_commit(tx);
912	return (err);
913}
914
915boolean_t
916zfs_id_overobjquota(zfsvfs_t *zfsvfs, uint64_t usedobj, uint64_t id)
917{
918	char buf[20 + DMU_OBJACCT_PREFIX_LEN];
919	uint64_t used, quota, quotaobj;
920	int err;
921
922	if (!dmu_objset_userobjspace_present(zfsvfs->z_os)) {
923		if (dmu_objset_userobjspace_upgradable(zfsvfs->z_os))
924			dmu_objset_id_quota_upgrade(zfsvfs->z_os);
925		return (B_FALSE);
926	}
927
928	if (usedobj == DMU_PROJECTUSED_OBJECT) {
929		if (!dmu_objset_projectquota_present(zfsvfs->z_os)) {
930			if (dmu_objset_projectquota_upgradable(zfsvfs->z_os)) {
931				dsl_pool_config_enter(
932				    dmu_objset_pool(zfsvfs->z_os), FTAG);
933				dmu_objset_id_quota_upgrade(zfsvfs->z_os);
934				dsl_pool_config_exit(
935				    dmu_objset_pool(zfsvfs->z_os), FTAG);
936			}
937			return (B_FALSE);
938		}
939		quotaobj = zfsvfs->z_projectobjquota_obj;
940	} else if (usedobj == DMU_USERUSED_OBJECT) {
941		quotaobj = zfsvfs->z_userobjquota_obj;
942	} else if (usedobj == DMU_GROUPUSED_OBJECT) {
943		quotaobj = zfsvfs->z_groupobjquota_obj;
944	} else {
945		return (B_FALSE);
946	}
947	if (quotaobj == 0 || zfsvfs->z_replay)
948		return (B_FALSE);
949
950	(void) sprintf(buf, "%llx", (longlong_t)id);
951	err = zap_lookup(zfsvfs->z_os, quotaobj, buf, 8, 1, &quota);
952	if (err != 0)
953		return (B_FALSE);
954
955	(void) sprintf(buf, DMU_OBJACCT_PREFIX "%llx", (longlong_t)id);
956	err = zap_lookup(zfsvfs->z_os, usedobj, buf, 8, 1, &used);
957	if (err != 0)
958		return (B_FALSE);
959	return (used >= quota);
960}
961
962boolean_t
963zfs_id_overblockquota(zfsvfs_t *zfsvfs, uint64_t usedobj, uint64_t id)
964{
965	char buf[20];
966	uint64_t used, quota, quotaobj;
967	int err;
968
969	if (usedobj == DMU_PROJECTUSED_OBJECT) {
970		if (!dmu_objset_projectquota_present(zfsvfs->z_os)) {
971			if (dmu_objset_projectquota_upgradable(zfsvfs->z_os)) {
972				dsl_pool_config_enter(
973				    dmu_objset_pool(zfsvfs->z_os), FTAG);
974				dmu_objset_id_quota_upgrade(zfsvfs->z_os);
975				dsl_pool_config_exit(
976				    dmu_objset_pool(zfsvfs->z_os), FTAG);
977			}
978			return (B_FALSE);
979		}
980		quotaobj = zfsvfs->z_projectquota_obj;
981	} else if (usedobj == DMU_USERUSED_OBJECT) {
982		quotaobj = zfsvfs->z_userquota_obj;
983	} else if (usedobj == DMU_GROUPUSED_OBJECT) {
984		quotaobj = zfsvfs->z_groupquota_obj;
985	} else {
986		return (B_FALSE);
987	}
988	if (quotaobj == 0 || zfsvfs->z_replay)
989		return (B_FALSE);
990
991	(void) sprintf(buf, "%llx", (longlong_t)id);
992	err = zap_lookup(zfsvfs->z_os, quotaobj, buf, 8, 1, &quota);
993	if (err != 0)
994		return (B_FALSE);
995
996	err = zap_lookup(zfsvfs->z_os, usedobj, buf, 8, 1, &used);
997	if (err != 0)
998		return (B_FALSE);
999	return (used >= quota);
1000}
1001
1002boolean_t
1003zfs_id_overquota(zfsvfs_t *zfsvfs, uint64_t usedobj, uint64_t id)
1004{
1005	return (zfs_id_overblockquota(zfsvfs, usedobj, id) ||
1006	    zfs_id_overobjquota(zfsvfs, usedobj, id));
1007}
1008
1009/*
1010 * Associate this zfsvfs with the given objset, which must be owned.
1011 * This will cache a bunch of on-disk state from the objset in the
1012 * zfsvfs.
1013 */
1014static int
1015zfsvfs_init(zfsvfs_t *zfsvfs, objset_t *os)
1016{
1017	int error;
1018	uint64_t val;
1019
1020	zfsvfs->z_max_blksz = SPA_OLD_MAXBLOCKSIZE;
1021	zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE;
1022	zfsvfs->z_os = os;
1023
1024	error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version);
1025	if (error != 0)
1026		return (error);
1027	if (zfsvfs->z_version >
1028	    zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) {
1029		(void) printf("Can't mount a version %lld file system "
1030		    "on a version %lld pool\n. Pool must be upgraded to mount "
1031		    "this file system.", (u_longlong_t)zfsvfs->z_version,
1032		    (u_longlong_t)spa_version(dmu_objset_spa(os)));
1033		return (SET_ERROR(ENOTSUP));
1034	}
1035	error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &val);
1036	if (error != 0)
1037		return (error);
1038	zfsvfs->z_norm = (int)val;
1039
1040	error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &val);
1041	if (error != 0)
1042		return (error);
1043	zfsvfs->z_utf8 = (val != 0);
1044
1045	error = zfs_get_zplprop(os, ZFS_PROP_CASE, &val);
1046	if (error != 0)
1047		return (error);
1048	zfsvfs->z_case = (uint_t)val;
1049
1050	/*
1051	 * Fold case on file systems that are always or sometimes case
1052	 * insensitive.
1053	 */
1054	if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
1055	    zfsvfs->z_case == ZFS_CASE_MIXED)
1056		zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
1057
1058	zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
1059	zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
1060
1061	uint64_t sa_obj = 0;
1062	if (zfsvfs->z_use_sa) {
1063		/* should either have both of these objects or none */
1064		error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1,
1065		    &sa_obj);
1066		if (error != 0)
1067			return (error);
1068	}
1069
1070	error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
1071	    &zfsvfs->z_attr_table);
1072	if (error != 0)
1073		return (error);
1074
1075	if (zfsvfs->z_version >= ZPL_VERSION_SA)
1076		sa_register_update_callback(os, zfs_sa_upgrade);
1077
1078	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1,
1079	    &zfsvfs->z_root);
1080	if (error != 0)
1081		return (error);
1082	ASSERT(zfsvfs->z_root != 0);
1083
1084	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1,
1085	    &zfsvfs->z_unlinkedobj);
1086	if (error != 0)
1087		return (error);
1088
1089	error = zap_lookup(os, MASTER_NODE_OBJ,
1090	    zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA],
1091	    8, 1, &zfsvfs->z_userquota_obj);
1092	if (error == ENOENT)
1093		zfsvfs->z_userquota_obj = 0;
1094	else if (error != 0)
1095		return (error);
1096
1097	error = zap_lookup(os, MASTER_NODE_OBJ,
1098	    zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA],
1099	    8, 1, &zfsvfs->z_groupquota_obj);
1100	if (error == ENOENT)
1101		zfsvfs->z_groupquota_obj = 0;
1102	else if (error != 0)
1103		return (error);
1104
1105	error = zap_lookup(os, MASTER_NODE_OBJ,
1106	    zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTQUOTA],
1107	    8, 1, &zfsvfs->z_projectquota_obj);
1108	if (error == ENOENT)
1109		zfsvfs->z_projectquota_obj = 0;
1110	else if (error != 0)
1111		return (error);
1112
1113	error = zap_lookup(os, MASTER_NODE_OBJ,
1114	    zfs_userquota_prop_prefixes[ZFS_PROP_USEROBJQUOTA],
1115	    8, 1, &zfsvfs->z_userobjquota_obj);
1116	if (error == ENOENT)
1117		zfsvfs->z_userobjquota_obj = 0;
1118	else if (error != 0)
1119		return (error);
1120
1121	error = zap_lookup(os, MASTER_NODE_OBJ,
1122	    zfs_userquota_prop_prefixes[ZFS_PROP_GROUPOBJQUOTA],
1123	    8, 1, &zfsvfs->z_groupobjquota_obj);
1124	if (error == ENOENT)
1125		zfsvfs->z_groupobjquota_obj = 0;
1126	else if (error != 0)
1127		return (error);
1128
1129	error = zap_lookup(os, MASTER_NODE_OBJ,
1130	    zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTOBJQUOTA],
1131	    8, 1, &zfsvfs->z_projectobjquota_obj);
1132	if (error == ENOENT)
1133		zfsvfs->z_projectobjquota_obj = 0;
1134	else if (error != 0)
1135		return (error);
1136
1137	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1,
1138	    &zfsvfs->z_fuid_obj);
1139	if (error == ENOENT)
1140		zfsvfs->z_fuid_obj = 0;
1141	else if (error != 0)
1142		return (error);
1143
1144	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1,
1145	    &zfsvfs->z_shares_dir);
1146	if (error == ENOENT)
1147		zfsvfs->z_shares_dir = 0;
1148	else if (error != 0)
1149		return (error);
1150
1151	return (0);
1152}
1153
1154int
1155zfsvfs_create(const char *osname, boolean_t readonly, zfsvfs_t **zfvp)
1156{
1157	objset_t *os;
1158	zfsvfs_t *zfsvfs;
1159	int error;
1160	boolean_t ro = (readonly || (strchr(osname, '@') != NULL));
1161
1162	zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
1163
1164	error = dmu_objset_own(osname, DMU_OST_ZFS, ro, B_TRUE, zfsvfs, &os);
1165	if (error != 0) {
1166		kmem_free(zfsvfs, sizeof (zfsvfs_t));
1167		return (error);
1168	}
1169
1170	error = zfsvfs_create_impl(zfvp, zfsvfs, os);
1171	if (error != 0) {
1172		dmu_objset_disown(os, B_TRUE, zfsvfs);
1173	}
1174	return (error);
1175}
1176
1177
1178int
1179zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os)
1180{
1181	int error;
1182
1183	zfsvfs->z_vfs = NULL;
1184	zfsvfs->z_parent = zfsvfs;
1185
1186	mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
1187	mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL);
1188	list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
1189	    offsetof(znode_t, z_link_node));
1190	rrm_init(&zfsvfs->z_teardown_lock, B_FALSE);
1191	rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL);
1192	rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL);
1193	for (int i = 0; i != ZFS_OBJ_MTX_SZ; i++)
1194		mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
1195
1196	error = zfsvfs_init(zfsvfs, os);
1197	if (error != 0) {
1198		*zfvp = NULL;
1199		kmem_free(zfsvfs, sizeof (zfsvfs_t));
1200		return (error);
1201	}
1202
1203	*zfvp = zfsvfs;
1204	return (0);
1205}
1206
1207static int
1208zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
1209{
1210	int error;
1211
1212	error = zfs_register_callbacks(zfsvfs->z_vfs);
1213	if (error)
1214		return (error);
1215
1216	zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data);
1217
1218	/*
1219	 * If we are not mounting (ie: online recv), then we don't
1220	 * have to worry about replaying the log as we blocked all
1221	 * operations out since we closed the ZIL.
1222	 */
1223	if (mounting) {
1224		boolean_t readonly;
1225
1226		/*
1227		 * During replay we remove the read only flag to
1228		 * allow replays to succeed.
1229		 */
1230		readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY;
1231		if (readonly != 0)
1232			zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
1233		else
1234			zfs_unlinked_drain(zfsvfs);
1235
1236		/*
1237		 * Parse and replay the intent log.
1238		 *
1239		 * Because of ziltest, this must be done after
1240		 * zfs_unlinked_drain().  (Further note: ziltest
1241		 * doesn't use readonly mounts, where
1242		 * zfs_unlinked_drain() isn't called.)  This is because
1243		 * ziltest causes spa_sync() to think it's committed,
1244		 * but actually it is not, so the intent log contains
1245		 * many txg's worth of changes.
1246		 *
1247		 * In particular, if object N is in the unlinked set in
1248		 * the last txg to actually sync, then it could be
1249		 * actually freed in a later txg and then reallocated
1250		 * in a yet later txg.  This would write a "create
1251		 * object N" record to the intent log.  Normally, this
1252		 * would be fine because the spa_sync() would have
1253		 * written out the fact that object N is free, before
1254		 * we could write the "create object N" intent log
1255		 * record.
1256		 *
1257		 * But when we are in ziltest mode, we advance the "open
1258		 * txg" without actually spa_sync()-ing the changes to
1259		 * disk.  So we would see that object N is still
1260		 * allocated and in the unlinked set, and there is an
1261		 * intent log record saying to allocate it.
1262		 */
1263		if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) {
1264			if (zil_replay_disable) {
1265				zil_destroy(zfsvfs->z_log, B_FALSE);
1266			} else {
1267				zfsvfs->z_replay = B_TRUE;
1268				zil_replay(zfsvfs->z_os, zfsvfs,
1269				    zfs_replay_vector);
1270				zfsvfs->z_replay = B_FALSE;
1271			}
1272		}
1273
1274		/* restore readonly bit */
1275		if (readonly != 0)
1276			zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY;
1277	}
1278
1279	/*
1280	 * Set the objset user_ptr to track its zfsvfs.
1281	 */
1282	mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
1283	dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
1284	mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
1285
1286	return (0);
1287}
1288
1289void
1290zfsvfs_free(zfsvfs_t *zfsvfs)
1291{
1292	int i;
1293	extern krwlock_t zfsvfs_lock; /* in zfs_znode.c */
1294
1295	/*
1296	 * This is a barrier to prevent the filesystem from going away in
1297	 * zfs_znode_move() until we can safely ensure that the filesystem is
1298	 * not unmounted. We consider the filesystem valid before the barrier
1299	 * and invalid after the barrier.
1300	 */
1301	rw_enter(&zfsvfs_lock, RW_READER);
1302	rw_exit(&zfsvfs_lock);
1303
1304	zfs_fuid_destroy(zfsvfs);
1305
1306	mutex_destroy(&zfsvfs->z_znodes_lock);
1307	mutex_destroy(&zfsvfs->z_lock);
1308	list_destroy(&zfsvfs->z_all_znodes);
1309	rrm_destroy(&zfsvfs->z_teardown_lock);
1310	rw_destroy(&zfsvfs->z_teardown_inactive_lock);
1311	rw_destroy(&zfsvfs->z_fuid_lock);
1312	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
1313		mutex_destroy(&zfsvfs->z_hold_mtx[i]);
1314	kmem_free(zfsvfs, sizeof (zfsvfs_t));
1315}
1316
1317static void
1318zfs_set_fuid_feature(zfsvfs_t *zfsvfs)
1319{
1320	zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
1321	if (zfsvfs->z_vfs) {
1322		if (zfsvfs->z_use_fuids) {
1323			vfs_set_feature(zfsvfs->z_vfs, VFSFT_XVATTR);
1324			vfs_set_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS);
1325			vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS);
1326			vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE);
1327			vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER);
1328			vfs_set_feature(zfsvfs->z_vfs, VFSFT_REPARSE);
1329		} else {
1330			vfs_clear_feature(zfsvfs->z_vfs, VFSFT_XVATTR);
1331			vfs_clear_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS);
1332			vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS);
1333			vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE);
1334			vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER);
1335			vfs_clear_feature(zfsvfs->z_vfs, VFSFT_REPARSE);
1336		}
1337	}
1338	zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
1339}
1340
1341static int
1342zfs_domount(vfs_t *vfsp, char *osname)
1343{
1344	dev_t mount_dev;
1345	uint64_t recordsize, fsid_guid;
1346	int error = 0;
1347	zfsvfs_t *zfsvfs;
1348	boolean_t readonly = vfsp->vfs_flag & VFS_RDONLY ? B_TRUE : B_FALSE;
1349
1350	ASSERT(vfsp);
1351	ASSERT(osname);
1352
1353	error = zfsvfs_create(osname, readonly, &zfsvfs);
1354	if (error)
1355		return (error);
1356	zfsvfs->z_vfs = vfsp;
1357
1358	/* Initialize the generic filesystem structure. */
1359	vfsp->vfs_bcount = 0;
1360	vfsp->vfs_data = NULL;
1361
1362	if (zfs_create_unique_device(&mount_dev) == -1) {
1363		error = SET_ERROR(ENODEV);
1364		goto out;
1365	}
1366	ASSERT(vfs_devismounted(mount_dev) == 0);
1367
1368	if (error = dsl_prop_get_integer(osname, "recordsize", &recordsize,
1369	    NULL))
1370		goto out;
1371
1372	vfsp->vfs_dev = mount_dev;
1373	vfsp->vfs_fstype = zfsfstype;
1374	vfsp->vfs_bsize = recordsize;
1375	vfsp->vfs_flag |= VFS_NOTRUNC;
1376	vfsp->vfs_data = zfsvfs;
1377
1378	/*
1379	 * The fsid is 64 bits, composed of an 8-bit fs type, which
1380	 * separates our fsid from any other filesystem types, and a
1381	 * 56-bit objset unique ID.  The objset unique ID is unique to
1382	 * all objsets open on this system, provided by unique_create().
1383	 * The 8-bit fs type must be put in the low bits of fsid[1]
1384	 * because that's where other Solaris filesystems put it.
1385	 */
1386	fsid_guid = dmu_objset_fsid_guid(zfsvfs->z_os);
1387	ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0);
1388	vfsp->vfs_fsid.val[0] = fsid_guid;
1389	vfsp->vfs_fsid.val[1] = ((fsid_guid>>32) << 8) |
1390	    zfsfstype & 0xFF;
1391
1392	/*
1393	 * Set features for file system.
1394	 */
1395	zfs_set_fuid_feature(zfsvfs);
1396	if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {
1397		vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
1398		vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
1399		vfs_set_feature(vfsp, VFSFT_NOCASESENSITIVE);
1400	} else if (zfsvfs->z_case == ZFS_CASE_MIXED) {
1401		vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
1402		vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
1403	}
1404	vfs_set_feature(vfsp, VFSFT_ZEROCOPY_SUPPORTED);
1405
1406	if (dmu_objset_is_snapshot(zfsvfs->z_os)) {
1407		uint64_t pval;
1408
1409		atime_changed_cb(zfsvfs, B_FALSE);
1410		readonly_changed_cb(zfsvfs, B_TRUE);
1411		if (error = dsl_prop_get_integer(osname, "xattr", &pval, NULL))
1412			goto out;
1413		xattr_changed_cb(zfsvfs, pval);
1414		zfsvfs->z_issnap = B_TRUE;
1415		zfsvfs->z_os->os_sync = ZFS_SYNC_DISABLED;
1416
1417		mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
1418		dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
1419		mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
1420	} else {
1421		error = zfsvfs_setup(zfsvfs, B_TRUE);
1422	}
1423
1424	if (!zfsvfs->z_issnap)
1425		zfsctl_create(zfsvfs);
1426out:
1427	if (error) {
1428		dmu_objset_disown(zfsvfs->z_os, B_TRUE, zfsvfs);
1429		zfsvfs_free(zfsvfs);
1430	} else {
1431		atomic_inc_32(&zfs_active_fs_count);
1432	}
1433
1434	return (error);
1435}
1436
1437void
1438zfs_unregister_callbacks(zfsvfs_t *zfsvfs)
1439{
1440	objset_t *os = zfsvfs->z_os;
1441
1442	if (!dmu_objset_is_snapshot(os))
1443		dsl_prop_unregister_all(dmu_objset_ds(os), zfsvfs);
1444}
1445
1446/*
1447 * Convert a decimal digit string to a uint64_t integer.
1448 */
1449static int
1450str_to_uint64(char *str, uint64_t *objnum)
1451{
1452	uint64_t num = 0;
1453
1454	while (*str) {
1455		if (*str < '0' || *str > '9')
1456			return (SET_ERROR(EINVAL));
1457
1458		num = num*10 + *str++ - '0';
1459	}
1460
1461	*objnum = num;
1462	return (0);
1463}
1464
1465/*
1466 * The boot path passed from the boot loader is in the form of
1467 * "rootpool-name/root-filesystem-object-number'. Convert this
1468 * string to a dataset name: "rootpool-name/root-filesystem-name".
1469 */
1470static int
1471zfs_parse_bootfs(char *bpath, char *outpath)
1472{
1473	char *slashp;
1474	uint64_t objnum;
1475	int error;
1476
1477	if (*bpath == 0 || *bpath == '/')
1478		return (SET_ERROR(EINVAL));
1479
1480	(void) strcpy(outpath, bpath);
1481
1482	slashp = strchr(bpath, '/');
1483
1484	/* if no '/', just return the pool name */
1485	if (slashp == NULL) {
1486		return (0);
1487	}
1488
1489	/* if not a number, just return the root dataset name */
1490	if (str_to_uint64(slashp+1, &objnum)) {
1491		return (0);
1492	}
1493
1494	*slashp = '\0';
1495	error = dsl_dsobj_to_dsname(bpath, objnum, outpath);
1496	*slashp = '/';
1497
1498	return (error);
1499}
1500
1501/*
1502 * Check that the hex label string is appropriate for the dataset being
1503 * mounted into the global_zone proper.
1504 *
1505 * Return an error if the hex label string is not default or
1506 * admin_low/admin_high.  For admin_low labels, the corresponding
1507 * dataset must be readonly.
1508 */
1509int
1510zfs_check_global_label(const char *dsname, const char *hexsl)
1511{
1512	if (strcasecmp(hexsl, ZFS_MLSLABEL_DEFAULT) == 0)
1513		return (0);
1514	if (strcasecmp(hexsl, ADMIN_HIGH) == 0)
1515		return (0);
1516	if (strcasecmp(hexsl, ADMIN_LOW) == 0) {
1517		/* must be readonly */
1518		uint64_t rdonly;
1519
1520		if (dsl_prop_get_integer(dsname,
1521		    zfs_prop_to_name(ZFS_PROP_READONLY), &rdonly, NULL))
1522			return (SET_ERROR(EACCES));
1523		return (rdonly ? 0 : EACCES);
1524	}
1525	return (SET_ERROR(EACCES));
1526}
1527
1528static int
1529zfs_statfs_project(zfsvfs_t *zfsvfs, znode_t *zp, struct statvfs64 *statp,
1530    uint32_t bshift)
1531{
1532	char buf[20 + DMU_OBJACCT_PREFIX_LEN];
1533	uint64_t offset = DMU_OBJACCT_PREFIX_LEN;
1534	uint64_t quota;
1535	uint64_t used;
1536	int err;
1537
1538	strlcpy(buf, DMU_OBJACCT_PREFIX, DMU_OBJACCT_PREFIX_LEN + 1);
1539	err = id_to_fuidstr(zfsvfs, NULL, zp->z_projid, buf + offset, B_FALSE);
1540	if (err)
1541		return (err);
1542
1543	if (zfsvfs->z_projectquota_obj == 0)
1544		goto objs;
1545
1546	err = zap_lookup(zfsvfs->z_os, zfsvfs->z_projectquota_obj,
1547	    buf + offset, 8, 1, &quota);
1548	if (err == ENOENT)
1549		goto objs;
1550	else if (err)
1551		return (err);
1552
1553	err = zap_lookup(zfsvfs->z_os, DMU_PROJECTUSED_OBJECT,
1554	    buf + offset, 8, 1, &used);
1555	if (unlikely(err == ENOENT)) {
1556		uint32_t blksize;
1557		u_longlong_t nblocks;
1558
1559		/*
1560		 * Quota accounting is async, so it is possible race case.
1561		 * There is at least one object with the given project ID.
1562		 */
1563		sa_object_size(zp->z_sa_hdl, &blksize, &nblocks);
1564		if (unlikely(zp->z_blksz == 0))
1565			blksize = zfsvfs->z_max_blksz;
1566
1567		used = blksize * nblocks;
1568	} else if (err) {
1569		return (err);
1570	}
1571
1572	statp->f_blocks = quota >> bshift;
1573	statp->f_bfree = (quota > used) ? ((quota - used) >> bshift) : 0;
1574	statp->f_bavail = statp->f_bfree;
1575
1576objs:
1577	if (zfsvfs->z_projectobjquota_obj == 0)
1578		return (0);
1579
1580	err = zap_lookup(zfsvfs->z_os, zfsvfs->z_projectobjquota_obj,
1581	    buf + offset, 8, 1, &quota);
1582	if (err == ENOENT)
1583		return (0);
1584	else if (err)
1585		return (err);
1586
1587	err = zap_lookup(zfsvfs->z_os, DMU_PROJECTUSED_OBJECT,
1588	    buf, 8, 1, &used);
1589	if (unlikely(err == ENOENT)) {
1590		/*
1591		 * Quota accounting is async, so it is possible race case.
1592		 * There is at least one object with the given project ID.
1593		 */
1594		used = 1;
1595	} else if (err) {
1596		return (err);
1597	}
1598
1599	statp->f_files = quota;
1600	statp->f_ffree = (quota > used) ? (quota - used) : 0;
1601
1602	return (0);
1603}
1604
1605/*
1606 * Determine whether the mount is allowed according to MAC check.
1607 * by comparing (where appropriate) label of the dataset against
1608 * the label of the zone being mounted into.  If the dataset has
1609 * no label, create one.
1610 *
1611 * Returns 0 if access allowed, error otherwise (e.g. EACCES)
1612 */
1613static int
1614zfs_mount_label_policy(vfs_t *vfsp, char *osname)
1615{
1616	int		error, retv;
1617	zone_t		*mntzone = NULL;
1618	ts_label_t	*mnt_tsl;
1619	bslabel_t	*mnt_sl;
1620	bslabel_t	ds_sl;
1621	char		ds_hexsl[MAXNAMELEN];
1622
1623	retv = EACCES;				/* assume the worst */
1624
1625	/*
1626	 * Start by getting the dataset label if it exists.
1627	 */
1628	error = dsl_prop_get(osname, zfs_prop_to_name(ZFS_PROP_MLSLABEL),
1629	    1, sizeof (ds_hexsl), &ds_hexsl, NULL);
1630	if (error)
1631		return (SET_ERROR(EACCES));
1632
1633	/*
1634	 * If labeling is NOT enabled, then disallow the mount of datasets
1635	 * which have a non-default label already.  No other label checks
1636	 * are needed.
1637	 */
1638	if (!is_system_labeled()) {
1639		if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0)
1640			return (0);
1641		return (SET_ERROR(EACCES));
1642	}
1643
1644	/*
1645	 * Get the label of the mountpoint.  If mounting into the global
1646	 * zone (i.e. mountpoint is not within an active zone and the
1647	 * zoned property is off), the label must be default or
1648	 * admin_low/admin_high only; no other checks are needed.
1649	 */
1650	mntzone = zone_find_by_any_path(refstr_value(vfsp->vfs_mntpt), B_FALSE);
1651	if (mntzone->zone_id == GLOBAL_ZONEID) {
1652		uint64_t zoned;
1653
1654		zone_rele(mntzone);
1655
1656		if (dsl_prop_get_integer(osname,
1657		    zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL))
1658			return (SET_ERROR(EACCES));
1659		if (!zoned)
1660			return (zfs_check_global_label(osname, ds_hexsl));
1661		else
1662			/*
1663			 * This is the case of a zone dataset being mounted
1664			 * initially, before the zone has been fully created;
1665			 * allow this mount into global zone.
1666			 */
1667			return (0);
1668	}
1669
1670	mnt_tsl = mntzone->zone_slabel;
1671	ASSERT(mnt_tsl != NULL);
1672	label_hold(mnt_tsl);
1673	mnt_sl = label2bslabel(mnt_tsl);
1674
1675	if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0) {
1676		/*
1677		 * The dataset doesn't have a real label, so fabricate one.
1678		 */
1679		char *str = NULL;
1680
1681		if (l_to_str_internal(mnt_sl, &str) == 0 &&
1682		    dsl_prop_set_string(osname,
1683		    zfs_prop_to_name(ZFS_PROP_MLSLABEL),
1684		    ZPROP_SRC_LOCAL, str) == 0)
1685			retv = 0;
1686		if (str != NULL)
1687			kmem_free(str, strlen(str) + 1);
1688	} else if (hexstr_to_label(ds_hexsl, &ds_sl) == 0) {
1689		/*
1690		 * Now compare labels to complete the MAC check.  If the
1691		 * labels are equal then allow access.  If the mountpoint
1692		 * label dominates the dataset label, allow readonly access.
1693		 * Otherwise, access is denied.
1694		 */
1695		if (blequal(mnt_sl, &ds_sl))
1696			retv = 0;
1697		else if (bldominates(mnt_sl, &ds_sl)) {
1698			vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0);
1699			retv = 0;
1700		}
1701	}
1702
1703	label_rele(mnt_tsl);
1704	zone_rele(mntzone);
1705	return (retv);
1706}
1707
1708static int
1709zfs_mountroot(vfs_t *vfsp, enum whymountroot why)
1710{
1711	int error = 0;
1712	static int zfsrootdone = 0;
1713	zfsvfs_t *zfsvfs = NULL;
1714	znode_t *zp = NULL;
1715	vnode_t *vp = NULL;
1716	char *zfs_bootfs;
1717	char *zfs_devid;
1718
1719	ASSERT(vfsp);
1720
1721	/*
1722	 * The filesystem that we mount as root is defined in the
1723	 * boot property "zfs-bootfs" with a format of
1724	 * "poolname/root-dataset-objnum".
1725	 */
1726	if (why == ROOT_INIT) {
1727		if (zfsrootdone++)
1728			return (SET_ERROR(EBUSY));
1729		/*
1730		 * the process of doing a spa_load will require the
1731		 * clock to be set before we could (for example) do
1732		 * something better by looking at the timestamp on
1733		 * an uberblock, so just set it to -1.
1734		 */
1735		clkset(-1);
1736
1737		if ((zfs_bootfs = spa_get_bootprop("zfs-bootfs")) == NULL) {
1738			cmn_err(CE_NOTE, "spa_get_bootfs: can not get "
1739			    "bootfs name");
1740			return (SET_ERROR(EINVAL));
1741		}
1742		zfs_devid = spa_get_bootprop("diskdevid");
1743		error = spa_import_rootpool(rootfs.bo_name, zfs_devid);
1744		if (zfs_devid)
1745			spa_free_bootprop(zfs_devid);
1746		if (error) {
1747			spa_free_bootprop(zfs_bootfs);
1748			cmn_err(CE_NOTE, "spa_import_rootpool: error %d",
1749			    error);
1750			return (error);
1751		}
1752		if (error = zfs_parse_bootfs(zfs_bootfs, rootfs.bo_name)) {
1753			spa_free_bootprop(zfs_bootfs);
1754			cmn_err(CE_NOTE, "zfs_parse_bootfs: error %d",
1755			    error);
1756			return (error);
1757		}
1758
1759		spa_free_bootprop(zfs_bootfs);
1760
1761		if (error = vfs_lock(vfsp))
1762			return (error);
1763
1764		if (error = zfs_domount(vfsp, rootfs.bo_name)) {
1765			cmn_err(CE_NOTE, "zfs_domount: error %d", error);
1766			goto out;
1767		}
1768
1769		zfsvfs = (zfsvfs_t *)vfsp->vfs_data;
1770		ASSERT(zfsvfs);
1771		if (error = zfs_zget(zfsvfs, zfsvfs->z_root, &zp)) {
1772			cmn_err(CE_NOTE, "zfs_zget: error %d", error);
1773			goto out;
1774		}
1775
1776		vp = ZTOV(zp);
1777		mutex_enter(&vp->v_lock);
1778		vp->v_flag |= VROOT;
1779		mutex_exit(&vp->v_lock);
1780		rootvp = vp;
1781
1782		/*
1783		 * Leave rootvp held.  The root file system is never unmounted.
1784		 */
1785
1786		vfs_add((struct vnode *)0, vfsp,
1787		    (vfsp->vfs_flag & VFS_RDONLY) ? MS_RDONLY : 0);
1788out:
1789		vfs_unlock(vfsp);
1790		return (error);
1791	} else if (why == ROOT_REMOUNT) {
1792		readonly_changed_cb(vfsp->vfs_data, B_FALSE);
1793		vfsp->vfs_flag |= VFS_REMOUNT;
1794
1795		/* refresh mount options */
1796		zfs_unregister_callbacks(vfsp->vfs_data);
1797		return (zfs_register_callbacks(vfsp));
1798
1799	} else if (why == ROOT_UNMOUNT) {
1800		zfs_unregister_callbacks((zfsvfs_t *)vfsp->vfs_data);
1801		(void) zfs_sync(vfsp, 0, 0);
1802		return (0);
1803	}
1804
1805	/*
1806	 * if "why" is equal to anything else other than ROOT_INIT,
1807	 * ROOT_REMOUNT, or ROOT_UNMOUNT, we do not support it.
1808	 */
1809	return (SET_ERROR(ENOTSUP));
1810}
1811
1812/*ARGSUSED*/
1813static int
1814zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
1815{
1816	char		*osname;
1817	pathname_t	spn;
1818	int		error = 0;
1819	uio_seg_t	fromspace = (uap->flags & MS_SYSSPACE) ?
1820	    UIO_SYSSPACE : UIO_USERSPACE;
1821	int		canwrite;
1822
1823	if (mvp->v_type != VDIR)
1824		return (SET_ERROR(ENOTDIR));
1825
1826	mutex_enter(&mvp->v_lock);
1827	if ((uap->flags & MS_REMOUNT) == 0 &&
1828	    (uap->flags & MS_OVERLAY) == 0 &&
1829	    (mvp->v_count != 1 || (mvp->v_flag & VROOT))) {
1830		mutex_exit(&mvp->v_lock);
1831		return (SET_ERROR(EBUSY));
1832	}
1833	mutex_exit(&mvp->v_lock);
1834
1835	/*
1836	 * ZFS does not support passing unparsed data in via MS_DATA.
1837	 * Users should use the MS_OPTIONSTR interface; this means
1838	 * that all option parsing is already done and the options struct
1839	 * can be interrogated.
1840	 */
1841	if ((uap->flags & MS_DATA) && uap->datalen > 0)
1842		return (SET_ERROR(EINVAL));
1843
1844	/*
1845	 * Get the objset name (the "special" mount argument).
1846	 */
1847	if (error = pn_get(uap->spec, fromspace, &spn))
1848		return (error);
1849
1850	osname = spn.pn_path;
1851
1852	/*
1853	 * Check for mount privilege?
1854	 *
1855	 * If we don't have privilege then see if
1856	 * we have local permission to allow it
1857	 */
1858	error = secpolicy_fs_mount(cr, mvp, vfsp);
1859	if (error) {
1860		if (dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) == 0) {
1861			vattr_t		vattr;
1862
1863			/*
1864			 * Make sure user is the owner of the mount point
1865			 * or has sufficient privileges.
1866			 */
1867
1868			vattr.va_mask = AT_UID;
1869
1870			if (VOP_GETATTR(mvp, &vattr, 0, cr, NULL)) {
1871				goto out;
1872			}
1873
1874			if (secpolicy_vnode_owner(cr, vattr.va_uid) != 0 &&
1875			    VOP_ACCESS(mvp, VWRITE, 0, cr, NULL) != 0) {
1876				goto out;
1877			}
1878			secpolicy_fs_mount_clearopts(cr, vfsp);
1879		} else {
1880			goto out;
1881		}
1882	}
1883
1884	/*
1885	 * Refuse to mount a filesystem if we are in a local zone and the
1886	 * dataset is not visible.
1887	 */
1888	if (!INGLOBALZONE(curproc) &&
1889	    (!zone_dataset_visible(osname, &canwrite) || !canwrite)) {
1890		error = SET_ERROR(EPERM);
1891		goto out;
1892	}
1893
1894	error = zfs_mount_label_policy(vfsp, osname);
1895	if (error)
1896		goto out;
1897
1898	/*
1899	 * When doing a remount, we simply refresh our temporary properties
1900	 * according to those options set in the current VFS options.
1901	 */
1902	if (uap->flags & MS_REMOUNT) {
1903		/* refresh mount options */
1904		zfs_unregister_callbacks(vfsp->vfs_data);
1905		error = zfs_register_callbacks(vfsp);
1906		goto out;
1907	}
1908
1909	error = zfs_domount(vfsp, osname);
1910
1911	/*
1912	 * Add an extra VFS_HOLD on our parent vfs so that it can't
1913	 * disappear due to a forced unmount.
1914	 */
1915	if (error == 0 && ((zfsvfs_t *)vfsp->vfs_data)->z_issnap)
1916		VFS_HOLD(mvp->v_vfsp);
1917
1918out:
1919	pn_free(&spn);
1920	return (error);
1921}
1922
1923static int
1924zfs_statvfs(vfs_t *vfsp, struct statvfs64 *statp)
1925{
1926	zfsvfs_t *zfsvfs = vfsp->vfs_data;
1927	dev32_t d32;
1928	uint64_t refdbytes, availbytes, usedobjs, availobjs;
1929	int err = 0;
1930
1931	ZFS_ENTER(zfsvfs);
1932
1933	dmu_objset_space(zfsvfs->z_os,
1934	    &refdbytes, &availbytes, &usedobjs, &availobjs);
1935
1936	/*
1937	 * The underlying storage pool actually uses multiple block sizes.
1938	 * We report the fragsize as the smallest block size we support,
1939	 * and we report our blocksize as the filesystem's maximum blocksize.
1940	 */
1941	statp->f_frsize = 1UL << SPA_MINBLOCKSHIFT;
1942	statp->f_bsize = zfsvfs->z_max_blksz;
1943
1944	/*
1945	 * The following report "total" blocks of various kinds in the
1946	 * file system, but reported in terms of f_frsize - the
1947	 * "fragment" size.
1948	 */
1949
1950	statp->f_blocks = (refdbytes + availbytes) >> SPA_MINBLOCKSHIFT;
1951	statp->f_bfree = availbytes >> SPA_MINBLOCKSHIFT;
1952	statp->f_bavail = statp->f_bfree; /* no root reservation */
1953
1954	/*
1955	 * statvfs() should really be called statufs(), because it assumes
1956	 * static metadata.  ZFS doesn't preallocate files, so the best
1957	 * we can do is report the max that could possibly fit in f_files,
1958	 * and that minus the number actually used in f_ffree.
1959	 * For f_ffree, report the smaller of the number of object available
1960	 * and the number of blocks (each object will take at least a block).
1961	 */
1962	statp->f_ffree = MIN(availobjs, statp->f_bfree);
1963	statp->f_favail = statp->f_ffree;	/* no "root reservation" */
1964	statp->f_files = statp->f_ffree + usedobjs;
1965
1966	(void) cmpldev(&d32, vfsp->vfs_dev);
1967	statp->f_fsid = d32;
1968
1969	/*
1970	 * We're a zfs filesystem.
1971	 */
1972	(void) strcpy(statp->f_basetype, vfssw[vfsp->vfs_fstype].vsw_name);
1973
1974	statp->f_flag = vf_to_stf(vfsp->vfs_flag);
1975
1976	statp->f_namemax = MAXNAMELEN - 1;
1977
1978	/*
1979	 * We have all of 32 characters to stuff a string here.
1980	 * Is there anything useful we could/should provide?
1981	 */
1982	bzero(statp->f_fstr, sizeof (statp->f_fstr));
1983
1984	if (dmu_objset_projectquota_enabled(zfsvfs->z_os) &&
1985	    dmu_objset_projectquota_present(zfsvfs->z_os)) {
1986		znode_t *zp;
1987
1988		/*
1989		 * In ZoL, zfs_statvfs is passed a Linux dentry (directory
1990		 * entry), instead of a vfsp. The ZoL code uses the dentry
1991		 * to get the znode from the dentry's inode. This represents
1992		 * whatever filename was passed to the user-level statvfs
1993		 * syscall.
1994		 *
1995		 * We're using the VFS root znode here, so this represents a
1996		 * potential difference from ZoL.
1997		 */
1998		if (zfs_zget(zfsvfs, zfsvfs->z_root, &zp) == 0) {
1999			uint32_t bshift = ddi_fls(statp->f_bsize) - 1;
2000
2001			if (zp->z_pflags & ZFS_PROJINHERIT && zp->z_projid &&
2002			    zpl_is_valid_projid(zp->z_projid))
2003				err = zfs_statfs_project(zfsvfs, zp, statp,
2004				    bshift);
2005			VN_RELE(ZTOV(zp));
2006		}
2007	}
2008
2009	ZFS_EXIT(zfsvfs);
2010	return (err);
2011}
2012
2013static int
2014zfs_root(vfs_t *vfsp, vnode_t **vpp)
2015{
2016	zfsvfs_t *zfsvfs = vfsp->vfs_data;
2017	znode_t *rootzp;
2018	int error;
2019
2020	ZFS_ENTER(zfsvfs);
2021
2022	error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp);
2023	if (error == 0)
2024		*vpp = ZTOV(rootzp);
2025
2026	ZFS_EXIT(zfsvfs);
2027	return (error);
2028}
2029
2030/*
2031 * Teardown the zfsvfs::z_os.
2032 *
2033 * Note, if 'unmounting' is FALSE, we return with the 'z_teardown_lock'
2034 * and 'z_teardown_inactive_lock' held.
2035 */
2036static int
2037zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting)
2038{
2039	znode_t	*zp;
2040
2041	rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
2042
2043	if (!unmounting) {
2044		/*
2045		 * We purge the parent filesystem's vfsp as the parent
2046		 * filesystem and all of its snapshots have their vnode's
2047		 * v_vfsp set to the parent's filesystem's vfsp.  Note,
2048		 * 'z_parent' is self referential for non-snapshots.
2049		 */
2050		(void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0);
2051	}
2052
2053	/*
2054	 * Close the zil. NB: Can't close the zil while zfs_inactive
2055	 * threads are blocked as zil_close can call zfs_inactive.
2056	 */
2057	if (zfsvfs->z_log) {
2058		zil_close(zfsvfs->z_log);
2059		zfsvfs->z_log = NULL;
2060	}
2061
2062	rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_WRITER);
2063
2064	/*
2065	 * If we are not unmounting (ie: online recv) and someone already
2066	 * unmounted this file system while we were doing the switcheroo,
2067	 * or a reopen of z_os failed then just bail out now.
2068	 */
2069	if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) {
2070		rw_exit(&zfsvfs->z_teardown_inactive_lock);
2071		rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
2072		return (SET_ERROR(EIO));
2073	}
2074
2075	/*
2076	 * At this point there are no vops active, and any new vops will
2077	 * fail with EIO since we have z_teardown_lock for writer (only
2078	 * relavent for forced unmount).
2079	 *
2080	 * Release all holds on dbufs.
2081	 */
2082	mutex_enter(&zfsvfs->z_znodes_lock);
2083	for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL;
2084	    zp = list_next(&zfsvfs->z_all_znodes, zp))
2085		if (zp->z_sa_hdl) {
2086			ASSERT(ZTOV(zp)->v_count > 0);
2087			zfs_znode_dmu_fini(zp);
2088		}
2089	mutex_exit(&zfsvfs->z_znodes_lock);
2090
2091	/*
2092	 * If we are unmounting, set the unmounted flag and let new vops
2093	 * unblock.  zfs_inactive will have the unmounted behavior, and all
2094	 * other vops will fail with EIO.
2095	 */
2096	if (unmounting) {
2097		zfsvfs->z_unmounted = B_TRUE;
2098		rw_exit(&zfsvfs->z_teardown_inactive_lock);
2099		rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
2100	}
2101
2102	/*
2103	 * z_os will be NULL if there was an error in attempting to reopen
2104	 * zfsvfs, so just return as the properties had already been
2105	 * unregistered and cached data had been evicted before.
2106	 */
2107	if (zfsvfs->z_os == NULL)
2108		return (0);
2109
2110	/*
2111	 * Unregister properties.
2112	 */
2113	zfs_unregister_callbacks(zfsvfs);
2114
2115	/*
2116	 * Evict cached data
2117	 */
2118	if (dsl_dataset_is_dirty(dmu_objset_ds(zfsvfs->z_os)) &&
2119	    !(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY))
2120		txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
2121	dmu_objset_evict_dbufs(zfsvfs->z_os);
2122
2123	return (0);
2124}
2125
2126/*ARGSUSED*/
2127static int
2128zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr)
2129{
2130	zfsvfs_t *zfsvfs = vfsp->vfs_data;
2131	objset_t *os;
2132	int ret;
2133
2134	ret = secpolicy_fs_unmount(cr, vfsp);
2135	if (ret) {
2136		if (dsl_deleg_access((char *)refstr_value(vfsp->vfs_resource),
2137		    ZFS_DELEG_PERM_MOUNT, cr))
2138			return (ret);
2139	}
2140
2141	/*
2142	 * We purge the parent filesystem's vfsp as the parent filesystem
2143	 * and all of its snapshots have their vnode's v_vfsp set to the
2144	 * parent's filesystem's vfsp.  Note, 'z_parent' is self
2145	 * referential for non-snapshots.
2146	 */
2147	(void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0);
2148
2149	/*
2150	 * Unmount any snapshots mounted under .zfs before unmounting the
2151	 * dataset itself.
2152	 */
2153	if (zfsvfs->z_ctldir != NULL &&
2154	    (ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0) {
2155		return (ret);
2156	}
2157
2158	if (!(fflag & MS_FORCE)) {
2159		/*
2160		 * Check the number of active vnodes in the file system.
2161		 * Our count is maintained in the vfs structure, but the
2162		 * number is off by 1 to indicate a hold on the vfs
2163		 * structure itself.
2164		 *
2165		 * The '.zfs' directory maintains a reference of its
2166		 * own, and any active references underneath are
2167		 * reflected in the vnode count.
2168		 */
2169		if (zfsvfs->z_ctldir == NULL) {
2170			if (vfsp->vfs_count > 1)
2171				return (SET_ERROR(EBUSY));
2172		} else {
2173			if (vfsp->vfs_count > 2 ||
2174			    zfsvfs->z_ctldir->v_count > 1)
2175				return (SET_ERROR(EBUSY));
2176		}
2177	}
2178
2179	vfsp->vfs_flag |= VFS_UNMOUNTED;
2180
2181	VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0);
2182	os = zfsvfs->z_os;
2183
2184	/*
2185	 * z_os will be NULL if there was an error in
2186	 * attempting to reopen zfsvfs.
2187	 */
2188	if (os != NULL) {
2189		/*
2190		 * Unset the objset user_ptr.
2191		 */
2192		mutex_enter(&os->os_user_ptr_lock);
2193		dmu_objset_set_user(os, NULL);
2194		mutex_exit(&os->os_user_ptr_lock);
2195
2196		/*
2197		 * Finally release the objset
2198		 */
2199		dmu_objset_disown(os, B_TRUE, zfsvfs);
2200	}
2201
2202	/*
2203	 * We can now safely destroy the '.zfs' directory node.
2204	 */
2205	if (zfsvfs->z_ctldir != NULL)
2206		zfsctl_destroy(zfsvfs);
2207
2208	return (0);
2209}
2210
2211static int
2212zfs_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp)
2213{
2214	zfsvfs_t	*zfsvfs = vfsp->vfs_data;
2215	znode_t		*zp;
2216	uint64_t	object = 0;
2217	uint64_t	fid_gen = 0;
2218	uint64_t	gen_mask;
2219	uint64_t	zp_gen;
2220	int		i, err;
2221
2222	*vpp = NULL;
2223
2224	ZFS_ENTER(zfsvfs);
2225
2226	if (fidp->fid_len == LONG_FID_LEN) {
2227		zfid_long_t	*zlfid = (zfid_long_t *)fidp;
2228		uint64_t	objsetid = 0;
2229		uint64_t	setgen = 0;
2230
2231		for (i = 0; i < sizeof (zlfid->zf_setid); i++)
2232			objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i);
2233
2234		for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
2235			setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i);
2236
2237		ZFS_EXIT(zfsvfs);
2238
2239		err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs);
2240		if (err)
2241			return (SET_ERROR(EINVAL));
2242		ZFS_ENTER(zfsvfs);
2243	}
2244
2245	if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) {
2246		zfid_short_t	*zfid = (zfid_short_t *)fidp;
2247
2248		for (i = 0; i < sizeof (zfid->zf_object); i++)
2249			object |= ((uint64_t)zfid->zf_object[i]) << (8 * i);
2250
2251		for (i = 0; i < sizeof (zfid->zf_gen); i++)
2252			fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i);
2253	} else {
2254		ZFS_EXIT(zfsvfs);
2255		return (SET_ERROR(EINVAL));
2256	}
2257
2258	/* A zero fid_gen means we are in the .zfs control directories */
2259	if (fid_gen == 0 &&
2260	    (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) {
2261		*vpp = zfsvfs->z_ctldir;
2262		ASSERT(*vpp != NULL);
2263		if (object == ZFSCTL_INO_SNAPDIR) {
2264			VERIFY(zfsctl_root_lookup(*vpp, "snapshot", vpp, NULL,
2265			    0, NULL, NULL, NULL, NULL, NULL) == 0);
2266		} else {
2267			VN_HOLD(*vpp);
2268		}
2269		ZFS_EXIT(zfsvfs);
2270		return (0);
2271	}
2272
2273	gen_mask = -1ULL >> (64 - 8 * i);
2274
2275	dprintf("getting %llu [%u mask %llx]\n", object, fid_gen, gen_mask);
2276	if (err = zfs_zget(zfsvfs, object, &zp)) {
2277		ZFS_EXIT(zfsvfs);
2278		return (err);
2279	}
2280	(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen,
2281	    sizeof (uint64_t));
2282	zp_gen = zp_gen & gen_mask;
2283	if (zp_gen == 0)
2284		zp_gen = 1;
2285	if (zp->z_unlinked || zp_gen != fid_gen) {
2286		dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen);
2287		VN_RELE(ZTOV(zp));
2288		ZFS_EXIT(zfsvfs);
2289		return (SET_ERROR(EINVAL));
2290	}
2291
2292	*vpp = ZTOV(zp);
2293	ZFS_EXIT(zfsvfs);
2294	return (0);
2295}
2296
2297/*
2298 * Block out VOPs and close zfsvfs_t::z_os
2299 *
2300 * Note, if successful, then we return with the 'z_teardown_lock' and
2301 * 'z_teardown_inactive_lock' write held.  We leave ownership of the underlying
2302 * dataset and objset intact so that they can be atomically handed off during
2303 * a subsequent rollback or recv operation and the resume thereafter.
2304 */
2305int
2306zfs_suspend_fs(zfsvfs_t *zfsvfs)
2307{
2308	int error;
2309
2310	if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0)
2311		return (error);
2312
2313	return (0);
2314}
2315
2316/*
2317 * Rebuild SA and release VOPs.  Note that ownership of the underlying dataset
2318 * is an invariant across any of the operations that can be performed while the
2319 * filesystem was suspended.  Whether it succeeded or failed, the preconditions
2320 * are the same: the relevant objset and associated dataset are owned by
2321 * zfsvfs, held, and long held on entry.
2322 */
2323int
2324zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds)
2325{
2326	int err;
2327	znode_t *zp;
2328
2329	ASSERT(RRM_WRITE_HELD(&zfsvfs->z_teardown_lock));
2330	ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock));
2331
2332	/*
2333	 * We already own this, so just update the objset_t, as the one we
2334	 * had before may have been evicted.
2335	 */
2336	objset_t *os;
2337	VERIFY3P(ds->ds_owner, ==, zfsvfs);
2338	VERIFY(dsl_dataset_long_held(ds));
2339	VERIFY0(dmu_objset_from_ds(ds, &os));
2340
2341	err = zfsvfs_init(zfsvfs, os);
2342	if (err != 0)
2343		goto bail;
2344
2345	VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0);
2346
2347	zfs_set_fuid_feature(zfsvfs);
2348
2349	/*
2350	 * Attempt to re-establish all the active znodes with
2351	 * their dbufs.  If a zfs_rezget() fails, then we'll let
2352	 * any potential callers discover that via ZFS_ENTER_VERIFY_VP
2353	 * when they try to use their znode.
2354	 */
2355	mutex_enter(&zfsvfs->z_znodes_lock);
2356	for (zp = list_head(&zfsvfs->z_all_znodes); zp;
2357	    zp = list_next(&zfsvfs->z_all_znodes, zp)) {
2358		(void) zfs_rezget(zp);
2359	}
2360	mutex_exit(&zfsvfs->z_znodes_lock);
2361
2362bail:
2363	/* release the VOPs */
2364	rw_exit(&zfsvfs->z_teardown_inactive_lock);
2365	rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
2366
2367	if (err) {
2368		/*
2369		 * Since we couldn't setup the sa framework, try to force
2370		 * unmount this file system.
2371		 */
2372		if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0)
2373			(void) dounmount(zfsvfs->z_vfs, MS_FORCE, CRED());
2374	}
2375	return (err);
2376}
2377
2378static void
2379zfs_freevfs(vfs_t *vfsp)
2380{
2381	zfsvfs_t *zfsvfs = vfsp->vfs_data;
2382
2383	/*
2384	 * If this is a snapshot, we have an extra VFS_HOLD on our parent
2385	 * from zfs_mount().  Release it here.  If we came through
2386	 * zfs_mountroot() instead, we didn't grab an extra hold, so
2387	 * skip the VFS_RELE for rootvfs.
2388	 */
2389	if (zfsvfs->z_issnap && (vfsp != rootvfs))
2390		VFS_RELE(zfsvfs->z_parent->z_vfs);
2391
2392	zfsvfs_free(zfsvfs);
2393
2394	atomic_dec_32(&zfs_active_fs_count);
2395}
2396
2397/*
2398 * VFS_INIT() initialization.  Note that there is no VFS_FINI(),
2399 * so we can't safely do any non-idempotent initialization here.
2400 * Leave that to zfs_init() and zfs_fini(), which are called
2401 * from the module's _init() and _fini() entry points.
2402 */
2403/*ARGSUSED*/
2404static int
2405zfs_vfsinit(int fstype, char *name)
2406{
2407	int error;
2408
2409	zfsfstype = fstype;
2410
2411	/*
2412	 * Setup vfsops and vnodeops tables.
2413	 */
2414	error = vfs_setfsops(fstype, zfs_vfsops_template, &zfs_vfsops);
2415	if (error != 0) {
2416		cmn_err(CE_WARN, "zfs: bad vfs ops template");
2417	}
2418
2419	error = zfs_create_op_tables();
2420	if (error) {
2421		zfs_remove_op_tables();
2422		cmn_err(CE_WARN, "zfs: bad vnode ops template");
2423		(void) vfs_freevfsops_by_type(zfsfstype);
2424		return (error);
2425	}
2426
2427	mutex_init(&zfs_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
2428
2429	/*
2430	 * Unique major number for all zfs mounts.
2431	 * If we run out of 32-bit minors, we'll getudev() another major.
2432	 */
2433	zfs_major = ddi_name_to_major(ZFS_DRIVER);
2434	zfs_minor = ZFS_MIN_MINOR;
2435
2436	return (0);
2437}
2438
2439void
2440zfs_init(void)
2441{
2442	/*
2443	 * Initialize .zfs directory structures
2444	 */
2445	zfsctl_init();
2446
2447	/*
2448	 * Initialize znode cache, vnode ops, etc...
2449	 */
2450	zfs_znode_init();
2451
2452	dmu_objset_register_type(DMU_OST_ZFS, zfs_space_delta_cb);
2453}
2454
2455void
2456zfs_fini(void)
2457{
2458	zfsctl_fini();
2459	zfs_znode_fini();
2460}
2461
2462int
2463zfs_busy(void)
2464{
2465	return (zfs_active_fs_count != 0);
2466}
2467
2468int
2469zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers)
2470{
2471	int error;
2472	objset_t *os = zfsvfs->z_os;
2473	dmu_tx_t *tx;
2474
2475	if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION)
2476		return (SET_ERROR(EINVAL));
2477
2478	if (newvers < zfsvfs->z_version)
2479		return (SET_ERROR(EINVAL));
2480
2481	if (zfs_spa_version_map(newvers) >
2482	    spa_version(dmu_objset_spa(zfsvfs->z_os)))
2483		return (SET_ERROR(ENOTSUP));
2484
2485	tx = dmu_tx_create(os);
2486	dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR);
2487	if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
2488		dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE,
2489		    ZFS_SA_ATTRS);
2490		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
2491	}
2492	error = dmu_tx_assign(tx, TXG_WAIT);
2493	if (error) {
2494		dmu_tx_abort(tx);
2495		return (error);
2496	}
2497
2498	error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR,
2499	    8, 1, &newvers, tx);
2500
2501	if (error) {
2502		dmu_tx_commit(tx);
2503		return (error);
2504	}
2505
2506	if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
2507		uint64_t sa_obj;
2508
2509		ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=,
2510		    SPA_VERSION_SA);
2511		sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE,
2512		    DMU_OT_NONE, 0, tx);
2513
2514		error = zap_add(os, MASTER_NODE_OBJ,
2515		    ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
2516		ASSERT0(error);
2517
2518		VERIFY(0 == sa_set_sa_object(os, sa_obj));
2519		sa_register_update_callback(os, zfs_sa_upgrade);
2520	}
2521
2522	spa_history_log_internal_ds(dmu_objset_ds(os), "upgrade", tx,
2523	    "from %llu to %llu", zfsvfs->z_version, newvers);
2524
2525	dmu_tx_commit(tx);
2526
2527	zfsvfs->z_version = newvers;
2528	os->os_version = newvers;
2529
2530	zfs_set_fuid_feature(zfsvfs);
2531
2532	return (0);
2533}
2534
2535/*
2536 * Read a property stored within the master node.
2537 */
2538int
2539zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value)
2540{
2541	uint64_t *cached_copy = NULL;
2542
2543	/*
2544	 * Figure out where in the objset_t the cached copy would live, if it
2545	 * is available for the requested property.
2546	 */
2547	if (os != NULL) {
2548		switch (prop) {
2549		case ZFS_PROP_VERSION:
2550			cached_copy = &os->os_version;
2551			break;
2552		case ZFS_PROP_NORMALIZE:
2553			cached_copy = &os->os_normalization;
2554			break;
2555		case ZFS_PROP_UTF8ONLY:
2556			cached_copy = &os->os_utf8only;
2557			break;
2558		case ZFS_PROP_CASE:
2559			cached_copy = &os->os_casesensitivity;
2560			break;
2561		default:
2562			break;
2563		}
2564	}
2565	if (cached_copy != NULL && *cached_copy != OBJSET_PROP_UNINITIALIZED) {
2566		*value = *cached_copy;
2567		return (0);
2568	}
2569
2570	/*
2571	 * If the property wasn't cached, look up the file system's value for
2572	 * the property. For the version property, we look up a slightly
2573	 * different string.
2574	 */
2575	const char *pname;
2576	int error = ENOENT;
2577	if (prop == ZFS_PROP_VERSION) {
2578		pname = ZPL_VERSION_STR;
2579	} else {
2580		pname = zfs_prop_to_name(prop);
2581	}
2582
2583	if (os != NULL) {
2584		ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS);
2585		error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value);
2586	}
2587
2588	if (error == ENOENT) {
2589		/* No value set, use the default value */
2590		switch (prop) {
2591		case ZFS_PROP_VERSION:
2592			*value = ZPL_VERSION;
2593			break;
2594		case ZFS_PROP_NORMALIZE:
2595		case ZFS_PROP_UTF8ONLY:
2596			*value = 0;
2597			break;
2598		case ZFS_PROP_CASE:
2599			*value = ZFS_CASE_SENSITIVE;
2600			break;
2601		default:
2602			return (error);
2603		}
2604		error = 0;
2605	}
2606
2607	/*
2608	 * If one of the methods for getting the property value above worked,
2609	 * copy it into the objset_t's cache.
2610	 */
2611	if (error == 0 && cached_copy != NULL) {
2612		*cached_copy = *value;
2613	}
2614
2615	return (error);
2616}
2617
2618/*
2619 * Return true if the coresponding vfs's unmounted flag is set.
2620 * Otherwise return false.
2621 * If this function returns true we know VFS unmount has been initiated.
2622 */
2623boolean_t
2624zfs_get_vfs_flag_unmounted(objset_t *os)
2625{
2626	zfsvfs_t *zfvp;
2627	boolean_t unmounted = B_FALSE;
2628
2629	ASSERT(dmu_objset_type(os) == DMU_OST_ZFS);
2630
2631	mutex_enter(&os->os_user_ptr_lock);
2632	zfvp = dmu_objset_get_user(os);
2633	if (zfvp != NULL && zfvp->z_vfs != NULL &&
2634	    (zfvp->z_vfs->vfs_flag & VFS_UNMOUNTED))
2635		unmounted = B_TRUE;
2636	mutex_exit(&os->os_user_ptr_lock);
2637
2638	return (unmounted);
2639}
2640
2641static vfsdef_t vfw = {
2642	VFSDEF_VERSION,
2643	MNTTYPE_ZFS,
2644	zfs_vfsinit,
2645	VSW_HASPROTO|VSW_CANRWRO|VSW_CANREMOUNT|VSW_VOLATILEDEV|VSW_STATS|
2646	    VSW_XID|VSW_ZMOUNT,
2647	&zfs_mntopts
2648};
2649
2650struct modlfs zfs_modlfs = {
2651	&mod_fsops, "ZFS filesystem version " SPA_VERSION_STRING, &vfw
2652};
2653