1fa9e406ahrens/*
2fa9e406ahrens * CDDL HEADER START
3fa9e406ahrens *
4fa9e406ahrens * The contents of this file are subject to the terms of the
5ea8dc4beschrock * Common Development and Distribution License (the "License").
6ea8dc4beschrock * You may not use this file except in compliance with the License.
7fa9e406ahrens *
8fa9e406ahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9fa9e406ahrens * or http://www.opensolaris.org/os/licensing.
10fa9e406ahrens * See the License for the specific language governing permissions
11fa9e406ahrens * and limitations under the License.
12fa9e406ahrens *
13fa9e406ahrens * When distributing Covered Code, include this CDDL HEADER in each
14fa9e406ahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15fa9e406ahrens * If applicable, add the following below this CDDL HEADER, with the
16fa9e406ahrens * fields enclosed by brackets "[]" replaced with your own identifying
17fa9e406ahrens * information: Portions Copyright [yyyy] [name of copyright owner]
18fa9e406ahrens *
19fa9e406ahrens * CDDL HEADER END
20fa9e406ahrens */
21fa9e406ahrens/*
228f2529dMark Shellenbaum * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
237931524Matthew Ahrens * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
24c3d26abMatthew Ahrens * Copyright (c) 2014 Integros [integros.com]
25fa9e406ahrens */
26fa9e406ahrens
2775c7619peteh/* Portions Copyright 2007 Jeremy Teo */
2875c7619peteh
2955434c7ek#ifdef _KERNEL
30fa9e406ahrens#include <sys/types.h>
31fa9e406ahrens#include <sys/param.h>
32fa9e406ahrens#include <sys/time.h>
33fa9e406ahrens#include <sys/systm.h>
34fa9e406ahrens#include <sys/sysmacros.h>
35fa9e406ahrens#include <sys/resource.h>
36fa9e406ahrens#include <sys/mntent.h>
3772fc53bmarks#include <sys/mkdev.h>
38de8267etimh#include <sys/u8_textprep.h>
39ab04eb8timh#include <sys/dsl_dataset.h>
40fa9e406ahrens#include <sys/vfs.h>
41aa59c4crsb#include <sys/vfs_opreg.h>
42fa9e406ahrens#include <sys/vnode.h>
43fa9e406ahrens#include <sys/file.h>
44fa9e406ahrens#include <sys/kmem.h>
45fa9e406ahrens#include <sys/errno.h>
46fa9e406ahrens#include <sys/unistd.h>
47fa9e406ahrens#include <sys/mode.h>
48fa9e406ahrens#include <sys/atomic.h>
49fa9e406ahrens#include <vm/pvn.h>
50fa9e406ahrens#include "fs/fs_subr.h"
51fa9e406ahrens#include <sys/zfs_dir.h>
52fa9e406ahrens#include <sys/zfs_acl.h>
53fa9e406ahrens#include <sys/zfs_ioctl.h>
54104e2edperrin#include <sys/zfs_rlock.h>
55da6c28aamw#include <sys/zfs_fuid.h>
560a586ceMark Shellenbaum#include <sys/dnode.h>
57fa9e406ahrens#include <sys/fs/zfs.h>
58da6c28aamw#include <sys/kidmap.h>
5955434c7ek#endif /* _KERNEL */
6055434c7ek
6155434c7ek#include <sys/dmu.h>
62b515258Matthew Ahrens#include <sys/dmu_objset.h>
6354811daToomas Soome#include <sys/dmu_tx.h>
6455434c7ek#include <sys/refcount.h>
6555434c7ek#include <sys/stat.h>
6655434c7ek#include <sys/zap.h>
6755434c7ek#include <sys/zfs_znode.h>
680a586ceMark Shellenbaum#include <sys/sa.h>
690a586ceMark Shellenbaum#include <sys/zfs_sa.h>
7099d5e17Tim Haley#include <sys/zfs_stat.h>
71fa9e406ahrens
72de8267etimh#include "zfs_prop.h"
730a586ceMark Shellenbaum#include "zfs_comutil.h"
74de8267etimh
7555434c7ek/*
76b5fca8ftomee * Define ZNODE_STATS to turn on statistic gathering. By default, it is only
77b5fca8ftomee * turned on when DEBUG is also defined.
78b5fca8ftomee */
79b5fca8ftomee#ifdef	DEBUG
80b5fca8ftomee#define	ZNODE_STATS
81b5fca8ftomee#endif	/* DEBUG */
82b5fca8ftomee
83b5fca8ftomee#ifdef	ZNODE_STATS
84b5fca8ftomee#define	ZNODE_STAT_ADD(stat)			((stat)++)
85b5fca8ftomee#else
86b5fca8ftomee#define	ZNODE_STAT_ADD(stat)			/* nothing */
87b5fca8ftomee#endif	/* ZNODE_STATS */
88b5fca8ftomee
89b5fca8ftomee/*
9055434c7ek * Functions needed for userland (ie: libzpool) are not put under
9155434c7ek * #ifdef_KERNEL; the rest of the functions have dependencies
9255434c7ek * (such as VFS logic) that will not compile easily in userland.
9355434c7ek */
9455434c7ek#ifdef _KERNEL
954e9583bTom Erickson/*
964e9583bTom Erickson * Needed to close a small window in zfs_znode_move() that allows the zfsvfs to
974e9583bTom Erickson * be freed before it can be safely accessed.
984e9583bTom Erickson */
994e9583bTom Ericksonkrwlock_t zfsvfs_lock;
1004e9583bTom Erickson
101b5fca8ftomeestatic kmem_cache_t *znode_cache = NULL;
102fa9e406ahrens
103c5832a5Alek Pinchuk/*
104c5832a5Alek Pinchuk * This is used by the test suite so that it can delay znodes from being
105c5832a5Alek Pinchuk * freed in order to inspect the unlinked set.
106c5832a5Alek Pinchuk */
107c5832a5Alek Pinchukint zfs_unlink_suspend_progress = 0;
108c5832a5Alek Pinchuk
109fa9e406ahrens/*ARGSUSED*/
110fa9e406ahrensstatic void
111874395dmaybeeznode_evict_error(dmu_buf_t *dbuf, void *user_ptr)
112fa9e406ahrens{
113874395dmaybee	/*
114874395dmaybee	 * We should never drop all dbuf refs without first clearing
115874395dmaybee	 * the eviction callback.
116874395dmaybee	 */
117874395dmaybee	panic("evicting znode %p\n", user_ptr);
118fa9e406ahrens}
119fa9e406ahrens
1207931524Matthew Ahrens/*
1217931524Matthew Ahrens * This callback is invoked when acquiring a RL_WRITER or RL_APPEND lock on
1227931524Matthew Ahrens * z_rangelock. It will modify the offset and length of the lock to reflect
1237931524Matthew Ahrens * znode-specific information, and convert RL_APPEND to RL_WRITER.  This is
1247931524Matthew Ahrens * called with the rangelock_t's rl_lock held, which avoids races.
1257931524Matthew Ahrens */
1267931524Matthew Ahrensstatic void
1277931524Matthew Ahrenszfs_rangelock_cb(locked_range_t *new, void *arg)
1287931524Matthew Ahrens{
1297931524Matthew Ahrens	znode_t *zp = arg;
1307931524Matthew Ahrens
1317931524Matthew Ahrens	/*
1327931524Matthew Ahrens	 * If in append mode, convert to writer and lock starting at the
1337931524Matthew Ahrens	 * current end of file.
1347931524Matthew Ahrens	 */
1357931524Matthew Ahrens	if (new->lr_type == RL_APPEND) {
1367931524Matthew Ahrens		new->lr_offset = zp->z_size;
1377931524Matthew Ahrens		new->lr_type = RL_WRITER;
1387931524Matthew Ahrens	}
1397931524Matthew Ahrens
1407931524Matthew Ahrens	/*
1417931524Matthew Ahrens	 * If we need to grow the block size then lock the whole file range.
1427931524Matthew Ahrens	 */
1437931524Matthew Ahrens	uint64_t end_size = MAX(zp->z_size, new->lr_offset + new->lr_length);
1447931524Matthew Ahrens	if (end_size > zp->z_blksz && (!ISP2(zp->z_blksz) ||
1457931524Matthew Ahrens	    zp->z_blksz < zp->z_zfsvfs->z_max_blksz)) {
1467931524Matthew Ahrens		new->lr_offset = 0;
1477931524Matthew Ahrens		new->lr_length = UINT64_MAX;
1487931524Matthew Ahrens	}
1497931524Matthew Ahrens}
1507931524Matthew Ahrens
151fa9e406ahrens/*ARGSUSED*/
152fa9e406ahrensstatic int
153b5fca8ftomeezfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
154fa9e406ahrens{
155fa9e406ahrens	znode_t *zp = buf;
156fa9e406ahrens
157b5fca8ftomee	ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs));
158b5fca8ftomee
159b5fca8ftomee	zp->z_vnode = vn_alloc(kmflags);
160b5fca8ftomee	if (zp->z_vnode == NULL) {
161b5fca8ftomee		return (-1);
162b5fca8ftomee	}
163b5fca8ftomee	ZTOV(zp)->v_data = zp;
164b5fca8ftomee
165b5fca8ftomee	list_link_init(&zp->z_link_node);
166b5fca8ftomee
167fa9e406ahrens	mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL);
168104e2edperrin	rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL);
169af2c482maybee	rw_init(&zp->z_name_lock, NULL, RW_DEFAULT, NULL);
170fa9e406ahrens	mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL);
171104e2edperrin
1727931524Matthew Ahrens	rangelock_init(&zp->z_rangelock, zfs_rangelock_cb, zp);
173104e2edperrin
174b5fca8ftomee	zp->z_dirlocks = NULL;
175d47621aTim Haley	zp->z_acl_cached = NULL;
176744947dTom Erickson	zp->z_moved = 0;
177fa9e406ahrens	return (0);
178fa9e406ahrens}
179fa9e406ahrens
180fa9e406ahrens/*ARGSUSED*/
181fa9e406ahrensstatic void
182b5fca8ftomeezfs_znode_cache_destructor(void *buf, void *arg)
183fa9e406ahrens{
184fa9e406ahrens	znode_t *zp = buf;
185fa9e406ahrens
186b5fca8ftomee	ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs));
187b5fca8ftomee	ASSERT(ZTOV(zp)->v_data == zp);
188b5fca8ftomee	vn_free(ZTOV(zp));
189b5fca8ftomee	ASSERT(!list_link_active(&zp->z_link_node));
190fa9e406ahrens	mutex_destroy(&zp->z_lock);
191104e2edperrin	rw_destroy(&zp->z_parent_lock);
192af2c482maybee	rw_destroy(&zp->z_name_lock);
193fa9e406ahrens	mutex_destroy(&zp->z_acl_lock);
1947931524Matthew Ahrens	rangelock_fini(&zp->z_rangelock);
195fa9e406ahrens
196b5fca8ftomee	ASSERT(zp->z_dirlocks == NULL);
1974929fd5Tim Haley	ASSERT(zp->z_acl_cached == NULL);
198b5fca8ftomee}
199b5fca8ftomee
200b5fca8ftomee#ifdef	ZNODE_STATS
201b5fca8ftomeestatic struct {
202b5fca8ftomee	uint64_t zms_zfsvfs_invalid;
2034e9583bTom Erickson	uint64_t zms_zfsvfs_recheck1;
204b5fca8ftomee	uint64_t zms_zfsvfs_unmounted;
2054e9583bTom Erickson	uint64_t zms_zfsvfs_recheck2;
206a66b2b3Tom Erickson	uint64_t zms_obj_held;
207b5fca8ftomee	uint64_t zms_vnode_locked;
208a66b2b3Tom Erickson	uint64_t zms_not_only_dnlc;
209b5fca8ftomee} znode_move_stats;
210b5fca8ftomee#endif	/* ZNODE_STATS */
211b5fca8ftomee
212b5fca8ftomeestatic void
213b5fca8ftomeezfs_znode_move_impl(znode_t *ozp, znode_t *nzp)
214b5fca8ftomee{
215b5fca8ftomee	vnode_t *vp;
216b5fca8ftomee
217b5fca8ftomee	/* Copy fields. */
218b5fca8ftomee	nzp->z_zfsvfs = ozp->z_zfsvfs;
219b5fca8ftomee
220b5fca8ftomee	/* Swap vnodes. */
221b5fca8ftomee	vp = nzp->z_vnode;
222b5fca8ftomee	nzp->z_vnode = ozp->z_vnode;
223b5fca8ftomee	ozp->z_vnode = vp; /* let destructor free the overwritten vnode */
224b5fca8ftomee	ZTOV(ozp)->v_data = ozp;
225b5fca8ftomee	ZTOV(nzp)->v_data = nzp;
226b5fca8ftomee
227b5fca8ftomee	nzp->z_id = ozp->z_id;
228b5fca8ftomee	ASSERT(ozp->z_dirlocks == NULL); /* znode not in use */
229b5fca8ftomee	nzp->z_unlinked = ozp->z_unlinked;
230b5fca8ftomee	nzp->z_atime_dirty = ozp->z_atime_dirty;
231b5fca8ftomee	nzp->z_zn_prefetch = ozp->z_zn_prefetch;
232b5fca8ftomee	nzp->z_blksz = ozp->z_blksz;
233b5fca8ftomee	nzp->z_seq = ozp->z_seq;
234b5fca8ftomee	nzp->z_mapcnt = ozp->z_mapcnt;
235b5fca8ftomee	nzp->z_gen = ozp->z_gen;
236b5fca8ftomee	nzp->z_sync_cnt = ozp->z_sync_cnt;
2370a586ceMark Shellenbaum	nzp->z_is_sa = ozp->z_is_sa;
2380a586ceMark Shellenbaum	nzp->z_sa_hdl = ozp->z_sa_hdl;
2390a586ceMark Shellenbaum	bcopy(ozp->z_atime, nzp->z_atime, sizeof (uint64_t) * 2);
2400a586ceMark Shellenbaum	nzp->z_links = ozp->z_links;
2410a586ceMark Shellenbaum	nzp->z_size = ozp->z_size;
2420a586ceMark Shellenbaum	nzp->z_pflags = ozp->z_pflags;
2430a586ceMark Shellenbaum	nzp->z_uid = ozp->z_uid;
2440a586ceMark Shellenbaum	nzp->z_gid = ozp->z_gid;
2450a586ceMark Shellenbaum	nzp->z_mode = ozp->z_mode;
246d98a623Mark Shellenbaum
247d98a623Mark Shellenbaum	/*
2486638ae1Mark Shellenbaum	 * Since this is just an idle znode and kmem is already dealing with
2496638ae1Mark Shellenbaum	 * memory pressure, release any cached ACL.
250d98a623Mark Shellenbaum	 */
251d98a623Mark Shellenbaum	if (ozp->z_acl_cached) {
252d98a623Mark Shellenbaum		zfs_acl_free(ozp->z_acl_cached);
253d98a623Mark Shellenbaum		ozp->z_acl_cached = NULL;
254d98a623Mark Shellenbaum	}
255b5fca8ftomee
2560a586ceMark Shellenbaum	sa_set_userp(nzp->z_sa_hdl, nzp);
257b5fca8ftomee
258b5fca8ftomee	/*
259b5fca8ftomee	 * Invalidate the original znode by clearing fields that provide a
260b5fca8ftomee	 * pointer back to the znode. Set the low bit of the vfs pointer to
261b5fca8ftomee	 * ensure that zfs_znode_move() recognizes the znode as invalid in any
262b5fca8ftomee	 * subsequent callback.
263b5fca8ftomee	 */
2640a586ceMark Shellenbaum	ozp->z_sa_hdl = NULL;
265b5fca8ftomee	POINTER_INVALIDATE(&ozp->z_zfsvfs);
266744947dTom Erickson
267744947dTom Erickson	/*
268744947dTom Erickson	 * Mark the znode.
269744947dTom Erickson	 */
270744947dTom Erickson	nzp->z_moved = 1;
271744947dTom Erickson	ozp->z_moved = (uint8_t)-1;
272b5fca8ftomee}
273b5fca8ftomee
274b5fca8ftomee/*ARGSUSED*/
275b5fca8ftomeestatic kmem_cbrc_t
276b5fca8ftomeezfs_znode_move(void *buf, void *newbuf, size_t size, void *arg)
277b5fca8ftomee{
278b5fca8ftomee	znode_t *ozp = buf, *nzp = newbuf;
279b5fca8ftomee	zfsvfs_t *zfsvfs;
280b5fca8ftomee	vnode_t *vp;
281b5fca8ftomee
282b5fca8ftomee	/*
283b5fca8ftomee	 * The znode is on the file system's list of known znodes if the vfs
284b5fca8ftomee	 * pointer is valid. We set the low bit of the vfs pointer when freeing
285b5fca8ftomee	 * the znode to invalidate it, and the memory patterns written by kmem
286b5fca8ftomee	 * (baddcafe and deadbeef) set at least one of the two low bits. A newly
287b5fca8ftomee	 * created znode sets the vfs pointer last of all to indicate that the
288b5fca8ftomee	 * znode is known and in a valid state to be moved by this function.
289b5fca8ftomee	 */
290b5fca8ftomee	zfsvfs = ozp->z_zfsvfs;
291b5fca8ftomee	if (!POINTER_IS_VALID(zfsvfs)) {
292b5fca8ftomee		ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_invalid);
293b5fca8ftomee		return (KMEM_CBRC_DONT_KNOW);
294b5fca8ftomee	}
295b5fca8ftomee
296b5fca8ftomee	/*
2974e9583bTom Erickson	 * Close a small window in which it's possible that the filesystem could
2984e9583bTom Erickson	 * be unmounted and freed, and zfsvfs, though valid in the previous
2994e9583bTom Erickson	 * statement, could point to unrelated memory by the time we try to
3004e9583bTom Erickson	 * prevent the filesystem from being unmounted.
3014e9583bTom Erickson	 */
3024e9583bTom Erickson	rw_enter(&zfsvfs_lock, RW_WRITER);
3034e9583bTom Erickson	if (zfsvfs != ozp->z_zfsvfs) {
3044e9583bTom Erickson		rw_exit(&zfsvfs_lock);
3054e9583bTom Erickson		ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_recheck1);
3064e9583bTom Erickson		return (KMEM_CBRC_DONT_KNOW);
3074e9583bTom Erickson	}
3084e9583bTom Erickson
3094e9583bTom Erickson	/*
3104e9583bTom Erickson	 * If the znode is still valid, then so is the file system. We know that
3114e9583bTom Erickson	 * no valid file system can be freed while we hold zfsvfs_lock, so we
3124e9583bTom Erickson	 * can safely ensure that the filesystem is not and will not be
3134e9583bTom Erickson	 * unmounted. The next statement is equivalent to ZFS_ENTER().
314b5fca8ftomee	 */
315c9030f6Alexander Motin	rrm_enter(&zfsvfs->z_teardown_lock, RW_READER, FTAG);
3161484342Matthew Ahrens	if (zfsvfs->z_unmounted) {
3171484342Matthew Ahrens		ZFS_EXIT(zfsvfs);
3184e9583bTom Erickson		rw_exit(&zfsvfs_lock);
319b5fca8ftomee		ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_unmounted);
320b5fca8ftomee		return (KMEM_CBRC_DONT_KNOW);
321b5fca8ftomee	}
3224e9583bTom Erickson	rw_exit(&zfsvfs_lock);
323b5fca8ftomee
324b5fca8ftomee	mutex_enter(&zfsvfs->z_znodes_lock);
325b5fca8ftomee	/*
326b5fca8ftomee	 * Recheck the vfs pointer in case the znode was removed just before
327b5fca8ftomee	 * acquiring the lock.
328b5fca8ftomee	 */
329b5fca8ftomee	if (zfsvfs != ozp->z_zfsvfs) {
330b5fca8ftomee		mutex_exit(&zfsvfs->z_znodes_lock);
331b5fca8ftomee		ZFS_EXIT(zfsvfs);
3324e9583bTom Erickson		ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_recheck2);
333b5fca8ftomee		return (KMEM_CBRC_DONT_KNOW);
334b5fca8ftomee	}
335b5fca8ftomee
336b5fca8ftomee	/*
337b5fca8ftomee	 * At this point we know that as long as we hold z_znodes_lock, the
338b5fca8ftomee	 * znode cannot be freed and fields within the znode can be safely
339a66b2b3Tom Erickson	 * accessed. Now, prevent a race with zfs_zget().
340b5fca8ftomee	 */
341a66b2b3Tom Erickson	if (ZFS_OBJ_HOLD_TRYENTER(zfsvfs, ozp->z_id) == 0) {
342a66b2b3Tom Erickson		mutex_exit(&zfsvfs->z_znodes_lock);
343a66b2b3Tom Erickson		ZFS_EXIT(zfsvfs);
344a66b2b3Tom Erickson		ZNODE_STAT_ADD(znode_move_stats.zms_obj_held);
345a66b2b3Tom Erickson		return (KMEM_CBRC_LATER);
346a66b2b3Tom Erickson	}
347a66b2b3Tom Erickson
348b5fca8ftomee	vp = ZTOV(ozp);
349b5fca8ftomee	if (mutex_tryenter(&vp->v_lock) == 0) {
350a66b2b3Tom Erickson		ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id);
351b5fca8ftomee		mutex_exit(&zfsvfs->z_znodes_lock);
352b5fca8ftomee		ZFS_EXIT(zfsvfs);
353b5fca8ftomee		ZNODE_STAT_ADD(znode_move_stats.zms_vnode_locked);
354b5fca8ftomee		return (KMEM_CBRC_LATER);
355b5fca8ftomee	}
356a66b2b3Tom Erickson
357b5fca8ftomee	/* Only move znodes that are referenced _only_ by the DNLC. */
358b5fca8ftomee	if (vp->v_count != 1 || !vn_in_dnlc(vp)) {
359b5fca8ftomee		mutex_exit(&vp->v_lock);
360a66b2b3Tom Erickson		ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id);
361b5fca8ftomee		mutex_exit(&zfsvfs->z_znodes_lock);
362b5fca8ftomee		ZFS_EXIT(zfsvfs);
363a66b2b3Tom Erickson		ZNODE_STAT_ADD(znode_move_stats.zms_not_only_dnlc);
364b5fca8ftomee		return (KMEM_CBRC_LATER);
365b5fca8ftomee	}
366b5fca8ftomee
367b5fca8ftomee	/*
368b5fca8ftomee	 * The znode is known and in a valid state to move. We're holding the
369b5fca8ftomee	 * locks needed to execute the critical section.
370b5fca8ftomee	 */
371b5fca8ftomee	zfs_znode_move_impl(ozp, nzp);
372b5fca8ftomee	mutex_exit(&vp->v_lock);
373a66b2b3Tom Erickson	ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id);
374b5fca8ftomee
375b5fca8ftomee	list_link_replace(&ozp->z_link_node, &nzp->z_link_node);
376b5fca8ftomee	mutex_exit(&zfsvfs->z_znodes_lock);
377b5fca8ftomee	ZFS_EXIT(zfsvfs);
378b5fca8ftomee
379b5fca8ftomee	return (KMEM_CBRC_YES);
380fa9e406ahrens}
381fa9e406ahrens
382fa9e406ahrensvoid
383fa9e406ahrenszfs_znode_init(void)
384fa9e406ahrens{
385fa9e406ahrens	/*
386fa9e406ahrens	 * Initialize zcache
387fa9e406ahrens	 */
3884e9583bTom Erickson	rw_init(&zfsvfs_lock, NULL, RW_DEFAULT, NULL);
389fa9e406ahrens	ASSERT(znode_cache == NULL);
390fa9e406ahrens	znode_cache = kmem_cache_create("zfs_znode_cache",
391fa9e406ahrens	    sizeof (znode_t), 0, zfs_znode_cache_constructor,
392fa9e406ahrens	    zfs_znode_cache_destructor, NULL, NULL, NULL, 0);
393b5fca8ftomee	kmem_cache_set_move(znode_cache, zfs_znode_move);
394fa9e406ahrens}
395fa9e406ahrens
396fa9e406ahrensvoid
397fa9e406ahrenszfs_znode_fini(void)
398fa9e406ahrens{
399fa9e406ahrens	/*
400fa9e406ahrens	 * Cleanup vfs & vnode ops
401fa9e406ahrens	 */
402fa9e406ahrens	zfs_remove_op_tables();
403fa9e406ahrens
404fa9e406ahrens	/*
405fa9e406ahrens	 * Cleanup zcache
406fa9e406ahrens	 */
407fa9e406ahrens	if (znode_cache)
408fa9e406ahrens		kmem_cache_destroy(znode_cache);
409fa9e406ahrens	znode_cache = NULL;
4104e9583bTom Erickson	rw_destroy(&zfsvfs_lock);
411fa9e406ahrens}
412fa9e406ahrens
413fa9e406ahrensstruct vnodeops *zfs_dvnodeops;
414fa9e406ahrensstruct vnodeops *zfs_fvnodeops;
415fa9e406ahrensstruct vnodeops *zfs_symvnodeops;
416fa9e406ahrensstruct vnodeops *zfs_xdvnodeops;
417fa9e406ahrensstruct vnodeops *zfs_evnodeops;
418743a77eAlan Wrightstruct vnodeops *zfs_sharevnodeops;
419fa9e406ahrens
420fa9e406ahrensvoid
421fa9e406ahrenszfs_remove_op_tables()
422fa9e406ahrens{
423fa9e406ahrens	/*
424fa9e406ahrens	 * Remove vfs ops
425fa9e406ahrens	 */
426fa9e406ahrens	ASSERT(zfsfstype);
427fa9e406ahrens	(void) vfs_freevfsops_by_type(zfsfstype);
428fa9e406ahrens	zfsfstype = 0;
429fa9e406ahrens
430fa9e406ahrens	/*
431fa9e406ahrens	 * Remove vnode ops
432fa9e406ahrens	 */
433fa9e406ahrens	if (zfs_dvnodeops)
434fa9e406ahrens		vn_freevnodeops(zfs_dvnodeops);
435fa9e406ahrens	if (zfs_fvnodeops)
436fa9e406ahrens		vn_freevnodeops(zfs_fvnodeops);
437fa9e406ahrens	if (zfs_symvnodeops)
438fa9e406ahrens		vn_freevnodeops(zfs_symvnodeops);
439fa9e406ahrens	if (zfs_xdvnodeops)
440fa9e406ahrens		vn_freevnodeops(zfs_xdvnodeops);
441fa9e406ahrens	if (zfs_evnodeops)
442fa9e406ahrens		vn_freevnodeops(zfs_evnodeops);
443743a77eAlan Wright	if (zfs_sharevnodeops)
444743a77eAlan Wright		vn_freevnodeops(zfs_sharevnodeops);
445fa9e406ahrens
446fa9e406ahrens	zfs_dvnodeops = NULL;
447fa9e406ahrens	zfs_fvnodeops = NULL;
448fa9e406ahrens	zfs_symvnodeops = NULL;
449fa9e406ahrens	zfs_xdvnodeops = NULL;
450fa9e406ahrens	zfs_evnodeops = NULL;
451743a77eAlan Wright	zfs_sharevnodeops = NULL;
452fa9e406ahrens}
453fa9e406ahrens
454fa9e406ahrensextern const fs_operation_def_t zfs_dvnodeops_template[];
455fa9e406ahrensextern const fs_operation_def_t zfs_fvnodeops_template[];
456fa9e406ahrensextern const fs_operation_def_t zfs_xdvnodeops_template[];
457fa9e406ahrensextern const fs_operation_def_t zfs_symvnodeops_template[];
458fa9e406ahrensextern const fs_operation_def_t zfs_evnodeops_template[];
459743a77eAlan Wrightextern const fs_operation_def_t zfs_sharevnodeops_template[];
460fa9e406ahrens
461fa9e406ahrensint
462fa9e406ahrenszfs_create_op_tables()
463fa9e406ahrens{
464fa9e406ahrens	int error;
465fa9e406ahrens
466fa9e406ahrens	/*
467fa9e406ahrens	 * zfs_dvnodeops can be set if mod_remove() calls mod_installfs()
468fa9e406ahrens	 * due to a failure to remove the the 2nd modlinkage (zfs_modldrv).
469fa9e406ahrens	 * In this case we just return as the ops vectors are already set up.
470fa9e406ahrens	 */
471fa9e406ahrens	if (zfs_dvnodeops)
472fa9e406ahrens		return (0);
473fa9e406ahrens
474fa9e406ahrens	error = vn_make_ops(MNTTYPE_ZFS, zfs_dvnodeops_template,
475fa9e406ahrens	    &zfs_dvnodeops);
476fa9e406ahrens	if (error)
477fa9e406ahrens		return (error);
478fa9e406ahrens
479fa9e406ahrens	error = vn_make_ops(MNTTYPE_ZFS, zfs_fvnodeops_template,
480fa9e406ahrens	    &zfs_fvnodeops);
481fa9e406ahrens	if (error)
482fa9e406ahrens		return (error);
483fa9e406ahrens
484fa9e406ahrens	error = vn_make_ops(MNTTYPE_ZFS, zfs_symvnodeops_template,
485fa9e406ahrens	    &zfs_symvnodeops);
486fa9e406ahrens	if (error)
487fa9e406ahrens		return (error);
488fa9e406ahrens
489fa9e406ahrens	error = vn_make_ops(MNTTYPE_ZFS, zfs_xdvnodeops_template,
490fa9e406ahrens	    &zfs_xdvnodeops);
491fa9e406ahrens	if (error)
492fa9e406ahrens		return (error);
493fa9e406ahrens
494fa9e406ahrens	error = vn_make_ops(MNTTYPE_ZFS, zfs_evnodeops_template,
495fa9e406ahrens	    &zfs_evnodeops);
496743a77eAlan Wright	if (error)
497743a77eAlan Wright		return (error);
498743a77eAlan Wright
499743a77eAlan Wright	error = vn_make_ops(MNTTYPE_ZFS, zfs_sharevnodeops_template,
500743a77eAlan Wright	    &zfs_sharevnodeops);
501743a77eAlan Wright
502743a77eAlan Wright	return (error);
503743a77eAlan Wright}
504743a77eAlan Wright
5059e1320cMark Shellenbaumint
506743a77eAlan Wrightzfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx)
507743a77eAlan Wright{
50889459e1Mark Shellenbaum	zfs_acl_ids_t acl_ids;
509743a77eAlan Wright	vattr_t vattr;
510743a77eAlan Wright	znode_t *sharezp;
511743a77eAlan Wright	vnode_t *vp;
512743a77eAlan Wright	znode_t *zp;
513743a77eAlan Wright	int error;
514743a77eAlan Wright
515743a77eAlan Wright	vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE;
516743a77eAlan Wright	vattr.va_type = VDIR;
517743a77eAlan Wright	vattr.va_mode = S_IFDIR|0555;
518743a77eAlan Wright	vattr.va_uid = crgetuid(kcred);
519743a77eAlan Wright	vattr.va_gid = crgetgid(kcred);
520743a77eAlan Wright
521743a77eAlan Wright	sharezp = kmem_cache_alloc(znode_cache, KM_SLEEP);
522744947dTom Erickson	ASSERT(!POINTER_IS_VALID(sharezp->z_zfsvfs));
523744947dTom Erickson	sharezp->z_moved = 0;
524743a77eAlan Wright	sharezp->z_unlinked = 0;
525743a77eAlan Wright	sharezp->z_atime_dirty = 0;
526743a77eAlan Wright	sharezp->z_zfsvfs = zfsvfs;
5270a586ceMark Shellenbaum	sharezp->z_is_sa = zfsvfs->z_use_sa;
528f67950bNasf-Fan	sharezp->z_pflags = 0;
529743a77eAlan Wright
530743a77eAlan Wright	vp = ZTOV(sharezp);
531743a77eAlan Wright	vn_reinit(vp);
532743a77eAlan Wright	vp->v_type = VDIR;
533743a77eAlan Wright
53489459e1Mark Shellenbaum	VERIFY(0 == zfs_acl_ids_create(sharezp, IS_ROOT_NODE, &vattr,
53589459e1Mark Shellenbaum	    kcred, NULL, &acl_ids));
5360a586ceMark Shellenbaum	zfs_mknode(sharezp, &vattr, tx, kcred, IS_ROOT_NODE, &zp, &acl_ids);
537743a77eAlan Wright	ASSERT3P(zp, ==, sharezp);
538743a77eAlan Wright	ASSERT(!vn_in_dnlc(ZTOV(sharezp))); /* not valid to move */
539743a77eAlan Wright	POINTER_INVALIDATE(&sharezp->z_zfsvfs);
540743a77eAlan Wright	error = zap_add(zfsvfs->z_os, MASTER_NODE_OBJ,
541743a77eAlan Wright	    ZFS_SHARES_DIR, 8, 1, &sharezp->z_id, tx);
542743a77eAlan Wright	zfsvfs->z_shares_dir = sharezp->z_id;
543743a77eAlan Wright
54489459e1Mark Shellenbaum	zfs_acl_ids_free(&acl_ids);
545743a77eAlan Wright	ZTOV(sharezp)->v_count = 0;
5460a586ceMark Shellenbaum	sa_handle_destroy(sharezp->z_sa_hdl);
547743a77eAlan Wright	kmem_cache_free(znode_cache, sharezp);
548fa9e406ahrens
549fa9e406ahrens	return (error);
550fa9e406ahrens}
551fa9e406ahrens
552fa9e406ahrens/*
55372fc53bmarks * define a couple of values we need available
55472fc53bmarks * for both 64 and 32 bit environments.
55572fc53bmarks */
55672fc53bmarks#ifndef NBITSMINOR64
55772fc53bmarks#define	NBITSMINOR64	32
55872fc53bmarks#endif
55972fc53bmarks#ifndef MAXMAJ64
56072fc53bmarks#define	MAXMAJ64	0xffffffffUL
56172fc53bmarks#endif
56272fc53bmarks#ifndef	MAXMIN64
56372fc53bmarks#define	MAXMIN64	0xffffffffUL
56472fc53bmarks#endif
56572fc53bmarks
56672fc53bmarks/*
56772fc53bmarks * Create special expldev for ZFS private use.
56872fc53bmarks * Can't use standard expldev since it doesn't do
56972fc53bmarks * what we want.  The standard expldev() takes a
57072fc53bmarks * dev32_t in LP64 and expands it to a long dev_t.
57172fc53bmarks * We need an interface that takes a dev32_t in ILP32
57272fc53bmarks * and expands it to a long dev_t.
57372fc53bmarks */
57472fc53bmarksstatic uint64_t
57572fc53bmarkszfs_expldev(dev_t dev)
57672fc53bmarks{
57772fc53bmarks#ifndef _LP64
57872fc53bmarks	major_t major = (major_t)dev >> NBITSMINOR32 & MAXMAJ32;
57972fc53bmarks	return (((uint64_t)major << NBITSMINOR64) |
58072fc53bmarks	    ((minor_t)dev & MAXMIN32));
58172fc53bmarks#else
58272fc53bmarks	return (dev);
58372fc53bmarks#endif
58472fc53bmarks}
58572fc53bmarks
58672fc53bmarks/*
58772fc53bmarks * Special cmpldev for ZFS private use.
58872fc53bmarks * Can't use standard cmpldev since it takes
58972fc53bmarks * a long dev_t and compresses it to dev32_t in
59072fc53bmarks * LP64.  We need to do a compaction of a long dev_t
59172fc53bmarks * to a dev32_t in ILP32.
59272fc53bmarks */
59372fc53bmarksdev_t
59472fc53bmarkszfs_cmpldev(uint64_t dev)
59572fc53bmarks{
59672fc53bmarks#ifndef _LP64
59772fc53bmarks	minor_t minor = (minor_t)dev & MAXMIN64;
59872fc53bmarks	major_t major = (major_t)(dev >> NBITSMINOR64) & MAXMAJ64;
59972fc53bmarks
60072fc53bmarks	if (major > MAXMAJ32 || minor > MAXMIN32)
60172fc53bmarks		return (NODEV32);
60272fc53bmarks
60372fc53bmarks	return (((dev32_t)major << NBITSMINOR32) | minor);
60472fc53bmarks#else
60572fc53bmarks	return (dev);
60672fc53bmarks#endif
60772fc53bmarks}
60872fc53bmarks
6094ccbb6eahrensstatic void
6100a586ceMark Shellenbaumzfs_znode_sa_init(zfsvfs_t *zfsvfs, znode_t *zp,
6110a586ceMark Shellenbaum    dmu_buf_t *db, dmu_object_type_t obj_type, sa_handle_t *sa_hdl)
6124ccbb6eahrens{
613b5fca8ftomee	ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs) || (zfsvfs == zp->z_zfsvfs));
614b5fca8ftomee	ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zfsvfs, zp->z_id)));
6154ccbb6eahrens
6164ccbb6eahrens	mutex_enter(&zp->z_lock);
6174ccbb6eahrens
6180a586ceMark Shellenbaum	ASSERT(zp->z_sa_hdl == NULL);
6196638ae1Mark Shellenbaum	ASSERT(zp->z_acl_cached == NULL);
6200a586ceMark Shellenbaum	if (sa_hdl == NULL) {
6210a586ceMark Shellenbaum		VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, zp,
6220a586ceMark Shellenbaum		    SA_HDL_SHARED, &zp->z_sa_hdl));
6230a586ceMark Shellenbaum	} else {
6240a586ceMark Shellenbaum		zp->z_sa_hdl = sa_hdl;
6250a586ceMark Shellenbaum		sa_set_userp(sa_hdl, zp);
6260a586ceMark Shellenbaum	}
6274ccbb6eahrens
6280a586ceMark Shellenbaum	zp->z_is_sa = (obj_type == DMU_OT_SA) ? B_TRUE : B_FALSE;
6294ccbb6eahrens
6304ccbb6eahrens	/*
6314ccbb6eahrens	 * Slap on VROOT if we are the root znode
6324ccbb6eahrens	 */
6334ccbb6eahrens	if (zp->z_id == zfsvfs->z_root)
6344ccbb6eahrens		ZTOV(zp)->v_flag |= VROOT;
6354ccbb6eahrens
6364ccbb6eahrens	mutex_exit(&zp->z_lock);
6374ccbb6eahrens	vn_exists(ZTOV(zp));
6384ccbb6eahrens}
6394ccbb6eahrens
640874395dmaybeevoid
6414ccbb6eahrenszfs_znode_dmu_fini(znode_t *zp)
6424ccbb6eahrens{
643b5fca8ftomee	ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zp->z_zfsvfs, zp->z_id)) ||
644b5fca8ftomee	    zp->z_unlinked ||
645874395dmaybee	    RW_WRITE_HELD(&zp->z_zfsvfs->z_teardown_inactive_lock));
6460a586ceMark Shellenbaum
6470a586ceMark Shellenbaum	sa_handle_destroy(zp->z_sa_hdl);
6480a586ceMark Shellenbaum	zp->z_sa_hdl = NULL;
6494ccbb6eahrens}
6504ccbb6eahrens
65172fc53bmarks/*
652fa9e406ahrens * Construct a new znode/vnode and intialize.
653fa9e406ahrens *
654fa9e406ahrens * This does not do a call to dmu_set_user() that is
655fa9e406ahrens * up to the caller to do, in case you don't want to
656fa9e406ahrens * return the znode
657fa9e406ahrens */
658ea8dc4beschrockstatic znode_t *
6590a586ceMark Shellenbaumzfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
6600a586ceMark Shellenbaum    dmu_object_type_t obj_type, sa_handle_t *hdl)
661fa9e406ahrens{
662fa9e406ahrens	znode_t	*zp;
663fa9e406ahrens	vnode_t *vp;
6640a586ceMark Shellenbaum	uint64_t mode;
6650a586ceMark Shellenbaum	uint64_t parent;
666f67950bNasf-Fan	uint64_t projid = ZFS_DEFAULT_PROJID;
667f67950bNasf-Fan	sa_bulk_attr_t bulk[11];
6680a586ceMark Shellenbaum	int count = 0;
669fa9e406ahrens
670fa9e406ahrens	zp = kmem_cache_alloc(znode_cache, KM_SLEEP);
671fa9e406ahrens
672fa9e406ahrens	ASSERT(zp->z_dirlocks == NULL);
673b5fca8ftomee	ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs));
674744947dTom Erickson	zp->z_moved = 0;
675fa9e406ahrens
676b5fca8ftomee	/*
677b5fca8ftomee	 * Defer setting z_zfsvfs until the znode is ready to be a candidate for
678b5fca8ftomee	 * the zfs_znode_move() callback.
679b5fca8ftomee	 */
6800a586ceMark Shellenbaum	zp->z_sa_hdl = NULL;
681893a6d3ahrens	zp->z_unlinked = 0;
682fa9e406ahrens	zp->z_atime_dirty = 0;
683fa9e406ahrens	zp->z_mapcnt = 0;
6844ccbb6eahrens	zp->z_id = db->db_object;
685fa9e406ahrens	zp->z_blksz = blksz;
686fa9e406ahrens	zp->z_seq = 0x7A4653;
68767bd71cperrin	zp->z_sync_cnt = 0;
6884ccbb6eahrens
6894ccbb6eahrens	vp = ZTOV(zp);
6904ccbb6eahrens	vn_reinit(vp);
6914ccbb6eahrens
6920a586ceMark Shellenbaum	zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl);
6930a586ceMark Shellenbaum
6940a586ceMark Shellenbaum	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8);
6950a586ceMark Shellenbaum	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, &zp->z_gen, 8);
6960a586ceMark Shellenbaum	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
6970a586ceMark Shellenbaum	    &zp->z_size, 8);
6980a586ceMark Shellenbaum	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
6990a586ceMark Shellenbaum	    &zp->z_links, 8);
7000a586ceMark Shellenbaum	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
7010a586ceMark Shellenbaum	    &zp->z_pflags, 8);
7020a586ceMark Shellenbaum	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8);
7030a586ceMark Shellenbaum	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
7040a586ceMark Shellenbaum	    &zp->z_atime, 16);
7050a586ceMark Shellenbaum	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
706f1696b2Mark Shellenbaum	    &zp->z_uid, 8);
7070a586ceMark Shellenbaum	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
708f1696b2Mark Shellenbaum	    &zp->z_gid, 8);
7090a586ceMark Shellenbaum
710f67950bNasf-Fan	if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count) != 0 || zp->z_gen == 0 ||
711f67950bNasf-Fan	    (dmu_objset_projectquota_enabled(zfsvfs->z_os) &&
712f67950bNasf-Fan	    (zp->z_pflags & ZFS_PROJID) &&
713f67950bNasf-Fan	    sa_lookup(zp->z_sa_hdl, SA_ZPL_PROJID(zfsvfs), &projid, 8) != 0)) {
7140a586ceMark Shellenbaum		if (hdl == NULL)
7150a586ceMark Shellenbaum			sa_handle_destroy(zp->z_sa_hdl);
7160a586ceMark Shellenbaum		kmem_cache_free(znode_cache, zp);
7170a586ceMark Shellenbaum		return (NULL);
7180a586ceMark Shellenbaum	}
719fa9e406ahrens
720f67950bNasf-Fan	zp->z_projid = projid;
7210a586ceMark Shellenbaum	zp->z_mode = mode;
722fa9e406ahrens	vp->v_vfsp = zfsvfs->z_parent->z_vfs;
7230a586ceMark Shellenbaum
7240a586ceMark Shellenbaum	vp->v_type = IFTOVT((mode_t)mode);
725fa9e406ahrens
726fa9e406ahrens	switch (vp->v_type) {
727fa9e406ahrens	case VDIR:
7280a586ceMark Shellenbaum		if (zp->z_pflags & ZFS_XATTR) {
729fa9e406ahrens			vn_setops(vp, zfs_xdvnodeops);
730fa9e406ahrens			vp->v_flag |= V_XATTRDIR;
7314ccbb6eahrens		} else {
732fa9e406ahrens			vn_setops(vp, zfs_dvnodeops);
7334ccbb6eahrens		}
7347f6e3e7perrin		zp->z_zn_prefetch = B_TRUE; /* z_prefetch default is enabled */
735fa9e406ahrens		break;
736fa9e406ahrens	case VBLK:
737fa9e406ahrens	case VCHR:
7380a586ceMark Shellenbaum		{
7390a586ceMark Shellenbaum			uint64_t rdev;
7400a586ceMark Shellenbaum			VERIFY(sa_lookup(zp->z_sa_hdl, SA_ZPL_RDEV(zfsvfs),
7410a586ceMark Shellenbaum			    &rdev, sizeof (rdev)) == 0);
7420a586ceMark Shellenbaum
7430a586ceMark Shellenbaum			vp->v_rdev = zfs_cmpldev(rdev);
7440a586ceMark Shellenbaum		}
745fa9e406ahrens		/*FALLTHROUGH*/
746fa9e406ahrens	case VFIFO:
747fa9e406ahrens	case VSOCK:
748fa9e406ahrens	case VDOOR:
749fa9e406ahrens		vn_setops(vp, zfs_fvnodeops);
750fa9e406ahrens		break;
751fa9e406ahrens	case VREG:
752fa9e406ahrens		vp->v_flag |= VMODSORT;
7530a586ceMark Shellenbaum		if (parent == zfsvfs->z_shares_dir) {
754f1696b2Mark Shellenbaum			ASSERT(zp->z_uid == 0 && zp->z_gid == 0);
755743a77eAlan Wright			vn_setops(vp, zfs_sharevnodeops);
7560a586ceMark Shellenbaum		} else {
757743a77eAlan Wright			vn_setops(vp, zfs_fvnodeops);
7580a586ceMark Shellenbaum		}
759fa9e406ahrens		break;
760fa9e406ahrens	case VLNK:
761fa9e406ahrens		vn_setops(vp, zfs_symvnodeops);
762fa9e406ahrens		break;
763fa9e406ahrens	default:
764fa9e406ahrens		vn_setops(vp, zfs_evnodeops);
765fa9e406ahrens		break;
766fa9e406ahrens	}
767fa9e406ahrens
768b5fca8ftomee	mutex_enter(&zfsvfs->z_znodes_lock);
769b5fca8ftomee	list_insert_tail(&zfsvfs->z_all_znodes, zp);
770b5fca8ftomee	membar_producer();
771b5fca8ftomee	/*
772b5fca8ftomee	 * Everything else must be valid before assigning z_zfsvfs makes the
773b5fca8ftomee	 * znode eligible for zfs_znode_move().
774b5fca8ftomee	 */
775b5fca8ftomee	zp->z_zfsvfs = zfsvfs;
776b5fca8ftomee	mutex_exit(&zfsvfs->z_znodes_lock);
777b5fca8ftomee
778874395dmaybee	VFS_HOLD(zfsvfs->z_vfs);
779fa9e406ahrens	return (zp);
780fa9e406ahrens}
781fa9e406ahrens
7820a586ceMark Shellenbaumstatic uint64_t empty_xattr;
7830a586ceMark Shellenbaumstatic uint64_t pad[4];
7840a586ceMark Shellenbaumstatic zfs_acl_phys_t acl_phys;
785fa9e406ahrens/*
786fa9e406ahrens * Create a new DMU object to hold a zfs znode.
787fa9e406ahrens *
788fa9e406ahrens *	IN:	dzp	- parent directory for new znode
789fa9e406ahrens *		vap	- file attributes for new znode
790fa9e406ahrens *		tx	- dmu transaction id for zap operations
791fa9e406ahrens *		cr	- credentials of caller
792fa9e406ahrens *		flag	- flags:
793fa9e406ahrens *			  IS_ROOT_NODE	- new object will be root
794fa9e406ahrens *			  IS_XATTR	- new object is an attribute
795da6c28aamw *		bonuslen - length of bonus buffer
796da6c28aamw *		setaclp  - File/Dir initial ACL
797da6c28aamw *		fuidp	 - Tracks fuid allocation.
798fa9e406ahrens *
7994ccbb6eahrens *	OUT:	zpp	- allocated znode
800fa9e406ahrens *
801fa9e406ahrens */
802fa9e406ahrensvoid
8034ccbb6eahrenszfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
8040a586ceMark Shellenbaum    uint_t flag, znode_t **zpp, zfs_acl_ids_t *acl_ids)
805fa9e406ahrens{
8060a586ceMark Shellenbaum	uint64_t	crtime[2], atime[2], mtime[2], ctime[2];
8070a586ceMark Shellenbaum	uint64_t	mode, size, links, parent, pflags;
808744947dTom Erickson	uint64_t	dzp_pflags = 0;
809f67950bNasf-Fan	uint64_t	projid = ZFS_DEFAULT_PROJID;
8100a586ceMark Shellenbaum	uint64_t	rdev = 0;
811fa9e406ahrens	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
8120a586ceMark Shellenbaum	dmu_buf_t	*db;
813fa9e406ahrens	timestruc_t	now;
8144ccbb6eahrens	uint64_t	gen, obj;
8150a586ceMark Shellenbaum	int		bonuslen;
81654811daToomas Soome	int		dnodesize;
8170a586ceMark Shellenbaum	sa_handle_t	*sa_hdl;
8180a586ceMark Shellenbaum	dmu_object_type_t obj_type;
81954811daToomas Soome	sa_bulk_attr_t	*sa_attrs;
8200a586ceMark Shellenbaum	int		cnt = 0;
8210a586ceMark Shellenbaum	zfs_acl_locator_cb_t locate = { 0 };
822fa9e406ahrens
823fa9e406ahrens	ASSERT(vap && (vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));
824fa9e406ahrens
8251209a47Neil Perrin	if (zfsvfs->z_replay) {
8264ccbb6eahrens		obj = vap->va_nodeid;
827fa9e406ahrens		now = vap->va_ctime;		/* see zfs_replay_create() */
828fa9e406ahrens		gen = vap->va_nblocks;		/* ditto */
82954811daToomas Soome		dnodesize = vap->va_fsid;	/* ditto */
830fa9e406ahrens	} else {
8314ccbb6eahrens		obj = 0;
832fa9e406ahrens		gethrestime(&now);
833fa9e406ahrens		gen = dmu_tx_get_txg(tx);
83454811daToomas Soome		dnodesize = dmu_objset_dnodesize(zfsvfs->z_os);
835fa9e406ahrens	}
836fa9e406ahrens
83754811daToomas Soome	if (dnodesize == 0)
83854811daToomas Soome		dnodesize = DNODE_MIN_SIZE;
83954811daToomas Soome
8400a586ceMark Shellenbaum	obj_type = zfsvfs->z_use_sa ? DMU_OT_SA : DMU_OT_ZNODE;
8410a586ceMark Shellenbaum	bonuslen = (obj_type == DMU_OT_SA) ?
84254811daToomas Soome	    DN_BONUS_SIZE(dnodesize) : ZFS_OLD_ZNODE_PHYS_SIZE;
8430a586ceMark Shellenbaum
844fa9e406ahrens	/*
845fa9e406ahrens	 * Create a new DMU object.
846fa9e406ahrens	 */
847ea8dc4beschrock	/*
848ea8dc4beschrock	 * There's currently no mechanism for pre-reading the blocks that will
849744947dTom Erickson	 * be needed to allocate a new object, so we accept the small chance
850ea8dc4beschrock	 * that there will be an i/o error and we will fail one of the
851ea8dc4beschrock	 * assertions below.
852ea8dc4beschrock	 */
853fa9e406ahrens	if (vap->va_type == VDIR) {
8544a1f0ccMark Shellenbaum		if (zfsvfs->z_replay) {
85554811daToomas Soome			VERIFY0(zap_create_claim_norm_dnsize(zfsvfs->z_os, obj,
856da6c28aamw			    zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
85754811daToomas Soome			    obj_type, bonuslen, dnodesize, tx));
858fa9e406ahrens		} else {
85954811daToomas Soome			obj = zap_create_norm_dnsize(zfsvfs->z_os,
860da6c28aamw			    zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
86154811daToomas Soome			    obj_type, bonuslen, dnodesize, tx);
862fa9e406ahrens		}
863fa9e406ahrens	} else {
8644a1f0ccMark Shellenbaum		if (zfsvfs->z_replay) {
86554811daToomas Soome			VERIFY0(dmu_object_claim_dnsize(zfsvfs->z_os, obj,
866fa9e406ahrens			    DMU_OT_PLAIN_FILE_CONTENTS, 0,
86754811daToomas Soome			    obj_type, bonuslen, dnodesize, tx));
868fa9e406ahrens		} else {
86954811daToomas Soome			obj = dmu_object_alloc_dnsize(zfsvfs->z_os,
870fa9e406ahrens			    DMU_OT_PLAIN_FILE_CONTENTS, 0,
87154811daToomas Soome			    obj_type, bonuslen, dnodesize, tx);
872fa9e406ahrens		}
873fa9e406ahrens	}
87459e7834Mark Shellenbaum
87559e7834Mark Shellenbaum	ZFS_OBJ_HOLD_ENTER(zfsvfs, obj);
87654811daToomas Soome	VERIFY0(sa_buf_hold(zfsvfs->z_os, obj, NULL, &db));
877fa9e406ahrens
878fa9e406ahrens	/*
879fa9e406ahrens	 * If this is the root, fix up the half-initialized parent pointer
880fa9e406ahrens	 * to reference the just-allocated physical data area.
881fa9e406ahrens	 */
882fa9e406ahrens	if (flag & IS_ROOT_NODE) {
8834ccbb6eahrens		dzp->z_id = obj;
884fa9e406ahrens	}
885fa9e406ahrens
886fa9e406ahrens	/*
887fa9e406ahrens	 * If parent is an xattr, so am I.
888fa9e406ahrens	 */
889f67950bNasf-Fan	if (dzp->z_pflags & ZFS_XATTR) {
890fa9e406ahrens		flag |= IS_XATTR;
891fa9e406ahrens	}
892fa9e406ahrens
893da6c28aamw	if (zfsvfs->z_use_fuids)
8940a586ceMark Shellenbaum		pflags = ZFS_ARCHIVE | ZFS_AV_MODIFIED;
8950a586ceMark Shellenbaum	else
8960a586ceMark Shellenbaum		pflags = 0;
897da6c28aamw
898fa9e406ahrens	if (vap->va_type == VDIR) {
8990a586ceMark Shellenbaum		size = 2;		/* contents ("." and "..") */
9000a586ceMark Shellenbaum		links = (flag & (IS_ROOT_NODE | IS_XATTR)) ? 2 : 1;
9010a586ceMark Shellenbaum	} else {
9020a586ceMark Shellenbaum		size = links = 0;
9030a586ceMark Shellenbaum	}
9040a586ceMark Shellenbaum
9050a586ceMark Shellenbaum	if (vap->va_type == VBLK || vap->va_type == VCHR) {
9060a586ceMark Shellenbaum		rdev = zfs_expldev(vap->va_rdev);
907fa9e406ahrens	}
908fa9e406ahrens
9090a586ceMark Shellenbaum	parent = dzp->z_id;
9100a586ceMark Shellenbaum	mode = acl_ids->z_mode;
911fa9e406ahrens	if (flag & IS_XATTR)
9120a586ceMark Shellenbaum		pflags |= ZFS_XATTR;
913fa9e406ahrens
914f67950bNasf-Fan	if (vap->va_type == VREG || vap->va_type == VDIR) {
915f67950bNasf-Fan		/*
916f67950bNasf-Fan		 * With ZFS_PROJID flag, we can easily know whether there is
917f67950bNasf-Fan		 * project ID stored on disk or not. See zfs_space_delta_cb().
918f67950bNasf-Fan		 */
919f67950bNasf-Fan		if (obj_type != DMU_OT_ZNODE &&
920f67950bNasf-Fan		    dmu_objset_projectquota_enabled(zfsvfs->z_os))
921f67950bNasf-Fan			pflags |= ZFS_PROJID;
922f67950bNasf-Fan
923f67950bNasf-Fan		/*
924f67950bNasf-Fan		 * Inherit project ID from parent if required.
925f67950bNasf-Fan		 */
926f67950bNasf-Fan		projid = zfs_inherit_projid(dzp);
927f67950bNasf-Fan		if (dzp->z_pflags & ZFS_PROJINHERIT)
928f67950bNasf-Fan			pflags |= ZFS_PROJINHERIT;
929f67950bNasf-Fan	}
930f67950bNasf-Fan
9310a586ceMark Shellenbaum	/*
9320a586ceMark Shellenbaum	 * No execs denied will be deterimed when zfs_mode_compute() is called.
9330a586ceMark Shellenbaum	 */
9340a586ceMark Shellenbaum	pflags |= acl_ids->z_aclp->z_hints &
9350a586ceMark Shellenbaum	    (ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|ZFS_ACL_AUTO_INHERIT|
9360a586ceMark Shellenbaum	    ZFS_ACL_DEFAULTED|ZFS_ACL_PROTECTED);
937fa9e406ahrens
9380a586ceMark Shellenbaum	ZFS_TIME_ENCODE(&now, crtime);
9390a586ceMark Shellenbaum	ZFS_TIME_ENCODE(&now, ctime);
940fa9e406ahrens
941fa9e406ahrens	if (vap->va_mask & AT_ATIME) {
9420a586ceMark Shellenbaum		ZFS_TIME_ENCODE(&vap->va_atime, atime);
943fa9e406ahrens	} else {
9440a586ceMark Shellenbaum		ZFS_TIME_ENCODE(&now, atime);
945fa9e406ahrens	}
946fa9e406ahrens
947fa9e406ahrens	if (vap->va_mask & AT_MTIME) {
9480a586ceMark Shellenbaum		ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
9490a586ceMark Shellenbaum	} else {
9500a586ceMark Shellenbaum		ZFS_TIME_ENCODE(&now, mtime);
9510a586ceMark Shellenbaum	}
9520a586ceMark Shellenbaum
9530a586ceMark Shellenbaum	/* Now add in all of the "SA" attributes */
9540a586ceMark Shellenbaum	VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, NULL, SA_HDL_SHARED,
9550a586ceMark Shellenbaum	    &sa_hdl));
9560a586ceMark Shellenbaum
9570a586ceMark Shellenbaum	/*
9580a586ceMark Shellenbaum	 * Setup the array of attributes to be replaced/set on the new file
9590a586ceMark Shellenbaum	 *
9600a586ceMark Shellenbaum	 * order for  DMU_OT_ZNODE is critical since it needs to be constructed
9610a586ceMark Shellenbaum	 * in the old znode_phys_t format.  Don't change this ordering
9620a586ceMark Shellenbaum	 */
96354811daToomas Soome	sa_attrs = kmem_alloc(sizeof (sa_bulk_attr_t) * ZPL_END, KM_SLEEP);
9640a586ceMark Shellenbaum
9650a586ceMark Shellenbaum	if (obj_type == DMU_OT_ZNODE) {
9660a586ceMark Shellenbaum		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs),
9670a586ceMark Shellenbaum		    NULL, &atime, 16);
9680a586ceMark Shellenbaum		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs),
9690a586ceMark Shellenbaum		    NULL, &mtime, 16);
9700a586ceMark Shellenbaum		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs),
9710a586ceMark Shellenbaum		    NULL, &ctime, 16);
9720a586ceMark Shellenbaum		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs),
9730a586ceMark Shellenbaum		    NULL, &crtime, 16);
9740a586ceMark Shellenbaum		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs),
9750a586ceMark Shellenbaum		    NULL, &gen, 8);
9760a586ceMark Shellenbaum		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs),
9770a586ceMark Shellenbaum		    NULL, &mode, 8);
9780a586ceMark Shellenbaum		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs),
9790a586ceMark Shellenbaum		    NULL, &size, 8);
9800a586ceMark Shellenbaum		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs),
9810a586ceMark Shellenbaum		    NULL, &parent, 8);
982fa9e406ahrens	} else {
9830a586ceMark Shellenbaum		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs),
9840a586ceMark Shellenbaum		    NULL, &mode, 8);
9850a586ceMark Shellenbaum		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs),
9860a586ceMark Shellenbaum		    NULL, &size, 8);
9870a586ceMark Shellenbaum		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs),
9880a586ceMark Shellenbaum		    NULL, &gen, 8);
98954811daToomas Soome		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs),
99054811daToomas Soome		    NULL, &acl_ids->z_fuid, 8);
99154811daToomas Soome		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs),
99254811daToomas Soome		    NULL, &acl_ids->z_fgid, 8);
9930a586ceMark Shellenbaum		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs),
9940a586ceMark Shellenbaum		    NULL, &parent, 8);
9950a586ceMark Shellenbaum		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs),
9960a586ceMark Shellenbaum		    NULL, &pflags, 8);
9970a586ceMark Shellenbaum		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs),
9980a586ceMark Shellenbaum		    NULL, &atime, 16);
9990a586ceMark Shellenbaum		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs),
10000a586ceMark Shellenbaum		    NULL, &mtime, 16);
1001