xref: /illumos-gate/usr/src/uts/common/fs/zfs/zfs_znode.c (revision 54811da5)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
24  * Copyright (c) 2014 Integros [integros.com]
25  */
26 
27 /* Portions Copyright 2007 Jeremy Teo */
28 
29 #ifdef _KERNEL
30 #include <sys/types.h>
31 #include <sys/param.h>
32 #include <sys/time.h>
33 #include <sys/systm.h>
34 #include <sys/sysmacros.h>
35 #include <sys/resource.h>
36 #include <sys/mntent.h>
37 #include <sys/mkdev.h>
38 #include <sys/u8_textprep.h>
39 #include <sys/dsl_dataset.h>
40 #include <sys/vfs.h>
41 #include <sys/vfs_opreg.h>
42 #include <sys/vnode.h>
43 #include <sys/file.h>
44 #include <sys/kmem.h>
45 #include <sys/errno.h>
46 #include <sys/unistd.h>
47 #include <sys/mode.h>
48 #include <sys/atomic.h>
49 #include <vm/pvn.h>
50 #include "fs/fs_subr.h"
51 #include <sys/zfs_dir.h>
52 #include <sys/zfs_acl.h>
53 #include <sys/zfs_ioctl.h>
54 #include <sys/zfs_rlock.h>
55 #include <sys/zfs_fuid.h>
56 #include <sys/dnode.h>
57 #include <sys/fs/zfs.h>
58 #include <sys/kidmap.h>
59 #endif /* _KERNEL */
60 
61 #include <sys/dmu.h>
62 #include <sys/dmu_objset.h>
63 #include <sys/dmu_tx.h>
64 #include <sys/refcount.h>
65 #include <sys/stat.h>
66 #include <sys/zap.h>
67 #include <sys/zfs_znode.h>
68 #include <sys/sa.h>
69 #include <sys/zfs_sa.h>
70 #include <sys/zfs_stat.h>
71 
72 #include "zfs_prop.h"
73 #include "zfs_comutil.h"
74 
75 /*
76  * Define ZNODE_STATS to turn on statistic gathering. By default, it is only
77  * turned on when DEBUG is also defined.
78  */
79 #ifdef	DEBUG
80 #define	ZNODE_STATS
81 #endif	/* DEBUG */
82 
83 #ifdef	ZNODE_STATS
84 #define	ZNODE_STAT_ADD(stat)			((stat)++)
85 #else
86 #define	ZNODE_STAT_ADD(stat)			/* nothing */
87 #endif	/* ZNODE_STATS */
88 
89 /*
90  * Functions needed for userland (ie: libzpool) are not put under
91  * #ifdef_KERNEL; the rest of the functions have dependencies
92  * (such as VFS logic) that will not compile easily in userland.
93  */
94 #ifdef _KERNEL
95 /*
96  * Needed to close a small window in zfs_znode_move() that allows the zfsvfs to
97  * be freed before it can be safely accessed.
98  */
99 krwlock_t zfsvfs_lock;
100 
101 static kmem_cache_t *znode_cache = NULL;
102 
103 /*ARGSUSED*/
104 static void
105 znode_evict_error(dmu_buf_t *dbuf, void *user_ptr)
106 {
107 	/*
108 	 * We should never drop all dbuf refs without first clearing
109 	 * the eviction callback.
110 	 */
111 	panic("evicting znode %p\n", user_ptr);
112 }
113 
114 /*
115  * This callback is invoked when acquiring a RL_WRITER or RL_APPEND lock on
116  * z_rangelock. It will modify the offset and length of the lock to reflect
117  * znode-specific information, and convert RL_APPEND to RL_WRITER.  This is
118  * called with the rangelock_t's rl_lock held, which avoids races.
119  */
120 static void
121 zfs_rangelock_cb(locked_range_t *new, void *arg)
122 {
123 	znode_t *zp = arg;
124 
125 	/*
126 	 * If in append mode, convert to writer and lock starting at the
127 	 * current end of file.
128 	 */
129 	if (new->lr_type == RL_APPEND) {
130 		new->lr_offset = zp->z_size;
131 		new->lr_type = RL_WRITER;
132 	}
133 
134 	/*
135 	 * If we need to grow the block size then lock the whole file range.
136 	 */
137 	uint64_t end_size = MAX(zp->z_size, new->lr_offset + new->lr_length);
138 	if (end_size > zp->z_blksz && (!ISP2(zp->z_blksz) ||
139 	    zp->z_blksz < zp->z_zfsvfs->z_max_blksz)) {
140 		new->lr_offset = 0;
141 		new->lr_length = UINT64_MAX;
142 	}
143 }
144 
145 /*ARGSUSED*/
146 static int
147 zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
148 {
149 	znode_t *zp = buf;
150 
151 	ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs));
152 
153 	zp->z_vnode = vn_alloc(kmflags);
154 	if (zp->z_vnode == NULL) {
155 		return (-1);
156 	}
157 	ZTOV(zp)->v_data = zp;
158 
159 	list_link_init(&zp->z_link_node);
160 
161 	mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL);
162 	rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL);
163 	rw_init(&zp->z_name_lock, NULL, RW_DEFAULT, NULL);
164 	mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL);
165 
166 	rangelock_init(&zp->z_rangelock, zfs_rangelock_cb, zp);
167 
168 	zp->z_dirlocks = NULL;
169 	zp->z_acl_cached = NULL;
170 	zp->z_moved = 0;
171 	return (0);
172 }
173 
174 /*ARGSUSED*/
175 static void
176 zfs_znode_cache_destructor(void *buf, void *arg)
177 {
178 	znode_t *zp = buf;
179 
180 	ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs));
181 	ASSERT(ZTOV(zp)->v_data == zp);
182 	vn_free(ZTOV(zp));
183 	ASSERT(!list_link_active(&zp->z_link_node));
184 	mutex_destroy(&zp->z_lock);
185 	rw_destroy(&zp->z_parent_lock);
186 	rw_destroy(&zp->z_name_lock);
187 	mutex_destroy(&zp->z_acl_lock);
188 	rangelock_fini(&zp->z_rangelock);
189 
190 	ASSERT(zp->z_dirlocks == NULL);
191 	ASSERT(zp->z_acl_cached == NULL);
192 }
193 
194 #ifdef	ZNODE_STATS
195 static struct {
196 	uint64_t zms_zfsvfs_invalid;
197 	uint64_t zms_zfsvfs_recheck1;
198 	uint64_t zms_zfsvfs_unmounted;
199 	uint64_t zms_zfsvfs_recheck2;
200 	uint64_t zms_obj_held;
201 	uint64_t zms_vnode_locked;
202 	uint64_t zms_not_only_dnlc;
203 } znode_move_stats;
204 #endif	/* ZNODE_STATS */
205 
206 static void
207 zfs_znode_move_impl(znode_t *ozp, znode_t *nzp)
208 {
209 	vnode_t *vp;
210 
211 	/* Copy fields. */
212 	nzp->z_zfsvfs = ozp->z_zfsvfs;
213 
214 	/* Swap vnodes. */
215 	vp = nzp->z_vnode;
216 	nzp->z_vnode = ozp->z_vnode;
217 	ozp->z_vnode = vp; /* let destructor free the overwritten vnode */
218 	ZTOV(ozp)->v_data = ozp;
219 	ZTOV(nzp)->v_data = nzp;
220 
221 	nzp->z_id = ozp->z_id;
222 	ASSERT(ozp->z_dirlocks == NULL); /* znode not in use */
223 	nzp->z_unlinked = ozp->z_unlinked;
224 	nzp->z_atime_dirty = ozp->z_atime_dirty;
225 	nzp->z_zn_prefetch = ozp->z_zn_prefetch;
226 	nzp->z_blksz = ozp->z_blksz;
227 	nzp->z_seq = ozp->z_seq;
228 	nzp->z_mapcnt = ozp->z_mapcnt;
229 	nzp->z_gen = ozp->z_gen;
230 	nzp->z_sync_cnt = ozp->z_sync_cnt;
231 	nzp->z_is_sa = ozp->z_is_sa;
232 	nzp->z_sa_hdl = ozp->z_sa_hdl;
233 	bcopy(ozp->z_atime, nzp->z_atime, sizeof (uint64_t) * 2);
234 	nzp->z_links = ozp->z_links;
235 	nzp->z_size = ozp->z_size;
236 	nzp->z_pflags = ozp->z_pflags;
237 	nzp->z_uid = ozp->z_uid;
238 	nzp->z_gid = ozp->z_gid;
239 	nzp->z_mode = ozp->z_mode;
240 
241 	/*
242 	 * Since this is just an idle znode and kmem is already dealing with
243 	 * memory pressure, release any cached ACL.
244 	 */
245 	if (ozp->z_acl_cached) {
246 		zfs_acl_free(ozp->z_acl_cached);
247 		ozp->z_acl_cached = NULL;
248 	}
249 
250 	sa_set_userp(nzp->z_sa_hdl, nzp);
251 
252 	/*
253 	 * Invalidate the original znode by clearing fields that provide a
254 	 * pointer back to the znode. Set the low bit of the vfs pointer to
255 	 * ensure that zfs_znode_move() recognizes the znode as invalid in any
256 	 * subsequent callback.
257 	 */
258 	ozp->z_sa_hdl = NULL;
259 	POINTER_INVALIDATE(&ozp->z_zfsvfs);
260 
261 	/*
262 	 * Mark the znode.
263 	 */
264 	nzp->z_moved = 1;
265 	ozp->z_moved = (uint8_t)-1;
266 }
267 
268 /*ARGSUSED*/
269 static kmem_cbrc_t
270 zfs_znode_move(void *buf, void *newbuf, size_t size, void *arg)
271 {
272 	znode_t *ozp = buf, *nzp = newbuf;
273 	zfsvfs_t *zfsvfs;
274 	vnode_t *vp;
275 
276 	/*
277 	 * The znode is on the file system's list of known znodes if the vfs
278 	 * pointer is valid. We set the low bit of the vfs pointer when freeing
279 	 * the znode to invalidate it, and the memory patterns written by kmem
280 	 * (baddcafe and deadbeef) set at least one of the two low bits. A newly
281 	 * created znode sets the vfs pointer last of all to indicate that the
282 	 * znode is known and in a valid state to be moved by this function.
283 	 */
284 	zfsvfs = ozp->z_zfsvfs;
285 	if (!POINTER_IS_VALID(zfsvfs)) {
286 		ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_invalid);
287 		return (KMEM_CBRC_DONT_KNOW);
288 	}
289 
290 	/*
291 	 * Close a small window in which it's possible that the filesystem could
292 	 * be unmounted and freed, and zfsvfs, though valid in the previous
293 	 * statement, could point to unrelated memory by the time we try to
294 	 * prevent the filesystem from being unmounted.
295 	 */
296 	rw_enter(&zfsvfs_lock, RW_WRITER);
297 	if (zfsvfs != ozp->z_zfsvfs) {
298 		rw_exit(&zfsvfs_lock);
299 		ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_recheck1);
300 		return (KMEM_CBRC_DONT_KNOW);
301 	}
302 
303 	/*
304 	 * If the znode is still valid, then so is the file system. We know that
305 	 * no valid file system can be freed while we hold zfsvfs_lock, so we
306 	 * can safely ensure that the filesystem is not and will not be
307 	 * unmounted. The next statement is equivalent to ZFS_ENTER().
308 	 */
309 	rrm_enter(&zfsvfs->z_teardown_lock, RW_READER, FTAG);
310 	if (zfsvfs->z_unmounted) {
311 		ZFS_EXIT(zfsvfs);
312 		rw_exit(&zfsvfs_lock);
313 		ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_unmounted);
314 		return (KMEM_CBRC_DONT_KNOW);
315 	}
316 	rw_exit(&zfsvfs_lock);
317 
318 	mutex_enter(&zfsvfs->z_znodes_lock);
319 	/*
320 	 * Recheck the vfs pointer in case the znode was removed just before
321 	 * acquiring the lock.
322 	 */
323 	if (zfsvfs != ozp->z_zfsvfs) {
324 		mutex_exit(&zfsvfs->z_znodes_lock);
325 		ZFS_EXIT(zfsvfs);
326 		ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_recheck2);
327 		return (KMEM_CBRC_DONT_KNOW);
328 	}
329 
330 	/*
331 	 * At this point we know that as long as we hold z_znodes_lock, the
332 	 * znode cannot be freed and fields within the znode can be safely
333 	 * accessed. Now, prevent a race with zfs_zget().
334 	 */
335 	if (ZFS_OBJ_HOLD_TRYENTER(zfsvfs, ozp->z_id) == 0) {
336 		mutex_exit(&zfsvfs->z_znodes_lock);
337 		ZFS_EXIT(zfsvfs);
338 		ZNODE_STAT_ADD(znode_move_stats.zms_obj_held);
339 		return (KMEM_CBRC_LATER);
340 	}
341 
342 	vp = ZTOV(ozp);
343 	if (mutex_tryenter(&vp->v_lock) == 0) {
344 		ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id);
345 		mutex_exit(&zfsvfs->z_znodes_lock);
346 		ZFS_EXIT(zfsvfs);
347 		ZNODE_STAT_ADD(znode_move_stats.zms_vnode_locked);
348 		return (KMEM_CBRC_LATER);
349 	}
350 
351 	/* Only move znodes that are referenced _only_ by the DNLC. */
352 	if (vp->v_count != 1 || !vn_in_dnlc(vp)) {
353 		mutex_exit(&vp->v_lock);
354 		ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id);
355 		mutex_exit(&zfsvfs->z_znodes_lock);
356 		ZFS_EXIT(zfsvfs);
357 		ZNODE_STAT_ADD(znode_move_stats.zms_not_only_dnlc);
358 		return (KMEM_CBRC_LATER);
359 	}
360 
361 	/*
362 	 * The znode is known and in a valid state to move. We're holding the
363 	 * locks needed to execute the critical section.
364 	 */
365 	zfs_znode_move_impl(ozp, nzp);
366 	mutex_exit(&vp->v_lock);
367 	ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id);
368 
369 	list_link_replace(&ozp->z_link_node, &nzp->z_link_node);
370 	mutex_exit(&zfsvfs->z_znodes_lock);
371 	ZFS_EXIT(zfsvfs);
372 
373 	return (KMEM_CBRC_YES);
374 }
375 
376 void
377 zfs_znode_init(void)
378 {
379 	/*
380 	 * Initialize zcache
381 	 */
382 	rw_init(&zfsvfs_lock, NULL, RW_DEFAULT, NULL);
383 	ASSERT(znode_cache == NULL);
384 	znode_cache = kmem_cache_create("zfs_znode_cache",
385 	    sizeof (znode_t), 0, zfs_znode_cache_constructor,
386 	    zfs_znode_cache_destructor, NULL, NULL, NULL, 0);
387 	kmem_cache_set_move(znode_cache, zfs_znode_move);
388 }
389 
390 void
391 zfs_znode_fini(void)
392 {
393 	/*
394 	 * Cleanup vfs & vnode ops
395 	 */
396 	zfs_remove_op_tables();
397 
398 	/*
399 	 * Cleanup zcache
400 	 */
401 	if (znode_cache)
402 		kmem_cache_destroy(znode_cache);
403 	znode_cache = NULL;
404 	rw_destroy(&zfsvfs_lock);
405 }
406 
407 struct vnodeops *zfs_dvnodeops;
408 struct vnodeops *zfs_fvnodeops;
409 struct vnodeops *zfs_symvnodeops;
410 struct vnodeops *zfs_xdvnodeops;
411 struct vnodeops *zfs_evnodeops;
412 struct vnodeops *zfs_sharevnodeops;
413 
414 void
415 zfs_remove_op_tables()
416 {
417 	/*
418 	 * Remove vfs ops
419 	 */
420 	ASSERT(zfsfstype);
421 	(void) vfs_freevfsops_by_type(zfsfstype);
422 	zfsfstype = 0;
423 
424 	/*
425 	 * Remove vnode ops
426 	 */
427 	if (zfs_dvnodeops)
428 		vn_freevnodeops(zfs_dvnodeops);
429 	if (zfs_fvnodeops)
430 		vn_freevnodeops(zfs_fvnodeops);
431 	if (zfs_symvnodeops)
432 		vn_freevnodeops(zfs_symvnodeops);
433 	if (zfs_xdvnodeops)
434 		vn_freevnodeops(zfs_xdvnodeops);
435 	if (zfs_evnodeops)
436 		vn_freevnodeops(zfs_evnodeops);
437 	if (zfs_sharevnodeops)
438 		vn_freevnodeops(zfs_sharevnodeops);
439 
440 	zfs_dvnodeops = NULL;
441 	zfs_fvnodeops = NULL;
442 	zfs_symvnodeops = NULL;
443 	zfs_xdvnodeops = NULL;
444 	zfs_evnodeops = NULL;
445 	zfs_sharevnodeops = NULL;
446 }
447 
448 extern const fs_operation_def_t zfs_dvnodeops_template[];
449 extern const fs_operation_def_t zfs_fvnodeops_template[];
450 extern const fs_operation_def_t zfs_xdvnodeops_template[];
451 extern const fs_operation_def_t zfs_symvnodeops_template[];
452 extern const fs_operation_def_t zfs_evnodeops_template[];
453 extern const fs_operation_def_t zfs_sharevnodeops_template[];
454 
455 int
456 zfs_create_op_tables()
457 {
458 	int error;
459 
460 	/*
461 	 * zfs_dvnodeops can be set if mod_remove() calls mod_installfs()
462 	 * due to a failure to remove the the 2nd modlinkage (zfs_modldrv).
463 	 * In this case we just return as the ops vectors are already set up.
464 	 */
465 	if (zfs_dvnodeops)
466 		return (0);
467 
468 	error = vn_make_ops(MNTTYPE_ZFS, zfs_dvnodeops_template,
469 	    &zfs_dvnodeops);
470 	if (error)
471 		return (error);
472 
473 	error = vn_make_ops(MNTTYPE_ZFS, zfs_fvnodeops_template,
474 	    &zfs_fvnodeops);
475 	if (error)
476 		return (error);
477 
478 	error = vn_make_ops(MNTTYPE_ZFS, zfs_symvnodeops_template,
479 	    &zfs_symvnodeops);
480 	if (error)
481 		return (error);
482 
483 	error = vn_make_ops(MNTTYPE_ZFS, zfs_xdvnodeops_template,
484 	    &zfs_xdvnodeops);
485 	if (error)
486 		return (error);
487 
488 	error = vn_make_ops(MNTTYPE_ZFS, zfs_evnodeops_template,
489 	    &zfs_evnodeops);
490 	if (error)
491 		return (error);
492 
493 	error = vn_make_ops(MNTTYPE_ZFS, zfs_sharevnodeops_template,
494 	    &zfs_sharevnodeops);
495 
496 	return (error);
497 }
498 
499 int
500 zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx)
501 {
502 	zfs_acl_ids_t acl_ids;
503 	vattr_t vattr;
504 	znode_t *sharezp;
505 	vnode_t *vp;
506 	znode_t *zp;
507 	int error;
508 
509 	vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE;
510 	vattr.va_type = VDIR;
511 	vattr.va_mode = S_IFDIR|0555;
512 	vattr.va_uid = crgetuid(kcred);
513 	vattr.va_gid = crgetgid(kcred);
514 
515 	sharezp = kmem_cache_alloc(znode_cache, KM_SLEEP);
516 	ASSERT(!POINTER_IS_VALID(sharezp->z_zfsvfs));
517 	sharezp->z_moved = 0;
518 	sharezp->z_unlinked = 0;
519 	sharezp->z_atime_dirty = 0;
520 	sharezp->z_zfsvfs = zfsvfs;
521 	sharezp->z_is_sa = zfsvfs->z_use_sa;
522 
523 	vp = ZTOV(sharezp);
524 	vn_reinit(vp);
525 	vp->v_type = VDIR;
526 
527 	VERIFY(0 == zfs_acl_ids_create(sharezp, IS_ROOT_NODE, &vattr,
528 	    kcred, NULL, &acl_ids));
529 	zfs_mknode(sharezp, &vattr, tx, kcred, IS_ROOT_NODE, &zp, &acl_ids);
530 	ASSERT3P(zp, ==, sharezp);
531 	ASSERT(!vn_in_dnlc(ZTOV(sharezp))); /* not valid to move */
532 	POINTER_INVALIDATE(&sharezp->z_zfsvfs);
533 	error = zap_add(zfsvfs->z_os, MASTER_NODE_OBJ,
534 	    ZFS_SHARES_DIR, 8, 1, &sharezp->z_id, tx);
535 	zfsvfs->z_shares_dir = sharezp->z_id;
536 
537 	zfs_acl_ids_free(&acl_ids);
538 	ZTOV(sharezp)->v_count = 0;
539 	sa_handle_destroy(sharezp->z_sa_hdl);
540 	kmem_cache_free(znode_cache, sharezp);
541 
542 	return (error);
543 }
544 
545 /*
546  * define a couple of values we need available
547  * for both 64 and 32 bit environments.
548  */
549 #ifndef NBITSMINOR64
550 #define	NBITSMINOR64	32
551 #endif
552 #ifndef MAXMAJ64
553 #define	MAXMAJ64	0xffffffffUL
554 #endif
555 #ifndef	MAXMIN64
556 #define	MAXMIN64	0xffffffffUL
557 #endif
558 
559 /*
560  * Create special expldev for ZFS private use.
561  * Can't use standard expldev since it doesn't do
562  * what we want.  The standard expldev() takes a
563  * dev32_t in LP64 and expands it to a long dev_t.
564  * We need an interface that takes a dev32_t in ILP32
565  * and expands it to a long dev_t.
566  */
567 static uint64_t
568 zfs_expldev(dev_t dev)
569 {
570 #ifndef _LP64
571 	major_t major = (major_t)dev >> NBITSMINOR32 & MAXMAJ32;
572 	return (((uint64_t)major << NBITSMINOR64) |
573 	    ((minor_t)dev & MAXMIN32));
574 #else
575 	return (dev);
576 #endif
577 }
578 
579 /*
580  * Special cmpldev for ZFS private use.
581  * Can't use standard cmpldev since it takes
582  * a long dev_t and compresses it to dev32_t in
583  * LP64.  We need to do a compaction of a long dev_t
584  * to a dev32_t in ILP32.
585  */
586 dev_t
587 zfs_cmpldev(uint64_t dev)
588 {
589 #ifndef _LP64
590 	minor_t minor = (minor_t)dev & MAXMIN64;
591 	major_t major = (major_t)(dev >> NBITSMINOR64) & MAXMAJ64;
592 
593 	if (major > MAXMAJ32 || minor > MAXMIN32)
594 		return (NODEV32);
595 
596 	return (((dev32_t)major << NBITSMINOR32) | minor);
597 #else
598 	return (dev);
599 #endif
600 }
601 
602 static void
603 zfs_znode_sa_init(zfsvfs_t *zfsvfs, znode_t *zp,
604     dmu_buf_t *db, dmu_object_type_t obj_type, sa_handle_t *sa_hdl)
605 {
606 	ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs) || (zfsvfs == zp->z_zfsvfs));
607 	ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zfsvfs, zp->z_id)));
608 
609 	mutex_enter(&zp->z_lock);
610 
611 	ASSERT(zp->z_sa_hdl == NULL);
612 	ASSERT(zp->z_acl_cached == NULL);
613 	if (sa_hdl == NULL) {
614 		VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, zp,
615 		    SA_HDL_SHARED, &zp->z_sa_hdl));
616 	} else {
617 		zp->z_sa_hdl = sa_hdl;
618 		sa_set_userp(sa_hdl, zp);
619 	}
620 
621 	zp->z_is_sa = (obj_type == DMU_OT_SA) ? B_TRUE : B_FALSE;
622 
623 	/*
624 	 * Slap on VROOT if we are the root znode
625 	 */
626 	if (zp->z_id == zfsvfs->z_root)
627 		ZTOV(zp)->v_flag |= VROOT;
628 
629 	mutex_exit(&zp->z_lock);
630 	vn_exists(ZTOV(zp));
631 }
632 
633 void
634 zfs_znode_dmu_fini(znode_t *zp)
635 {
636 	ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zp->z_zfsvfs, zp->z_id)) ||
637 	    zp->z_unlinked ||
638 	    RW_WRITE_HELD(&zp->z_zfsvfs->z_teardown_inactive_lock));
639 
640 	sa_handle_destroy(zp->z_sa_hdl);
641 	zp->z_sa_hdl = NULL;
642 }
643 
644 /*
645  * Construct a new znode/vnode and intialize.
646  *
647  * This does not do a call to dmu_set_user() that is
648  * up to the caller to do, in case you don't want to
649  * return the znode
650  */
651 static znode_t *
652 zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
653     dmu_object_type_t obj_type, sa_handle_t *hdl)
654 {
655 	znode_t	*zp;
656 	vnode_t *vp;
657 	uint64_t mode;
658 	uint64_t parent;
659 	sa_bulk_attr_t bulk[9];
660 	int count = 0;
661 
662 	zp = kmem_cache_alloc(znode_cache, KM_SLEEP);
663 
664 	ASSERT(zp->z_dirlocks == NULL);
665 	ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs));
666 	zp->z_moved = 0;
667 
668 	/*
669 	 * Defer setting z_zfsvfs until the znode is ready to be a candidate for
670 	 * the zfs_znode_move() callback.
671 	 */
672 	zp->z_sa_hdl = NULL;
673 	zp->z_unlinked = 0;
674 	zp->z_atime_dirty = 0;
675 	zp->z_mapcnt = 0;
676 	zp->z_id = db->db_object;
677 	zp->z_blksz = blksz;
678 	zp->z_seq = 0x7A4653;
679 	zp->z_sync_cnt = 0;
680 
681 	vp = ZTOV(zp);
682 	vn_reinit(vp);
683 
684 	zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl);
685 
686 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8);
687 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, &zp->z_gen, 8);
688 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
689 	    &zp->z_size, 8);
690 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
691 	    &zp->z_links, 8);
692 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
693 	    &zp->z_pflags, 8);
694 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8);
695 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
696 	    &zp->z_atime, 16);
697 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
698 	    &zp->z_uid, 8);
699 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
700 	    &zp->z_gid, 8);
701 
702 	if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count) != 0 || zp->z_gen == 0) {
703 		if (hdl == NULL)
704 			sa_handle_destroy(zp->z_sa_hdl);
705 		kmem_cache_free(znode_cache, zp);
706 		return (NULL);
707 	}
708 
709 	zp->z_mode = mode;
710 	vp->v_vfsp = zfsvfs->z_parent->z_vfs;
711 
712 	vp->v_type = IFTOVT((mode_t)mode);
713 
714 	switch (vp->v_type) {
715 	case VDIR:
716 		if (zp->z_pflags & ZFS_XATTR) {
717 			vn_setops(vp, zfs_xdvnodeops);
718 			vp->v_flag |= V_XATTRDIR;
719 		} else {
720 			vn_setops(vp, zfs_dvnodeops);
721 		}
722 		zp->z_zn_prefetch = B_TRUE; /* z_prefetch default is enabled */
723 		break;
724 	case VBLK:
725 	case VCHR:
726 		{
727 			uint64_t rdev;
728 			VERIFY(sa_lookup(zp->z_sa_hdl, SA_ZPL_RDEV(zfsvfs),
729 			    &rdev, sizeof (rdev)) == 0);
730 
731 			vp->v_rdev = zfs_cmpldev(rdev);
732 		}
733 		/*FALLTHROUGH*/
734 	case VFIFO:
735 	case VSOCK:
736 	case VDOOR:
737 		vn_setops(vp, zfs_fvnodeops);
738 		break;
739 	case VREG:
740 		vp->v_flag |= VMODSORT;
741 		if (parent == zfsvfs->z_shares_dir) {
742 			ASSERT(zp->z_uid == 0 && zp->z_gid == 0);
743 			vn_setops(vp, zfs_sharevnodeops);
744 		} else {
745 			vn_setops(vp, zfs_fvnodeops);
746 		}
747 		break;
748 	case VLNK:
749 		vn_setops(vp, zfs_symvnodeops);
750 		break;
751 	default:
752 		vn_setops(vp, zfs_evnodeops);
753 		break;
754 	}
755 
756 	mutex_enter(&zfsvfs->z_znodes_lock);
757 	list_insert_tail(&zfsvfs->z_all_znodes, zp);
758 	membar_producer();
759 	/*
760 	 * Everything else must be valid before assigning z_zfsvfs makes the
761 	 * znode eligible for zfs_znode_move().
762 	 */
763 	zp->z_zfsvfs = zfsvfs;
764 	mutex_exit(&zfsvfs->z_znodes_lock);
765 
766 	VFS_HOLD(zfsvfs->z_vfs);
767 	return (zp);
768 }
769 
770 static uint64_t empty_xattr;
771 static uint64_t pad[4];
772 static zfs_acl_phys_t acl_phys;
773 /*
774  * Create a new DMU object to hold a zfs znode.
775  *
776  *	IN:	dzp	- parent directory for new znode
777  *		vap	- file attributes for new znode
778  *		tx	- dmu transaction id for zap operations
779  *		cr	- credentials of caller
780  *		flag	- flags:
781  *			  IS_ROOT_NODE	- new object will be root
782  *			  IS_XATTR	- new object is an attribute
783  *		bonuslen - length of bonus buffer
784  *		setaclp  - File/Dir initial ACL
785  *		fuidp	 - Tracks fuid allocation.
786  *
787  *	OUT:	zpp	- allocated znode
788  *
789  */
790 void
791 zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
792     uint_t flag, znode_t **zpp, zfs_acl_ids_t *acl_ids)
793 {
794 	uint64_t	crtime[2], atime[2], mtime[2], ctime[2];
795 	uint64_t	mode, size, links, parent, pflags;
796 	uint64_t	dzp_pflags = 0;
797 	uint64_t	rdev = 0;
798 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
799 	dmu_buf_t	*db;
800 	timestruc_t	now;
801 	uint64_t	gen, obj;
802 	int		bonuslen;
803 	int		dnodesize;
804 	sa_handle_t	*sa_hdl;
805 	dmu_object_type_t obj_type;
806 	sa_bulk_attr_t	*sa_attrs;
807 	int		cnt = 0;
808 	zfs_acl_locator_cb_t locate = { 0 };
809 
810 	ASSERT(vap && (vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));
811 
812 	if (zfsvfs->z_replay) {
813 		obj = vap->va_nodeid;
814 		now = vap->va_ctime;		/* see zfs_replay_create() */
815 		gen = vap->va_nblocks;		/* ditto */
816 		dnodesize = vap->va_fsid;	/* ditto */
817 	} else {
818 		obj = 0;
819 		gethrestime(&now);
820 		gen = dmu_tx_get_txg(tx);
821 		dnodesize = dmu_objset_dnodesize(zfsvfs->z_os);
822 	}
823 
824 	if (dnodesize == 0)
825 		dnodesize = DNODE_MIN_SIZE;
826 
827 	obj_type = zfsvfs->z_use_sa ? DMU_OT_SA : DMU_OT_ZNODE;
828 	bonuslen = (obj_type == DMU_OT_SA) ?
829 	    DN_BONUS_SIZE(dnodesize) : ZFS_OLD_ZNODE_PHYS_SIZE;
830 
831 	/*
832 	 * Create a new DMU object.
833 	 */
834 	/*
835 	 * There's currently no mechanism for pre-reading the blocks that will
836 	 * be needed to allocate a new object, so we accept the small chance
837 	 * that there will be an i/o error and we will fail one of the
838 	 * assertions below.
839 	 */
840 	if (vap->va_type == VDIR) {
841 		if (zfsvfs->z_replay) {
842 			VERIFY0(zap_create_claim_norm_dnsize(zfsvfs->z_os, obj,
843 			    zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
844 			    obj_type, bonuslen, dnodesize, tx));
845 		} else {
846 			obj = zap_create_norm_dnsize(zfsvfs->z_os,
847 			    zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
848 			    obj_type, bonuslen, dnodesize, tx);
849 		}
850 	} else {
851 		if (zfsvfs->z_replay) {
852 			VERIFY0(dmu_object_claim_dnsize(zfsvfs->z_os, obj,
853 			    DMU_OT_PLAIN_FILE_CONTENTS, 0,
854 			    obj_type, bonuslen, dnodesize, tx));
855 		} else {
856 			obj = dmu_object_alloc_dnsize(zfsvfs->z_os,
857 			    DMU_OT_PLAIN_FILE_CONTENTS, 0,
858 			    obj_type, bonuslen, dnodesize, tx);
859 		}
860 	}
861 
862 	ZFS_OBJ_HOLD_ENTER(zfsvfs, obj);
863 	VERIFY0(sa_buf_hold(zfsvfs->z_os, obj, NULL, &db));
864 
865 	/*
866 	 * If this is the root, fix up the half-initialized parent pointer
867 	 * to reference the just-allocated physical data area.
868 	 */
869 	if (flag & IS_ROOT_NODE) {
870 		dzp->z_id = obj;
871 	} else {
872 		dzp_pflags = dzp->z_pflags;
873 	}
874 
875 	/*
876 	 * If parent is an xattr, so am I.
877 	 */
878 	if (dzp_pflags & ZFS_XATTR) {
879 		flag |= IS_XATTR;
880 	}
881 
882 	if (zfsvfs->z_use_fuids)
883 		pflags = ZFS_ARCHIVE | ZFS_AV_MODIFIED;
884 	else
885 		pflags = 0;
886 
887 	if (vap->va_type == VDIR) {
888 		size = 2;		/* contents ("." and "..") */
889 		links = (flag & (IS_ROOT_NODE | IS_XATTR)) ? 2 : 1;
890 	} else {
891 		size = links = 0;
892 	}
893 
894 	if (vap->va_type == VBLK || vap->va_type == VCHR) {
895 		rdev = zfs_expldev(vap->va_rdev);
896 	}
897 
898 	parent = dzp->z_id;
899 	mode = acl_ids->z_mode;
900 	if (flag & IS_XATTR)
901 		pflags |= ZFS_XATTR;
902 
903 	/*
904 	 * No execs denied will be deterimed when zfs_mode_compute() is called.
905 	 */
906 	pflags |= acl_ids->z_aclp->z_hints &
907 	    (ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|ZFS_ACL_AUTO_INHERIT|
908 	    ZFS_ACL_DEFAULTED|ZFS_ACL_PROTECTED);
909 
910 	ZFS_TIME_ENCODE(&now, crtime);
911 	ZFS_TIME_ENCODE(&now, ctime);
912 
913 	if (vap->va_mask & AT_ATIME) {
914 		ZFS_TIME_ENCODE(&vap->va_atime, atime);
915 	} else {
916 		ZFS_TIME_ENCODE(&now, atime);
917 	}
918 
919 	if (vap->va_mask & AT_MTIME) {
920 		ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
921 	} else {
922 		ZFS_TIME_ENCODE(&now, mtime);
923 	}
924 
925 	/* Now add in all of the "SA" attributes */
926 	VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, NULL, SA_HDL_SHARED,
927 	    &sa_hdl));
928 
929 	/*
930 	 * Setup the array of attributes to be replaced/set on the new file
931 	 *
932 	 * order for  DMU_OT_ZNODE is critical since it needs to be constructed
933 	 * in the old znode_phys_t format.  Don't change this ordering
934 	 */
935 	sa_attrs = kmem_alloc(sizeof (sa_bulk_attr_t) * ZPL_END, KM_SLEEP);
936 
937 	if (obj_type == DMU_OT_ZNODE) {
938 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs),
939 		    NULL, &atime, 16);
940 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs),
941 		    NULL, &mtime, 16);
942 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs),
943 		    NULL, &ctime, 16);
944 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs),
945 		    NULL, &crtime, 16);
946 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs),
947 		    NULL, &gen, 8);
948 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs),
949 		    NULL, &mode, 8);
950 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs),
951 		    NULL, &size, 8);
952 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs),
953 		    NULL, &parent, 8);
954 	} else {
955 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs),
956 		    NULL, &mode, 8);
957 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs),
958 		    NULL, &size, 8);
959 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs),
960 		    NULL, &gen, 8);
961 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs),
962 		    NULL, &acl_ids->z_fuid, 8);
963 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs),
964 		    NULL, &acl_ids->z_fgid, 8);
965 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs),
966 		    NULL, &parent, 8);
967 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs),
968 		    NULL, &pflags, 8);
969 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs),
970 		    NULL, &atime, 16);
971 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs),
972 		    NULL, &mtime, 16);
973 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs),
974 		    NULL, &ctime, 16);
975 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs),
976 		    NULL, &crtime, 16);
977 	}
978 
979 	SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8);
980 
981 	if (obj_type == DMU_OT_ZNODE) {
982 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_XATTR(zfsvfs), NULL,
983 		    &empty_xattr, 8);
984 	}
985 	if (obj_type == DMU_OT_ZNODE ||
986 	    (vap->va_type == VBLK || vap->va_type == VCHR)) {
987 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_RDEV(zfsvfs),
988 		    NULL, &rdev, 8);
989 
990 	}
991 	if (obj_type == DMU_OT_ZNODE) {
992 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs),
993 		    NULL, &pflags, 8);
994 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), NULL,
995 		    &acl_ids->z_fuid, 8);
996 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), NULL,
997 		    &acl_ids->z_fgid, 8);
998 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PAD(zfsvfs), NULL, pad,
999 		    sizeof (uint64_t) * 4);
1000 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ZNODE_ACL(zfsvfs), NULL,
1001 		    &acl_phys, sizeof (zfs_acl_phys_t));
1002 	} else if (acl_ids->z_aclp->z_version >= ZFS_ACL_VERSION_FUID) {
1003 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_COUNT(zfsvfs), NULL,
1004 		    &acl_ids->z_aclp->z_acl_count, 8);
1005 		locate.cb_aclp = acl_ids->z_aclp;
1006 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_ACES(zfsvfs),
1007 		    zfs_acl_data_locator, &locate,
1008 		    acl_ids->z_aclp->z_acl_bytes);
1009 		mode = zfs_mode_compute(mode, acl_ids->z_aclp, &pflags,
1010 		    acl_ids->z_fuid, acl_ids->z_fgid);
1011 	}
1012 
1013 	VERIFY(sa_replace_all_by_template(sa_hdl, sa_attrs, cnt, tx) == 0);
1014 
1015 	if (!(flag & IS_ROOT_NODE)) {
1016 		*zpp = zfs_znode_alloc(zfsvfs, db, 0, obj_type, sa_hdl);
1017 		ASSERT(*zpp != NULL);
1018 	} else {
1019 		/*
1020 		 * If we are creating the root node, the "parent" we
1021 		 * passed in is the znode for the root.
1022 		 */
1023 		*zpp = dzp;
1024 
1025 		(*zpp)->z_sa_hdl = sa_hdl;
1026 	}
1027 
1028 	(*zpp)->z_pflags = pflags;
1029 	(*zpp)->z_mode = mode;
1030 	(*zpp)->z_dnodesize = dnodesize;
1031 
1032 	if (vap->va_mask & AT_XVATTR)
1033 		zfs_xvattr_set(*zpp, (xvattr_t *)vap, tx);
1034 
1035 	if (obj_type == DMU_OT_ZNODE ||
1036 	    acl_ids->z_aclp->z_version < ZFS_ACL_VERSION_FUID) {
1037 		VERIFY0(zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx));
1038 	}
1039 	kmem_free(sa_attrs, sizeof (sa_bulk_attr_t) * ZPL_END);
1040 	ZFS_OBJ_HOLD_EXIT(zfsvfs, obj);
1041 }
1042 
1043 /*
1044  * Update in-core attributes.  It is assumed the caller will be doing an
1045  * sa_bulk_update to push the changes out.
1046  */
1047 void
1048 zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx)
1049 {
1050 	xoptattr_t *xoap;
1051 
1052 	xoap = xva_getxoptattr(xvap);
1053 	ASSERT(xoap);
1054 
1055 	if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
1056 		uint64_t times[2];
1057 		ZFS_TIME_ENCODE(&xoap->xoa_createtime, times);
1058 		(void) sa_update(zp->z_sa_hdl, SA_ZPL_CRTIME(zp->z_zfsvfs),
1059 		    &times, sizeof (times), tx);
1060 		XVA_SET_RTN(xvap, XAT_CREATETIME);
1061 	}
1062 	if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
1063 		ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly,
1064 		    zp->z_pflags, tx);
1065 		XVA_SET_RTN(xvap, XAT_READONLY);
1066 	}
1067 	if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
1068 		ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden,
1069 		    zp->z_pflags, tx);
1070 		XVA_SET_RTN(xvap, XAT_HIDDEN);
1071 	}
1072 	if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
1073 		ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system,
1074 		    zp->z_pflags, tx);
1075 		XVA_SET_RTN(xvap, XAT_SYSTEM);
1076 	}
1077 	if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
1078 		ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive,
1079 		    zp->z_pflags, tx);
1080 		XVA_SET_RTN(xvap, XAT_ARCHIVE);
1081 	}
1082 	if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
1083 		ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable,
1084 		    zp->z_pflags, tx);
1085 		XVA_SET_RTN(xvap, XAT_IMMUTABLE);
1086 	}
1087 	if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
1088 		ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink,
1089 		    zp->z_pflags, tx);
1090 		XVA_SET_RTN(xvap, XAT_NOUNLINK);
1091 	}
1092 	if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
1093 		ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly,
1094 		    zp->z_pflags, tx);
1095 		XVA_SET_RTN(xvap, XAT_APPENDONLY);
1096 	}
1097 	if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
1098 		ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump,
1099 		    zp->z_pflags, tx);
1100 		XVA_SET_RTN(xvap, XAT_NODUMP);
1101 	}
1102 	if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
1103 		ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque,
1104 		    zp->z_pflags, tx);
1105 		XVA_SET_RTN(xvap, XAT_OPAQUE);
1106 	}
1107 	if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
1108 		ZFS_ATTR_SET(zp, ZFS_AV_QUARANTINED,
1109 		    xoap->xoa_av_quarantined, zp->z_pflags, tx);
1110 		XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
1111 	}
1112 	if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
1113 		ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified,
1114 		    zp->z_pflags, tx);
1115 		XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
1116 	}
1117 	if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) {
1118 		zfs_sa_set_scanstamp(zp, xvap, tx);
1119 		XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP);
1120 	}
1121 	if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
1122 		ZFS_ATTR_SET(zp, ZFS_REPARSE, xoap->xoa_reparse,
1123 		    zp->z_pflags, tx);
1124 		XVA_SET_RTN(xvap, XAT_REPARSE);
1125 	}
1126 	if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
1127 		ZFS_ATTR_SET(zp, ZFS_OFFLINE, xoap->xoa_offline,
1128 		    zp->z_pflags, tx);
1129 		XVA_SET_RTN(xvap, XAT_OFFLINE);
1130 	}
1131 	if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
1132 		ZFS_ATTR_SET(zp, ZFS_SPARSE, xoap->xoa_sparse,
1133 		    zp->z_pflags, tx);
1134 		XVA_SET_RTN(xvap, XAT_SPARSE);
1135 	}
1136 }
1137 
1138 int
1139 zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp)
1140 {
1141 	dmu_object_info_t doi;
1142 	dmu_buf_t	*db;
1143 	znode_t		*zp;
1144 	int err;
1145 	sa_handle_t	*hdl;
1146 
1147 	*zpp = NULL;
1148 
1149 	ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num);
1150 
1151 	err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db);
1152 	if (err) {
1153 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
1154 		return (err);
1155 	}
1156 
1157 	dmu_object_info_from_db(db, &doi);
1158 	if (doi.doi_bonus_type != DMU_OT_SA &&
1159 	    (doi.doi_bonus_type != DMU_OT_ZNODE ||
1160 	    (doi.doi_bonus_type == DMU_OT_ZNODE &&
1161 	    doi.doi_bonus_size < sizeof (znode_phys_t)))) {
1162 		sa_buf_rele(db, NULL);
1163 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
1164 		return (SET_ERROR(EINVAL));
1165 	}
1166 
1167 	hdl = dmu_buf_get_user(db);
1168 	if (hdl != NULL) {
1169 		zp  = sa_get_userdata(hdl);
1170 
1171 
1172 		/*
1173 		 * Since "SA" does immediate eviction we
1174 		 * should never find a sa handle that doesn't
1175 		 * know about the znode.
1176 		 */
1177 
1178 		ASSERT3P(zp, !=, NULL);
1179 
1180 		mutex_enter(&zp->z_lock);
1181 		ASSERT3U(zp->z_id, ==, obj_num);
1182 		if (zp->z_unlinked) {
1183 			err = SET_ERROR(ENOENT);
1184 		} else {
1185 			VN_HOLD(ZTOV(zp));
1186 			*zpp = zp;
1187 			err = 0;
1188 		}
1189 		mutex_exit(&zp->z_lock);
1190 		sa_buf_rele(db, NULL);
1191 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
1192 		return (err);
1193 	}
1194 
1195 	/*
1196 	 * Not found create new znode/vnode
1197 	 * but only if file exists.
1198 	 *
1199 	 * There is a small window where zfs_vget() could
1200 	 * find this object while a file create is still in
1201 	 * progress.  This is checked for in zfs_znode_alloc()
1202 	 *
1203 	 * if zfs_znode_alloc() fails it will drop the hold on the
1204 	 * bonus buffer.
1205 	 */
1206 	zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size,
1207 	    doi.doi_bonus_type, NULL);
1208 	if (zp == NULL) {
1209 		err = SET_ERROR(ENOENT);
1210 	} else {
1211 		*zpp = zp;
1212 	}
1213 	ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
1214 	return (err);
1215 }
1216 
1217 int
1218 zfs_rezget(znode_t *zp)
1219 {
1220 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1221 	dmu_object_info_t doi;
1222 	dmu_buf_t *db;
1223 	uint64_t obj_num = zp->z_id;
1224 	uint64_t mode;
1225 	sa_bulk_attr_t bulk[8];
1226 	int err;
1227 	int count = 0;
1228 	uint64_t gen;
1229 
1230 	ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num);
1231 
1232 	mutex_enter(&zp->z_acl_lock);
1233 	if (zp->z_acl_cached) {
1234 		zfs_acl_free(zp->z_acl_cached);
1235 		zp->z_acl_cached = NULL;
1236 	}
1237 
1238 	mutex_exit(&zp->z_acl_lock);
1239 	ASSERT(zp->z_sa_hdl == NULL);
1240 	err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db);
1241 	if (err) {
1242 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
1243 		return (err);
1244 	}
1245 
1246 	dmu_object_info_from_db(db, &doi);
1247 	if (doi.doi_bonus_type != DMU_OT_SA &&
1248 	    (doi.doi_bonus_type != DMU_OT_ZNODE ||
1249 	    (doi.doi_bonus_type == DMU_OT_ZNODE &&
1250 	    doi.doi_bonus_size < sizeof (znode_phys_t)))) {
1251 		sa_buf_rele(db, NULL);
1252 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
1253 		return (SET_ERROR(EINVAL));
1254 	}
1255 
1256 	zfs_znode_sa_init(zfsvfs, zp, db, doi.doi_bonus_type, NULL);
1257 
1258 	/* reload cached values */
1259 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL,
1260 	    &gen, sizeof (gen));
1261 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
1262 	    &zp->z_size, sizeof (zp->z_size));
1263 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
1264 	    &zp->z_links, sizeof (zp->z_links));
1265 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
1266 	    &zp->z_pflags, sizeof (zp->z_pflags));
1267 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
1268 	    &zp->z_atime, sizeof (zp->z_atime));
1269 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
1270 	    &zp->z_uid, sizeof (zp->z_uid));
1271 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
1272 	    &zp->z_gid, sizeof (zp->z_gid));
1273 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
1274 	    &mode, sizeof (mode));
1275 
1276 	if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) {
1277 		zfs_znode_dmu_fini(zp);
1278 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
1279 		return (SET_ERROR(EIO));
1280 	}
1281 
1282 	zp->z_mode = mode;
1283 
1284 	if (gen != zp->z_gen) {
1285 		zfs_znode_dmu_fini(zp);
1286 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
1287 		return (SET_ERROR(EIO));
1288 	}
1289 
1290 	zp->z_blksz = doi.doi_data_block_size;
1291 
1292 	/*
1293 	 * If the file has zero links, then it has been unlinked on the send
1294 	 * side and it must be in the received unlinked set.
1295 	 * We call zfs_znode_dmu_fini() now to prevent any accesses to the
1296 	 * stale data and to prevent automatical removal of the file in
1297 	 * zfs_zinactive().  The file will be removed either when it is removed
1298 	 * on the send side and the next incremental stream is received or
1299 	 * when the unlinked set gets processed.
1300 	 */
1301 	zp->z_unlinked = (zp->z_links == 0);
1302 	if (zp->z_unlinked)
1303 		zfs_znode_dmu_fini(zp);
1304 
1305 	ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
1306 
1307 	return (0);
1308 }
1309 
1310 void
1311 zfs_znode_delete(znode_t *zp, dmu_tx_t *tx)
1312 {
1313 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1314 	objset_t *os = zfsvfs->z_os;
1315 	uint64_t obj = zp->z_id;
1316 	uint64_t acl_obj = zfs_external_acl(zp);
1317 
1318 	ZFS_OBJ_HOLD_ENTER(zfsvfs, obj);
1319 	if (acl_obj) {
1320 		VERIFY(!zp->z_is_sa);
1321 		VERIFY(0 == dmu_object_free(os, acl_obj, tx));
1322 	}
1323 	VERIFY(0 == dmu_object_free(os, obj, tx));
1324 	zfs_znode_dmu_fini(zp);
1325 	ZFS_OBJ_HOLD_EXIT(zfsvfs, obj);
1326 	zfs_znode_free(zp);
1327 }
1328 
1329 void
1330 zfs_zinactive(znode_t *zp)
1331 {
1332 	vnode_t	*vp = ZTOV(zp);
1333 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1334 	uint64_t z_id = zp->z_id;
1335 
1336 	ASSERT(zp->z_sa_hdl);
1337 
1338 	/*
1339 	 * Don't allow a zfs_zget() while were trying to release this znode
1340 	 */
1341 	ZFS_OBJ_HOLD_ENTER(zfsvfs, z_id);
1342 
1343 	mutex_enter(&zp->z_lock);
1344 	mutex_enter(&vp->v_lock);
1345 	VN_RELE_LOCKED(vp);
1346 	if (vp->v_count > 0 || vn_has_cached_data(vp)) {
1347 		/*
1348 		 * If the hold count is greater than zero, somebody has
1349 		 * obtained a new reference on this znode while we were
1350 		 * processing it here, so we are done.  If we still have
1351 		 * mapped pages then we are also done, since we don't
1352 		 * want to inactivate the znode until the pages get pushed.
1353 		 *
1354 		 * XXX - if vn_has_cached_data(vp) is true, but count == 0,
1355 		 * this seems like it would leave the znode hanging with
1356 		 * no chance to go inactive...
1357 		 */
1358 		mutex_exit(&vp->v_lock);
1359 		mutex_exit(&zp->z_lock);
1360 		ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
1361 		return;
1362 	}
1363 	mutex_exit(&vp->v_lock);
1364 
1365 	/*
1366 	 * If this was the last reference to a file with no links, remove
1367 	 * the file from the file system unless the file system is mounted
1368 	 * read-only.  That can happen, for example, if the file system was
1369 	 * originally read-write, the file was opened, then unlinked and
1370 	 * the file system was made read-only before the file was finally
1371 	 * closed.  The file will remain in the unlinked set.
1372 	 */
1373 	if (zp->z_unlinked) {
1374 		ASSERT(!zfsvfs->z_issnap);
1375 		if ((zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) == 0) {
1376 			mutex_exit(&zp->z_lock);
1377 			ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
1378 			zfs_rmnode(zp);
1379 			return;
1380 		}
1381 	}
1382 
1383 	mutex_exit(&zp->z_lock);
1384 	zfs_znode_dmu_fini(zp);
1385 	ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
1386 	zfs_znode_free(zp);
1387 }
1388 
1389 void
1390 zfs_znode_free(znode_t *zp)
1391 {
1392 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1393 
1394 	vn_invalid(ZTOV(zp));
1395 
1396 	ASSERT(ZTOV(zp)->v_count == 0);
1397 
1398 	mutex_enter(&zfsvfs->z_znodes_lock);
1399 	POINTER_INVALIDATE(&zp->z_zfsvfs);
1400 	list_remove(&zfsvfs->z_all_znodes, zp);
1401 	mutex_exit(&zfsvfs->z_znodes_lock);
1402 
1403 	if (zp->z_acl_cached) {
1404 		zfs_acl_free(zp->z_acl_cached);
1405 		zp->z_acl_cached = NULL;
1406 	}
1407 
1408 	kmem_cache_free(znode_cache, zp);
1409 
1410 	VFS_RELE(zfsvfs->z_vfs);
1411 }
1412 
1413 void
1414 zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2],
1415     uint64_t ctime[2], boolean_t have_tx)
1416 {
1417 	timestruc_t	now;
1418 
1419 	gethrestime(&now);
1420 
1421 	if (have_tx) {	/* will sa_bulk_update happen really soon? */
1422 		zp->z_atime_dirty = 0;
1423 		zp->z_seq++;
1424 	} else {
1425 		zp->z_atime_dirty = 1;
1426 	}
1427 
1428 	if (flag & AT_ATIME) {
1429 		ZFS_TIME_ENCODE(&now, zp->z_atime);
1430 	}
1431 
1432 	if (flag & AT_MTIME) {
1433 		ZFS_TIME_ENCODE(&now, mtime);
1434 		if (zp->z_zfsvfs->z_use_fuids) {
1435 			zp->z_pflags |= (ZFS_ARCHIVE |
1436 			    ZFS_AV_MODIFIED);
1437 		}
1438 	}
1439 
1440 	if (flag & AT_CTIME) {
1441 		ZFS_TIME_ENCODE(&now, ctime);
1442 		if (zp->z_zfsvfs->z_use_fuids)
1443 			zp->z_pflags |= ZFS_ARCHIVE;
1444 	}
1445 }
1446 
1447 /*
1448  * Grow the block size for a file.
1449  *
1450  *	IN:	zp	- znode of file to free data in.
1451  *		size	- requested block size
1452  *		tx	- open transaction.
1453  *
1454  * NOTE: this function assumes that the znode is write locked.
1455  */
1456 void
1457 zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx)
1458 {
1459 	int		error;
1460 	u_longlong_t	dummy;
1461 
1462 	if (size <= zp->z_blksz)
1463 		return;
1464 	/*
1465 	 * If the file size is already greater than the current blocksize,
1466 	 * we will not grow.  If there is more than one block in a file,
1467 	 * the blocksize cannot change.
1468 	 */
1469 	if (zp->z_blksz && zp->z_size > zp->z_blksz)
1470 		return;
1471 
1472 	error = dmu_object_set_blocksize(zp->z_zfsvfs->z_os, zp->z_id,
1473 	    size, 0, tx);
1474 
1475 	if (error == ENOTSUP)
1476 		return;
1477 	ASSERT0(error);
1478 
1479 	/* What blocksize did we actually get? */
1480 	dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &zp->z_blksz, &dummy);
1481 }
1482 
1483 /*
1484  * This is a dummy interface used when pvn_vplist_dirty() should *not*
1485  * be calling back into the fs for a putpage().  E.g.: when truncating
1486  * a file, the pages being "thrown away* don't need to be written out.
1487  */
1488 /* ARGSUSED */
1489 static int
1490 zfs_no_putpage(vnode_t *vp, page_t *pp, u_offset_t *offp, size_t *lenp,
1491     int flags, cred_t *cr)
1492 {
1493 	ASSERT(0);
1494 	return (0);
1495 }
1496 
1497 /*
1498  * Increase the file length
1499  *
1500  *	IN:	zp	- znode of file to free data in.
1501  *		end	- new end-of-file
1502  *
1503  *	RETURN:	0 on success, error code on failure
1504  */
1505 static int
1506 zfs_extend(znode_t *zp, uint64_t end)
1507 {
1508 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1509 	dmu_tx_t *tx;
1510 	locked_range_t *lr;
1511 	uint64_t newblksz;
1512 	int error;
1513 
1514 	/*
1515 	 * We will change zp_size, lock the whole file.
1516 	 */
1517 	lr = rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER);
1518 
1519 	/*
1520 	 * Nothing to do if file already at desired length.
1521 	 */
1522 	if (end <= zp->z_size) {
1523 		rangelock_exit(lr);
1524 		return (0);
1525 	}
1526 	tx = dmu_tx_create(zfsvfs->z_os);
1527 	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1528 	zfs_sa_upgrade_txholds(tx, zp);
1529 	if (end > zp->z_blksz &&
1530 	    (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) {
1531 		/*
1532 		 * We are growing the file past the current block size.
1533 		 */
1534 		if (zp->z_blksz > zp->z_zfsvfs->z_max_blksz) {
1535 			/*
1536 			 * File's blocksize is already larger than the
1537 			 * "recordsize" property.  Only let it grow to
1538 			 * the next power of 2.
1539 			 */
1540 			ASSERT(!ISP2(zp->z_blksz));
1541 			newblksz = MIN(end, 1 << highbit64(zp->z_blksz));
1542 		} else {
1543 			newblksz = MIN(end, zp->z_zfsvfs->z_max_blksz);
1544 		}
1545 		dmu_tx_hold_write(tx, zp->z_id, 0, newblksz);
1546 	} else {
1547 		newblksz = 0;
1548 	}
1549 
1550 	error = dmu_tx_assign(tx, TXG_WAIT);
1551 	if (error) {
1552 		dmu_tx_abort(tx);
1553 		rangelock_exit(lr);
1554 		return (error);
1555 	}
1556 
1557 	if (newblksz)
1558 		zfs_grow_blocksize(zp, newblksz, tx);
1559 
1560 	zp->z_size = end;
1561 
1562 	VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zp->z_zfsvfs),
1563 	    &zp->z_size, sizeof (zp->z_size), tx));
1564 
1565 	rangelock_exit(lr);
1566 
1567 	dmu_tx_commit(tx);
1568 
1569 	return (0);
1570 }
1571 
1572 /*
1573  * Free space in a file.
1574  *
1575  *	IN:	zp	- znode of file to free data in.
1576  *		off	- start of section to free.
1577  *		len	- length of section to free.
1578  *
1579  *	RETURN:	0 on success, error code on failure
1580  */
1581 static int
1582 zfs_free_range(znode_t *zp, uint64_t off, uint64_t len)
1583 {
1584 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1585 	locked_range_t *lr;
1586 	int error;
1587 
1588 	/*
1589 	 * Lock the range being freed.
1590 	 */
1591 	lr = rangelock_enter(&zp->z_rangelock, off, len, RL_WRITER);
1592 
1593 	/*
1594 	 * Nothing to do if file already at desired length.
1595 	 */
1596 	if (off >= zp->z_size) {
1597 		rangelock_exit(lr);
1598 		return (0);
1599 	}
1600 
1601 	if (off + len > zp->z_size)
1602 		len = zp->z_size - off;
1603 
1604 	error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len);
1605 
1606 	rangelock_exit(lr);
1607 
1608 	return (error);
1609 }
1610 
1611 /*
1612  * Truncate a file
1613  *
1614  *	IN:	zp	- znode of file to free data in.
1615  *		end	- new end-of-file.
1616  *
1617  *	RETURN:	0 on success, error code on failure
1618  */
1619 static int
1620 zfs_trunc(znode_t *zp, uint64_t end)
1621 {
1622 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1623 	vnode_t *vp = ZTOV(zp);
1624 	dmu_tx_t *tx;
1625 	locked_range_t *lr;
1626 	int error;
1627 	sa_bulk_attr_t bulk[2];
1628 	int count = 0;
1629 
1630 	/*
1631 	 * We will change zp_size, lock the whole file.
1632 	 */
1633 	lr = rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER);
1634 
1635 	/*
1636 	 * Nothing to do if file already at desired length.
1637 	 */
1638 	if (end >= zp->z_size) {
1639 		rangelock_exit(lr);
1640 		return (0);
1641 	}
1642 
1643 	error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end,
1644 	    DMU_OBJECT_END);
1645 	if (error) {
1646 		rangelock_exit(lr);
1647 		return (error);
1648 	}
1649 	tx = dmu_tx_create(zfsvfs->z_os);
1650 	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1651 	zfs_sa_upgrade_txholds(tx, zp);
1652 	dmu_tx_mark_netfree(tx);
1653 	error = dmu_tx_assign(tx, TXG_WAIT);
1654 	if (error) {
1655 		dmu_tx_abort(tx);
1656 		rangelock_exit(lr);
1657 		return (error);
1658 	}
1659 
1660 	zp->z_size = end;
1661 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs),
1662 	    NULL, &zp->z_size, sizeof (zp->z_size));
1663 
1664 	if (end == 0) {
1665 		zp->z_pflags &= ~ZFS_SPARSE;
1666 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
1667 		    NULL, &zp->z_pflags, 8);
1668 	}
1669 	VERIFY(sa_bulk_update(zp->z_sa_hdl, bulk, count, tx) == 0);
1670 
1671 	dmu_tx_commit(tx);
1672 
1673 	/*
1674 	 * Clear any mapped pages in the truncated region.  This has to
1675 	 * happen outside of the transaction to avoid the possibility of
1676 	 * a deadlock with someone trying to push a page that we are
1677 	 * about to invalidate.
1678 	 */
1679 	if (vn_has_cached_data(vp)) {
1680 		page_t *pp;
1681 		uint64_t start = end & PAGEMASK;
1682 		int poff = end & PAGEOFFSET;
1683 
1684 		if (poff != 0 && (pp = page_lookup(vp, start, SE_SHARED))) {
1685 			/*
1686 			 * We need to zero a partial page.
1687 			 */
1688 			pagezero(pp, poff, PAGESIZE - poff);
1689 			start += PAGESIZE;
1690 			page_unlock(pp);
1691 		}
1692 		error = pvn_vplist_dirty(vp, start, zfs_no_putpage,
1693 		    B_INVAL | B_TRUNC, NULL);
1694 		ASSERT(error == 0);
1695 	}
1696 
1697 	rangelock_exit(lr);
1698 
1699 	return (0);
1700 }
1701 
1702 /*
1703  * Free space in a file
1704  *
1705  *	IN:	zp	- znode of file to free data in.
1706  *		off	- start of range
1707  *		len	- end of range (0 => EOF)
1708  *		flag	- current file open mode flags.
1709  *		log	- TRUE if this action should be logged
1710  *
1711  *	RETURN:	0 on success, error code on failure
1712  */
1713 int
1714 zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log)
1715 {
1716 	vnode_t *vp = ZTOV(zp);
1717 	dmu_tx_t *tx;
1718 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1719 	zilog_t *zilog = zfsvfs->z_log;
1720 	uint64_t mode;
1721 	uint64_t mtime[2], ctime[2];
1722 	sa_bulk_attr_t bulk[3];
1723 	int count = 0;
1724 	int error;
1725 
1726 	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), &mode,
1727 	    sizeof (mode))) != 0)
1728 		return (error);
1729 
1730 	if (off > zp->z_size) {
1731 		error =  zfs_extend(zp, off+len);
1732 		if (error == 0 && log)
1733 			goto log;
1734 		else
1735 			return (error);
1736 	}
1737 
1738 	/*
1739 	 * Check for any locks in the region to be freed.
1740 	 */
1741 
1742 	if (MANDLOCK(vp, (mode_t)mode)) {
1743 		uint64_t length = (len ? len : zp->z_size - off);
1744 		if (error = chklock(vp, FWRITE, off, length, flag, NULL))
1745 			return (error);
1746 	}
1747 
1748 	if (len == 0) {
1749 		error = zfs_trunc(zp, off);
1750 	} else {
1751 		if ((error = zfs_free_range(zp, off, len)) == 0 &&
1752 		    off + len > zp->z_size)
1753 			error = zfs_extend(zp, off+len);
1754 	}
1755 	if (error || !log)
1756 		return (error);
1757 log:
1758 	tx = dmu_tx_create(zfsvfs->z_os);
1759 	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1760 	zfs_sa_upgrade_txholds(tx, zp);
1761 	error = dmu_tx_assign(tx, TXG_WAIT);
1762 	if (error) {
1763 		dmu_tx_abort(tx);
1764 		return (error);
1765 	}
1766 
1767 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, 16);
1768 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, ctime, 16);
1769 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
1770 	    NULL, &zp->z_pflags, 8);
1771 	zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, B_TRUE);
1772 	error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
1773 	ASSERT(error == 0);
1774 
1775 	zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len);
1776 
1777 	dmu_tx_commit(tx);
1778 	return (0);
1779 }
1780 
1781 void
1782 zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
1783 {
1784 	uint64_t	moid, obj, sa_obj, version;
1785 	uint64_t	sense = ZFS_CASE_SENSITIVE;
1786 	uint64_t	norm = 0;
1787 	nvpair_t	*elem;
1788 	int		error;
1789 	int		i;
1790 	znode_t		*rootzp = NULL;
1791 	zfsvfs_t	*zfsvfs;
1792 	vnode_t		*vp;
1793 	vattr_t		vattr;
1794 	znode_t		*zp;
1795 	zfs_acl_ids_t	acl_ids;
1796 
1797 	/*
1798 	 * First attempt to create master node.
1799 	 */
1800 	/*
1801 	 * In an empty objset, there are no blocks to read and thus
1802 	 * there can be no i/o errors (which we assert below).
1803 	 */
1804 	moid = MASTER_NODE_OBJ;
1805 	error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE,
1806 	    DMU_OT_NONE, 0, tx);
1807 	ASSERT(error == 0);
1808 
1809 	/*
1810 	 * Set starting attributes.
1811 	 */
1812 	version = zfs_zpl_version_map(spa_version(dmu_objset_spa(os)));
1813 	elem = NULL;
1814 	while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) {
1815 		/* For the moment we expect all zpl props to be uint64_ts */
1816 		uint64_t val;
1817 		char *name;
1818 
1819 		ASSERT(nvpair_type(elem) == DATA_TYPE_UINT64);
1820 		VERIFY(nvpair_value_uint64(elem, &val) == 0);
1821 		name = nvpair_name(elem);
1822 		if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) {
1823 			if (val < version)
1824 				version = val;
1825 		} else {
1826 			error = zap_update(os, moid, name, 8, 1, &val, tx);
1827 		}
1828 		ASSERT(error == 0);
1829 		if (strcmp(name, zfs_prop_to_name(ZFS_PROP_NORMALIZE)) == 0)
1830 			norm = val;
1831 		else if (strcmp(name, zfs_prop_to_name(ZFS_PROP_CASE)) == 0)
1832 			sense = val;
1833 	}
1834 	ASSERT(version != 0);
1835 	error = zap_update(os, moid, ZPL_VERSION_STR, 8, 1, &version, tx);
1836 
1837 	/*
1838 	 * Create zap object used for SA attribute registration
1839 	 */
1840 
1841 	if (version >= ZPL_VERSION_SA) {
1842 		sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE,
1843 		    DMU_OT_NONE, 0, tx);
1844 		error = zap_add(os, moid, ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
1845 		ASSERT(error == 0);
1846 	} else {
1847 		sa_obj = 0;
1848 	}
1849 	/*
1850 	 * Create a delete queue.
1851 	 */
1852 	obj = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx);
1853 
1854 	error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &obj, tx);
1855 	ASSERT(error == 0);
1856 
1857 	/*
1858 	 * Create root znode.  Create minimal znode/vnode/zfsvfs
1859 	 * to allow zfs_mknode to work.
1860 	 */
1861 	vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE;
1862 	vattr.va_type = VDIR;
1863 	vattr.va_mode = S_IFDIR|0755;
1864 	vattr.va_uid = crgetuid(cr);
1865 	vattr.va_gid = crgetgid(cr);
1866 
1867 	rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP);
1868 	ASSERT(!POINTER_IS_VALID(rootzp->z_zfsvfs));
1869 	rootzp->z_moved = 0;
1870 	rootzp->z_unlinked = 0;
1871 	rootzp->z_atime_dirty = 0;
1872 	rootzp->z_is_sa = USE_SA(version, os);
1873 
1874 	vp = ZTOV(rootzp);
1875 	vn_reinit(vp);
1876 	vp->v_type = VDIR;
1877 
1878 	zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
1879 	zfsvfs->z_os = os;
1880 	zfsvfs->z_parent = zfsvfs;
1881 	zfsvfs->z_version = version;
1882 	zfsvfs->z_use_fuids = USE_FUIDS(version, os);
1883 	zfsvfs->z_use_sa = USE_SA(version, os);
1884 	zfsvfs->z_norm = norm;
1885 
1886 	error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
1887 	    &zfsvfs->z_attr_table);
1888 
1889 	ASSERT(error == 0);
1890 
1891 	/*
1892 	 * Fold case on file systems that are always or sometimes case
1893 	 * insensitive.
1894 	 */
1895 	if (sense == ZFS_CASE_INSENSITIVE || sense == ZFS_CASE_MIXED)
1896 		zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
1897 
1898 	mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
1899 	list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
1900 	    offsetof(znode_t, z_link_node));
1901 
1902 	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
1903 		mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
1904 
1905 	rootzp->z_zfsvfs = zfsvfs;
1906 	VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr,
1907 	    cr, NULL, &acl_ids));
1908 	zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, &acl_ids);
1909 	ASSERT3P(zp, ==, rootzp);
1910 	ASSERT(!vn_in_dnlc(ZTOV(rootzp))); /* not valid to move */
1911 	error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx);
1912 	ASSERT(error == 0);
1913 	zfs_acl_ids_free(&acl_ids);
1914 	POINTER_INVALIDATE(&rootzp->z_zfsvfs);
1915 
1916 	ZTOV(rootzp)->v_count = 0;
1917 	sa_handle_destroy(rootzp->z_sa_hdl);
1918 	kmem_cache_free(znode_cache, rootzp);
1919 
1920 	/*
1921 	 * Create shares directory
1922 	 */
1923 
1924 	error = zfs_create_share_dir(zfsvfs, tx);
1925 
1926 	ASSERT(error == 0);
1927 
1928 	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
1929 		mutex_destroy(&zfsvfs->z_hold_mtx[i]);
1930 	kmem_free(zfsvfs, sizeof (zfsvfs_t));
1931 }
1932 
1933 #endif /* _KERNEL */
1934 
1935 static int
1936 zfs_sa_setup(objset_t *osp, sa_attr_type_t **sa_table)
1937 {
1938 	uint64_t sa_obj = 0;
1939 	int error;
1940 
1941 	error = zap_lookup(osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj);
1942 	if (error != 0 && error != ENOENT)
1943 		return (error);
1944 
1945 	error = sa_setup(osp, sa_obj, zfs_attr_table, ZPL_END, sa_table);
1946 	return (error);
1947 }
1948 
1949 static int
1950 zfs_grab_sa_handle(objset_t *osp, uint64_t obj, sa_handle_t **hdlp,
1951     dmu_buf_t **db, void *tag)
1952 {
1953 	dmu_object_info_t doi;
1954 	int error;
1955 
1956 	if ((error = sa_buf_hold(osp, obj, tag, db)) != 0)
1957 		return (error);
1958 
1959 	dmu_object_info_from_db(*db, &doi);
1960 	if ((doi.doi_bonus_type != DMU_OT_SA &&
1961 	    doi.doi_bonus_type != DMU_OT_ZNODE) ||
1962 	    doi.doi_bonus_type == DMU_OT_ZNODE &&
1963 	    doi.doi_bonus_size < sizeof (znode_phys_t)) {
1964 		sa_buf_rele(*db, tag);
1965 		return (SET_ERROR(ENOTSUP));
1966 	}
1967 
1968 	error = sa_handle_get(osp, obj, NULL, SA_HDL_PRIVATE, hdlp);
1969 	if (error != 0) {
1970 		sa_buf_rele(*db, tag);
1971 		return (error);
1972 	}
1973 
1974 	return (0);
1975 }
1976 
1977 void
1978 zfs_release_sa_handle(sa_handle_t *hdl, dmu_buf_t *db, void *tag)
1979 {
1980 	sa_handle_destroy(hdl);
1981 	sa_buf_rele(db, tag);
1982 }
1983 
1984 /*
1985  * Given an object number, return its parent object number and whether
1986  * or not the object is an extended attribute directory.
1987  */
1988 static int
1989 zfs_obj_to_pobj(objset_t *osp, sa_handle_t *hdl, sa_attr_type_t *sa_table,
1990     uint64_t *pobjp, int *is_xattrdir)
1991 {
1992 	uint64_t parent;
1993 	uint64_t pflags;
1994 	uint64_t mode;
1995 	uint64_t parent_mode;
1996 	sa_bulk_attr_t bulk[3];
1997 	sa_handle_t *sa_hdl;
1998 	dmu_buf_t *sa_db;
1999 	int count = 0;
2000 	int error;
2001 
2002 	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_PARENT], NULL,
2003 	    &parent, sizeof (parent));
2004 	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_FLAGS], NULL,
2005 	    &pflags, sizeof (pflags));
2006 	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL,
2007 	    &mode, sizeof (mode));
2008 
2009 	if ((error = sa_bulk_lookup(hdl, bulk, count)) != 0)
2010 		return (error);
2011 
2012 	/*
2013 	 * When a link is removed its parent pointer is not changed and will
2014 	 * be invalid.  There are two cases where a link is removed but the
2015 	 * file stays around, when it goes to the delete queue and when there
2016 	 * are additional links.
2017 	 */
2018 	error = zfs_grab_sa_handle(osp, parent, &sa_hdl, &sa_db, FTAG);
2019 	if (error != 0)
2020 		return (error);
2021 
2022 	error = sa_lookup(sa_hdl, ZPL_MODE, &parent_mode, sizeof (parent_mode));
2023 	zfs_release_sa_handle(sa_hdl, sa_db, FTAG);
2024 	if (error != 0)
2025 		return (error);
2026 
2027 	*is_xattrdir = ((pflags & ZFS_XATTR) != 0) && S_ISDIR(mode);
2028 
2029 	/*
2030 	 * Extended attributes can be applied to files, directories, etc.
2031 	 * Otherwise the parent must be a directory.
2032 	 */
2033 	if (!*is_xattrdir && !S_ISDIR(parent_mode))
2034 		return (SET_ERROR(EINVAL));
2035 
2036 	*pobjp = parent;
2037 
2038 	return (0);
2039 }
2040 
2041 /*
2042  * Given an object number, return some zpl level statistics
2043  */
2044 static int
2045 zfs_obj_to_stats_impl(sa_handle_t *hdl, sa_attr_type_t *sa_table,
2046     zfs_stat_t *sb)
2047 {
2048 	sa_bulk_attr_t bulk[4];
2049 	int count = 0;
2050 
2051 	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL,
2052 	    &sb->zs_mode, sizeof (sb->zs_mode));
2053 	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_GEN], NULL,
2054 	    &sb->zs_gen, sizeof (sb->zs_gen));
2055 	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_LINKS], NULL,
2056 	    &sb->zs_links, sizeof (sb->zs_links));
2057 	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_CTIME], NULL,
2058 	    &sb->zs_ctime, sizeof (sb->zs_ctime));
2059 
2060 	return (sa_bulk_lookup(hdl, bulk, count));
2061 }
2062 
2063 static int
2064 zfs_obj_to_path_impl(objset_t *osp, uint64_t obj, sa_handle_t *hdl,
2065     sa_attr_type_t *sa_table, char *buf, int len)
2066 {
2067 	sa_handle_t *sa_hdl;
2068 	sa_handle_t *prevhdl = NULL;
2069 	dmu_buf_t *prevdb = NULL;
2070 	dmu_buf_t *sa_db = NULL;
2071 	char *path = buf + len - 1;
2072 	int error;
2073 
2074 	*path = '\0';
2075 	sa_hdl = hdl;
2076 
2077 	uint64_t deleteq_obj;
2078 	VERIFY0(zap_lookup(osp, MASTER_NODE_OBJ,
2079 	    ZFS_UNLINKED_SET, sizeof (uint64_t), 1, &deleteq_obj));
2080 	error = zap_lookup_int(osp, deleteq_obj, obj);
2081 	if (error == 0) {
2082 		return (ESTALE);
2083 	} else if (error != ENOENT) {
2084 		return (error);
2085 	}
2086 	error = 0;
2087 
2088 	for (;;) {
2089 		uint64_t pobj;
2090 		char component[MAXNAMELEN + 2];
2091 		size_t complen;
2092 		int is_xattrdir;
2093 
2094 		if (prevdb)
2095 			zfs_release_sa_handle(prevhdl, prevdb, FTAG);
2096 
2097 		if ((error = zfs_obj_to_pobj(osp, sa_hdl, sa_table, &pobj,
2098 		    &is_xattrdir)) != 0)
2099 			break;
2100 
2101 		if (pobj == obj) {
2102 			if (path[0] != '/')
2103 				*--path = '/';
2104 			break;
2105 		}
2106 
2107 		component[0] = '/';
2108 		if (is_xattrdir) {
2109 			(void) sprintf(component + 1, "<xattrdir>");
2110 		} else {
2111 			error = zap_value_search(osp, pobj, obj,
2112 			    ZFS_DIRENT_OBJ(-1ULL), component + 1);
2113 			if (error != 0)
2114 				break;
2115 		}
2116 
2117 		complen = strlen(component);
2118 		path -= complen;
2119 		ASSERT(path >= buf);
2120 		bcopy(component, path, complen);
2121 		obj = pobj;
2122 
2123 		if (sa_hdl != hdl) {
2124 			prevhdl = sa_hdl;
2125 			prevdb = sa_db;
2126 		}
2127 		error = zfs_grab_sa_handle(osp, obj, &sa_hdl, &sa_db, FTAG);
2128 		if (error != 0) {
2129 			sa_hdl = prevhdl;
2130 			sa_db = prevdb;
2131 			break;
2132 		}
2133 	}
2134 
2135 	if (sa_hdl != NULL && sa_hdl != hdl) {
2136 		ASSERT(sa_db != NULL);
2137 		zfs_release_sa_handle(sa_hdl, sa_db, FTAG);
2138 	}
2139 
2140 	if (error == 0)
2141 		(void) memmove(buf, path, buf + len - path);
2142 
2143 	return (error);
2144 }
2145 
2146 int
2147 zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len)
2148 {
2149 	sa_attr_type_t *sa_table;
2150 	sa_handle_t *hdl;
2151 	dmu_buf_t *db;
2152 	int error;
2153 
2154 	error = zfs_sa_setup(osp, &sa_table);
2155 	if (error != 0)
2156 		return (error);
2157 
2158 	error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG);
2159 	if (error != 0)
2160 		return (error);
2161 
2162 	error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len);
2163 
2164 	zfs_release_sa_handle(hdl, db, FTAG);
2165 	return (error);
2166 }
2167 
2168 int
2169 zfs_obj_to_stats(objset_t *osp, uint64_t obj, zfs_stat_t *sb,
2170     char *buf, int len)
2171 {
2172 	char *path = buf + len - 1;
2173 	sa_attr_type_t *sa_table;
2174 	sa_handle_t *hdl;
2175 	dmu_buf_t *db;
2176 	int error;
2177 
2178 	*path = '\0';
2179 
2180 	error = zfs_sa_setup(osp, &sa_table);
2181 	if (error != 0)
2182 		return (error);
2183 
2184 	error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG);
2185 	if (error != 0)
2186 		return (error);
2187 
2188 	error = zfs_obj_to_stats_impl(hdl, sa_table, sb);
2189 	if (error != 0) {
2190 		zfs_release_sa_handle(hdl, db, FTAG);
2191 		return (error);
2192 	}
2193 
2194 	error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len);
2195 
2196 	zfs_release_sa_handle(hdl, db, FTAG);
2197 	return (error);
2198 }
2199