1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
24 * Copyright (c) 2014 Integros [integros.com]
25 */
26
27/* Portions Copyright 2007 Jeremy Teo */
28
29#ifdef _KERNEL
30#include <sys/types.h>
31#include <sys/param.h>
32#include <sys/time.h>
33#include <sys/systm.h>
34#include <sys/sysmacros.h>
35#include <sys/resource.h>
36#include <sys/mntent.h>
37#include <sys/mkdev.h>
38#include <sys/u8_textprep.h>
39#include <sys/dsl_dataset.h>
40#include <sys/vfs.h>
41#include <sys/vfs_opreg.h>
42#include <sys/vnode.h>
43#include <sys/file.h>
44#include <sys/kmem.h>
45#include <sys/errno.h>
46#include <sys/unistd.h>
47#include <sys/mode.h>
48#include <sys/atomic.h>
49#include <vm/pvn.h>
50#include "fs/fs_subr.h"
51#include <sys/zfs_dir.h>
52#include <sys/zfs_acl.h>
53#include <sys/zfs_ioctl.h>
54#include <sys/zfs_rlock.h>
55#include <sys/zfs_fuid.h>
56#include <sys/dnode.h>
57#include <sys/fs/zfs.h>
58#include <sys/kidmap.h>
59#endif /* _KERNEL */
60
61#include <sys/dmu.h>
62#include <sys/dmu_objset.h>
63#include <sys/dmu_tx.h>
64#include <sys/refcount.h>
65#include <sys/stat.h>
66#include <sys/zap.h>
67#include <sys/zfs_znode.h>
68#include <sys/sa.h>
69#include <sys/zfs_sa.h>
70#include <sys/zfs_stat.h>
71
72#include "zfs_prop.h"
73#include "zfs_comutil.h"
74
75/*
76 * Define ZNODE_STATS to turn on statistic gathering. By default, it is only
77 * turned on when DEBUG is also defined.
78 */
79#ifdef	DEBUG
80#define	ZNODE_STATS
81#endif	/* DEBUG */
82
83#ifdef	ZNODE_STATS
84#define	ZNODE_STAT_ADD(stat)			((stat)++)
85#else
86#define	ZNODE_STAT_ADD(stat)			/* nothing */
87#endif	/* ZNODE_STATS */
88
89/*
90 * Functions needed for userland (ie: libzpool) are not put under
91 * #ifdef_KERNEL; the rest of the functions have dependencies
92 * (such as VFS logic) that will not compile easily in userland.
93 */
94#ifdef _KERNEL
95/*
96 * Needed to close a small window in zfs_znode_move() that allows the zfsvfs to
97 * be freed before it can be safely accessed.
98 */
99krwlock_t zfsvfs_lock;
100
101static kmem_cache_t *znode_cache = NULL;
102
103/*
104 * This is used by the test suite so that it can delay znodes from being
105 * freed in order to inspect the unlinked set.
106 */
107int zfs_unlink_suspend_progress = 0;
108
109/*ARGSUSED*/
110static void
111znode_evict_error(dmu_buf_t *dbuf, void *user_ptr)
112{
113	/*
114	 * We should never drop all dbuf refs without first clearing
115	 * the eviction callback.
116	 */
117	panic("evicting znode %p\n", user_ptr);
118}
119
120/*
121 * This callback is invoked when acquiring a RL_WRITER or RL_APPEND lock on
122 * z_rangelock. It will modify the offset and length of the lock to reflect
123 * znode-specific information, and convert RL_APPEND to RL_WRITER.  This is
124 * called with the rangelock_t's rl_lock held, which avoids races.
125 */
126static void
127zfs_rangelock_cb(locked_range_t *new, void *arg)
128{
129	znode_t *zp = arg;
130
131	/*
132	 * If in append mode, convert to writer and lock starting at the
133	 * current end of file.
134	 */
135	if (new->lr_type == RL_APPEND) {
136		new->lr_offset = zp->z_size;
137		new->lr_type = RL_WRITER;
138	}
139
140	/*
141	 * If we need to grow the block size then lock the whole file range.
142	 */
143	uint64_t end_size = MAX(zp->z_size, new->lr_offset + new->lr_length);
144	if (end_size > zp->z_blksz && (!ISP2(zp->z_blksz) ||
145	    zp->z_blksz < zp->z_zfsvfs->z_max_blksz)) {
146		new->lr_offset = 0;
147		new->lr_length = UINT64_MAX;
148	}
149}
150
151/*ARGSUSED*/
152static int
153zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
154{
155	znode_t *zp = buf;
156
157	ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs));
158
159	zp->z_vnode = vn_alloc(kmflags);
160	if (zp->z_vnode == NULL) {
161		return (-1);
162	}
163	ZTOV(zp)->v_data = zp;
164
165	list_link_init(&zp->z_link_node);
166
167	mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL);
168	rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL);
169	rw_init(&zp->z_name_lock, NULL, RW_DEFAULT, NULL);
170	mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL);
171
172	rangelock_init(&zp->z_rangelock, zfs_rangelock_cb, zp);
173
174	zp->z_dirlocks = NULL;
175	zp->z_acl_cached = NULL;
176	zp->z_moved = 0;
177	return (0);
178}
179
180/*ARGSUSED*/
181static void
182zfs_znode_cache_destructor(void *buf, void *arg)
183{
184	znode_t *zp = buf;
185
186	ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs));
187	ASSERT(ZTOV(zp)->v_data == zp);
188	vn_free(ZTOV(zp));
189	ASSERT(!list_link_active(&zp->z_link_node));
190	mutex_destroy(&zp->z_lock);
191	rw_destroy(&zp->z_parent_lock);
192	rw_destroy(&zp->z_name_lock);
193	mutex_destroy(&zp->z_acl_lock);
194	rangelock_fini(&zp->z_rangelock);
195
196	ASSERT(zp->z_dirlocks == NULL);
197	ASSERT(zp->z_acl_cached == NULL);
198}
199
200#ifdef	ZNODE_STATS
201static struct {
202	uint64_t zms_zfsvfs_invalid;
203	uint64_t zms_zfsvfs_recheck1;
204	uint64_t zms_zfsvfs_unmounted;
205	uint64_t zms_zfsvfs_recheck2;
206	uint64_t zms_obj_held;
207	uint64_t zms_vnode_locked;
208	uint64_t zms_not_only_dnlc;
209} znode_move_stats;
210#endif	/* ZNODE_STATS */
211
212static void
213zfs_znode_move_impl(znode_t *ozp, znode_t *nzp)
214{
215	vnode_t *vp;
216
217	/* Copy fields. */
218	nzp->z_zfsvfs = ozp->z_zfsvfs;
219
220	/* Swap vnodes. */
221	vp = nzp->z_vnode;
222	nzp->z_vnode = ozp->z_vnode;
223	ozp->z_vnode = vp; /* let destructor free the overwritten vnode */
224	ZTOV(ozp)->v_data = ozp;
225	ZTOV(nzp)->v_data = nzp;
226
227	nzp->z_id = ozp->z_id;
228	ASSERT(ozp->z_dirlocks == NULL); /* znode not in use */
229	nzp->z_unlinked = ozp->z_unlinked;
230	nzp->z_atime_dirty = ozp->z_atime_dirty;
231	nzp->z_zn_prefetch = ozp->z_zn_prefetch;
232	nzp->z_blksz = ozp->z_blksz;
233	nzp->z_seq = ozp->z_seq;
234	nzp->z_mapcnt = ozp->z_mapcnt;
235	nzp->z_gen = ozp->z_gen;
236	nzp->z_sync_cnt = ozp->z_sync_cnt;
237	nzp->z_is_sa = ozp->z_is_sa;
238	nzp->z_sa_hdl = ozp->z_sa_hdl;
239	bcopy(ozp->z_atime, nzp->z_atime, sizeof (uint64_t) * 2);
240	nzp->z_links = ozp->z_links;
241	nzp->z_size = ozp->z_size;
242	nzp->z_pflags = ozp->z_pflags;
243	nzp->z_uid = ozp->z_uid;
244	nzp->z_gid = ozp->z_gid;
245	nzp->z_mode = ozp->z_mode;
246
247	/*
248	 * Since this is just an idle znode and kmem is already dealing with
249	 * memory pressure, release any cached ACL.
250	 */
251	if (ozp->z_acl_cached) {
252		zfs_acl_free(ozp->z_acl_cached);
253		ozp->z_acl_cached = NULL;
254	}
255
256	sa_set_userp(nzp->z_sa_hdl, nzp);
257
258	/*
259	 * Invalidate the original znode by clearing fields that provide a
260	 * pointer back to the znode. Set the low bit of the vfs pointer to
261	 * ensure that zfs_znode_move() recognizes the znode as invalid in any
262	 * subsequent callback.
263	 */
264	ozp->z_sa_hdl = NULL;
265	POINTER_INVALIDATE(&ozp->z_zfsvfs);
266
267	/*
268	 * Mark the znode.
269	 */
270	nzp->z_moved = 1;
271	ozp->z_moved = (uint8_t)-1;
272}
273
274/*ARGSUSED*/
275static kmem_cbrc_t
276zfs_znode_move(void *buf, void *newbuf, size_t size, void *arg)
277{
278	znode_t *ozp = buf, *nzp = newbuf;
279	zfsvfs_t *zfsvfs;
280	vnode_t *vp;
281
282	/*
283	 * The znode is on the file system's list of known znodes if the vfs
284	 * pointer is valid. We set the low bit of the vfs pointer when freeing
285	 * the znode to invalidate it, and the memory patterns written by kmem
286	 * (baddcafe and deadbeef) set at least one of the two low bits. A newly
287	 * created znode sets the vfs pointer last of all to indicate that the
288	 * znode is known and in a valid state to be moved by this function.
289	 */
290	zfsvfs = ozp->z_zfsvfs;
291	if (!POINTER_IS_VALID(zfsvfs)) {
292		ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_invalid);
293		return (KMEM_CBRC_DONT_KNOW);
294	}
295
296	/*
297	 * Close a small window in which it's possible that the filesystem could
298	 * be unmounted and freed, and zfsvfs, though valid in the previous
299	 * statement, could point to unrelated memory by the time we try to
300	 * prevent the filesystem from being unmounted.
301	 */
302	rw_enter(&zfsvfs_lock, RW_WRITER);
303	if (zfsvfs != ozp->z_zfsvfs) {
304		rw_exit(&zfsvfs_lock);
305		ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_recheck1);
306		return (KMEM_CBRC_DONT_KNOW);
307	}
308
309	/*
310	 * If the znode is still valid, then so is the file system. We know that
311	 * no valid file system can be freed while we hold zfsvfs_lock, so we
312	 * can safely ensure that the filesystem is not and will not be
313	 * unmounted. The next statement is equivalent to ZFS_ENTER().
314	 */
315	rrm_enter(&zfsvfs->z_teardown_lock, RW_READER, FTAG);
316	if (zfsvfs->z_unmounted) {
317		ZFS_EXIT(zfsvfs);
318		rw_exit(&zfsvfs_lock);
319		ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_unmounted);
320		return (KMEM_CBRC_DONT_KNOW);
321	}
322	rw_exit(&zfsvfs_lock);
323
324	mutex_enter(&zfsvfs->z_znodes_lock);
325	/*
326	 * Recheck the vfs pointer in case the znode was removed just before
327	 * acquiring the lock.
328	 */
329	if (zfsvfs != ozp->z_zfsvfs) {
330		mutex_exit(&zfsvfs->z_znodes_lock);
331		ZFS_EXIT(zfsvfs);
332		ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_recheck2);
333		return (KMEM_CBRC_DONT_KNOW);
334	}
335
336	/*
337	 * At this point we know that as long as we hold z_znodes_lock, the
338	 * znode cannot be freed and fields within the znode can be safely
339	 * accessed. Now, prevent a race with zfs_zget().
340	 */
341	if (ZFS_OBJ_HOLD_TRYENTER(zfsvfs, ozp->z_id) == 0) {
342		mutex_exit(&zfsvfs->z_znodes_lock);
343		ZFS_EXIT(zfsvfs);
344		ZNODE_STAT_ADD(znode_move_stats.zms_obj_held);
345		return (KMEM_CBRC_LATER);
346	}
347
348	vp = ZTOV(ozp);
349	if (mutex_tryenter(&vp->v_lock) == 0) {
350		ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id);
351		mutex_exit(&zfsvfs->z_znodes_lock);
352		ZFS_EXIT(zfsvfs);
353		ZNODE_STAT_ADD(znode_move_stats.zms_vnode_locked);
354		return (KMEM_CBRC_LATER);
355	}
356
357	/* Only move znodes that are referenced _only_ by the DNLC. */
358	if (vp->v_count != 1 || !vn_in_dnlc(vp)) {
359		mutex_exit(&vp->v_lock);
360		ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id);
361		mutex_exit(&zfsvfs->z_znodes_lock);
362		ZFS_EXIT(zfsvfs);
363		ZNODE_STAT_ADD(znode_move_stats.zms_not_only_dnlc);
364		return (KMEM_CBRC_LATER);
365	}
366
367	/*
368	 * The znode is known and in a valid state to move. We're holding the
369	 * locks needed to execute the critical section.
370	 */
371	zfs_znode_move_impl(ozp, nzp);
372	mutex_exit(&vp->v_lock);
373	ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id);
374
375	list_link_replace(&ozp->z_link_node, &nzp->z_link_node);
376	mutex_exit(&zfsvfs->z_znodes_lock);
377	ZFS_EXIT(zfsvfs);
378
379	return (KMEM_CBRC_YES);
380}
381
382void
383zfs_znode_init(void)
384{
385	/*
386	 * Initialize zcache
387	 */
388	rw_init(&zfsvfs_lock, NULL, RW_DEFAULT, NULL);
389	ASSERT(znode_cache == NULL);
390	znode_cache = kmem_cache_create("zfs_znode_cache",
391	    sizeof (znode_t), 0, zfs_znode_cache_constructor,
392	    zfs_znode_cache_destructor, NULL, NULL, NULL, 0);
393	kmem_cache_set_move(znode_cache, zfs_znode_move);
394}
395
396void
397zfs_znode_fini(void)
398{
399	/*
400	 * Cleanup vfs & vnode ops
401	 */
402	zfs_remove_op_tables();
403
404	/*
405	 * Cleanup zcache
406	 */
407	if (znode_cache)
408		kmem_cache_destroy(znode_cache);
409	znode_cache = NULL;
410	rw_destroy(&zfsvfs_lock);
411}
412
413struct vnodeops *zfs_dvnodeops;
414struct vnodeops *zfs_fvnodeops;
415struct vnodeops *zfs_symvnodeops;
416struct vnodeops *zfs_xdvnodeops;
417struct vnodeops *zfs_evnodeops;
418struct vnodeops *zfs_sharevnodeops;
419
420void
421zfs_remove_op_tables()
422{
423	/*
424	 * Remove vfs ops
425	 */
426	ASSERT(zfsfstype);
427	(void) vfs_freevfsops_by_type(zfsfstype);
428	zfsfstype = 0;
429
430	/*
431	 * Remove vnode ops
432	 */
433	if (zfs_dvnodeops)
434		vn_freevnodeops(zfs_dvnodeops);
435	if (zfs_fvnodeops)
436		vn_freevnodeops(zfs_fvnodeops);
437	if (zfs_symvnodeops)
438		vn_freevnodeops(zfs_symvnodeops);
439	if (zfs_xdvnodeops)
440		vn_freevnodeops(zfs_xdvnodeops);
441	if (zfs_evnodeops)
442		vn_freevnodeops(zfs_evnodeops);
443	if (zfs_sharevnodeops)
444		vn_freevnodeops(zfs_sharevnodeops);
445
446	zfs_dvnodeops = NULL;
447	zfs_fvnodeops = NULL;
448	zfs_symvnodeops = NULL;
449	zfs_xdvnodeops = NULL;
450	zfs_evnodeops = NULL;
451	zfs_sharevnodeops = NULL;
452}
453
454extern const fs_operation_def_t zfs_dvnodeops_template[];
455extern const fs_operation_def_t zfs_fvnodeops_template[];
456extern const fs_operation_def_t zfs_xdvnodeops_template[];
457extern const fs_operation_def_t zfs_symvnodeops_template[];
458extern const fs_operation_def_t zfs_evnodeops_template[];
459extern const fs_operation_def_t zfs_sharevnodeops_template[];
460
461int
462zfs_create_op_tables()
463{
464	int error;
465
466	/*
467	 * zfs_dvnodeops can be set if mod_remove() calls mod_installfs()
468	 * due to a failure to remove the the 2nd modlinkage (zfs_modldrv).
469	 * In this case we just return as the ops vectors are already set up.
470	 */
471	if (zfs_dvnodeops)
472		return (0);
473
474	error = vn_make_ops(MNTTYPE_ZFS, zfs_dvnodeops_template,
475	    &zfs_dvnodeops);
476	if (error)
477		return (error);
478
479	error = vn_make_ops(MNTTYPE_ZFS, zfs_fvnodeops_template,
480	    &zfs_fvnodeops);
481	if (error)
482		return (error);
483
484	error = vn_make_ops(MNTTYPE_ZFS, zfs_symvnodeops_template,
485	    &zfs_symvnodeops);
486	if (error)
487		return (error);
488
489	error = vn_make_ops(MNTTYPE_ZFS, zfs_xdvnodeops_template,
490	    &zfs_xdvnodeops);
491	if (error)
492		return (error);
493
494	error = vn_make_ops(MNTTYPE_ZFS, zfs_evnodeops_template,
495	    &zfs_evnodeops);
496	if (error)
497		return (error);
498
499	error = vn_make_ops(MNTTYPE_ZFS, zfs_sharevnodeops_template,
500	    &zfs_sharevnodeops);
501
502	return (error);
503}
504
505int
506zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx)
507{
508	zfs_acl_ids_t acl_ids;
509	vattr_t vattr;
510	znode_t *sharezp;
511	vnode_t *vp;
512	znode_t *zp;
513	int error;
514
515	vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE;
516	vattr.va_type = VDIR;
517	vattr.va_mode = S_IFDIR|0555;
518	vattr.va_uid = crgetuid(kcred);
519	vattr.va_gid = crgetgid(kcred);
520
521	sharezp = kmem_cache_alloc(znode_cache, KM_SLEEP);
522	ASSERT(!POINTER_IS_VALID(sharezp->z_zfsvfs));
523	sharezp->z_moved = 0;
524	sharezp->z_unlinked = 0;
525	sharezp->z_atime_dirty = 0;
526	sharezp->z_zfsvfs = zfsvfs;
527	sharezp->z_is_sa = zfsvfs->z_use_sa;
528	sharezp->z_pflags = 0;
529
530	vp = ZTOV(sharezp);
531	vn_reinit(vp);
532	vp->v_type = VDIR;
533
534	VERIFY(0 == zfs_acl_ids_create(sharezp, IS_ROOT_NODE, &vattr,
535	    kcred, NULL, &acl_ids));
536	zfs_mknode(sharezp, &vattr, tx, kcred, IS_ROOT_NODE, &zp, &acl_ids);
537	ASSERT3P(zp, ==, sharezp);
538	ASSERT(!vn_in_dnlc(ZTOV(sharezp))); /* not valid to move */
539	POINTER_INVALIDATE(&sharezp->z_zfsvfs);
540	error = zap_add(zfsvfs->z_os, MASTER_NODE_OBJ,
541	    ZFS_SHARES_DIR, 8, 1, &sharezp->z_id, tx);
542	zfsvfs->z_shares_dir = sharezp->z_id;
543
544	zfs_acl_ids_free(&acl_ids);
545	ZTOV(sharezp)->v_count = 0;
546	sa_handle_destroy(sharezp->z_sa_hdl);
547	kmem_cache_free(znode_cache, sharezp);
548
549	return (error);
550}
551
552/*
553 * define a couple of values we need available
554 * for both 64 and 32 bit environments.
555 */
556#ifndef NBITSMINOR64
557#define	NBITSMINOR64	32
558#endif
559#ifndef MAXMAJ64
560#define	MAXMAJ64	0xffffffffUL
561#endif
562#ifndef	MAXMIN64
563#define	MAXMIN64	0xffffffffUL
564#endif
565
566/*
567 * Create special expldev for ZFS private use.
568 * Can't use standard expldev since it doesn't do
569 * what we want.  The standard expldev() takes a
570 * dev32_t in LP64 and expands it to a long dev_t.
571 * We need an interface that takes a dev32_t in ILP32
572 * and expands it to a long dev_t.
573 */
574static uint64_t
575zfs_expldev(dev_t dev)
576{
577#ifndef _LP64
578	major_t major = (major_t)dev >> NBITSMINOR32 & MAXMAJ32;
579	return (((uint64_t)major << NBITSMINOR64) |
580	    ((minor_t)dev & MAXMIN32));
581#else
582	return (dev);
583#endif
584}
585
586/*
587 * Special cmpldev for ZFS private use.
588 * Can't use standard cmpldev since it takes
589 * a long dev_t and compresses it to dev32_t in
590 * LP64.  We need to do a compaction of a long dev_t
591 * to a dev32_t in ILP32.
592 */
593dev_t
594zfs_cmpldev(uint64_t dev)
595{
596#ifndef _LP64
597	minor_t minor = (minor_t)dev & MAXMIN64;
598	major_t major = (major_t)(dev >> NBITSMINOR64) & MAXMAJ64;
599
600	if (major > MAXMAJ32 || minor > MAXMIN32)
601		return (NODEV32);
602
603	return (((dev32_t)major << NBITSMINOR32) | minor);
604#else
605	return (dev);
606#endif
607}
608
609static void
610zfs_znode_sa_init(zfsvfs_t *zfsvfs, znode_t *zp,
611    dmu_buf_t *db, dmu_object_type_t obj_type, sa_handle_t *sa_hdl)
612{
613	ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs) || (zfsvfs == zp->z_zfsvfs));
614	ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zfsvfs, zp->z_id)));
615
616	mutex_enter(&zp->z_lock);
617
618	ASSERT(zp->z_sa_hdl == NULL);
619	ASSERT(zp->z_acl_cached == NULL);
620	if (sa_hdl == NULL) {
621		VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, zp,
622		    SA_HDL_SHARED, &zp->z_sa_hdl));
623	} else {
624		zp->z_sa_hdl = sa_hdl;
625		sa_set_userp(sa_hdl, zp);
626	}
627
628	zp->z_is_sa = (obj_type == DMU_OT_SA) ? B_TRUE : B_FALSE;
629
630	/*
631	 * Slap on VROOT if we are the root znode
632	 */
633	if (zp->z_id == zfsvfs->z_root)
634		ZTOV(zp)->v_flag |= VROOT;
635
636	mutex_exit(&zp->z_lock);
637	vn_exists(ZTOV(zp));
638}
639
640void
641zfs_znode_dmu_fini(znode_t *zp)
642{
643	ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zp->z_zfsvfs, zp->z_id)) ||
644	    zp->z_unlinked ||
645	    RW_WRITE_HELD(&zp->z_zfsvfs->z_teardown_inactive_lock));
646
647	sa_handle_destroy(zp->z_sa_hdl);
648	zp->z_sa_hdl = NULL;
649}
650
651/*
652 * Construct a new znode/vnode and intialize.
653 *
654 * This does not do a call to dmu_set_user() that is
655 * up to the caller to do, in case you don't want to
656 * return the znode
657 */
658static znode_t *
659zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
660    dmu_object_type_t obj_type, sa_handle_t *hdl)
661{
662	znode_t	*zp;
663	vnode_t *vp;
664	uint64_t mode;
665	uint64_t parent;
666	uint64_t projid = ZFS_DEFAULT_PROJID;
667	sa_bulk_attr_t bulk[11];
668	int count = 0;
669
670	zp = kmem_cache_alloc(znode_cache, KM_SLEEP);
671
672	ASSERT(zp->z_dirlocks == NULL);
673	ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs));
674	zp->z_moved = 0;
675
676	/*
677	 * Defer setting z_zfsvfs until the znode is ready to be a candidate for
678	 * the zfs_znode_move() callback.
679	 */
680	zp->z_sa_hdl = NULL;
681	zp->z_unlinked = 0;
682	zp->z_atime_dirty = 0;
683	zp->z_mapcnt = 0;
684	zp->z_id = db->db_object;
685	zp->z_blksz = blksz;
686	zp->z_seq = 0x7A4653;
687	zp->z_sync_cnt = 0;
688
689	vp = ZTOV(zp);
690	vn_reinit(vp);
691
692	zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl);
693
694	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8);
695	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, &zp->z_gen, 8);
696	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
697	    &zp->z_size, 8);
698	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
699	    &zp->z_links, 8);
700	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
701	    &zp->z_pflags, 8);
702	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8);
703	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
704	    &zp->z_atime, 16);
705	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
706	    &zp->z_uid, 8);
707	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
708	    &zp->z_gid, 8);
709
710	if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count) != 0 || zp->z_gen == 0 ||
711	    (dmu_objset_projectquota_enabled(zfsvfs->z_os) &&
712	    (zp->z_pflags & ZFS_PROJID) &&
713	    sa_lookup(zp->z_sa_hdl, SA_ZPL_PROJID(zfsvfs), &projid, 8) != 0)) {
714		if (hdl == NULL)
715			sa_handle_destroy(zp->z_sa_hdl);
716		kmem_cache_free(znode_cache, zp);
717		return (NULL);
718	}
719
720	zp->z_projid = projid;
721	zp->z_mode = mode;
722	vp->v_vfsp = zfsvfs->z_parent->z_vfs;
723
724	vp->v_type = IFTOVT((mode_t)mode);
725
726	switch (vp->v_type) {
727	case VDIR:
728		if (zp->z_pflags & ZFS_XATTR) {
729			vn_setops(vp, zfs_xdvnodeops);
730			vp->v_flag |= V_XATTRDIR;
731		} else {
732			vn_setops(vp, zfs_dvnodeops);
733		}
734		zp->z_zn_prefetch = B_TRUE; /* z_prefetch default is enabled */
735		break;
736	case VBLK:
737	case VCHR:
738		{
739			uint64_t rdev;
740			VERIFY(sa_lookup(zp->z_sa_hdl, SA_ZPL_RDEV(zfsvfs),
741			    &rdev, sizeof (rdev)) == 0);
742
743			vp->v_rdev = zfs_cmpldev(rdev);
744		}
745		/*FALLTHROUGH*/
746	case VFIFO:
747	case VSOCK:
748	case VDOOR:
749		vn_setops(vp, zfs_fvnodeops);
750		break;
751	case VREG:
752		vp->v_flag |= VMODSORT;
753		if (parent == zfsvfs->z_shares_dir) {
754			ASSERT(zp->z_uid == 0 && zp->z_gid == 0);
755			vn_setops(vp, zfs_sharevnodeops);
756		} else {
757			vn_setops(vp, zfs_fvnodeops);
758		}
759		break;
760	case VLNK:
761		vn_setops(vp, zfs_symvnodeops);
762		break;
763	default:
764		vn_setops(vp, zfs_evnodeops);
765		break;
766	}
767
768	mutex_enter(&zfsvfs->z_znodes_lock);
769	list_insert_tail(&zfsvfs->z_all_znodes, zp);
770	membar_producer();
771	/*
772	 * Everything else must be valid before assigning z_zfsvfs makes the
773	 * znode eligible for zfs_znode_move().
774	 */
775	zp->z_zfsvfs = zfsvfs;
776	mutex_exit(&zfsvfs->z_znodes_lock);
777
778	VFS_HOLD(zfsvfs->z_vfs);
779	return (zp);
780}
781
782static uint64_t empty_xattr;
783static uint64_t pad[4];
784static zfs_acl_phys_t acl_phys;
785/*
786 * Create a new DMU object to hold a zfs znode.
787 *
788 *	IN:	dzp	- parent directory for new znode
789 *		vap	- file attributes for new znode
790 *		tx	- dmu transaction id for zap operations
791 *		cr	- credentials of caller
792 *		flag	- flags:
793 *			  IS_ROOT_NODE	- new object will be root
794 *			  IS_XATTR	- new object is an attribute
795 *		bonuslen - length of bonus buffer
796 *		setaclp  - File/Dir initial ACL
797 *		fuidp	 - Tracks fuid allocation.
798 *
799 *	OUT:	zpp	- allocated znode
800 *
801 */
802void
803zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
804    uint_t flag, znode_t **zpp, zfs_acl_ids_t *acl_ids)
805{
806	uint64_t	crtime[2], atime[2], mtime[2], ctime[2];
807	uint64_t	mode, size, links, parent, pflags;
808	uint64_t	dzp_pflags = 0;
809	uint64_t	projid = ZFS_DEFAULT_PROJID;
810	uint64_t	rdev = 0;
811	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
812	dmu_buf_t	*db;
813	timestruc_t	now;
814	uint64_t	gen, obj;
815	int		bonuslen;
816	int		dnodesize;
817	sa_handle_t	*sa_hdl;
818	dmu_object_type_t obj_type;
819	sa_bulk_attr_t	*sa_attrs;
820	int		cnt = 0;
821	zfs_acl_locator_cb_t locate = { 0 };
822
823	ASSERT(vap && (vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));
824
825	if (zfsvfs->z_replay) {
826		obj = vap->va_nodeid;
827		now = vap->va_ctime;		/* see zfs_replay_create() */
828		gen = vap->va_nblocks;		/* ditto */
829		dnodesize = vap->va_fsid;	/* ditto */
830	} else {
831		obj = 0;
832		gethrestime(&now);
833		gen = dmu_tx_get_txg(tx);
834		dnodesize = dmu_objset_dnodesize(zfsvfs->z_os);
835	}
836
837	if (dnodesize == 0)
838		dnodesize = DNODE_MIN_SIZE;
839
840	obj_type = zfsvfs->z_use_sa ? DMU_OT_SA : DMU_OT_ZNODE;
841	bonuslen = (obj_type == DMU_OT_SA) ?
842	    DN_BONUS_SIZE(dnodesize) : ZFS_OLD_ZNODE_PHYS_SIZE;
843
844	/*
845	 * Create a new DMU object.
846	 */
847	/*
848	 * There's currently no mechanism for pre-reading the blocks that will
849	 * be needed to allocate a new object, so we accept the small chance
850	 * that there will be an i/o error and we will fail one of the
851	 * assertions below.
852	 */
853	if (vap->va_type == VDIR) {
854		if (zfsvfs->z_replay) {
855			VERIFY0(zap_create_claim_norm_dnsize(zfsvfs->z_os, obj,
856			    zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
857			    obj_type, bonuslen, dnodesize, tx));
858		} else {
859			obj = zap_create_norm_dnsize(zfsvfs->z_os,
860			    zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
861			    obj_type, bonuslen, dnodesize, tx);
862		}
863	} else {
864		if (zfsvfs->z_replay) {
865			VERIFY0(dmu_object_claim_dnsize(zfsvfs->z_os, obj,
866			    DMU_OT_PLAIN_FILE_CONTENTS, 0,
867			    obj_type, bonuslen, dnodesize, tx));
868		} else {
869			obj = dmu_object_alloc_dnsize(zfsvfs->z_os,
870			    DMU_OT_PLAIN_FILE_CONTENTS, 0,
871			    obj_type, bonuslen, dnodesize, tx);
872		}
873	}
874
875	ZFS_OBJ_HOLD_ENTER(zfsvfs, obj);
876	VERIFY0(sa_buf_hold(zfsvfs->z_os, obj, NULL, &db));
877
878	/*
879	 * If this is the root, fix up the half-initialized parent pointer
880	 * to reference the just-allocated physical data area.
881	 */
882	if (flag & IS_ROOT_NODE) {
883		dzp->z_id = obj;
884	}
885
886	/*
887	 * If parent is an xattr, so am I.
888	 */
889	if (dzp->z_pflags & ZFS_XATTR) {
890		flag |= IS_XATTR;
891	}
892
893	if (zfsvfs->z_use_fuids)
894		pflags = ZFS_ARCHIVE | ZFS_AV_MODIFIED;
895	else
896		pflags = 0;
897
898	if (vap->va_type == VDIR) {
899		size = 2;		/* contents ("." and "..") */
900		links = (flag & (IS_ROOT_NODE | IS_XATTR)) ? 2 : 1;
901	} else {
902		size = links = 0;
903	}
904
905	if (vap->va_type == VBLK || vap->va_type == VCHR) {
906		rdev = zfs_expldev(vap->va_rdev);
907	}
908
909	parent = dzp->z_id;
910	mode = acl_ids->z_mode;
911	if (flag & IS_XATTR)
912		pflags |= ZFS_XATTR;
913
914	if (vap->va_type == VREG || vap->va_type == VDIR) {
915		/*
916		 * With ZFS_PROJID flag, we can easily know whether there is
917		 * project ID stored on disk or not. See zfs_space_delta_cb().
918		 */
919		if (obj_type != DMU_OT_ZNODE &&
920		    dmu_objset_projectquota_enabled(zfsvfs->z_os))
921			pflags |= ZFS_PROJID;
922
923		/*
924		 * Inherit project ID from parent if required.
925		 */
926		projid = zfs_inherit_projid(dzp);
927		if (dzp->z_pflags & ZFS_PROJINHERIT)
928			pflags |= ZFS_PROJINHERIT;
929	}
930
931	/*
932	 * No execs denied will be deterimed when zfs_mode_compute() is called.
933	 */
934	pflags |= acl_ids->z_aclp->z_hints &
935	    (ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|ZFS_ACL_AUTO_INHERIT|
936	    ZFS_ACL_DEFAULTED|ZFS_ACL_PROTECTED);
937
938	ZFS_TIME_ENCODE(&now, crtime);
939	ZFS_TIME_ENCODE(&now, ctime);
940
941	if (vap->va_mask & AT_ATIME) {
942		ZFS_TIME_ENCODE(&vap->va_atime, atime);
943	} else {
944		ZFS_TIME_ENCODE(&now, atime);
945	}
946
947	if (vap->va_mask & AT_MTIME) {
948		ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
949	} else {
950		ZFS_TIME_ENCODE(&now, mtime);
951	}
952
953	/* Now add in all of the "SA" attributes */
954	VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, NULL, SA_HDL_SHARED,
955	    &sa_hdl));
956
957	/*
958	 * Setup the array of attributes to be replaced/set on the new file
959	 *
960	 * order for  DMU_OT_ZNODE is critical since it needs to be constructed
961	 * in the old znode_phys_t format.  Don't change this ordering
962	 */
963	sa_attrs = kmem_alloc(sizeof (sa_bulk_attr_t) * ZPL_END, KM_SLEEP);
964
965	if (obj_type == DMU_OT_ZNODE) {
966		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs),
967		    NULL, &atime, 16);
968		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs),
969		    NULL, &mtime, 16);
970		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs),
971		    NULL, &ctime, 16);
972		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs),
973		    NULL, &crtime, 16);
974		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs),
975		    NULL, &gen, 8);
976		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs),
977		    NULL, &mode, 8);
978		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs),
979		    NULL, &size, 8);
980		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs),
981		    NULL, &parent, 8);
982	} else {
983		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs),
984		    NULL, &mode, 8);
985		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs),
986		    NULL, &size, 8);
987		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs),
988		    NULL, &gen, 8);
989		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs),
990		    NULL, &acl_ids->z_fuid, 8);
991		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs),
992		    NULL, &acl_ids->z_fgid, 8);
993		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs),
994		    NULL, &parent, 8);
995		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs),
996		    NULL, &pflags, 8);
997		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs),
998		    NULL, &atime, 16);
999		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs),
1000		    NULL, &mtime, 16);
1001		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs),
1002		    NULL, &ctime, 16);
1003		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs),
1004		    NULL, &crtime, 16);
1005	}
1006
1007	SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8);
1008
1009	if (obj_type == DMU_OT_ZNODE) {
1010		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_XATTR(zfsvfs), NULL,
1011		    &empty_xattr, 8);
1012	} else if (dmu_objset_projectquota_enabled(zfsvfs->z_os) &&
1013	    pflags & ZFS_PROJID) {
1014		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PROJID(zfsvfs),
1015		    NULL, &projid, 8);
1016	}
1017	if (obj_type == DMU_OT_ZNODE ||
1018	    (vap->va_type == VBLK || vap->va_type == VCHR)) {
1019		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_RDEV(zfsvfs),
1020		    NULL, &rdev, 8);
1021
1022	}
1023	if (obj_type == DMU_OT_ZNODE) {
1024		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs),
1025		    NULL, &pflags, 8);
1026		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), NULL,
1027		    &acl_ids->z_fuid, 8);
1028		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), NULL,
1029		    &acl_ids->z_fgid, 8);
1030		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PAD(zfsvfs), NULL, pad,
1031		    sizeof (uint64_t) * 4);
1032		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ZNODE_ACL(zfsvfs), NULL,
1033		    &acl_phys, sizeof (zfs_acl_phys_t));
1034	} else if (acl_ids->z_aclp->z_version >= ZFS_ACL_VERSION_FUID) {
1035		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_COUNT(zfsvfs), NULL,
1036		    &acl_ids->z_aclp->z_acl_count, 8);
1037		locate.cb_aclp = acl_ids->z_aclp;
1038		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_ACES(zfsvfs),
1039		    zfs_acl_data_locator, &locate,
1040		    acl_ids->z_aclp->z_acl_bytes);
1041		mode = zfs_mode_compute(mode, acl_ids->z_aclp, &pflags,
1042		    acl_ids->z_fuid, acl_ids->z_fgid);
1043	}
1044
1045	VERIFY(sa_replace_all_by_template(sa_hdl, sa_attrs, cnt, tx) == 0);
1046
1047	if (!(flag & IS_ROOT_NODE)) {
1048		*zpp = zfs_znode_alloc(zfsvfs, db, 0, obj_type, sa_hdl);
1049		ASSERT(*zpp != NULL);
1050	} else {
1051		/*
1052		 * If we are creating the root node, the "parent" we
1053		 * passed in is the znode for the root.
1054		 */
1055		*zpp = dzp;
1056
1057		(*zpp)->z_sa_hdl = sa_hdl;
1058	}
1059
1060	(*zpp)->z_pflags = pflags;
1061	(*zpp)->z_mode = mode;
1062	(*zpp)->z_dnodesize = dnodesize;
1063	(*zpp)->z_projid = projid;
1064
1065	if (vap->va_mask & AT_XVATTR)
1066		zfs_xvattr_set(*zpp, (xvattr_t *)vap, tx);
1067
1068	if (obj_type == DMU_OT_ZNODE ||
1069	    acl_ids->z_aclp->z_version < ZFS_ACL_VERSION_FUID) {
1070		VERIFY0(zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx));
1071	}
1072	kmem_free(sa_attrs, sizeof (sa_bulk_attr_t) * ZPL_END);
1073	ZFS_OBJ_HOLD_EXIT(zfsvfs, obj);
1074}
1075
1076/*
1077 * Update in-core attributes.  It is assumed the caller will be doing an
1078 * sa_bulk_update to push the changes out.
1079 */
1080void
1081zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx)
1082{
1083	xoptattr_t *xoap;
1084
1085	xoap = xva_getxoptattr(xvap);
1086	ASSERT(xoap);
1087
1088	if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
1089		uint64_t times[2];
1090		ZFS_TIME_ENCODE(&xoap->xoa_createtime, times);
1091		(void) sa_update(zp->z_sa_hdl, SA_ZPL_CRTIME(zp->z_zfsvfs),
1092		    &times, sizeof (times), tx);
1093		XVA_SET_RTN(xvap, XAT_CREATETIME);
1094	}
1095	if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
1096		ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly,
1097		    zp->z_pflags, tx);
1098		XVA_SET_RTN(xvap, XAT_READONLY);
1099	}
1100	if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
1101		ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden,
1102		    zp->z_pflags, tx);
1103		XVA_SET_RTN(xvap, XAT_HIDDEN);
1104	}
1105	if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
1106		ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system,
1107		    zp->z_pflags, tx);
1108		XVA_SET_RTN(xvap, XAT_SYSTEM);
1109	}
1110	if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
1111		ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive,
1112		    zp->z_pflags, tx);
1113		XVA_SET_RTN(xvap, XAT_ARCHIVE);
1114	}
1115	if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
1116		ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable,
1117		    zp->z_pflags, tx);
1118		XVA_SET_RTN(xvap, XAT_IMMUTABLE);
1119	}
1120	if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
1121		ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink,
1122		    zp->z_pflags, tx);
1123		XVA_SET_RTN(xvap, XAT_NOUNLINK);
1124	}
1125	if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
1126		ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly,
1127		    zp->z_pflags, tx);
1128		XVA_SET_RTN(xvap, XAT_APPENDONLY);
1129	}
1130	if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
1131		ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump,
1132		    zp->z_pflags, tx);
1133		XVA_SET_RTN(xvap, XAT_NODUMP);
1134	}
1135	if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
1136		ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque,
1137		    zp->z_pflags, tx);
1138		XVA_SET_RTN(xvap, XAT_OPAQUE);
1139	}
1140	if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
1141		ZFS_ATTR_SET(zp, ZFS_AV_QUARANTINED,
1142		    xoap->xoa_av_quarantined, zp->z_pflags, tx);
1143		XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
1144	}
1145	if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
1146		ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified,
1147		    zp->z_pflags, tx);
1148		XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
1149	}
1150	if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) {
1151		zfs_sa_set_scanstamp(zp, xvap, tx);
1152		XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP);
1153	}
1154	if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
1155		ZFS_ATTR_SET(zp, ZFS_REPARSE, xoap->xoa_reparse,
1156		    zp->z_pflags, tx);
1157		XVA_SET_RTN(xvap, XAT_REPARSE);
1158	}
1159	if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
1160		ZFS_ATTR_SET(zp, ZFS_OFFLINE, xoap->xoa_offline,
1161		    zp->z_pflags, tx);
1162		XVA_SET_RTN(xvap, XAT_OFFLINE);
1163	}
1164	if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
1165		ZFS_ATTR_SET(zp, ZFS_SPARSE, xoap->xoa_sparse,
1166		    zp->z_pflags, tx);
1167		XVA_SET_RTN(xvap, XAT_SPARSE);
1168	}
1169	if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) {
1170		ZFS_ATTR_SET(zp, ZFS_PROJINHERIT, xoap->xoa_projinherit,
1171		    zp->z_pflags, tx);
1172		XVA_SET_RTN(xvap, XAT_PROJINHERIT);
1173	}
1174}
1175
1176int
1177zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp)
1178{
1179	dmu_object_info_t doi;
1180	dmu_buf_t	*db;
1181	znode_t		*zp;
1182	int err;
1183	sa_handle_t	*hdl;
1184
1185	*zpp = NULL;
1186
1187	ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num);
1188
1189	err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db);
1190	if (err) {
1191		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
1192		return (err);
1193	}
1194
1195	dmu_object_info_from_db(db, &doi);
1196	if (doi.doi_bonus_type != DMU_OT_SA &&
1197	    (doi.doi_bonus_type != DMU_OT_ZNODE ||
1198	    (doi.doi_bonus_type == DMU_OT_ZNODE &&
1199	    doi.doi_bonus_size < sizeof (znode_phys_t)))) {
1200		sa_buf_rele(db, NULL);
1201		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
1202		return (SET_ERROR(EINVAL));
1203	}
1204
1205	hdl = dmu_buf_get_user(db);
1206	if (hdl != NULL) {
1207		zp  = sa_get_userdata(hdl);
1208
1209
1210		/*
1211		 * Since "SA" does immediate eviction we
1212		 * should never find a sa handle that doesn't
1213		 * know about the znode.
1214		 */
1215
1216		ASSERT3P(zp, !=, NULL);
1217
1218		mutex_enter(&zp->z_lock);
1219		ASSERT3U(zp->z_id, ==, obj_num);
1220		if (zp->z_unlinked) {
1221			err = SET_ERROR(ENOENT);
1222		} else {
1223			VN_HOLD(ZTOV(zp));
1224			*zpp = zp;
1225			err = 0;
1226		}
1227		mutex_exit(&zp->z_lock);
1228		sa_buf_rele(db, NULL);
1229		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
1230		return (err);
1231	}
1232
1233	/*
1234	 * Not found create new znode/vnode
1235	 * but only if file exists.
1236	 *
1237	 * There is a small window where zfs_vget() could
1238	 * find this object while a file create is still in
1239	 * progress.  This is checked for in zfs_znode_alloc()
1240	 *
1241	 * if zfs_znode_alloc() fails it will drop the hold on the
1242	 * bonus buffer.
1243	 */
1244	zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size,
1245	    doi.doi_bonus_type, NULL);
1246	if (zp == NULL) {
1247		err = SET_ERROR(ENOENT);
1248	} else {
1249		*zpp = zp;
1250	}
1251	ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
1252	return (err);
1253}
1254
1255int
1256zfs_rezget(znode_t *zp)
1257{
1258	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1259	dmu_object_info_t doi;
1260	dmu_buf_t *db;
1261	uint64_t obj_num = zp->z_id;
1262	uint64_t mode;
1263	sa_bulk_attr_t bulk[10];
1264	int err;
1265	int count = 0;
1266	uint64_t gen;
1267	uint64_t projid = ZFS_DEFAULT_PROJID;
1268
1269	ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num);
1270
1271	mutex_enter(&zp->z_acl_lock);
1272	if (zp->z_acl_cached) {
1273		zfs_acl_free(zp->z_acl_cached);
1274		zp->z_acl_cached = NULL;
1275	}
1276
1277	mutex_exit(&zp->z_acl_lock);
1278	ASSERT(zp->z_sa_hdl == NULL);
1279	err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db);
1280	if (err) {
1281		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
1282		return (err);
1283	}
1284
1285	dmu_object_info_from_db(db, &doi);
1286	if (doi.doi_bonus_type != DMU_OT_SA &&
1287	    (doi.doi_bonus_type != DMU_OT_ZNODE ||
1288	    (doi.doi_bonus_type == DMU_OT_ZNODE &&
1289	    doi.doi_bonus_size < sizeof (znode_phys_t)))) {
1290		sa_buf_rele(db, NULL);
1291		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
1292		return (SET_ERROR(EINVAL));
1293	}
1294
1295	zfs_znode_sa_init(zfsvfs, zp, db, doi.doi_bonus_type, NULL);
1296
1297	/* reload cached values */
1298	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL,
1299	    &gen, sizeof (gen));
1300	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
1301	    &zp->z_size, sizeof (zp->z_size));
1302	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
1303	    &zp->z_links, sizeof (zp->z_links));
1304	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
1305	    &zp->z_pflags, sizeof (zp->z_pflags));
1306	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
1307	    &zp->z_atime, sizeof (zp->z_atime));
1308	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
1309	    &zp->z_uid, sizeof (zp->z_uid));
1310	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
1311	    &zp->z_gid, sizeof (zp->z_gid));
1312	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
1313	    &mode, sizeof (mode));
1314
1315	if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) {
1316		zfs_znode_dmu_fini(zp);
1317		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
1318		return (SET_ERROR(EIO));
1319	}
1320
1321	if (dmu_objset_projectquota_enabled(zfsvfs->z_os)) {
1322		err = sa_lookup(zp->z_sa_hdl, SA_ZPL_PROJID(zfsvfs),
1323		    &projid, 8);
1324		if (err != 0 && err != ENOENT) {
1325			zfs_znode_dmu_fini(zp);
1326			ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
1327			return (SET_ERROR(err));
1328		}
1329	}
1330
1331	zp->z_projid = projid;
1332	zp->z_mode = mode;
1333
1334	if (gen != zp->z_gen) {
1335		zfs_znode_dmu_fini(zp);
1336		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
1337		return (SET_ERROR(EIO));
1338	}
1339
1340	zp->z_blksz = doi.doi_data_block_size;
1341
1342	/*
1343	 * If the file has zero links, then it has been unlinked on the send
1344	 * side and it must be in the received unlinked set.
1345	 * We call zfs_znode_dmu_fini() now to prevent any accesses to the
1346	 * stale data and to prevent automatical removal of the file in
1347	 * zfs_zinactive().  The file will be removed either when it is removed
1348	 * on the send side and the next incremental stream is received or
1349	 * when the unlinked set gets processed.
1350	 */
1351	zp->z_unlinked = (zp->z_links == 0);
1352	if (zp->z_unlinked)
1353		zfs_znode_dmu_fini(zp);
1354
1355	ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
1356
1357	return (0);
1358}
1359
1360void
1361zfs_znode_delete(znode_t *zp, dmu_tx_t *tx)
1362{
1363	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1364	objset_t *os = zfsvfs->z_os;
1365	uint64_t obj = zp->z_id;
1366	uint64_t acl_obj = zfs_external_acl(zp);
1367
1368	ZFS_OBJ_HOLD_ENTER(zfsvfs, obj);
1369	if (acl_obj) {
1370		VERIFY(!zp->z_is_sa);
1371		VERIFY(0 == dmu_object_free(os, acl_obj, tx));
1372	}
1373	VERIFY(0 == dmu_object_free(os, obj, tx));
1374	zfs_znode_dmu_fini(zp);
1375	ZFS_OBJ_HOLD_EXIT(zfsvfs, obj);
1376	zfs_znode_free(zp);
1377}
1378
1379void
1380zfs_zinactive(znode_t *zp)
1381{
1382	vnode_t	*vp = ZTOV(zp);
1383	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1384	uint64_t z_id = zp->z_id;
1385
1386	ASSERT(zp->z_sa_hdl);
1387
1388	/*
1389	 * Don't allow a zfs_zget() while were trying to release this znode
1390	 */
1391	ZFS_OBJ_HOLD_ENTER(zfsvfs, z_id);
1392
1393	mutex_enter(&zp->z_lock);
1394	mutex_enter(&vp->v_lock);
1395	VN_RELE_LOCKED(vp);
1396	if (vp->v_count > 0 || vn_has_cached_data(vp)) {
1397		/*
1398		 * If the hold count is greater than zero, somebody has
1399		 * obtained a new reference on this znode while we were
1400		 * processing it here, so we are done.  If we still have
1401		 * mapped pages then we are also done, since we don't
1402		 * want to inactivate the znode until the pages get pushed.
1403		 *
1404		 * XXX - if vn_has_cached_data(vp) is true, but count == 0,
1405		 * this seems like it would leave the znode hanging with
1406		 * no chance to go inactive...
1407		 */
1408		mutex_exit(&vp->v_lock);
1409		mutex_exit(&zp->z_lock);
1410		ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
1411		return;
1412	}
1413	mutex_exit(&vp->v_lock);
1414
1415	/*
1416	 * If this was the last reference to a file with no links, remove
1417	 * the file from the file system unless the file system is mounted
1418	 * read-only.  That can happen, for example, if the file system was
1419	 * originally read-write, the file was opened, then unlinked and
1420	 * the file system was made read-only before the file was finally
1421	 * closed.  The file will remain in the unlinked set.
1422	 */
1423	if (zp->z_unlinked) {
1424		ASSERT(!zfsvfs->z_issnap);
1425		if ((zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) == 0 &&
1426		    !zfs_unlink_suspend_progress) {
1427			mutex_exit(&zp->z_lock);
1428			ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
1429			zfs_rmnode(zp);
1430			return;
1431		}
1432	}
1433
1434	mutex_exit(&zp->z_lock);
1435	zfs_znode_dmu_fini(zp);
1436	ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
1437	zfs_znode_free(zp);
1438}
1439
1440void
1441zfs_znode_free(znode_t *zp)
1442{
1443	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1444
1445	vn_invalid(ZTOV(zp));
1446
1447	ASSERT(ZTOV(zp)->v_count == 0);
1448
1449	mutex_enter(&zfsvfs->z_znodes_lock);
1450	POINTER_INVALIDATE(&zp->z_zfsvfs);
1451	list_remove(&zfsvfs->z_all_znodes, zp);
1452	mutex_exit(&zfsvfs->z_znodes_lock);
1453
1454	if (zp->z_acl_cached) {
1455		zfs_acl_free(zp->z_acl_cached);
1456		zp->z_acl_cached = NULL;
1457	}
1458
1459	kmem_cache_free(znode_cache, zp);
1460
1461	VFS_RELE(zfsvfs->z_vfs);
1462}
1463
1464void
1465zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2],
1466    uint64_t ctime[2], boolean_t have_tx)
1467{
1468	timestruc_t	now;
1469
1470	gethrestime(&now);
1471
1472	if (have_tx) {	/* will sa_bulk_update happen really soon? */
1473		zp->z_atime_dirty = 0;
1474		zp->z_seq++;
1475	} else {
1476		zp->z_atime_dirty = 1;
1477	}
1478
1479	if (flag & AT_ATIME) {
1480		ZFS_TIME_ENCODE(&now, zp->z_atime);
1481	}
1482
1483	if (flag & AT_MTIME) {
1484		ZFS_TIME_ENCODE(&now, mtime);
1485		if (zp->z_zfsvfs->z_use_fuids) {
1486			zp->z_pflags |= (ZFS_ARCHIVE |
1487			    ZFS_AV_MODIFIED);
1488		}
1489	}
1490
1491	if (flag & AT_CTIME) {
1492		ZFS_TIME_ENCODE(&now, ctime);
1493		if (zp->z_zfsvfs->z_use_fuids)
1494			zp->z_pflags |= ZFS_ARCHIVE;
1495	}
1496}
1497
1498/*
1499 * Grow the block size for a file.
1500 *
1501 *	IN:	zp	- znode of file to free data in.
1502 *		size	- requested block size
1503 *		tx	- open transaction.
1504 *
1505 * NOTE: this function assumes that the znode is write locked.
1506 */
1507void
1508zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx)
1509{
1510	int		error;
1511	u_longlong_t	dummy;
1512
1513	if (size <= zp->z_blksz)
1514		return;
1515	/*
1516	 * If the file size is already greater than the current blocksize,
1517	 * we will not grow.  If there is more than one block in a file,
1518	 * the blocksize cannot change.
1519	 */
1520	if (zp->z_blksz && zp->z_size > zp->z_blksz)
1521		return;
1522
1523	error = dmu_object_set_blocksize(zp->z_zfsvfs->z_os, zp->z_id,
1524	    size, 0, tx);
1525
1526	if (error == ENOTSUP)
1527		return;
1528	ASSERT0(error);
1529
1530	/* What blocksize did we actually get? */
1531	dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &zp->z_blksz, &dummy);
1532}
1533
1534/*
1535 * This is a dummy interface used when pvn_vplist_dirty() should *not*
1536 * be calling back into the fs for a putpage().  E.g.: when truncating
1537 * a file, the pages being "thrown away* don't need to be written out.
1538 */
1539/* ARGSUSED */
1540static int
1541zfs_no_putpage(vnode_t *vp, page_t *pp, u_offset_t *offp, size_t *lenp,
1542    int flags, cred_t *cr)
1543{
1544	ASSERT(0);
1545	return (0);
1546}
1547
1548/*
1549 * Increase the file length
1550 *
1551 *	IN:	zp	- znode of file to free data in.
1552 *		end	- new end-of-file
1553 *
1554 *	RETURN:	0 on success, error code on failure
1555 */
1556static int
1557zfs_extend(znode_t *zp, uint64_t end)
1558{
1559	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1560	dmu_tx_t *tx;
1561	locked_range_t *lr;
1562	uint64_t newblksz;
1563	int error;
1564
1565	/*
1566	 * We will change zp_size, lock the whole file.
1567	 */
1568	lr = rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER);
1569
1570	/*
1571	 * Nothing to do if file already at desired length.
1572	 */
1573	if (end <= zp->z_size) {
1574		rangelock_exit(lr);
1575		return (0);
1576	}
1577	tx = dmu_tx_create(zfsvfs->z_os);
1578	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1579	zfs_sa_upgrade_txholds(tx, zp);
1580	if (end > zp->z_blksz &&
1581	    (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) {
1582		/*
1583		 * We are growing the file past the current block size.
1584		 */
1585		if (zp->z_blksz > zp->z_zfsvfs->z_max_blksz) {
1586			/*
1587			 * File's blocksize is already larger than the
1588			 * "recordsize" property.  Only let it grow to
1589			 * the next power of 2.
1590			 */
1591			ASSERT(!ISP2(zp->z_blksz));
1592			newblksz = MIN(end, 1 << highbit64(zp->z_blksz));
1593		} else {
1594			newblksz = MIN(end, zp->z_zfsvfs->z_max_blksz);
1595		}
1596		dmu_tx_hold_write(tx, zp->z_id, 0, newblksz);
1597	} else {
1598		newblksz = 0;
1599	}
1600
1601	error = dmu_tx_assign(tx, TXG_WAIT);
1602	if (error) {
1603		dmu_tx_abort(tx);
1604		rangelock_exit(lr);
1605		return (error);
1606	}
1607
1608	if (newblksz)
1609		zfs_grow_blocksize(zp, newblksz, tx);
1610
1611	zp->z_size = end;
1612
1613	VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zp->z_zfsvfs),
1614	    &zp->z_size, sizeof (zp->z_size), tx));
1615
1616	rangelock_exit(lr);
1617
1618	dmu_tx_commit(tx);
1619
1620	return (0);
1621}
1622
1623/*
1624 * Free space in a file.
1625 *
1626 *	IN:	zp	- znode of file to free data in.
1627 *		off	- start of section to free.
1628 *		len	- length of section to free.
1629 *
1630 *	RETURN:	0 on success, error code on failure
1631 */
1632static int
1633zfs_free_range(znode_t *zp, uint64_t off, uint64_t len)
1634{
1635	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1636	locked_range_t *lr;
1637	int error;
1638
1639	/*
1640	 * Lock the range being freed.
1641	 */
1642	lr = rangelock_enter(&zp->z_rangelock, off, len, RL_WRITER);
1643
1644	/*
1645	 * Nothing to do if file already at desired length.
1646	 */
1647	if (off >= zp->z_size) {
1648		rangelock_exit(lr);
1649		return (0);
1650	}
1651
1652	if (off + len > zp->z_size)
1653		len = zp->z_size - off;
1654
1655	error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len);
1656
1657	rangelock_exit(lr);
1658
1659	return (error);
1660}
1661
1662/*
1663 * Truncate a file
1664 *
1665 *	IN:	zp	- znode of file to free data in.
1666 *		end	- new end-of-file.
1667 *
1668 *	RETURN:	0 on success, error code on failure
1669 */
1670static int
1671zfs_trunc(znode_t *zp, uint64_t end)
1672{
1673	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1674	vnode_t *vp = ZTOV(zp);
1675	dmu_tx_t *tx;
1676	locked_range_t *lr;
1677	int error;
1678	sa_bulk_attr_t bulk[2];
1679	int count = 0;
1680
1681	/*
1682	 * We will change zp_size, lock the whole file.
1683	 */
1684	lr = rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER);
1685
1686	/*
1687	 * Nothing to do if file already at desired length.
1688	 */
1689	if (end >= zp->z_size) {
1690		rangelock_exit(lr);
1691		return (0);
1692	}
1693
1694	error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end,
1695	    DMU_OBJECT_END);
1696	if (error) {
1697		rangelock_exit(lr);
1698		return (error);
1699	}
1700	tx = dmu_tx_create(zfsvfs->z_os);
1701	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1702	zfs_sa_upgrade_txholds(tx, zp);
1703	dmu_tx_mark_netfree(tx);
1704	error = dmu_tx_assign(tx, TXG_WAIT);
1705	if (error) {
1706		dmu_tx_abort(tx);
1707		rangelock_exit(lr);
1708		return (error);
1709	}
1710
1711	zp->z_size = end;
1712	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs),
1713	    NULL, &zp->z_size, sizeof (zp->z_size));
1714
1715	if (end == 0) {
1716		zp->z_pflags &= ~ZFS_SPARSE;
1717		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
1718		    NULL, &zp->z_pflags, 8);
1719	}
1720	VERIFY(sa_bulk_update(zp->z_sa_hdl, bulk, count, tx) == 0);
1721
1722	dmu_tx_commit(tx);
1723
1724	/*
1725	 * Clear any mapped pages in the truncated region.  This has to
1726	 * happen outside of the transaction to avoid the possibility of
1727	 * a deadlock with someone trying to push a page that we are
1728	 * about to invalidate.
1729	 */
1730	if (vn_has_cached_data(vp)) {
1731		page_t *pp;
1732		uint64_t start = end & PAGEMASK;
1733		int poff = end & PAGEOFFSET;
1734
1735		if (poff != 0 && (pp = page_lookup(vp, start, SE_SHARED))) {
1736			/*
1737			 * We need to zero a partial page.
1738			 */
1739			pagezero(pp, poff, PAGESIZE - poff);
1740			start += PAGESIZE;
1741			page_unlock(pp);
1742		}
1743		error = pvn_vplist_dirty(vp, start, zfs_no_putpage,
1744		    B_INVAL | B_TRUNC, NULL);
1745		ASSERT(error == 0);
1746	}
1747
1748	rangelock_exit(lr);
1749
1750	return (0);
1751}
1752
1753/*
1754 * Free space in a file
1755 *
1756 *	IN:	zp	- znode of file to free data in.
1757 *		off	- start of range
1758 *		len	- end of range (0 => EOF)
1759 *		flag	- current file open mode flags.
1760 *		log	- TRUE if this action should be logged
1761 *
1762 *	RETURN:	0 on success, error code on failure
1763 */
1764int
1765zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log)
1766{
1767	vnode_t *vp = ZTOV(zp);
1768	dmu_tx_t *tx;
1769	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1770	zilog_t *zilog = zfsvfs->z_log;
1771	uint64_t mode;
1772	uint64_t mtime[2], ctime[2];
1773	sa_bulk_attr_t bulk[3];
1774	int count = 0;
1775	int error;
1776
1777	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), &mode,
1778	    sizeof (mode))) != 0)
1779		return (error);
1780
1781	if (off > zp->z_size) {
1782		error =  zfs_extend(zp, off+len);
1783		if (error == 0 && log)
1784			goto log;
1785		else
1786			return (error);
1787	}
1788
1789	/*
1790	 * Check for any locks in the region to be freed.
1791	 */
1792
1793	if (MANDLOCK(vp, (mode_t)mode)) {
1794		uint64_t length = (len ? len : zp->z_size - off);
1795		if (error = chklock(vp, FWRITE, off, length, flag, NULL))
1796			return (error);
1797	}
1798
1799	if (len == 0) {
1800		error = zfs_trunc(zp, off);
1801	} else {
1802		if ((error = zfs_free_range(zp, off, len)) == 0 &&
1803		    off + len > zp->z_size)
1804			error = zfs_extend(zp, off+len);
1805	}
1806	if (error || !log)
1807		return (error);
1808log:
1809	tx = dmu_tx_create(zfsvfs->z_os);
1810	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1811	zfs_sa_upgrade_txholds(tx, zp);
1812	error = dmu_tx_assign(tx, TXG_WAIT);
1813	if (error) {
1814		dmu_tx_abort(tx);
1815		return (error);
1816	}
1817
1818	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, 16);
1819	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, ctime, 16);
1820	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
1821	    NULL, &zp->z_pflags, 8);
1822	zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, B_TRUE);
1823	error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
1824	ASSERT(error == 0);
1825
1826	zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len);
1827
1828	dmu_tx_commit(tx);
1829	return (0);
1830}
1831
1832void
1833zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
1834{
1835	uint64_t	moid, obj, sa_obj, version;
1836	uint64_t	sense = ZFS_CASE_SENSITIVE;
1837	uint64_t	norm = 0;
1838	nvpair_t	*elem;
1839	int		error;
1840	int		i;
1841	znode_t		*rootzp = NULL;
1842	zfsvfs_t	*zfsvfs;
1843	vnode_t		*vp;
1844	vattr_t		vattr;
1845	znode_t		*zp;
1846	zfs_acl_ids_t	acl_ids;
1847
1848	/*
1849	 * First attempt to create master node.
1850	 */
1851	/*
1852	 * In an empty objset, there are no blocks to read and thus
1853	 * there can be no i/o errors (which we assert below).
1854	 */
1855	moid = MASTER_NODE_OBJ;
1856	error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE,
1857	    DMU_OT_NONE, 0, tx);
1858	ASSERT(error == 0);
1859
1860	/*
1861	 * Set starting attributes.
1862	 */
1863	version = zfs_zpl_version_map(spa_version(dmu_objset_spa(os)));
1864	elem = NULL;
1865	while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) {
1866		/* For the moment we expect all zpl props to be uint64_ts */
1867		uint64_t val;
1868		char *name;
1869
1870		ASSERT(nvpair_type(elem) == DATA_TYPE_UINT64);
1871		VERIFY(nvpair_value_uint64(elem, &val) == 0);
1872		name = nvpair_name(elem);
1873		if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) {
1874			if (val < version)
1875				version = val;
1876		} else {
1877			error = zap_update(os, moid, name, 8, 1, &val, tx);
1878		}
1879		ASSERT(error == 0);
1880		if (strcmp(name, zfs_prop_to_name(ZFS_PROP_NORMALIZE)) == 0)
1881			norm = val;
1882		else if (strcmp(name, zfs_prop_to_name(ZFS_PROP_CASE)) == 0)
1883			sense = val;
1884	}
1885	ASSERT(version != 0);
1886	error = zap_update(os, moid, ZPL_VERSION_STR, 8, 1, &version, tx);
1887
1888	/*
1889	 * Create zap object used for SA attribute registration
1890	 */
1891
1892	if (version >= ZPL_VERSION_SA) {
1893		sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE,
1894		    DMU_OT_NONE, 0, tx);
1895		error = zap_add(os, moid, ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
1896		ASSERT(error == 0);
1897	} else {
1898		sa_obj = 0;
1899	}
1900	/*
1901	 * Create a delete queue.
1902	 */
1903	obj = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx);
1904
1905	error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &obj, tx);
1906	ASSERT(error == 0);
1907
1908	/*
1909	 * Create root znode.  Create minimal znode/vnode/zfsvfs
1910	 * to allow zfs_mknode to work.
1911	 */
1912	vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE;
1913	vattr.va_type = VDIR;
1914	vattr.va_mode = S_IFDIR|0755;
1915	vattr.va_uid = crgetuid(cr);
1916	vattr.va_gid = crgetgid(cr);
1917
1918	rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP);
1919	ASSERT(!POINTER_IS_VALID(rootzp->z_zfsvfs));
1920	rootzp->z_moved = 0;
1921	rootzp->z_unlinked = 0;
1922	rootzp->z_atime_dirty = 0;
1923	rootzp->z_is_sa = USE_SA(version, os);
1924	rootzp->z_pflags = 0;
1925
1926	vp = ZTOV(rootzp);
1927	vn_reinit(vp);
1928	vp->v_type = VDIR;
1929
1930	zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
1931	zfsvfs->z_os = os;
1932	zfsvfs->z_parent = zfsvfs;
1933	zfsvfs->z_version = version;
1934	zfsvfs->z_use_fuids = USE_FUIDS(version, os);
1935	zfsvfs->z_use_sa = USE_SA(version, os);
1936	zfsvfs->z_norm = norm;
1937
1938	error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
1939	    &zfsvfs->z_attr_table);
1940
1941	ASSERT(error == 0);
1942
1943	/*
1944	 * Fold case on file systems that are always or sometimes case
1945	 * insensitive.
1946	 */
1947	if (sense == ZFS_CASE_INSENSITIVE || sense == ZFS_CASE_MIXED)
1948		zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
1949
1950	mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
1951	list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
1952	    offsetof(znode_t, z_link_node));
1953
1954	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
1955		mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
1956
1957	rootzp->z_zfsvfs = zfsvfs;
1958	VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr,
1959	    cr, NULL, &acl_ids));
1960	zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, &acl_ids);
1961	ASSERT3P(zp, ==, rootzp);
1962	ASSERT(!vn_in_dnlc(ZTOV(rootzp))); /* not valid to move */
1963	error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx);
1964	ASSERT(error == 0);
1965	zfs_acl_ids_free(&acl_ids);
1966	POINTER_INVALIDATE(&rootzp->z_zfsvfs);
1967
1968	ZTOV(rootzp)->v_count = 0;
1969	sa_handle_destroy(rootzp->z_sa_hdl);
1970	kmem_cache_free(znode_cache, rootzp);
1971
1972	/*
1973	 * Create shares directory
1974	 */
1975
1976	error = zfs_create_share_dir(zfsvfs, tx);
1977
1978	ASSERT(error == 0);
1979
1980	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
1981		mutex_destroy(&zfsvfs->z_hold_mtx[i]);
1982	kmem_free(zfsvfs, sizeof (zfsvfs_t));
1983}
1984
1985#endif /* _KERNEL */
1986
1987static int
1988zfs_sa_setup(objset_t *osp, sa_attr_type_t **sa_table)
1989{
1990	uint64_t sa_obj = 0;
1991	int error;
1992
1993	error = zap_lookup(osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj);
1994	if (error != 0 && error != ENOENT)
1995		return (error);
1996
1997	error = sa_setup(osp, sa_obj, zfs_attr_table, ZPL_END, sa_table);
1998	return (error);
1999}
2000
2001static int
2002zfs_grab_sa_handle(objset_t *osp, uint64_t obj, sa_handle_t **hdlp,
2003    dmu_buf_t **db, void *tag)
2004{
2005	dmu_object_info_t doi;
2006	int error;
2007
2008	if ((error = sa_buf_hold(osp, obj, tag, db)) != 0)
2009		return (error);
2010
2011	dmu_object_info_from_db(*db, &doi);
2012	if ((doi.doi_bonus_type != DMU_OT_SA &&
2013	    doi.doi_bonus_type != DMU_OT_ZNODE) ||
2014	    doi.doi_bonus_type == DMU_OT_ZNODE &&
2015	    doi.doi_bonus_size < sizeof (znode_phys_t)) {
2016		sa_buf_rele(*db, tag);
2017		return (SET_ERROR(ENOTSUP));
2018	}
2019
2020	error = sa_handle_get(osp, obj, NULL, SA_HDL_PRIVATE, hdlp);
2021	if (error != 0) {
2022		sa_buf_rele(*db, tag);
2023		return (error);
2024	}
2025
2026	return (0);
2027}
2028
2029void
2030zfs_release_sa_handle(sa_handle_t *hdl, dmu_buf_t *db, void *tag)
2031{
2032	sa_handle_destroy(hdl);
2033	sa_buf_rele(db, tag);
2034}
2035
2036/*
2037 * Given an object number, return its parent object number and whether
2038 * or not the object is an extended attribute directory.
2039 */
2040static int
2041zfs_obj_to_pobj(objset_t *osp, sa_handle_t *hdl, sa_attr_type_t *sa_table,
2042    uint64_t *pobjp, int *is_xattrdir)
2043{
2044	uint64_t parent;
2045	uint64_t pflags;
2046	uint64_t mode;
2047	uint64_t parent_mode;
2048	sa_bulk_attr_t bulk[3];
2049	sa_handle_t *sa_hdl;
2050	dmu_buf_t *sa_db;
2051	int count = 0;
2052	int error;
2053
2054	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_PARENT], NULL,
2055	    &parent, sizeof (parent));
2056	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_FLAGS], NULL,
2057	    &pflags, sizeof (pflags));
2058	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL,
2059	    &mode, sizeof (mode));
2060
2061	if ((error = sa_bulk_lookup(hdl, bulk, count)) != 0)
2062		return (error);
2063
2064	/*
2065	 * When a link is removed its parent pointer is not changed and will
2066	 * be invalid.  There are two cases where a link is removed but the
2067	 * file stays around, when it goes to the delete queue and when there
2068	 * are additional links.
2069	 */
2070	error = zfs_grab_sa_handle(osp, parent, &sa_hdl, &sa_db, FTAG);
2071	if (error != 0)
2072		return (error);
2073
2074	error = sa_lookup(sa_hdl, ZPL_MODE, &parent_mode, sizeof (parent_mode));
2075	zfs_release_sa_handle(sa_hdl, sa_db, FTAG);
2076	if (error != 0)
2077		return (error);
2078
2079	*is_xattrdir = ((pflags & ZFS_XATTR) != 0) && S_ISDIR(mode);
2080
2081	/*
2082	 * Extended attributes can be applied to files, directories, etc.
2083	 * Otherwise the parent must be a directory.
2084	 */
2085	if (!*is_xattrdir && !S_ISDIR(parent_mode))
2086		return (SET_ERROR(EINVAL));
2087
2088	*pobjp = parent;
2089
2090	return (0);
2091}
2092
2093/*
2094 * Given an object number, return some zpl level statistics
2095 */
2096static int
2097zfs_obj_to_stats_impl(sa_handle_t *hdl, sa_attr_type_t *sa_table,
2098    zfs_stat_t *sb)
2099{
2100	sa_bulk_attr_t bulk[4];
2101	int count = 0;
2102
2103	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL,
2104	    &sb->zs_mode, sizeof (sb->zs_mode));
2105	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_GEN], NULL,
2106	    &sb->zs_gen, sizeof (sb->zs_gen));
2107	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_LINKS], NULL,
2108	    &sb->zs_links, sizeof (sb->zs_links));
2109	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_CTIME], NULL,
2110	    &sb->zs_ctime, sizeof (sb->zs_ctime));
2111
2112	return (sa_bulk_lookup(hdl, bulk, count));
2113}
2114
2115static int
2116zfs_obj_to_path_impl(objset_t *osp, uint64_t obj, sa_handle_t *hdl,
2117    sa_attr_type_t *sa_table, char *buf, int len)
2118{
2119	sa_handle_t *sa_hdl;
2120	sa_handle_t *prevhdl = NULL;
2121	dmu_buf_t *prevdb = NULL;
2122	dmu_buf_t *sa_db = NULL;
2123	char *path = buf + len - 1;
2124	int error;
2125
2126	*path = '\0';
2127	sa_hdl = hdl;
2128
2129	uint64_t deleteq_obj;
2130	VERIFY0(zap_lookup(osp, MASTER_NODE_OBJ,
2131	    ZFS_UNLINKED_SET, sizeof (uint64_t), 1, &deleteq_obj));
2132	error = zap_lookup_int(osp, deleteq_obj, obj);
2133	if (error == 0) {
2134		return (ESTALE);
2135	} else if (error != ENOENT) {
2136		return (error);
2137	}
2138	error = 0;
2139
2140	for (;;) {
2141		uint64_t pobj;
2142		char component[MAXNAMELEN + 2];
2143		size_t complen;
2144		int is_xattrdir;
2145
2146		if (prevdb)
2147			zfs_release_sa_handle(prevhdl, prevdb, FTAG);
2148
2149		if ((error = zfs_obj_to_pobj(osp, sa_hdl, sa_table, &pobj,
2150		    &is_xattrdir)) != 0)
2151			break;
2152
2153		if (pobj == obj) {
2154			if (path[0] != '/')
2155				*--path = '/';
2156			break;
2157		}
2158
2159		component[0] = '/';
2160		if (is_xattrdir) {
2161			(void) sprintf(component + 1, "<xattrdir>");
2162		} else {
2163			error = zap_value_search(osp, pobj, obj,
2164			    ZFS_DIRENT_OBJ(-1ULL), component + 1);
2165			if (error != 0)
2166				break;
2167		}
2168
2169		complen = strlen(component);
2170		path -= complen;
2171		ASSERT(path >= buf);
2172		bcopy(component, path, complen);
2173		obj = pobj;
2174
2175		if (sa_hdl != hdl) {
2176			prevhdl = sa_hdl;
2177			prevdb = sa_db;
2178		}
2179		error = zfs_grab_sa_handle(osp, obj, &sa_hdl, &sa_db, FTAG);
2180		if (error != 0) {
2181			sa_hdl = prevhdl;
2182			sa_db = prevdb;
2183			break;
2184		}
2185	}
2186
2187	if (sa_hdl != NULL && sa_hdl != hdl) {
2188		ASSERT(sa_db != NULL);
2189		zfs_release_sa_handle(sa_hdl, sa_db, FTAG);
2190	}
2191
2192	if (error == 0)
2193		(void) memmove(buf, path, buf + len - path);
2194
2195	return (error);
2196}
2197
2198int
2199zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len)
2200{
2201	sa_attr_type_t *sa_table;
2202	sa_handle_t *hdl;
2203	dmu_buf_t *db;
2204	int error;
2205
2206	error = zfs_sa_setup(osp, &sa_table);
2207	if (error != 0)
2208		return (error);
2209
2210	error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG);
2211	if (error != 0)
2212		return (error);
2213
2214	error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len);
2215
2216	zfs_release_sa_handle(hdl, db, FTAG);
2217	return (error);
2218}
2219
2220int
2221zfs_obj_to_stats(objset_t *osp, uint64_t obj, zfs_stat_t *sb,
2222    char *buf, int len)
2223{
2224	char *path = buf + len - 1;
2225	sa_attr_type_t *sa_table;
2226	sa_handle_t *hdl;
2227	dmu_buf_t *db;
2228	int error;
2229
2230	*path = '\0';
2231
2232	error = zfs_sa_setup(osp, &sa_table);
2233	if (error != 0)
2234		return (error);
2235
2236	error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG);
2237	if (error != 0)
2238		return (error);
2239
2240	error = zfs_obj_to_stats_impl(hdl, sa_table, sb);
2241	if (error != 0) {
2242		zfs_release_sa_handle(hdl, db, FTAG);
2243		return (error);
2244	}
2245
2246	error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len);
2247
2248	zfs_release_sa_handle(hdl, db, FTAG);
2249	return (error);
2250}
2251