1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
25 * Copyright (c) 2014 Integros [integros.com]
26 * Copyright 2020 Joyent, Inc.
27 * Copyright 2017 Nexenta Systems, Inc.
28 */
29
30/* Portions Copyright 2007 Jeremy Teo */
31/* Portions Copyright 2010 Robert Milkowski */
32
33#include <sys/types.h>
34#include <sys/param.h>
35#include <sys/time.h>
36#include <sys/systm.h>
37#include <sys/sysmacros.h>
38#include <sys/resource.h>
39#include <sys/vfs.h>
40#include <sys/vfs_opreg.h>
41#include <sys/vnode.h>
42#include <sys/file.h>
43#include <sys/stat.h>
44#include <sys/kmem.h>
45#include <sys/taskq.h>
46#include <sys/uio.h>
47#include <sys/vmsystm.h>
48#include <sys/atomic.h>
49#include <sys/vm.h>
50#include <vm/seg_vn.h>
51#include <vm/pvn.h>
52#include <vm/as.h>
53#include <vm/kpm.h>
54#include <vm/seg_kpm.h>
55#include <sys/mman.h>
56#include <sys/pathname.h>
57#include <sys/cmn_err.h>
58#include <sys/errno.h>
59#include <sys/unistd.h>
60#include <sys/zfs_dir.h>
61#include <sys/zfs_acl.h>
62#include <sys/zfs_ioctl.h>
63#include <sys/fs/zfs.h>
64#include <sys/dmu.h>
65#include <sys/dmu_objset.h>
66#include <sys/spa.h>
67#include <sys/txg.h>
68#include <sys/dbuf.h>
69#include <sys/zap.h>
70#include <sys/sa.h>
71#include <sys/dirent.h>
72#include <sys/policy.h>
73#include <sys/sunddi.h>
74#include <sys/filio.h>
75#include <sys/sid.h>
76#include "fs/fs_subr.h"
77#include <sys/zfs_ctldir.h>
78#include <sys/zfs_fuid.h>
79#include <sys/zfs_sa.h>
80#include <sys/dnlc.h>
81#include <sys/zfs_rlock.h>
82#include <sys/extdirent.h>
83#include <sys/kidmap.h>
84#include <sys/cred.h>
85#include <sys/attr.h>
86#include <sys/zil.h>
87#include <sys/sa_impl.h>
88#include <sys/zfs_project.h>
89
90/*
91 * Programming rules.
92 *
93 * Each vnode op performs some logical unit of work.  To do this, the ZPL must
94 * properly lock its in-core state, create a DMU transaction, do the work,
95 * record this work in the intent log (ZIL), commit the DMU transaction,
96 * and wait for the intent log to commit if it is a synchronous operation.
97 * Moreover, the vnode ops must work in both normal and log replay context.
98 * The ordering of events is important to avoid deadlocks and references
99 * to freed memory.  The example below illustrates the following Big Rules:
100 *
101 *  (1)	A check must be made in each zfs thread for a mounted file system.
102 *	This is done avoiding races using ZFS_ENTER(zfsvfs).
103 *	A ZFS_EXIT(zfsvfs) is needed before all returns.  Any znodes
104 *	must be checked with ZFS_VERIFY_ZP(zp).  Both of these macros
105 *	can return EIO from the calling function.
106 *
107 *  (2)	VN_RELE() should always be the last thing except for zil_commit()
108 *	(if necessary) and ZFS_EXIT(). This is for 3 reasons:
109 *	First, if it's the last reference, the vnode/znode
110 *	can be freed, so the zp may point to freed memory.  Second, the last
111 *	reference will call zfs_zinactive(), which may induce a lot of work --
112 *	pushing cached pages (which acquires range locks) and syncing out
113 *	cached atime changes.  Third, zfs_zinactive() may require a new tx,
114 *	which could deadlock the system if you were already holding one.
115 *	If you must call VN_RELE() within a tx then use VN_RELE_ASYNC().
116 *
117 *  (3)	All range locks must be grabbed before calling dmu_tx_assign(),
118 *	as they can span dmu_tx_assign() calls.
119 *
120 *  (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to
121 *      dmu_tx_assign().  This is critical because we don't want to block
122 *      while holding locks.
123 *
124 *	If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT.  This
125 *	reduces lock contention and CPU usage when we must wait (note that if
126 *	throughput is constrained by the storage, nearly every transaction
127 *	must wait).
128 *
129 *      Note, in particular, that if a lock is sometimes acquired before
130 *      the tx assigns, and sometimes after (e.g. z_lock), then failing
131 *      to use a non-blocking assign can deadlock the system.  The scenario:
132 *
133 *	Thread A has grabbed a lock before calling dmu_tx_assign().
134 *	Thread B is in an already-assigned tx, and blocks for this lock.
135 *	Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
136 *	forever, because the previous txg can't quiesce until B's tx commits.
137 *
138 *	If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
139 *	then drop all locks, call dmu_tx_wait(), and try again.  On subsequent
140 *	calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT,
141 *	to indicate that this operation has already called dmu_tx_wait().
142 *	This will ensure that we don't retry forever, waiting a short bit
143 *	each time.
144 *
145 *  (5)	If the operation succeeded, generate the intent log entry for it
146 *	before dropping locks.  This ensures that the ordering of events
147 *	in the intent log matches the order in which they actually occurred.
148 *	During ZIL replay the zfs_log_* functions will update the sequence
149 *	number to indicate the zil transaction has replayed.
150 *
151 *  (6)	At the end of each vnode op, the DMU tx must always commit,
152 *	regardless of whether there were any errors.
153 *
154 *  (7)	After dropping all locks, invoke zil_commit(zilog, foid)
155 *	to ensure that synchronous semantics are provided when necessary.
156 *
157 * In general, this is how things should be ordered in each vnode op:
158 *
159 *	ZFS_ENTER(zfsvfs);		// exit if unmounted
160 * top:
161 *	zfs_dirent_lock(&dl, ...)	// lock directory entry (may VN_HOLD())
162 *	rw_enter(...);			// grab any other locks you need
163 *	tx = dmu_tx_create(...);	// get DMU tx
164 *	dmu_tx_hold_*();		// hold each object you might modify
165 *	error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
166 *	if (error) {
167 *		rw_exit(...);		// drop locks
168 *		zfs_dirent_unlock(dl);	// unlock directory entry
169 *		VN_RELE(...);		// release held vnodes
170 *		if (error == ERESTART) {
171 *			waited = B_TRUE;
172 *			dmu_tx_wait(tx);
173 *			dmu_tx_abort(tx);
174 *			goto top;
175 *		}
176 *		dmu_tx_abort(tx);	// abort DMU tx
177 *		ZFS_EXIT(zfsvfs);	// finished in zfs
178 *		return (error);		// really out of space
179 *	}
180 *	error = do_real_work();		// do whatever this VOP does
181 *	if (error == 0)
182 *		zfs_log_*(...);		// on success, make ZIL entry
183 *	dmu_tx_commit(tx);		// commit DMU tx -- error or not
184 *	rw_exit(...);			// drop locks
185 *	zfs_dirent_unlock(dl);		// unlock directory entry
186 *	VN_RELE(...);			// release held vnodes
187 *	zil_commit(zilog, foid);	// synchronous when necessary
188 *	ZFS_EXIT(zfsvfs);		// finished in zfs
189 *	return (error);			// done, report error
190 */
191
192/* ARGSUSED */
193static int
194zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
195{
196	znode_t	*zp = VTOZ(*vpp);
197	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
198
199	ZFS_ENTER(zfsvfs);
200	ZFS_VERIFY_ZP(zp);
201
202	if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) &&
203	    ((flag & FAPPEND) == 0)) {
204		ZFS_EXIT(zfsvfs);
205		return (SET_ERROR(EPERM));
206	}
207
208	if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
209	    ZTOV(zp)->v_type == VREG &&
210	    !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) {
211		if (fs_vscan(*vpp, cr, 0) != 0) {
212			ZFS_EXIT(zfsvfs);
213			return (SET_ERROR(EACCES));
214		}
215	}
216
217	/* Keep a count of the synchronous opens in the znode */
218	if (flag & (FSYNC | FDSYNC))
219		atomic_inc_32(&zp->z_sync_cnt);
220
221	ZFS_EXIT(zfsvfs);
222	return (0);
223}
224
225/* ARGSUSED */
226static int
227zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
228    caller_context_t *ct)
229{
230	znode_t	*zp = VTOZ(vp);
231	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
232
233	/*
234	 * Clean up any locks held by this process on the vp.
235	 */
236	cleanlocks(vp, ddi_get_pid(), 0);
237	cleanshares(vp, ddi_get_pid());
238
239	ZFS_ENTER(zfsvfs);
240	ZFS_VERIFY_ZP(zp);
241
242	/* Decrement the synchronous opens in the znode */
243	if ((flag & (FSYNC | FDSYNC)) && (count == 1))
244		atomic_dec_32(&zp->z_sync_cnt);
245
246	if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
247	    ZTOV(zp)->v_type == VREG &&
248	    !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0)
249		VERIFY(fs_vscan(vp, cr, 1) == 0);
250
251	ZFS_EXIT(zfsvfs);
252	return (0);
253}
254
255/*
256 * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and
257 * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter.
258 */
259static int
260zfs_holey(vnode_t *vp, int cmd, offset_t *off)
261{
262	znode_t	*zp = VTOZ(vp);
263	uint64_t noff = (uint64_t)*off; /* new offset */
264	uint64_t file_sz;
265	int error;
266	boolean_t hole;
267
268	file_sz = zp->z_size;
269	if (noff >= file_sz)  {
270		return (SET_ERROR(ENXIO));
271	}
272
273	if (cmd == _FIO_SEEK_HOLE)
274		hole = B_TRUE;
275	else
276		hole = B_FALSE;
277
278	error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff);
279
280	if (error == ESRCH)
281		return (SET_ERROR(ENXIO));
282
283	/*
284	 * We could find a hole that begins after the logical end-of-file,
285	 * because dmu_offset_next() only works on whole blocks.  If the
286	 * EOF falls mid-block, then indicate that the "virtual hole"
287	 * at the end of the file begins at the logical EOF, rather than
288	 * at the end of the last block.
289	 */
290	if (noff > file_sz) {
291		ASSERT(hole);
292		noff = file_sz;
293	}
294
295	if (noff < *off)
296		return (error);
297	*off = noff;
298	return (error);
299}
300
301static int
302zfs_ioctl_getxattr(vnode_t *vp, intptr_t data, int flag, cred_t *cr,
303    caller_context_t *ct)
304{
305	zfsxattr_t fsx = { 0 };
306	znode_t *zp = VTOZ(vp);
307
308	if (zp->z_pflags & ZFS_PROJINHERIT)
309		fsx.fsx_xflags = ZFS_PROJINHERIT_FL;
310	if (zp->z_pflags & ZFS_PROJID)
311		fsx.fsx_projid = zp->z_projid;
312	if (ddi_copyout(&fsx, (void *)data, sizeof (fsx), flag))
313		return (SET_ERROR(EFAULT));
314
315	return (0);
316}
317
318static int zfs_setattr(vnode_t *, vattr_t *, int, cred_t *, caller_context_t *);
319
320static int
321zfs_ioctl_setxattr(vnode_t *vp, intptr_t data, int flags, cred_t *cr,
322    caller_context_t *ct)
323{
324	znode_t *zp = VTOZ(vp);
325	zfsxattr_t fsx;
326	xvattr_t xva;
327	xoptattr_t *xoap;
328	int err;
329
330	if (ddi_copyin((void *)data, &fsx, sizeof (fsx), flags))
331		return (SET_ERROR(EFAULT));
332
333	if (!zpl_is_valid_projid(fsx.fsx_projid))
334		return (SET_ERROR(EINVAL));
335
336	if (fsx.fsx_xflags & ~ZFS_PROJINHERIT_FL)
337		return (SET_ERROR(EOPNOTSUPP));
338
339	xva_init(&xva);
340	xoap = xva_getxoptattr(&xva);
341
342	XVA_SET_REQ(&xva, XAT_PROJINHERIT);
343	if (fsx.fsx_xflags & ZFS_PROJINHERIT_FL)
344		xoap->xoa_projinherit = B_TRUE;
345
346	XVA_SET_REQ(&xva, XAT_PROJID);
347	xoap->xoa_projid = fsx.fsx_projid;
348
349	return (zfs_setattr(vp, (vattr_t *)&xva, flags, cr, ct));
350}
351
352/* ARGSUSED */
353static int
354zfs_ioctl(vnode_t *vp, int com, intptr_t data, int flag, cred_t *cred,
355    int *rvalp, caller_context_t *ct)
356{
357	offset_t off;
358	offset_t ndata;
359	dmu_object_info_t doi;
360	int error;
361	zfsvfs_t *zfsvfs;
362	znode_t *zp;
363
364	switch (com) {
365	case _FIOFFS:
366	{
367		return (zfs_sync(vp->v_vfsp, 0, cred));
368
369		/*
370		 * The following two ioctls are used by bfu.  Faking out,
371		 * necessary to avoid bfu errors.
372		 */
373	}
374	case _FIOGDIO:
375	case _FIOSDIO:
376	{
377		return (0);
378	}
379
380	case _FIODIRECTIO:
381	{
382		/*
383		 * ZFS inherently provides the basic semantics for directio.
384		 * This is the summary from the ZFS on Linux support for
385		 * O_DIRECT, which is the common form of directio, and required
386		 * no changes to ZFS.
387		 *
388		 * 1. Minimize cache effects of the I/O.
389		 *
390		 *    By design the ARC is already scan-resistant, which helps
391		 *    mitigate the need for special O_DIRECT handling.
392		 *
393		 * 2. O_DIRECT _MAY_ impose restrictions on IO alignment and
394		 *    length.
395		 *
396		 *    No additional alignment or length restrictions are
397		 *    imposed by ZFS.
398		 *
399		 * 3. O_DIRECT _MAY_ perform unbuffered IO operations directly
400		 *    between user memory and block device.
401		 *
402		 *    No unbuffered IO operations are currently supported. In
403		 *    order to support features such as compression, encryption,
404		 *    and checksumming a copy must be made to transform the
405		 *    data.
406		 *
407		 * 4. O_DIRECT _MAY_ imply O_DSYNC (XFS).
408		 *
409		 *    O_DIRECT does not imply O_DSYNC for ZFS.
410		 *
411		 * 5. O_DIRECT _MAY_ disable file locking that serializes IO
412		 *    operations.
413		 *
414		 *    All I/O in ZFS is locked for correctness and this locking
415		 *    is not disabled by O_DIRECT.
416		 */
417		return (0);
418	}
419
420	case _FIO_SEEK_DATA:
421	case _FIO_SEEK_HOLE:
422	{
423		if (ddi_copyin((void *)data, &off, sizeof (off), flag))
424			return (SET_ERROR(EFAULT));
425
426		zp = VTOZ(vp);
427		zfsvfs = zp->z_zfsvfs;
428		ZFS_ENTER(zfsvfs);
429		ZFS_VERIFY_ZP(zp);
430
431		/* offset parameter is in/out */
432		error = zfs_holey(vp, com, &off);
433		ZFS_EXIT(zfsvfs);
434		if (error)
435			return (error);
436		if (ddi_copyout(&off, (void *)data, sizeof (off), flag))
437			return (SET_ERROR(EFAULT));
438		return (0);
439	}
440	case _FIO_COUNT_FILLED:
441	{
442		/*
443		 * _FIO_COUNT_FILLED adds a new ioctl command which
444		 * exposes the number of filled blocks in a
445		 * ZFS object.
446		 */
447		zp = VTOZ(vp);
448		zfsvfs = zp->z_zfsvfs;
449		ZFS_ENTER(zfsvfs);
450		ZFS_VERIFY_ZP(zp);
451
452		/*
453		 * Wait for all dirty blocks for this object
454		 * to get synced out to disk, and the DMU info
455		 * updated.
456		 */
457		error = dmu_object_wait_synced(zfsvfs->z_os, zp->z_id);
458		if (error) {
459			ZFS_EXIT(zfsvfs);
460			return (error);
461		}
462
463		/*
464		 * Retrieve fill count from DMU object.
465		 */
466		error = dmu_object_info(zfsvfs->z_os, zp->z_id, &doi);
467		if (error) {
468			ZFS_EXIT(zfsvfs);
469			return (error);
470		}
471
472		ndata = doi.doi_fill_count;
473
474		ZFS_EXIT(zfsvfs);
475		if (ddi_copyout(&ndata, (void *)data, sizeof (ndata), flag))
476			return (SET_ERROR(EFAULT));
477		return (0);
478	}
479	case ZFS_IOC_FSGETXATTR:
480		return (zfs_ioctl_getxattr(vp, data, flag, cred, ct));
481	case ZFS_IOC_FSSETXATTR:
482		return (zfs_ioctl_setxattr(vp, data, flag, cred, ct));
483	}
484	return (SET_ERROR(ENOTTY));
485}
486
487/*
488 * Utility functions to map and unmap a single physical page.  These
489 * are used to manage the mappable copies of ZFS file data, and therefore
490 * do not update ref/mod bits.
491 */
492caddr_t
493zfs_map_page(page_t *pp, enum seg_rw rw)
494{
495	if (kpm_enable)
496		return (hat_kpm_mapin(pp, 0));
497	ASSERT(rw == S_READ || rw == S_WRITE);
498	return (ppmapin(pp, PROT_READ | ((rw == S_WRITE) ? PROT_WRITE : 0),
499	    (caddr_t)-1));
500}
501
502void
503zfs_unmap_page(page_t *pp, caddr_t addr)
504{
505	if (kpm_enable) {
506		hat_kpm_mapout(pp, 0, addr);
507	} else {
508		ppmapout(addr);
509	}
510}
511
512/*
513 * When a file is memory mapped, we must keep the IO data synchronized
514 * between the DMU cache and the memory mapped pages.  What this means:
515 *
516 * On Write:	If we find a memory mapped page, we write to *both*
517 *		the page and the dmu buffer.
518 */
519static void
520update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid)
521{
522	int64_t	off;
523
524	off = start & PAGEOFFSET;
525	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
526		page_t *pp;
527		uint64_t nbytes = MIN(PAGESIZE - off, len);
528
529		if (pp = page_lookup(vp, start, SE_SHARED)) {
530			caddr_t va;
531
532			va = zfs_map_page(pp, S_WRITE);
533			(void) dmu_read(os, oid, start+off, nbytes, va+off,
534			    DMU_READ_PREFETCH);
535			zfs_unmap_page(pp, va);
536			page_unlock(pp);
537		}
538		len -= nbytes;
539		off = 0;
540	}
541}
542
543/*
544 * When a file is memory mapped, we must keep the IO data synchronized
545 * between the DMU cache and the memory mapped pages.  What this means:
546 *
547 * On Read:	We "read" preferentially from memory mapped pages,
548 *		else we default from the dmu buffer.
549 *
550 * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
551 *	 the file is memory mapped.
552 */
553static int
554mappedread(vnode_t *vp, int nbytes, uio_t *uio)
555{
556	znode_t *zp = VTOZ(vp);
557	int64_t	start, off;
558	int len = nbytes;
559	int error = 0;
560
561	start = uio->uio_loffset;
562	off = start & PAGEOFFSET;
563	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
564		page_t *pp;
565		uint64_t bytes = MIN(PAGESIZE - off, len);
566
567		if (pp = page_lookup(vp, start, SE_SHARED)) {
568			caddr_t va;
569
570			va = zfs_map_page(pp, S_READ);
571			error = uiomove(va + off, bytes, UIO_READ, uio);
572			zfs_unmap_page(pp, va);
573			page_unlock(pp);
574		} else {
575			error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
576			    uio, bytes);
577		}
578		len -= bytes;
579		off = 0;
580		if (error)
581			break;
582	}
583	return (error);
584}
585
586offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */
587
588/*
589 * Read bytes from specified file into supplied buffer.
590 *
591 *	IN:	vp	- vnode of file to be read from.
592 *		uio	- structure supplying read location, range info,
593 *			  and return buffer.
594 *		ioflag	- SYNC flags; used to provide FRSYNC semantics.
595 *		cr	- credentials of caller.
596 *		ct	- caller context
597 *
598 *	OUT:	uio	- updated offset and range, buffer filled.
599 *
600 *	RETURN:	0 on success, error code on failure.
601 *
602 * Side Effects:
603 *	vp - atime updated if byte count > 0
604 */
605/* ARGSUSED */
606static int
607zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
608{
609	znode_t		*zp = VTOZ(vp);
610	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
611	ssize_t		n, nbytes;
612	int		error = 0;
613	boolean_t	frsync = B_FALSE;
614	xuio_t		*xuio = NULL;
615
616	ZFS_ENTER(zfsvfs);
617	ZFS_VERIFY_ZP(zp);
618
619	if (zp->z_pflags & ZFS_AV_QUARANTINED) {
620		ZFS_EXIT(zfsvfs);
621		return (SET_ERROR(EACCES));
622	}
623
624	/*
625	 * Validate file offset
626	 */
627	if (uio->uio_loffset < (offset_t)0) {
628		ZFS_EXIT(zfsvfs);
629		return (SET_ERROR(EINVAL));
630	}
631
632	/*
633	 * Fasttrack empty reads
634	 */
635	if (uio->uio_resid == 0) {
636		ZFS_EXIT(zfsvfs);
637		return (0);
638	}
639
640	/*
641	 * Check for mandatory locks
642	 */
643	if (MANDMODE(zp->z_mode)) {
644		if (error = chklock(vp, FREAD,
645		    uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) {
646			ZFS_EXIT(zfsvfs);
647			return (error);
648		}
649	}
650
651#ifdef FRSYNC
652	/*
653	 * If we're in FRSYNC mode, sync out this znode before reading it.
654	 * Only do this for non-snapshots.
655	 *
656	 * Some platforms do not support FRSYNC and instead map it
657	 * to FSYNC, which results in unnecessary calls to zil_commit. We
658	 * only honor FRSYNC requests on platforms which support it.
659	 */
660	frsync = !!(ioflag & FRSYNC);
661#endif
662
663	if (zfsvfs->z_log &&
664	    (frsync || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS))
665		zil_commit(zfsvfs->z_log, zp->z_id);
666
667	/*
668	 * Lock the range against changes.
669	 */
670	locked_range_t *lr = rangelock_enter(&zp->z_rangelock,
671	    uio->uio_loffset, uio->uio_resid, RL_READER);
672
673	/*
674	 * If we are reading past end-of-file we can skip
675	 * to the end; but we might still need to set atime.
676	 */
677	if (uio->uio_loffset >= zp->z_size) {
678		error = 0;
679		goto out;
680	}
681
682	ASSERT(uio->uio_loffset < zp->z_size);
683	n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset);
684
685	if ((uio->uio_extflg == UIO_XUIO) &&
686	    (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) {
687		int nblk;
688		int blksz = zp->z_blksz;
689		uint64_t offset = uio->uio_loffset;
690
691		xuio = (xuio_t *)uio;
692		if ((ISP2(blksz))) {
693			nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset,
694			    blksz)) / blksz;
695		} else {
696			ASSERT(offset + n <= blksz);
697			nblk = 1;
698		}
699		(void) dmu_xuio_init(xuio, nblk);
700
701		if (vn_has_cached_data(vp)) {
702			/*
703			 * For simplicity, we always allocate a full buffer
704			 * even if we only expect to read a portion of a block.
705			 */
706			while (--nblk >= 0) {
707				(void) dmu_xuio_add(xuio,
708				    dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
709				    blksz), 0, blksz);
710			}
711		}
712	}
713
714	while (n > 0) {
715		nbytes = MIN(n, zfs_read_chunk_size -
716		    P2PHASE(uio->uio_loffset, zfs_read_chunk_size));
717
718		if (vn_has_cached_data(vp)) {
719			error = mappedread(vp, nbytes, uio);
720		} else {
721			error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
722			    uio, nbytes);
723		}
724		if (error) {
725			/* convert checksum errors into IO errors */
726			if (error == ECKSUM)
727				error = SET_ERROR(EIO);
728			break;
729		}
730
731		n -= nbytes;
732	}
733out:
734	rangelock_exit(lr);
735
736	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
737	ZFS_EXIT(zfsvfs);
738	return (error);
739}
740
741/*
742 * Write the bytes to a file.
743 *
744 *	IN:	vp	- vnode of file to be written to.
745 *		uio	- structure supplying write location, range info,
746 *			  and data buffer.
747 *		ioflag	- FAPPEND, FSYNC, and/or FDSYNC.  FAPPEND is
748 *			  set if in append mode.
749 *		cr	- credentials of caller.
750 *		ct	- caller context (NFS/CIFS fem monitor only)
751 *
752 *	OUT:	uio	- updated offset and range.
753 *
754 *	RETURN:	0 on success, error code on failure.
755 *
756 * Timestamps:
757 *	vp - ctime|mtime updated if byte count > 0
758 */
759
760/* ARGSUSED */
761static int
762zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
763{
764	znode_t		*zp = VTOZ(vp);
765	rlim64_t	limit = uio->uio_llimit;
766	ssize_t		start_resid = uio->uio_resid;
767	ssize_t		tx_bytes;
768	uint64_t	end_size;
769	dmu_tx_t	*tx;
770	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
771	zilog_t		*zilog;
772	offset_t	woff;
773	ssize_t		n, nbytes;
774	int		max_blksz = zfsvfs->z_max_blksz;
775	int		error = 0;
776	int		prev_error;
777	arc_buf_t	*abuf;
778	iovec_t		*aiov = NULL;
779	xuio_t		*xuio = NULL;
780	int		i_iov = 0;
781	int		iovcnt = uio->uio_iovcnt;
782	iovec_t		*iovp = uio->uio_iov;
783	int		write_eof;
784	int		count = 0;
785	sa_bulk_attr_t	bulk[4];
786	uint64_t	mtime[2], ctime[2];
787
788	/*
789	 * Fasttrack empty write
790	 */
791	n = start_resid;
792	if (n == 0)
793		return (0);
794
795	if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
796		limit = MAXOFFSET_T;
797
798	ZFS_ENTER(zfsvfs);
799	ZFS_VERIFY_ZP(zp);
800
801	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
802	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
803	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
804	    &zp->z_size, 8);
805	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
806	    &zp->z_pflags, 8);
807
808	/*
809	 * In a case vp->v_vfsp != zp->z_zfsvfs->z_vfs (e.g. snapshots) our
810	 * callers might not be able to detect properly that we are read-only,
811	 * so check it explicitly here.
812	 */
813	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
814		ZFS_EXIT(zfsvfs);
815		return (SET_ERROR(EROFS));
816	}
817
818	/*
819	 * If immutable or not appending then return EPERM.
820	 * Intentionally allow ZFS_READONLY through here.
821	 * See zfs_zaccess_common()
822	 */
823	if ((zp->z_pflags & ZFS_IMMUTABLE) ||
824	    ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) &&
825	    (uio->uio_loffset < zp->z_size))) {
826		ZFS_EXIT(zfsvfs);
827		return (SET_ERROR(EPERM));
828	}
829
830	zilog = zfsvfs->z_log;
831
832	/*
833	 * Validate file offset
834	 */
835	woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset;
836	if (woff < 0) {
837		ZFS_EXIT(zfsvfs);
838		return (SET_ERROR(EINVAL));
839	}
840
841	/*
842	 * Check for mandatory locks before calling rangelock_enter()
843	 * in order to prevent a deadlock with locks set via fcntl().
844	 */
845	if (MANDMODE((mode_t)zp->z_mode) &&
846	    (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) {
847		ZFS_EXIT(zfsvfs);
848		return (error);
849	}
850
851	/*
852	 * Pre-fault the pages to ensure slow (eg NFS) pages
853	 * don't hold up txg.
854	 * Skip this if uio contains loaned arc_buf.
855	 */
856	if ((uio->uio_extflg == UIO_XUIO) &&
857	    (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY))
858		xuio = (xuio_t *)uio;
859	else
860		uio_prefaultpages(MIN(n, max_blksz), uio);
861
862	/*
863	 * If in append mode, set the io offset pointer to eof.
864	 */
865	locked_range_t *lr;
866	if (ioflag & FAPPEND) {
867		/*
868		 * Obtain an appending range lock to guarantee file append
869		 * semantics.  We reset the write offset once we have the lock.
870		 */
871		lr = rangelock_enter(&zp->z_rangelock, 0, n, RL_APPEND);
872		woff = lr->lr_offset;
873		if (lr->lr_length == UINT64_MAX) {
874			/*
875			 * We overlocked the file because this write will cause
876			 * the file block size to increase.
877			 * Note that zp_size cannot change with this lock held.
878			 */
879			woff = zp->z_size;
880		}
881		uio->uio_loffset = woff;
882	} else {
883		/*
884		 * Note that if the file block size will change as a result of
885		 * this write, then this range lock will lock the entire file
886		 * so that we can re-write the block safely.
887		 */
888		lr = rangelock_enter(&zp->z_rangelock, woff, n, RL_WRITER);
889	}
890
891	if (woff >= limit) {
892		rangelock_exit(lr);
893		ZFS_EXIT(zfsvfs);
894		return (SET_ERROR(EFBIG));
895	}
896
897	if ((woff + n) > limit || woff > (limit - n))
898		n = limit - woff;
899
900	/* Will this write extend the file length? */
901	write_eof = (woff + n > zp->z_size);
902
903	end_size = MAX(zp->z_size, woff + n);
904
905	/*
906	 * Write the file in reasonable size chunks.  Each chunk is written
907	 * in a separate transaction; this keeps the intent log records small
908	 * and allows us to do more fine-grained space accounting.
909	 */
910	while (n > 0) {
911		woff = uio->uio_loffset;
912
913		if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT,
914		    zp->z_uid) ||
915		    zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT,
916		    zp->z_gid) ||
917		    (zp->z_projid != ZFS_DEFAULT_PROJID &&
918		    zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT,
919		    zp->z_projid))) {
920			error = SET_ERROR(EDQUOT);
921			break;
922		}
923
924		arc_buf_t *abuf = NULL;
925		if (xuio) {
926			ASSERT(i_iov < iovcnt);
927			aiov = &iovp[i_iov];
928			abuf = dmu_xuio_arcbuf(xuio, i_iov);
929			dmu_xuio_clear(xuio, i_iov);
930			DTRACE_PROBE3(zfs_cp_write, int, i_iov,
931			    iovec_t *, aiov, arc_buf_t *, abuf);
932			ASSERT((aiov->iov_base == abuf->b_data) ||
933			    ((char *)aiov->iov_base - (char *)abuf->b_data +
934			    aiov->iov_len == arc_buf_size(abuf)));
935			i_iov++;
936		} else if (n >= max_blksz && woff >= zp->z_size &&
937		    P2PHASE(woff, max_blksz) == 0 &&
938		    zp->z_blksz == max_blksz) {
939			/*
940			 * This write covers a full block.  "Borrow" a buffer
941			 * from the dmu so that we can fill it before we enter
942			 * a transaction.  This avoids the possibility of
943			 * holding up the transaction if the data copy hangs
944			 * up on a pagefault (e.g., from an NFS server mapping).
945			 */
946			size_t cbytes;
947
948			abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
949			    max_blksz);
950			ASSERT(abuf != NULL);
951			ASSERT(arc_buf_size(abuf) == max_blksz);
952			if (error = uiocopy(abuf->b_data, max_blksz,
953			    UIO_WRITE, uio, &cbytes)) {
954				dmu_return_arcbuf(abuf);
955				break;
956			}
957			ASSERT(cbytes == max_blksz);
958		}
959
960		/*
961		 * Start a transaction.
962		 */
963		tx = dmu_tx_create(zfsvfs->z_os);
964		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
965		dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
966		zfs_sa_upgrade_txholds(tx, zp);
967		error = dmu_tx_assign(tx, TXG_WAIT);
968		if (error) {
969			dmu_tx_abort(tx);
970			if (abuf != NULL)
971				dmu_return_arcbuf(abuf);
972			break;
973		}
974
975		/*
976		 * If rangelock_enter() over-locked we grow the blocksize
977		 * and then reduce the lock range.  This will only happen
978		 * on the first iteration since rangelock_reduce() will
979		 * shrink down lr_length to the appropriate size.
980		 */
981		if (lr->lr_length == UINT64_MAX) {
982			uint64_t new_blksz;
983
984			if (zp->z_blksz > max_blksz) {
985				/*
986				 * File's blocksize is already larger than the
987				 * "recordsize" property.  Only let it grow to
988				 * the next power of 2.
989				 */
990				ASSERT(!ISP2(zp->z_blksz));
991				new_blksz = MIN(end_size,
992				    1 << highbit64(zp->z_blksz));
993			} else {
994				new_blksz = MIN(end_size, max_blksz);
995			}
996			zfs_grow_blocksize(zp, new_blksz, tx);
997			rangelock_reduce(lr, woff, n);
998		}
999
1000		/*
1001		 * XXX - should we really limit each write to z_max_blksz?
1002		 * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
1003		 */
1004		nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
1005
1006		if (abuf == NULL) {
1007			tx_bytes = uio->uio_resid;
1008			error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl),
1009			    uio, nbytes, tx);
1010			tx_bytes -= uio->uio_resid;
1011		} else {
1012			tx_bytes = nbytes;
1013			ASSERT(xuio == NULL || tx_bytes == aiov->iov_len);
1014			/*
1015			 * If this is not a full block write, but we are
1016			 * extending the file past EOF and this data starts
1017			 * block-aligned, use assign_arcbuf().  Otherwise,
1018			 * write via dmu_write().
1019			 */
1020			if (tx_bytes < max_blksz && (!write_eof ||
1021			    aiov->iov_base != abuf->b_data)) {
1022				ASSERT(xuio);
1023				dmu_write(zfsvfs->z_os, zp->z_id, woff,
1024				    aiov->iov_len, aiov->iov_base, tx);
1025				dmu_return_arcbuf(abuf);
1026				xuio_stat_wbuf_copied();
1027			} else {
1028				ASSERT(xuio || tx_bytes == max_blksz);
1029				dmu_assign_arcbuf_by_dbuf(
1030				    sa_get_db(zp->z_sa_hdl), woff, abuf, tx);
1031			}
1032			ASSERT(tx_bytes <= uio->uio_resid);
1033			uioskip(uio, tx_bytes);
1034		}
1035		if (tx_bytes && vn_has_cached_data(vp)) {
1036			update_pages(vp, woff,
1037			    tx_bytes, zfsvfs->z_os, zp->z_id);
1038		}
1039
1040		/*
1041		 * If we made no progress, we're done.  If we made even
1042		 * partial progress, update the znode and ZIL accordingly.
1043		 */
1044		if (tx_bytes == 0) {
1045			(void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
1046			    (void *)&zp->z_size, sizeof (uint64_t), tx);
1047			dmu_tx_commit(tx);
1048			ASSERT(error != 0);
1049			break;
1050		}
1051
1052		/*
1053		 * Clear Set-UID/Set-GID bits on successful write if not
1054		 * privileged and at least one of the excute bits is set.
1055		 *
1056		 * It would be nice to to this after all writes have
1057		 * been done, but that would still expose the ISUID/ISGID
1058		 * to another app after the partial write is committed.
1059		 *
1060		 * Note: we don't call zfs_fuid_map_id() here because
1061		 * user 0 is not an ephemeral uid.
1062		 */
1063		mutex_enter(&zp->z_acl_lock);
1064		if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) |
1065		    (S_IXUSR >> 6))) != 0 &&
1066		    (zp->z_mode & (S_ISUID | S_ISGID)) != 0 &&
1067		    secpolicy_vnode_setid_retain(cr,
1068		    (zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0) != 0) {
1069			uint64_t newmode;
1070			zp->z_mode &= ~(S_ISUID | S_ISGID);
1071			newmode = zp->z_mode;
1072			(void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs),
1073			    (void *)&newmode, sizeof (uint64_t), tx);
1074		}
1075		mutex_exit(&zp->z_acl_lock);
1076
1077		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
1078		    B_TRUE);
1079
1080		/*
1081		 * Update the file size (zp_size) if it has changed;
1082		 * account for possible concurrent updates.
1083		 */
1084		while ((end_size = zp->z_size) < uio->uio_loffset) {
1085			(void) atomic_cas_64(&zp->z_size, end_size,
1086			    uio->uio_loffset);
1087		}
1088		/*
1089		 * If we are replaying and eof is non zero then force
1090		 * the file size to the specified eof. Note, there's no
1091		 * concurrency during replay.
1092		 */
1093		if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0)
1094			zp->z_size = zfsvfs->z_replay_eof;
1095
1096		/*
1097		 * Keep track of a possible pre-existing error from a partial
1098		 * write via dmu_write_uio_dbuf above.
1099		 */
1100		prev_error = error;
1101		error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
1102
1103		zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag);
1104		dmu_tx_commit(tx);
1105
1106		if (prev_error != 0 || error != 0)
1107			break;
1108		ASSERT(tx_bytes == nbytes);
1109		n -= nbytes;
1110
1111		if (!xuio && n > 0)
1112			uio_prefaultpages(MIN(n, max_blksz), uio);
1113	}
1114
1115	rangelock_exit(lr);
1116
1117	/*
1118	 * If we're in replay mode, or we made no progress, return error.
1119	 * Otherwise, it's at least a partial write, so it's successful.
1120	 */
1121	if (zfsvfs->z_replay || uio->uio_resid == start_resid) {
1122		ZFS_EXIT(zfsvfs);
1123		return (error);
1124	}
1125
1126	if (ioflag & (FSYNC | FDSYNC) ||
1127	    zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1128		zil_commit(zilog, zp->z_id);
1129
1130	ZFS_EXIT(zfsvfs);
1131	return (0);
1132}
1133
1134/* ARGSUSED */
1135void
1136zfs_get_done(zgd_t *zgd, int error)
1137{
1138	znode_t *zp = zgd->zgd_private;
1139	objset_t *os = zp->z_zfsvfs->z_os;
1140
1141	if (zgd->zgd_db)
1142		dmu_buf_rele(zgd->zgd_db, zgd);
1143
1144	rangelock_exit(zgd->zgd_lr);
1145
1146	/*
1147	 * Release the vnode asynchronously as we currently have the
1148	 * txg stopped from syncing.
1149	 */
1150	VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
1151
1152	kmem_free(zgd, sizeof (zgd_t));
1153}
1154
1155#ifdef DEBUG
1156static int zil_fault_io = 0;
1157#endif
1158
1159/*
1160 * Get data to generate a TX_WRITE intent log record.
1161 */
1162int
1163zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
1164{
1165	zfsvfs_t *zfsvfs = arg;
1166	objset_t *os = zfsvfs->z_os;
1167	znode_t *zp;
1168	uint64_t object = lr->lr_foid;
1169	uint64_t offset = lr->lr_offset;
1170	uint64_t size = lr->lr_length;
1171	dmu_buf_t *db;
1172	zgd_t *zgd;
1173	int error = 0;
1174
1175	ASSERT3P(lwb, !=, NULL);
1176	ASSERT3P(zio, !=, NULL);
1177	ASSERT3U(size, !=, 0);
1178
1179	/*
1180	 * Nothing to do if the file has been removed
1181	 */
1182	if (zfs_zget(zfsvfs, object, &zp) != 0)
1183		return (SET_ERROR(ENOENT));
1184	if (zp->z_unlinked) {
1185		/*
1186		 * Release the vnode asynchronously as we currently have the
1187		 * txg stopped from syncing.
1188		 */
1189		VN_RELE_ASYNC(ZTOV(zp),
1190		    dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
1191		return (SET_ERROR(ENOENT));
1192	}
1193
1194	zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
1195	zgd->zgd_lwb = lwb;
1196	zgd->zgd_private = zp;
1197
1198	/*
1199	 * Write records come in two flavors: immediate and indirect.
1200	 * For small writes it's cheaper to store the data with the
1201	 * log record (immediate); for large writes it's cheaper to
1202	 * sync the data and get a pointer to it (indirect) so that
1203	 * we don't have to write the data twice.
1204	 */
1205	if (buf != NULL) { /* immediate write */
1206		zgd->zgd_lr = rangelock_enter(&zp->z_rangelock,
1207		    offset, size, RL_READER);
1208		/* test for truncation needs to be done while range locked */
1209		if (offset >= zp->z_size) {
1210			error = SET_ERROR(ENOENT);
1211		} else {
1212			error = dmu_read(os, object, offset, size, buf,
1213			    DMU_READ_NO_PREFETCH);
1214		}
1215		ASSERT(error == 0 || error == ENOENT);
1216	} else { /* indirect write */
1217		/*
1218		 * Have to lock the whole block to ensure when it's
1219		 * written out and its checksum is being calculated
1220		 * that no one can change the data. We need to re-check
1221		 * blocksize after we get the lock in case it's changed!
1222		 */
1223		for (;;) {
1224			uint64_t blkoff;
1225			size = zp->z_blksz;
1226			blkoff = ISP2(size) ? P2PHASE(offset, size) : offset;
1227			offset -= blkoff;
1228			zgd->zgd_lr = rangelock_enter(&zp->z_rangelock,
1229			    offset, size, RL_READER);
1230			if (zp->z_blksz == size)
1231				break;
1232			offset += blkoff;
1233			rangelock_exit(zgd->zgd_lr);
1234		}
1235		/* test for truncation needs to be done while range locked */
1236		if (lr->lr_offset >= zp->z_size)
1237			error = SET_ERROR(ENOENT);
1238#ifdef DEBUG
1239		if (zil_fault_io) {
1240			error = SET_ERROR(EIO);
1241			zil_fault_io = 0;
1242		}
1243#endif
1244		if (error == 0)
1245			error = dmu_buf_hold(os, object, offset, zgd, &db,
1246			    DMU_READ_NO_PREFETCH);
1247
1248		if (error == 0) {
1249			blkptr_t *bp = &lr->lr_blkptr;
1250
1251			zgd->zgd_db = db;
1252			zgd->zgd_bp = bp;
1253
1254			ASSERT(db->db_offset == offset);
1255			ASSERT(db->db_size == size);
1256
1257			error = dmu_sync(zio, lr->lr_common.lrc_txg,
1258			    zfs_get_done, zgd);
1259			ASSERT(error || lr->lr_length <= size);
1260
1261			/*
1262			 * On success, we need to wait for the write I/O
1263			 * initiated by dmu_sync() to complete before we can
1264			 * release this dbuf.  We will finish everything up
1265			 * in the zfs_get_done() callback.
1266			 */
1267			if (error == 0)
1268				return (0);
1269
1270			if (error == EALREADY) {
1271				lr->lr_common.lrc_txtype = TX_WRITE2;
1272				/*
1273				 * TX_WRITE2 relies on the data previously
1274				 * written by the TX_WRITE that caused
1275				 * EALREADY.  We zero out the BP because
1276				 * it is the old, currently-on-disk BP.
1277				 */
1278				zgd->zgd_bp = NULL;
1279				BP_ZERO(bp);
1280				error = 0;
1281			}
1282		}
1283	}
1284
1285	zfs_get_done(zgd, error);
1286
1287	return (error);
1288}
1289
1290/*ARGSUSED*/
1291static int
1292zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr,
1293    caller_context_t *ct)
1294{
1295	znode_t *zp = VTOZ(vp);
1296	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1297	int error;
1298
1299	ZFS_ENTER(zfsvfs);
1300	ZFS_VERIFY_ZP(zp);
1301
1302	if (flag & V_ACE_MASK)
1303		error = zfs_zaccess(zp, mode, flag, B_FALSE, cr);
1304	else
1305		error = zfs_zaccess_rwx(zp, mode, flag, cr);
1306
1307	ZFS_EXIT(zfsvfs);
1308	return (error);
1309}
1310
1311/*
1312 * If vnode is for a device return a specfs vnode instead.
1313 */
1314static int
1315specvp_check(vnode_t **vpp, cred_t *cr)
1316{
1317	int error = 0;
1318
1319	if (IS_DEVVP(*vpp)) {
1320		struct vnode *svp;
1321
1322		svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
1323		VN_RELE(*vpp);
1324		if (svp == NULL)
1325			error = SET_ERROR(ENOSYS);
1326		*vpp = svp;
1327	}
1328	return (error);
1329}
1330
1331
1332/*
1333 * Lookup an entry in a directory, or an extended attribute directory.
1334 * If it exists, return a held vnode reference for it.
1335 *
1336 *	IN:	dvp	- vnode of directory to search.
1337 *		nm	- name of entry to lookup.
1338 *		pnp	- full pathname to lookup [UNUSED].
1339 *		flags	- LOOKUP_XATTR set if looking for an attribute.
1340 *		rdir	- root directory vnode [UNUSED].
1341 *		cr	- credentials of caller.
1342 *		ct	- caller context
1343 *		direntflags - directory lookup flags
1344 *		realpnp - returned pathname.
1345 *
1346 *	OUT:	vpp	- vnode of located entry, NULL if not found.
1347 *
1348 *	RETURN:	0 on success, error code on failure.
1349 *
1350 * Timestamps:
1351 *	NA
1352 */
1353/* ARGSUSED */
1354static int
1355zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
1356    int flags, vnode_t *rdir, cred_t *cr,  caller_context_t *ct,
1357    int *direntflags, pathname_t *realpnp)
1358{
1359	znode_t *zdp = VTOZ(dvp);
1360	zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
1361	int	error = 0;
1362
1363	/*
1364	 * Fast path lookup, however we must skip DNLC lookup
1365	 * for case folding or normalizing lookups because the
1366	 * DNLC code only stores the passed in name.  This means
1367	 * creating 'a' and removing 'A' on a case insensitive
1368	 * file system would work, but DNLC still thinks 'a'
1369	 * exists and won't let you create it again on the next
1370	 * pass through fast path.
1371	 */
1372	if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) {
1373
1374		if (dvp->v_type != VDIR) {
1375			return (SET_ERROR(ENOTDIR));
1376		} else if (zdp->z_sa_hdl == NULL) {
1377			return (SET_ERROR(EIO));
1378		}
1379
1380		if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) {
1381			error = zfs_fastaccesschk_execute(zdp, cr);
1382			if (!error) {
1383				*vpp = dvp;
1384				VN_HOLD(*vpp);
1385				return (0);
1386			}
1387			return (error);
1388		} else if (!zdp->z_zfsvfs->z_norm &&
1389		    (zdp->z_zfsvfs->z_case == ZFS_CASE_SENSITIVE)) {
1390
1391			vnode_t *tvp = dnlc_lookup(dvp, nm);
1392
1393			if (tvp) {
1394				error = zfs_fastaccesschk_execute(zdp, cr);
1395				if (error) {
1396					VN_RELE(tvp);
1397					return (error);
1398				}
1399				if (tvp == DNLC_NO_VNODE) {
1400					VN_RELE(tvp);
1401					return (SET_ERROR(ENOENT));
1402				} else {
1403					*vpp = tvp;
1404					return (specvp_check(vpp, cr));
1405				}
1406			}
1407		}
1408	}
1409
1410	DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm);
1411
1412	ZFS_ENTER(zfsvfs);
1413	ZFS_VERIFY_ZP(zdp);
1414
1415	*vpp = NULL;
1416
1417	if (flags & LOOKUP_XATTR) {
1418		/*
1419		 * If the xattr property is off, refuse the lookup request.
1420		 */
1421		if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) {
1422			ZFS_EXIT(zfsvfs);
1423			return (SET_ERROR(EINVAL));
1424		}
1425
1426		/*
1427		 * We don't allow recursive attributes..
1428		 * Maybe someday we will.
1429		 */
1430		if (zdp->z_pflags & ZFS_XATTR) {
1431			ZFS_EXIT(zfsvfs);
1432			return (SET_ERROR(EINVAL));
1433		}
1434
1435		if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) {
1436			ZFS_EXIT(zfsvfs);
1437			return (error);
1438		}
1439
1440		/*
1441		 * Do we have permission to get into attribute directory?
1442		 */
1443
1444		if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0,
1445		    B_FALSE, cr)) {
1446			VN_RELE(*vpp);
1447			*vpp = NULL;
1448		}
1449
1450		ZFS_EXIT(zfsvfs);
1451		return (error);
1452	}
1453
1454	if (dvp->v_type != VDIR) {
1455		ZFS_EXIT(zfsvfs);
1456		return (SET_ERROR(ENOTDIR));
1457	}
1458
1459	/*
1460	 * Check accessibility of directory.
1461	 */
1462
1463	if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) {
1464		ZFS_EXIT(zfsvfs);
1465		return (error);
1466	}
1467
1468	if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
1469	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1470		ZFS_EXIT(zfsvfs);
1471		return (SET_ERROR(EILSEQ));
1472	}
1473
1474	error = zfs_dirlook(zdp, nm, vpp, flags, direntflags, realpnp);
1475	if (error == 0)
1476		error = specvp_check(vpp, cr);
1477
1478	ZFS_EXIT(zfsvfs);
1479	return (error);
1480}
1481
1482/*
1483 * Attempt to create a new entry in a directory.  If the entry
1484 * already exists, truncate the file if permissible, else return
1485 * an error.  Return the vp of the created or trunc'd file.
1486 *
1487 *	IN:	dvp	- vnode of directory to put new file entry in.
1488 *		name	- name of new file entry.
1489 *		vap	- attributes of new file.
1490 *		excl	- flag indicating exclusive or non-exclusive mode.
1491 *		mode	- mode to open file with.
1492 *		cr	- credentials of caller.
1493 *		flag	- large file flag [UNUSED].
1494 *		ct	- caller context
1495 *		vsecp	- ACL to be set
1496 *
1497 *	OUT:	vpp	- vnode of created or trunc'd entry.
1498 *
1499 *	RETURN:	0 on success, error code on failure.
1500 *
1501 * Timestamps:
1502 *	dvp - ctime|mtime updated if new entry created
1503 *	 vp - ctime|mtime always, atime if new
1504 */
1505
1506/* ARGSUSED */
1507static int
1508zfs_create(vnode_t *dvp, char *name, vattr_t *vap, vcexcl_t excl,
1509    int mode, vnode_t **vpp, cred_t *cr, int flag, caller_context_t *ct,
1510    vsecattr_t *vsecp)
1511{
1512	znode_t		*zp, *dzp = VTOZ(dvp);
1513	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
1514	zilog_t		*zilog;
1515	objset_t	*os;
1516	zfs_dirlock_t	*dl;
1517	dmu_tx_t	*tx;
1518	int		error;
1519	ksid_t		*ksid;
1520	uid_t		uid;
1521	gid_t		gid = crgetgid(cr);
1522	zfs_acl_ids_t   acl_ids;
1523	boolean_t	fuid_dirtied;
1524	boolean_t	have_acl = B_FALSE;
1525	boolean_t	waited = B_FALSE;
1526
1527	/*
1528	 * If we have an ephemeral id, ACL, or XVATTR then
1529	 * make sure file system is at proper version
1530	 */
1531
1532	ksid = crgetsid(cr, KSID_OWNER);
1533	if (ksid)
1534		uid = ksid_getid(ksid);
1535	else
1536		uid = crgetuid(cr);
1537
1538	if (zfsvfs->z_use_fuids == B_FALSE &&
1539	    (vsecp || (vap->va_mask & AT_XVATTR) ||
1540	    IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
1541		return (SET_ERROR(EINVAL));
1542
1543	ZFS_ENTER(zfsvfs);
1544	ZFS_VERIFY_ZP(dzp);
1545	os = zfsvfs->z_os;
1546	zilog = zfsvfs->z_log;
1547
1548	if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
1549	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1550		ZFS_EXIT(zfsvfs);
1551		return (SET_ERROR(EILSEQ));
1552	}
1553
1554	if (vap->va_mask & AT_XVATTR) {
1555		if ((error = secpolicy_xvattr((xvattr_t *)vap,
1556		    crgetuid(cr), cr, vap->va_type)) != 0) {
1557			ZFS_EXIT(zfsvfs);
1558			return (error);
1559		}
1560	}
1561top:
1562	*vpp = NULL;
1563
1564	if ((vap->va_mode & VSVTX) && secpolicy_vnode_stky_modify(cr))
1565		vap->va_mode &= ~VSVTX;
1566
1567	if (*name == '\0') {
1568		/*
1569		 * Null component name refers to the directory itself.
1570		 */
1571		VN_HOLD(dvp);
1572		zp = dzp;
1573		dl = NULL;
1574		error = 0;
1575	} else {
1576		/* possible VN_HOLD(zp) */
1577		int zflg = 0;
1578
1579		if (flag & FIGNORECASE)
1580			zflg |= ZCILOOK;
1581
1582		error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
1583		    NULL, NULL);
1584		if (error) {
1585			if (have_acl)
1586				zfs_acl_ids_free(&acl_ids);
1587			if (strcmp(name, "..") == 0)
1588				error = SET_ERROR(EISDIR);
1589			ZFS_EXIT(zfsvfs);
1590			return (error);
1591		}
1592	}
1593
1594	if (zp == NULL) {
1595		uint64_t txtype;
1596		uint64_t projid = ZFS_DEFAULT_PROJID;
1597
1598		/*
1599		 * Create a new file object and update the directory
1600		 * to reference it.
1601		 */
1602		if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
1603			if (have_acl)
1604				zfs_acl_ids_free(&acl_ids);
1605			goto out;
1606		}
1607
1608		/*
1609		 * We only support the creation of regular files in
1610		 * extended attribute directories.
1611		 */
1612
1613		if ((dzp->z_pflags & ZFS_XATTR) &&
1614		    (vap->va_type != VREG)) {
1615			if (have_acl)
1616				zfs_acl_ids_free(&acl_ids);
1617			error = SET_ERROR(EINVAL);
1618			goto out;
1619		}
1620
1621		if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap,
1622		    cr, vsecp, &acl_ids)) != 0)
1623			goto out;
1624		have_acl = B_TRUE;
1625
1626		if (vap->va_type == VREG || vap->va_type == VDIR)
1627			projid = zfs_inherit_projid(dzp);
1628		if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) {
1629			zfs_acl_ids_free(&acl_ids);
1630			error = SET_ERROR(EDQUOT);
1631			goto out;
1632		}
1633
1634		tx = dmu_tx_create(os);
1635
1636		dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
1637		    ZFS_SA_BASE_ATTR_SIZE);
1638
1639		fuid_dirtied = zfsvfs->z_fuid_dirty;
1640		if (fuid_dirtied)
1641			zfs_fuid_txhold(zfsvfs, tx);
1642		dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
1643		dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
1644		if (!zfsvfs->z_use_sa &&
1645		    acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
1646			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
1647			    0, acl_ids.z_aclp->z_acl_bytes);
1648		}
1649		error = dmu_tx_assign(tx,
1650		    (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
1651		if (error) {
1652			zfs_dirent_unlock(dl);
1653			if (error == ERESTART) {
1654				waited = B_TRUE;
1655				dmu_tx_wait(tx);
1656				dmu_tx_abort(tx);
1657				goto top;
1658			}
1659			zfs_acl_ids_free(&acl_ids);
1660			dmu_tx_abort(tx);
1661			ZFS_EXIT(zfsvfs);
1662			return (error);
1663		}
1664		zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
1665
1666		if (fuid_dirtied)
1667			zfs_fuid_sync(zfsvfs, tx);
1668
1669		(void) zfs_link_create(dl, zp, tx, ZNEW);
1670		txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
1671		if (flag & FIGNORECASE)
1672			txtype |= TX_CI;
1673		zfs_log_create(zilog, tx, txtype, dzp, zp, name,
1674		    vsecp, acl_ids.z_fuidp, vap);
1675		zfs_acl_ids_free(&acl_ids);
1676		dmu_tx_commit(tx);
1677	} else {
1678		int aflags = (flag & FAPPEND) ? V_APPEND : 0;
1679
1680		if (have_acl)
1681			zfs_acl_ids_free(&acl_ids);
1682		have_acl = B_FALSE;
1683
1684		/*
1685		 * A directory entry already exists for this name.
1686		 */
1687		/*
1688		 * Can't truncate an existing file if in exclusive mode.
1689		 */
1690		if (excl == EXCL) {
1691			error = SET_ERROR(EEXIST);
1692			goto out;
1693		}
1694		/*
1695		 * Can't open a directory for writing.
1696		 */
1697		if ((ZTOV(zp)->v_type == VDIR) && (mode & S_IWRITE)) {
1698			error = SET_ERROR(EISDIR);
1699			goto out;
1700		}
1701		/*
1702		 * Verify requested access to file.
1703		 */
1704		if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr))) {
1705			goto out;
1706		}
1707
1708		mutex_enter(&dzp->z_lock);
1709		dzp->z_seq++;
1710		mutex_exit(&dzp->z_lock);
1711
1712		/*
1713		 * Truncate regular files if requested.
1714		 */
1715		if ((ZTOV(zp)->v_type == VREG) &&
1716		    (vap->va_mask & AT_SIZE) && (vap->va_size == 0)) {
1717			/* we can't hold any locks when calling zfs_freesp() */
1718			zfs_dirent_unlock(dl);
1719			dl = NULL;
1720			error = zfs_freesp(zp, 0, 0, mode, TRUE);
1721			if (error == 0) {
1722				vnevent_create(ZTOV(zp), ct);
1723			}
1724		}
1725	}
1726out:
1727
1728	if (dl)
1729		zfs_dirent_unlock(dl);
1730
1731	if (error) {
1732		if (zp)
1733			VN_RELE(ZTOV(zp));
1734	} else {
1735		*vpp = ZTOV(zp);
1736		error = specvp_check(vpp, cr);
1737	}
1738
1739	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1740		zil_commit(zilog, 0);
1741
1742	ZFS_EXIT(zfsvfs);
1743	return (error);
1744}
1745
1746/*
1747 * Remove an entry from a directory.
1748 *
1749 *	IN:	dvp	- vnode of directory to remove entry from.
1750 *		name	- name of entry to remove.
1751 *		cr	- credentials of caller.
1752 *		ct	- caller context
1753 *		flags	- case flags
1754 *
1755 *	RETURN:	0 on success, error code on failure.
1756 *
1757 * Timestamps:
1758 *	dvp - ctime|mtime
1759 *	 vp - ctime (if nlink > 0)
1760 */
1761
1762uint64_t null_xattr = 0;
1763
1764/*ARGSUSED*/
1765static int
1766zfs_remove(vnode_t *dvp, char *name, cred_t *cr, caller_context_t *ct,
1767    int flags)
1768{
1769	znode_t		*zp, *dzp = VTOZ(dvp);
1770	znode_t		*xzp;
1771	vnode_t		*vp;
1772	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
1773	zilog_t		*zilog;
1774	uint64_t	acl_obj, xattr_obj;
1775	uint64_t	xattr_obj_unlinked = 0;
1776	uint64_t	obj = 0;
1777	zfs_dirlock_t	*dl;
1778	dmu_tx_t	*tx;
1779	boolean_t	may_delete_now, delete_now = FALSE;
1780	boolean_t	unlinked, toobig = FALSE;
1781	uint64_t	txtype;
1782	pathname_t	*realnmp = NULL;
1783	pathname_t	realnm;
1784	int		error;
1785	int		zflg = ZEXISTS;
1786	boolean_t	waited = B_FALSE;
1787
1788	ZFS_ENTER(zfsvfs);
1789	ZFS_VERIFY_ZP(dzp);
1790	zilog = zfsvfs->z_log;
1791
1792	if (flags & FIGNORECASE) {
1793		zflg |= ZCILOOK;
1794		pn_alloc(&realnm);
1795		realnmp = &realnm;
1796	}
1797
1798top:
1799	xattr_obj = 0;
1800	xzp = NULL;
1801	/*
1802	 * Attempt to lock directory; fail if entry doesn't exist.
1803	 */
1804	if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
1805	    NULL, realnmp)) {
1806		if (realnmp)
1807			pn_free(realnmp);
1808		ZFS_EXIT(zfsvfs);
1809		return (error);
1810	}
1811
1812	vp = ZTOV(zp);
1813
1814	if (error = zfs_zaccess_delete(dzp, zp, cr)) {
1815		goto out;
1816	}
1817
1818	/*
1819	 * Need to use rmdir for removing directories.
1820	 */
1821	if (vp->v_type == VDIR) {
1822		error = SET_ERROR(EPERM);
1823		goto out;
1824	}
1825
1826	vnevent_remove(vp, dvp, name, ct);
1827
1828	if (realnmp)
1829		dnlc_remove(dvp, realnmp->pn_buf);
1830	else
1831		dnlc_remove(dvp, name);
1832
1833	mutex_enter(&vp->v_lock);
1834	may_delete_now = vp->v_count == 1 && !vn_has_cached_data(vp);
1835	mutex_exit(&vp->v_lock);
1836
1837	/*
1838	 * We may delete the znode now, or we may put it in the unlinked set;
1839	 * it depends on whether we're the last link, and on whether there are
1840	 * other holds on the vnode.  So we dmu_tx_hold() the right things to
1841	 * allow for either case.
1842	 */
1843	obj = zp->z_id;
1844	tx = dmu_tx_create(zfsvfs->z_os);
1845	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
1846	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1847	zfs_sa_upgrade_txholds(tx, zp);
1848	zfs_sa_upgrade_txholds(tx, dzp);
1849	if (may_delete_now) {
1850		toobig =
1851		    zp->z_size > zp->z_blksz * DMU_MAX_DELETEBLKCNT;
1852		/* if the file is too big, only hold_free a token amount */
1853		dmu_tx_hold_free(tx, zp->z_id, 0,
1854		    (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END));
1855	}
1856
1857	/* are there any extended attributes? */
1858	error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
1859	    &xattr_obj, sizeof (xattr_obj));
1860	if (error == 0 && xattr_obj) {
1861		error = zfs_zget(zfsvfs, xattr_obj, &xzp);
1862		ASSERT0(error);
1863		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
1864		dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
1865	}
1866
1867	mutex_enter(&zp->z_lock);
1868	if ((acl_obj = zfs_external_acl(zp)) != 0 && may_delete_now)
1869		dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
1870	mutex_exit(&zp->z_lock);
1871
1872	/* charge as an update -- would be nice not to charge at all */
1873	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
1874
1875	/*
1876	 * Mark this transaction as typically resulting in a net free of space
1877	 */
1878	dmu_tx_mark_netfree(tx);
1879
1880	error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
1881	if (error) {
1882		zfs_dirent_unlock(dl);
1883		VN_RELE(vp);
1884		if (xzp)
1885			VN_RELE(ZTOV(xzp));
1886		if (error == ERESTART) {
1887			waited = B_TRUE;
1888			dmu_tx_wait(tx);
1889			dmu_tx_abort(tx);
1890			goto top;
1891		}
1892		if (realnmp)
1893			pn_free(realnmp);
1894		dmu_tx_abort(tx);
1895		ZFS_EXIT(zfsvfs);
1896		return (error);
1897	}
1898
1899	/*
1900	 * Remove the directory entry.
1901	 */
1902	error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked);
1903
1904	if (error) {
1905		dmu_tx_commit(tx);
1906		goto out;
1907	}
1908
1909	if (unlinked) {
1910		/*
1911		 * Hold z_lock so that we can make sure that the ACL obj
1912		 * hasn't changed.  Could have been deleted due to
1913		 * zfs_sa_upgrade().
1914		 */
1915		mutex_enter(&zp->z_lock);
1916		mutex_enter(&vp->v_lock);
1917		(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
1918		    &xattr_obj_unlinked, sizeof (xattr_obj_unlinked));
1919		delete_now = may_delete_now && !toobig &&
1920		    vp->v_count == 1 && !vn_has_cached_data(vp) &&
1921		    xattr_obj == xattr_obj_unlinked && zfs_external_acl(zp) ==
1922		    acl_obj;
1923		mutex_exit(&vp->v_lock);
1924	}
1925
1926	if (delete_now) {
1927		if (xattr_obj_unlinked) {
1928			ASSERT3U(xzp->z_links, ==, 2);
1929			mutex_enter(&xzp->z_lock);
1930			xzp->z_unlinked = 1;
1931			xzp->z_links = 0;
1932			error = sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs),
1933			    &xzp->z_links, sizeof (xzp->z_links), tx);
1934			ASSERT3U(error,  ==,  0);
1935			mutex_exit(&xzp->z_lock);
1936			zfs_unlinked_add(xzp, tx);
1937
1938			if (zp->z_is_sa)
1939				error = sa_remove(zp->z_sa_hdl,
1940				    SA_ZPL_XATTR(zfsvfs), tx);
1941			else
1942				error = sa_update(zp->z_sa_hdl,
1943				    SA_ZPL_XATTR(zfsvfs), &null_xattr,
1944				    sizeof (uint64_t), tx);
1945			ASSERT0(error);
1946		}
1947		mutex_enter(&vp->v_lock);
1948		VN_RELE_LOCKED(vp);
1949		ASSERT0(vp->v_count);
1950		mutex_exit(&vp->v_lock);
1951		mutex_exit(&zp->z_lock);
1952		zfs_znode_delete(zp, tx);
1953	} else if (unlinked) {
1954		mutex_exit(&zp->z_lock);
1955		zfs_unlinked_add(zp, tx);
1956	}
1957
1958	txtype = TX_REMOVE;
1959	if (flags & FIGNORECASE)
1960		txtype |= TX_CI;
1961	zfs_log_remove(zilog, tx, txtype, dzp, name, obj, unlinked);
1962
1963	dmu_tx_commit(tx);
1964out:
1965	if (realnmp)
1966		pn_free(realnmp);
1967
1968	zfs_dirent_unlock(dl);
1969
1970	if (!delete_now)
1971		VN_RELE(vp);
1972	if (xzp)
1973		VN_RELE(ZTOV(xzp));
1974
1975	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1976		zil_commit(zilog, 0);
1977
1978	ZFS_EXIT(zfsvfs);
1979	return (error);
1980}
1981
1982/*
1983 * Create a new directory and insert it into dvp using the name
1984 * provided.  Return a pointer to the inserted directory.
1985 *
1986 *	IN:	dvp	- vnode of directory to add subdir to.
1987 *		dirname	- name of new directory.
1988 *		vap	- attributes of new directory.
1989 *		cr	- credentials of caller.
1990 *		ct	- caller context
1991 *		flags	- case flags
1992 *		vsecp	- ACL to be set
1993 *
1994 *	OUT:	vpp	- vnode of created directory.
1995 *
1996 *	RETURN:	0 on success, error code on failure.
1997 *
1998 * Timestamps:
1999 *	dvp - ctime|mtime updated
2000 *	 vp - ctime|mtime|atime updated
2001 */
2002/*ARGSUSED*/
2003static int
2004zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr,
2005    caller_context_t *ct, int flags, vsecattr_t *vsecp)
2006{
2007	znode_t		*zp, *dzp = VTOZ(dvp);
2008	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
2009	zilog_t		*zilog;
2010	zfs_dirlock_t	*dl;
2011	uint64_t	txtype;
2012	dmu_tx_t	*tx;
2013	int		error;
2014	int		zf = ZNEW;
2015	ksid_t		*ksid;
2016	uid_t		uid;
2017	gid_t		gid = crgetgid(cr);
2018	zfs_acl_ids_t   acl_ids;
2019	boolean_t	fuid_dirtied;
2020	boolean_t	waited = B_FALSE;
2021
2022	ASSERT(vap->va_type == VDIR);
2023
2024	/*
2025	 * If we have an ephemeral id, ACL, or XVATTR then
2026	 * make sure file system is at proper version
2027	 */
2028
2029	ksid = crgetsid(cr, KSID_OWNER);
2030	if (ksid)
2031		uid = ksid_getid(ksid);
2032	else
2033		uid = crgetuid(cr);
2034	if (zfsvfs->z_use_fuids == B_FALSE &&
2035	    (vsecp || (vap->va_mask & AT_XVATTR) ||
2036	    IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
2037		return (SET_ERROR(EINVAL));
2038
2039	ZFS_ENTER(zfsvfs);
2040	ZFS_VERIFY_ZP(dzp);
2041	zilog = zfsvfs->z_log;
2042
2043	if (dzp->z_pflags & ZFS_XATTR) {
2044		ZFS_EXIT(zfsvfs);
2045		return (SET_ERROR(EINVAL));
2046	}
2047
2048	if (zfsvfs->z_utf8 && u8_validate(dirname,
2049	    strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
2050		ZFS_EXIT(zfsvfs);
2051		return (SET_ERROR(EILSEQ));
2052	}
2053	if (flags & FIGNORECASE)
2054		zf |= ZCILOOK;
2055
2056	if (vap->va_mask & AT_XVATTR) {
2057		if ((error = secpolicy_xvattr((xvattr_t *)vap,
2058		    crgetuid(cr), cr, vap->va_type)) != 0) {
2059			ZFS_EXIT(zfsvfs);
2060			return (error);
2061		}
2062	}
2063
2064	if ((error = zfs_acl_ids_create(dzp, 0, vap, cr,
2065	    vsecp, &acl_ids)) != 0) {
2066		ZFS_EXIT(zfsvfs);
2067		return (error);
2068	}
2069	/*
2070	 * First make sure the new directory doesn't exist.
2071	 *
2072	 * Existence is checked first to make sure we don't return
2073	 * EACCES instead of EEXIST which can cause some applications
2074	 * to fail.
2075	 */
2076top:
2077	*vpp = NULL;
2078
2079	if (error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf,
2080	    NULL, NULL)) {
2081		zfs_acl_ids_free(&acl_ids);
2082		ZFS_EXIT(zfsvfs);
2083		return (error);
2084	}
2085
2086	if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) {
2087		zfs_acl_ids_free(&acl_ids);
2088		zfs_dirent_unlock(dl);
2089		ZFS_EXIT(zfsvfs);
2090		return (error);
2091	}
2092
2093	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zfs_inherit_projid(dzp))) {
2094		zfs_acl_ids_free(&acl_ids);
2095		zfs_dirent_unlock(dl);
2096		ZFS_EXIT(zfsvfs);
2097		return (SET_ERROR(EDQUOT));
2098	}
2099
2100	/*
2101	 * Add a new entry to the directory.
2102	 */
2103	tx = dmu_tx_create(zfsvfs->z_os);
2104	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
2105	dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
2106	fuid_dirtied = zfsvfs->z_fuid_dirty;
2107	if (fuid_dirtied)
2108		zfs_fuid_txhold(zfsvfs, tx);
2109	if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
2110		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
2111		    acl_ids.z_aclp->z_acl_bytes);
2112	}
2113
2114	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
2115	    ZFS_SA_BASE_ATTR_SIZE);
2116
2117	error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
2118	if (error) {
2119		zfs_dirent_unlock(dl);
2120		if (error == ERESTART) {
2121			waited = B_TRUE;
2122			dmu_tx_wait(tx);
2123			dmu_tx_abort(tx);
2124			goto top;
2125		}
2126		zfs_acl_ids_free(&acl_ids);
2127		dmu_tx_abort(tx);
2128		ZFS_EXIT(zfsvfs);
2129		return (error);
2130	}
2131
2132	/*
2133	 * Create new node.
2134	 */
2135	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
2136
2137	if (fuid_dirtied)
2138		zfs_fuid_sync(zfsvfs, tx);
2139
2140	/*
2141	 * Now put new name in parent dir.
2142	 */
2143	(void) zfs_link_create(dl, zp, tx, ZNEW);
2144
2145	*vpp = ZTOV(zp);
2146
2147	txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap);
2148	if (flags & FIGNORECASE)
2149		txtype |= TX_CI;
2150	zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp,
2151	    acl_ids.z_fuidp, vap);
2152
2153	zfs_acl_ids_free(&acl_ids);
2154
2155	dmu_tx_commit(tx);
2156
2157	zfs_dirent_unlock(dl);
2158
2159	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
2160		zil_commit(zilog, 0);
2161
2162	ZFS_EXIT(zfsvfs);
2163	return (0);
2164}
2165
2166/*
2167 * Remove a directory subdir entry.  If the current working
2168 * directory is the same as the subdir to be removed, the
2169 * remove will fail.
2170 *
2171 *	IN:	dvp	- vnode of directory to remove from.
2172 *		name	- name of directory to be removed.
2173 *		cwd	- vnode of current working directory.
2174 *		cr	- credentials of caller.
2175 *		ct	- caller context
2176 *		flags	- case flags
2177 *
2178 *	RETURN:	0 on success, error code on failure.
2179 *
2180 * Timestamps:
2181 *	dvp - ctime|mtime updated
2182 */
2183/*ARGSUSED*/
2184static int
2185zfs_rmdir(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr,
2186    caller_context_t *ct, int flags)
2187{
2188	znode_t		*dzp = VTOZ(dvp);
2189	znode_t		*zp;
2190	vnode_t		*vp;
2191	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
2192	zilog_t		*zilog;
2193	zfs_dirlock_t	*dl;
2194	dmu_tx_t	*tx;
2195	int		error;
2196	int		zflg = ZEXISTS;
2197	boolean_t	waited = B_FALSE;
2198
2199	ZFS_ENTER(zfsvfs);
2200	ZFS_VERIFY_ZP(dzp);
2201	zilog = zfsvfs->z_log;
2202
2203	if (flags & FIGNORECASE)
2204		zflg |= ZCILOOK;
2205top:
2206	zp = NULL;
2207
2208	/*
2209	 * Attempt to lock directory; fail if entry doesn't exist.
2210	 */
2211	if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
2212	    NULL, NULL)) {
2213		ZFS_EXIT(zfsvfs);
2214		return (error);
2215	}
2216
2217	vp = ZTOV(zp);
2218
2219	if (error = zfs_zaccess_delete(dzp, zp, cr)) {
2220		goto out;
2221	}
2222
2223	if (vp->v_type != VDIR) {
2224		error = SET_ERROR(ENOTDIR);
2225		goto out;
2226	}
2227
2228	if (vp == cwd) {
2229		error = SET_ERROR(EINVAL);
2230		goto out;
2231	}
2232
2233	vnevent_rmdir(vp, dvp, name, ct);
2234
2235	/*
2236	 * Grab a lock on the directory to make sure that noone is
2237	 * trying to add (or lookup) entries while we are removing it.
2238	 */
2239	rw_enter(&zp->z_name_lock, RW_WRITER);
2240
2241	/*
2242	 * Grab a lock on the parent pointer to make sure we play well
2243	 * with the treewalk and directory rename code.
2244	 */
2245	rw_enter(&zp->z_parent_lock, RW_WRITER);
2246
2247	tx = dmu_tx_create(zfsvfs->z_os);
2248	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
2249	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
2250	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
2251	zfs_sa_upgrade_txholds(tx, zp);
2252	zfs_sa_upgrade_txholds(tx, dzp);
2253	dmu_tx_mark_netfree(tx);
2254	error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
2255	if (error) {
2256		rw_exit(&zp->z_parent_lock);
2257		rw_exit(&zp->z_name_lock);
2258		zfs_dirent_unlock(dl);
2259		VN_RELE(vp);
2260		if (error == ERESTART) {
2261			waited = B_TRUE;
2262			dmu_tx_wait(tx);
2263			dmu_tx_abort(tx);
2264			goto top;
2265		}
2266		dmu_tx_abort(tx);
2267		ZFS_EXIT(zfsvfs);
2268		return (error);
2269	}
2270
2271	error = zfs_link_destroy(dl, zp, tx, zflg, NULL);
2272
2273	if (error == 0) {
2274		uint64_t txtype = TX_RMDIR;
2275		if (flags & FIGNORECASE)
2276			txtype |= TX_CI;
2277		zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT,
2278		    B_FALSE);
2279	}
2280
2281	dmu_tx_commit(tx);
2282
2283	rw_exit(&zp->z_parent_lock);
2284	rw_exit(&zp->z_name_lock);
2285out:
2286	zfs_dirent_unlock(dl);
2287
2288	VN_RELE(vp);
2289
2290	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
2291		zil_commit(zilog, 0);
2292
2293	ZFS_EXIT(zfsvfs);
2294	return (error);
2295}
2296
2297/*
2298 * Read as many directory entries as will fit into the provided
2299 * buffer from the given directory cursor position (specified in
2300 * the uio structure).
2301 *
2302 *	IN:	vp	- vnode of directory to read.
2303 *		uio	- structure supplying read location, range info,
2304 *			  and return buffer.
2305 *		cr	- credentials of caller.
2306 *		ct	- caller context
2307 *		flags	- case flags
2308 *
2309 *	OUT:	uio	- updated offset and range, buffer filled.
2310 *		eofp	- set to true if end-of-file detected.
2311 *
2312 *	RETURN:	0 on success, error code on failure.
2313 *
2314 * Timestamps:
2315 *	vp - atime updated
2316 *
2317 * Note that the low 4 bits of the cookie returned by zap is always zero.
2318 * This allows us to use the low range for "special" directory entries:
2319 * We use 0 for '.', and 1 for '..'.  If this is the root of the filesystem,
2320 * we use the offset 2 for the '.zfs' directory.
2321 */
2322/* ARGSUSED */
2323static int
2324zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp,
2325    caller_context_t *ct, int flags)
2326{
2327	znode_t		*zp = VTOZ(vp);
2328	iovec_t		*iovp;
2329	edirent_t	*eodp;
2330	dirent64_t	*odp;
2331	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
2332	objset_t	*os;
2333	caddr_t		outbuf;
2334	size_t		bufsize;
2335	zap_cursor_t	zc;
2336	zap_attribute_t	zap;
2337	uint_t		bytes_wanted;
2338	uint64_t	offset; /* must be unsigned; checks for < 1 */
2339	uint64_t	parent;
2340	int		local_eof;
2341	int		outcount;
2342	int		error;
2343	uint8_t		prefetch;
2344	boolean_t	check_sysattrs;
2345
2346	ZFS_ENTER(zfsvfs);
2347	ZFS_VERIFY_ZP(zp);
2348
2349	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
2350	    &parent, sizeof (parent))) != 0) {
2351		ZFS_EXIT(zfsvfs);
2352		return (error);
2353	}
2354
2355	/*
2356	 * If we are not given an eof variable,
2357	 * use a local one.
2358	 */
2359	if (eofp == NULL)
2360		eofp = &local_eof;
2361
2362	/*
2363	 * Check for valid iov_len.
2364	 */
2365	if (uio->uio_iov->iov_len <= 0) {
2366		ZFS_EXIT(zfsvfs);
2367		return (SET_ERROR(EINVAL));
2368	}
2369
2370	/*
2371	 * Quit if directory has been removed (posix)
2372	 */
2373	if ((*eofp = zp->z_unlinked) != 0) {
2374		ZFS_EXIT(zfsvfs);
2375		return (0);
2376	}
2377
2378	error = 0;
2379	os = zfsvfs->z_os;
2380	offset = uio->uio_loffset;
2381	prefetch = zp->z_zn_prefetch;
2382
2383	/*
2384	 * Initialize the iterator cursor.
2385	 */
2386	if (offset <= 3) {
2387		/*
2388		 * Start iteration from the beginning of the directory.
2389		 */
2390		zap_cursor_init(&zc, os, zp->z_id);
2391	} else {
2392		/*
2393		 * The offset is a serialized cursor.
2394		 */
2395		zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
2396	}
2397
2398	/*
2399	 * Get space to change directory entries into fs independent format.
2400	 */
2401	iovp = uio->uio_iov;
2402	bytes_wanted = iovp->iov_len;
2403	if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) {
2404		bufsize = bytes_wanted;
2405		outbuf = kmem_alloc(bufsize, KM_SLEEP);
2406		odp = (struct dirent64 *)outbuf;
2407	} else {
2408		bufsize = bytes_wanted;
2409		outbuf = NULL;
2410		odp = (struct dirent64 *)iovp->iov_base;
2411	}
2412	eodp = (struct edirent *)odp;
2413
2414	/*
2415	 * If this VFS supports the system attribute view interface; and
2416	 * we're looking at an extended attribute directory; and we care
2417	 * about normalization conflicts on this vfs; then we must check
2418	 * for normalization conflicts with the sysattr name space.
2419	 */
2420	check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
2421	    (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm &&
2422	    (flags & V_RDDIR_ENTFLAGS);
2423
2424	/*
2425	 * Transform to file-system independent format
2426	 */
2427	outcount = 0;
2428	while (outcount < bytes_wanted) {
2429		ino64_t objnum;
2430		ushort_t reclen;
2431		off64_t *next = NULL;
2432
2433		/*
2434		 * Special case `.', `..', and `.zfs'.
2435		 */
2436		if (offset == 0) {
2437			(void) strcpy(zap.za_name, ".");
2438			zap.za_normalization_conflict = 0;
2439			objnum = zp->z_id;
2440		} else if (offset == 1) {
2441			(void) strcpy(zap.za_name, "..");
2442			zap.za_normalization_conflict = 0;
2443			objnum = parent;
2444		} else if (offset == 2 && zfs_show_ctldir(zp)) {
2445			(void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
2446			zap.za_normalization_conflict = 0;
2447			objnum = ZFSCTL_INO_ROOT;
2448		} else {
2449			/*
2450			 * Grab next entry.
2451			 */
2452			if (error = zap_cursor_retrieve(&zc, &zap)) {
2453				if ((*eofp = (error == ENOENT)) != 0)
2454					break;
2455				else
2456					goto update;
2457			}
2458
2459			if (zap.za_integer_length != 8 ||
2460			    zap.za_num_integers != 1) {
2461				cmn_err(CE_WARN, "zap_readdir: bad directory "
2462				    "entry, obj = %lld, offset = %lld\n",
2463				    (u_longlong_t)zp->z_id,
2464				    (u_longlong_t)offset);
2465				error = SET_ERROR(ENXIO);
2466				goto update;
2467			}
2468
2469			objnum = ZFS_DIRENT_OBJ(zap.za_first_integer);
2470			/*
2471			 * MacOS X can extract the object type here such as:
2472			 * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer);
2473			 */
2474
2475			if (check_sysattrs && !zap.za_normalization_conflict) {
2476				zap.za_normalization_conflict =
2477				    xattr_sysattr_casechk(zap.za_name);
2478			}
2479		}
2480
2481		if (flags & V_RDDIR_ACCFILTER) {
2482			/*
2483			 * If we have no access at all, don't include
2484			 * this entry in the returned information
2485			 */
2486			znode_t	*ezp;
2487			if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0)
2488				goto skip_entry;
2489			if (!zfs_has_access(ezp, cr)) {
2490				VN_RELE(ZTOV(ezp));
2491				goto skip_entry;
2492			}
2493			VN_RELE(ZTOV(ezp));
2494		}
2495
2496		if (flags & V_RDDIR_ENTFLAGS)
2497			reclen = EDIRENT_RECLEN(strlen(zap.za_name));
2498		else
2499			reclen = DIRENT64_RECLEN(strlen(zap.za_name));
2500
2501		/*
2502		 * Will this entry fit in the buffer?
2503		 */
2504		if (outcount + reclen > bufsize) {
2505			/*
2506			 * Did we manage to fit anything in the buffer?
2507			 */
2508			if (!outcount) {
2509				error = SET_ERROR(EINVAL);
2510				goto update;
2511			}
2512			break;
2513		}
2514		if (flags & V_RDDIR_ENTFLAGS) {
2515			/*
2516			 * Add extended flag entry:
2517			 */
2518			eodp->ed_ino = objnum;
2519			eodp->ed_reclen = reclen;
2520			/* NOTE: ed_off is the offset for the *next* entry */
2521			next = &(eodp->ed_off);
2522			eodp->ed_eflags = zap.za_normalization_conflict ?
2523			    ED_CASE_CONFLICT : 0;
2524			(void) strncpy(eodp->ed_name, zap.za_name,
2525			    EDIRENT_NAMELEN(reclen));
2526			eodp = (edirent_t *)((intptr_t)eodp + reclen);
2527		} else {
2528			/*
2529			 * Add normal entry:
2530			 */
2531			odp->d_ino = objnum;
2532			odp->d_reclen = reclen;
2533			/* NOTE: d_off is the offset for the *next* entry */
2534			next = &(odp->d_off);
2535			(void) strncpy(odp->d_name, zap.za_name,
2536			    DIRENT64_NAMELEN(reclen));
2537			odp = (dirent64_t *)((intptr_t)odp + reclen);
2538		}
2539		outcount += reclen;
2540
2541		ASSERT(outcount <= bufsize);
2542
2543		/* Prefetch znode */
2544		if (prefetch)
2545			dmu_prefetch(os, objnum, 0, 0, 0,
2546			    ZIO_PRIORITY_SYNC_READ);
2547
2548	skip_entry:
2549		/*
2550		 * Move to the next entry, fill in the previous offset.
2551		 */
2552		if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
2553			zap_cursor_advance(&zc);
2554			offset = zap_cursor_serialize(&zc);
2555		} else {
2556			offset += 1;
2557		}
2558		if (next)
2559			*next = offset;
2560	}
2561	zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
2562
2563	if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) {
2564		iovp->iov_base += outcount;
2565		iovp->iov_len -= outcount;
2566		uio->uio_resid -= outcount;
2567	} else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) {
2568		/*
2569		 * Reset the pointer.
2570		 */
2571		offset = uio->uio_loffset;
2572	}
2573
2574update:
2575	zap_cursor_fini(&zc);
2576	if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1)
2577		kmem_free(outbuf, bufsize);
2578
2579	if (error == ENOENT)
2580		error = 0;
2581
2582	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
2583
2584	uio->uio_loffset = offset;
2585	ZFS_EXIT(zfsvfs);
2586	return (error);
2587}
2588
2589ulong_t zfs_fsync_sync_cnt = 4;
2590
2591static int
2592zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
2593{
2594	znode_t	*zp = VTOZ(vp);
2595	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2596
2597	/*
2598	 * Regardless of whether this is required for standards conformance,
2599	 * this is the logical behavior when fsync() is called on a file with
2600	 * dirty pages.  We use B_ASYNC since the ZIL transactions are already
2601	 * going to be pushed out as part of the zil_commit().
2602	 */
2603	if (vn_has_cached_data(vp) && !(syncflag & FNODSYNC) &&
2604	    (vp->v_type == VREG) && !(IS_SWAPVP(vp)))
2605		(void) VOP_PUTPAGE(vp, (offset_t)0, (size_t)0, B_ASYNC, cr, ct);
2606
2607	(void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt);
2608
2609	if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) {
2610		ZFS_ENTER(zfsvfs);
2611		ZFS_VERIFY_ZP(zp);
2612		zil_commit(zfsvfs->z_log, zp->z_id);
2613		ZFS_EXIT(zfsvfs);
2614	}
2615	return (0);
2616}
2617
2618
2619/*
2620 * Get the requested file attributes and place them in the provided
2621 * vattr structure.
2622 *
2623 *	IN:	vp	- vnode of file.
2624 *		vap	- va_mask identifies requested attributes.
2625 *			  If AT_XVATTR set, then optional attrs are requested
2626 *		flags	- ATTR_NOACLCHECK (CIFS server context)
2627 *		cr	- credentials of caller.
2628 *		ct	- caller context
2629 *
2630 *	OUT:	vap	- attribute values.
2631 *
2632 *	RETURN:	0 (always succeeds).
2633 */
2634/* ARGSUSED */
2635static int
2636zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
2637    caller_context_t *ct)
2638{
2639	znode_t *zp = VTOZ(vp);
2640	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2641	int	error = 0;
2642	uint64_t links;
2643	uint64_t mtime[2], ctime[2];
2644	xvattr_t *xvap = (xvattr_t *)vap;	/* vap may be an xvattr_t * */
2645	xoptattr_t *xoap = NULL;
2646	boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
2647	sa_bulk_attr_t bulk[2];
2648	int count = 0;
2649
2650	ZFS_ENTER(zfsvfs);
2651	ZFS_VERIFY_ZP(zp);
2652
2653	zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid);
2654
2655	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
2656	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
2657
2658	if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) {
2659		ZFS_EXIT(zfsvfs);
2660		return (error);
2661	}
2662
2663	/*
2664	 * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES.
2665	 * Also, if we are the owner don't bother, since owner should
2666	 * always be allowed to read basic attributes of file.
2667	 */
2668	if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) &&
2669	    (vap->va_uid != crgetuid(cr))) {
2670		if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0,
2671		    skipaclchk, cr)) {
2672			ZFS_EXIT(zfsvfs);
2673			return (error);
2674		}
2675	}
2676
2677	/*
2678	 * Return all attributes.  It's cheaper to provide the answer
2679	 * than to determine whether we were asked the question.
2680	 */
2681
2682	mutex_enter(&zp->z_lock);
2683	vap->va_type = vp->v_type;
2684	vap->va_mode = zp->z_mode & MODEMASK;
2685	vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev;
2686	vap->va_nodeid = zp->z_id;
2687	if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp))
2688		links = zp->z_links + 1;
2689	else
2690		links = zp->z_links;
2691	vap->va_nlink = MIN(links, UINT32_MAX);	/* nlink_t limit! */
2692	vap->va_size = zp->z_size;
2693	vap->va_rdev = vp->v_rdev;
2694	vap->va_seq = zp->z_seq;
2695
2696	/*
2697	 * Add in any requested optional attributes and the create time.
2698	 * Also set the corresponding bits in the returned attribute bitmap.
2699	 */
2700	if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) {
2701		if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
2702			xoap->xoa_archive =
2703			    ((zp->z_pflags & ZFS_ARCHIVE) != 0);
2704			XVA_SET_RTN(xvap, XAT_ARCHIVE);
2705		}
2706
2707		if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
2708			xoap->xoa_readonly =
2709			    ((zp->z_pflags & ZFS_READONLY) != 0);
2710			XVA_SET_RTN(xvap, XAT_READONLY);
2711		}
2712
2713		if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
2714			xoap->xoa_system =
2715			    ((zp->z_pflags & ZFS_SYSTEM) != 0);
2716			XVA_SET_RTN(xvap, XAT_SYSTEM);
2717		}
2718
2719		if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
2720			xoap->xoa_hidden =
2721			    ((zp->z_pflags & ZFS_HIDDEN) != 0);
2722			XVA_SET_RTN(xvap, XAT_HIDDEN);
2723		}
2724
2725		if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
2726			xoap->xoa_nounlink =
2727			    ((zp->z_pflags & ZFS_NOUNLINK) != 0);
2728			XVA_SET_RTN(xvap, XAT_NOUNLINK);
2729		}
2730
2731		if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
2732			xoap->xoa_immutable =
2733			    ((zp->z_pflags & ZFS_IMMUTABLE) != 0);
2734			XVA_SET_RTN(xvap, XAT_IMMUTABLE);
2735		}
2736
2737		if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
2738			xoap->xoa_appendonly =
2739			    ((zp->z_pflags & ZFS_APPENDONLY) != 0);
2740			XVA_SET_RTN(xvap, XAT_APPENDONLY);
2741		}
2742
2743		if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
2744			xoap->xoa_nodump =
2745			    ((zp->z_pflags & ZFS_NODUMP) != 0);
2746			XVA_SET_RTN(xvap, XAT_NODUMP);
2747		}
2748
2749		if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
2750			xoap->xoa_opaque =
2751			    ((zp->z_pflags & ZFS_OPAQUE) != 0);
2752			XVA_SET_RTN(xvap, XAT_OPAQUE);
2753		}
2754
2755		if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
2756			xoap->xoa_av_quarantined =
2757			    ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0);
2758			XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
2759		}
2760
2761		if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
2762			xoap->xoa_av_modified =
2763			    ((zp->z_pflags & ZFS_AV_MODIFIED) != 0);
2764			XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
2765		}
2766
2767		if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) &&
2768		    vp->v_type == VREG) {
2769			zfs_sa_get_scanstamp(zp, xvap);
2770		}
2771
2772		if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
2773			uint64_t times[2];
2774
2775			(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(zfsvfs),
2776			    times, sizeof (times));
2777			ZFS_TIME_DECODE(&xoap->xoa_createtime, times);
2778			XVA_SET_RTN(xvap, XAT_CREATETIME);
2779		}
2780
2781		if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
2782			xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0);
2783			XVA_SET_RTN(xvap, XAT_REPARSE);
2784		}
2785		if (XVA_ISSET_REQ(xvap, XAT_GEN)) {
2786			xoap->xoa_generation = zp->z_gen;
2787			XVA_SET_RTN(xvap, XAT_GEN);
2788		}
2789
2790		if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
2791			xoap->xoa_offline =
2792			    ((zp->z_pflags & ZFS_OFFLINE) != 0);
2793			XVA_SET_RTN(xvap, XAT_OFFLINE);
2794		}
2795
2796		if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
2797			xoap->xoa_sparse =
2798			    ((zp->z_pflags & ZFS_SPARSE) != 0);
2799			XVA_SET_RTN(xvap, XAT_SPARSE);
2800		}
2801
2802		if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) {
2803			xoap->xoa_projinherit =
2804			    ((zp->z_pflags & ZFS_PROJINHERIT) != 0);
2805			XVA_SET_RTN(xvap, XAT_PROJINHERIT);
2806		}
2807
2808		if (XVA_ISSET_REQ(xvap, XAT_PROJID)) {
2809			xoap->xoa_projid = zp->z_projid;
2810			XVA_SET_RTN(xvap, XAT_PROJID);
2811		}
2812	}
2813
2814	ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime);
2815	ZFS_TIME_DECODE(&vap->va_mtime, mtime);
2816	ZFS_TIME_DECODE(&vap->va_ctime, ctime);
2817
2818	mutex_exit(&zp->z_lock);
2819
2820	sa_object_size(zp->z_sa_hdl, &vap->va_blksize, &vap->va_nblocks);
2821
2822	if (zp->z_blksz == 0) {
2823		/*
2824		 * Block size hasn't been set; suggest maximal I/O transfers.
2825		 */
2826		vap->va_blksize = zfsvfs->z_max_blksz;
2827	}
2828
2829	ZFS_EXIT(zfsvfs);
2830	return (0);
2831}
2832
2833/*
2834 * For the operation of changing file's user/group/project, we need to
2835 * handle not only the main object that is assigned to the file directly,
2836 * but also the ones that are used by the file via hidden xattr directory.
2837 *
2838 * Because the xattr directory may contain many EA entries, it may be
2839 * impossible to change all of them in the same transaction as changing the
2840 * main object's user/group/project attributes. If so, we have to change them
2841 * via other multiple independent transactions one by one. It may be not a good
2842 * solution, but we have no better idea yet.
2843 */
2844static int
2845zfs_setattr_dir(znode_t *dzp)
2846{
2847	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
2848	objset_t	*os = zfsvfs->z_os;
2849	zap_cursor_t	zc;
2850	zap_attribute_t	zap;
2851	zfs_dirlock_t	*dl;
2852	znode_t		*zp = NULL;
2853	dmu_tx_t	*tx = NULL;
2854	sa_bulk_attr_t	bulk[4];
2855	int		count;
2856	int		err;
2857
2858	zap_cursor_init(&zc, os, dzp->z_id);
2859	while ((err = zap_cursor_retrieve(&zc, &zap)) == 0) {
2860		count = 0;
2861		if (zap.za_integer_length != 8 || zap.za_num_integers != 1) {
2862			err = ENXIO;
2863			break;
2864		}
2865
2866		err = zfs_dirent_lock(&dl, dzp, (char *)zap.za_name, &zp,
2867		    ZEXISTS, NULL, NULL);
2868		if (err == ENOENT)
2869			goto next;
2870		if (err)
2871			break;
2872
2873		if (zp->z_uid == dzp->z_uid &&
2874		    zp->z_gid == dzp->z_gid &&
2875		    zp->z_projid == dzp->z_projid)
2876			goto next;
2877
2878		tx = dmu_tx_create(os);
2879		if (!(zp->z_pflags & ZFS_PROJID))
2880			dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
2881		else
2882			dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
2883
2884		err = dmu_tx_assign(tx, TXG_WAIT);
2885		if (err)
2886			break;
2887
2888		mutex_enter(&dzp->z_lock);
2889
2890		if (zp->z_uid != dzp->z_uid) {
2891			zp->z_uid = dzp->z_uid;
2892			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
2893			    &dzp->z_uid, sizeof (dzp->z_uid));
2894		}
2895
2896		if (zp->z_gid != dzp->z_gid) {
2897			zp->z_gid = dzp->z_gid;
2898			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
2899			    &dzp->z_gid, sizeof (dzp->z_gid));
2900		}
2901
2902		if (zp->z_projid != dzp->z_projid) {
2903			if (!(zp->z_pflags & ZFS_PROJID)) {
2904				zp->z_pflags |= ZFS_PROJID;
2905				SA_ADD_BULK_ATTR(bulk, count,
2906				    SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags,
2907				    sizeof (zp->z_pflags));
2908			}
2909
2910			zp->z_projid = dzp->z_projid;
2911			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PROJID(zfsvfs),
2912			    NULL, &zp->z_projid, sizeof (zp->z_projid));
2913		}
2914
2915		mutex_exit(&dzp->z_lock);
2916
2917		if (likely(count > 0)) {
2918			err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
2919			dmu_tx_commit(tx);
2920		} else {
2921			dmu_tx_abort(tx);
2922		}
2923		tx = NULL;
2924		if (err != 0 && err != ENOENT)
2925			break;
2926
2927next:
2928		if (zp) {
2929			VN_RELE(ZTOV(zp));
2930			zp = NULL;
2931			zfs_dirent_unlock(dl);
2932		}
2933		zap_cursor_advance(&zc);
2934	}
2935
2936	if (tx)
2937		dmu_tx_abort(tx);
2938	if (zp) {
2939		VN_RELE(ZTOV(zp));
2940		zfs_dirent_unlock(dl);
2941	}
2942	zap_cursor_fini(&zc);
2943
2944	return (err == ENOENT ? 0 : err);
2945}
2946
2947/*
2948 * Set the file attributes to the values contained in the
2949 * vattr structure.
2950 *
2951 *	IN:	vp	- vnode of file to be modified.
2952 *		vap	- new attribute values.
2953 *			  If AT_XVATTR set, then optional attrs are being set
2954 *		flags	- ATTR_UTIME set if non-default time values provided.
2955 *			- ATTR_NOACLCHECK (CIFS context only).
2956 *		cr	- credentials of caller.
2957 *		ct	- caller context
2958 *
2959 *	RETURN:	0 on success, error code on failure.
2960 *
2961 * Timestamps:
2962 *	vp - ctime updated, mtime updated if size changed.
2963 */
2964/* ARGSUSED */
2965static int
2966zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
2967    caller_context_t *ct)
2968{
2969	znode_t		*zp = VTOZ(vp);
2970	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
2971	objset_t	*os = zfsvfs->z_os;
2972	zilog_t		*zilog;
2973	dmu_tx_t	*tx;
2974	vattr_t		oldva;
2975	xvattr_t	tmpxvattr;
2976	uint_t		mask = vap->va_mask;
2977	uint_t		saved_mask = 0;
2978	int		trim_mask = 0;
2979	uint64_t	new_mode;
2980	uint64_t	new_uid, new_gid;
2981	uint64_t	xattr_obj;
2982	uint64_t	mtime[2], ctime[2];
2983	uint64_t	projid = ZFS_INVALID_PROJID;
2984	znode_t		*attrzp;
2985	int		need_policy = FALSE;
2986	int		err, err2 = 0;
2987	zfs_fuid_info_t *fuidp = NULL;
2988	xvattr_t *xvap = (xvattr_t *)vap;	/* vap may be an xvattr_t * */
2989	xoptattr_t	*xoap;
2990	zfs_acl_t	*aclp;
2991	boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
2992	boolean_t	fuid_dirtied = B_FALSE;
2993	boolean_t	handle_eadir = B_FALSE;
2994	sa_bulk_attr_t	bulk[8], xattr_bulk[8];
2995	int		count = 0, xattr_count = 0;
2996
2997	if (mask == 0)
2998		return (0);
2999
3000	if (mask & AT_NOSET)
3001		return (SET_ERROR(EINVAL));
3002
3003	ZFS_ENTER(zfsvfs);
3004	ZFS_VERIFY_ZP(zp);
3005
3006	/*
3007	 * If this is a xvattr_t, then get a pointer to the structure of
3008	 * optional attributes.  If this is NULL, then we have a vattr_t.
3009	 */
3010	xoap = xva_getxoptattr(xvap);
3011	if (xoap != NULL && (mask & AT_XVATTR)) {
3012		if (XVA_ISSET_REQ(xvap, XAT_PROJID)) {
3013			if (!dmu_objset_projectquota_enabled(os) ||
3014			    (vp->v_type != VREG && vp->v_type != VDIR)) {
3015				ZFS_EXIT(zfsvfs);
3016				return (SET_ERROR(ENOTSUP));
3017			}
3018
3019			projid = xoap->xoa_projid;
3020			if (unlikely(projid == ZFS_INVALID_PROJID)) {
3021				ZFS_EXIT(zfsvfs);
3022				return (SET_ERROR(EINVAL));
3023			}
3024
3025			if (projid == zp->z_projid && zp->z_pflags & ZFS_PROJID)
3026				projid = ZFS_INVALID_PROJID;
3027			else
3028				need_policy = TRUE;
3029		}
3030
3031		if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT) &&
3032		    (!dmu_objset_projectquota_enabled(os) ||
3033		    (vp->v_type != VREG && vp->v_type != VDIR))) {
3034				ZFS_EXIT(zfsvfs);
3035				return (SET_ERROR(ENOTSUP));
3036		}
3037	}
3038
3039	zilog = zfsvfs->z_log;
3040
3041	/*
3042	 * Make sure that if we have ephemeral uid/gid or xvattr specified
3043	 * that file system is at proper version level
3044	 */
3045
3046	if (zfsvfs->z_use_fuids == B_FALSE &&
3047	    (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) ||
3048	    ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) ||
3049	    (mask & AT_XVATTR))) {
3050		ZFS_EXIT(zfsvfs);
3051		return (SET_ERROR(EINVAL));
3052	}
3053
3054	if (mask & AT_SIZE && vp->v_type == VDIR) {
3055		ZFS_EXIT(zfsvfs);
3056		return (SET_ERROR(EISDIR));
3057	}
3058
3059	if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) {
3060		ZFS_EXIT(zfsvfs);
3061		return (SET_ERROR(EINVAL));
3062	}
3063
3064	xva_init(&tmpxvattr);
3065
3066	/*
3067	 * Immutable files can only alter immutable bit and atime
3068	 */
3069	if ((zp->z_pflags & ZFS_IMMUTABLE) &&
3070	    ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) ||
3071	    ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) {
3072		ZFS_EXIT(zfsvfs);
3073		return (SET_ERROR(EPERM));
3074	}
3075
3076	/*
3077	 * Note: ZFS_READONLY is handled in zfs_zaccess_common.
3078	 */
3079
3080	/*
3081	 * Verify timestamps doesn't overflow 32 bits.
3082	 * ZFS can handle large timestamps, but 32bit syscalls can't
3083	 * handle times greater than 2039.  This check should be removed
3084	 * once large timestamps are fully supported.
3085	 */
3086	if (mask & (AT_ATIME | AT_MTIME)) {
3087		if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) ||
3088		    ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) {
3089			ZFS_EXIT(zfsvfs);
3090			return (SET_ERROR(EOVERFLOW));
3091		}
3092	}
3093
3094top:
3095	attrzp = NULL;
3096	aclp = NULL;
3097
3098	/* Can this be moved to before the top label? */
3099	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
3100		ZFS_EXIT(zfsvfs);
3101		return (SET_ERROR(EROFS));
3102	}
3103
3104	/*
3105	 * First validate permissions
3106	 */
3107
3108	if (mask & AT_SIZE) {
3109		err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr);
3110		if (err) {
3111			ZFS_EXIT(zfsvfs);
3112			return (err);
3113		}
3114		/*
3115		 * XXX - Note, we are not providing any open
3116		 * mode flags here (like FNDELAY), so we may
3117		 * block if there are locks present... this
3118		 * should be addressed in openat().
3119		 */
3120		/* XXX - would it be OK to generate a log record here? */
3121		err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
3122		if (err) {
3123			ZFS_EXIT(zfsvfs);
3124			return (err);
3125		}
3126
3127		if (vap->va_size == 0)
3128			vnevent_truncate(ZTOV(zp), ct);
3129	}
3130
3131	if (mask & (AT_ATIME|AT_MTIME) ||
3132	    ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) ||
3133	    XVA_ISSET_REQ(xvap, XAT_READONLY) ||
3134	    XVA_ISSET_REQ(xvap, XAT_ARCHIVE) ||
3135	    XVA_ISSET_REQ(xvap, XAT_OFFLINE) ||
3136	    XVA_ISSET_REQ(xvap, XAT_SPARSE) ||
3137	    XVA_ISSET_REQ(xvap, XAT_CREATETIME) ||
3138	    XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) {
3139		need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
3140		    skipaclchk, cr);
3141	}
3142
3143	if (mask & (AT_UID|AT_GID)) {
3144		int	idmask = (mask & (AT_UID|AT_GID));
3145		int	take_owner;
3146		int	take_group;
3147
3148		/*
3149		 * NOTE: even if a new mode is being set,
3150		 * we may clear S_ISUID/S_ISGID bits.
3151		 */
3152
3153		if (!(mask & AT_MODE))
3154			vap->va_mode = zp->z_mode;
3155
3156		/*
3157		 * Take ownership or chgrp to group we are a member of
3158		 */
3159
3160		take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr));
3161		take_group = (mask & AT_GID) &&
3162		    zfs_groupmember(zfsvfs, vap->va_gid, cr);
3163
3164		/*
3165		 * If both AT_UID and AT_GID are set then take_owner and
3166		 * take_group must both be set in order to allow taking
3167		 * ownership.
3168		 *
3169		 * Otherwise, send the check through secpolicy_vnode_setattr()
3170		 *
3171		 */
3172
3173		if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) ||
3174		    ((idmask == AT_UID) && take_owner) ||
3175		    ((idmask == AT_GID) && take_group)) {
3176			if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0,
3177			    skipaclchk, cr) == 0) {
3178				/*
3179				 * Remove setuid/setgid for non-privileged users
3180				 */
3181				secpolicy_setid_clear(vap, cr);
3182				trim_mask = (mask & (AT_UID|AT_GID));
3183			} else {
3184				need_policy =  TRUE;
3185			}
3186		} else {
3187			need_policy =  TRUE;
3188		}
3189	}
3190
3191	mutex_enter(&zp->z_lock);
3192	oldva.va_mode = zp->z_mode;
3193	zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
3194	if (mask & AT_XVATTR) {
3195		/*
3196		 * Update xvattr mask to include only those attributes
3197		 * that are actually changing.
3198		 *
3199		 * the bits will be restored prior to actually setting
3200		 * the attributes so the caller thinks they were set.
3201		 */
3202		if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
3203			if (xoap->xoa_appendonly !=
3204			    ((zp->z_pflags & ZFS_APPENDONLY) != 0)) {
3205				need_policy = TRUE;
3206			} else {
3207				XVA_CLR_REQ(xvap, XAT_APPENDONLY);
3208				XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY);
3209			}
3210		}
3211
3212		if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) {
3213			if (xoap->xoa_projinherit !=
3214			    ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) {
3215				need_policy = TRUE;
3216			} else {
3217				XVA_CLR_REQ(xvap, XAT_PROJINHERIT);
3218				XVA_SET_REQ(&tmpxvattr, XAT_PROJINHERIT);
3219			}
3220		}
3221
3222		if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
3223			if (xoap->xoa_nounlink !=
3224			    ((zp->z_pflags & ZFS_NOUNLINK) != 0)) {
3225				need_policy = TRUE;
3226			} else {
3227				XVA_CLR_REQ(xvap, XAT_NOUNLINK);
3228				XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK);
3229			}
3230		}
3231
3232		if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
3233			if (xoap->xoa_immutable !=
3234			    ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) {
3235				need_policy = TRUE;
3236			} else {
3237				XVA_CLR_REQ(xvap, XAT_IMMUTABLE);
3238				XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE);
3239			}
3240		}
3241
3242		if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
3243			if (xoap->xoa_nodump !=
3244			    ((zp->z_pflags & ZFS_NODUMP) != 0)) {
3245				need_policy = TRUE;
3246			} else {
3247				XVA_CLR_REQ(xvap, XAT_NODUMP);
3248				XVA_SET_REQ(&tmpxvattr, XAT_NODUMP);
3249			}
3250		}
3251
3252		if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
3253			if (xoap->xoa_av_modified !=
3254			    ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) {
3255				need_policy = TRUE;
3256			} else {
3257				XVA_CLR_REQ(xvap, XAT_AV_MODIFIED);
3258				XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED);
3259			}
3260		}
3261
3262		if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
3263			if ((vp->v_type != VREG &&
3264			    xoap->xoa_av_quarantined) ||
3265			    xoap->xoa_av_quarantined !=
3266			    ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) {
3267				need_policy = TRUE;
3268			} else {
3269				XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED);
3270				XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED);
3271			}
3272		}
3273
3274		if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
3275			mutex_exit(&zp->z_lock);
3276			ZFS_EXIT(zfsvfs);
3277			return (SET_ERROR(EPERM));
3278		}
3279
3280		if (need_policy == FALSE &&
3281		    (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) ||
3282		    XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
3283			need_policy = TRUE;
3284		}
3285	}
3286
3287	mutex_exit(&zp->z_lock);
3288
3289	if (mask & AT_MODE) {
3290		if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) {
3291			err = secpolicy_setid_setsticky_clear(vp, vap,
3292			    &oldva, cr);
3293			if (err) {
3294				ZFS_EXIT(zfsvfs);
3295				return (err);
3296			}
3297			trim_mask |= AT_MODE;
3298		} else {
3299			need_policy = TRUE;
3300		}
3301	}
3302
3303	if (need_policy) {
3304		/*
3305		 * If trim_mask is set then take ownership
3306		 * has been granted or write_acl is present and user
3307		 * has the ability to modify mode.  In that case remove
3308		 * UID|GID and or MODE from mask so that
3309		 * secpolicy_vnode_setattr() doesn't revoke it.
3310		 */
3311
3312		if (trim_mask) {
3313			saved_mask = vap->va_mask;
3314			vap->va_mask &= ~trim_mask;
3315		}
3316		err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags,
3317		    (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp);
3318		if (err) {
3319			ZFS_EXIT(zfsvfs);
3320			return (err);
3321		}
3322
3323		if (trim_mask)
3324			vap->va_mask |= saved_mask;
3325	}
3326
3327	/*
3328	 * secpolicy_vnode_setattr, or take ownership may have
3329	 * changed va_mask
3330	 */
3331	mask = vap->va_mask;
3332
3333	if ((mask & (AT_UID | AT_GID)) || projid != ZFS_INVALID_PROJID) {
3334		handle_eadir = B_TRUE;
3335		err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
3336		    &xattr_obj, sizeof (xattr_obj));
3337
3338		if (err == 0 && xattr_obj) {
3339			err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp);
3340			if (err)
3341				goto out2;
3342		}
3343		if (mask & AT_UID) {
3344			new_uid = zfs_fuid_create(zfsvfs,
3345			    (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
3346			if (new_uid != zp->z_uid &&
3347			    zfs_id_overquota(zfsvfs, DMU_USERUSED_OBJECT,
3348			    new_uid)) {
3349				if (attrzp)
3350					VN_RELE(ZTOV(attrzp));
3351				err = SET_ERROR(EDQUOT);
3352				goto out2;
3353			}
3354		}
3355
3356		if (mask & AT_GID) {
3357			new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid,
3358			    cr, ZFS_GROUP, &fuidp);
3359			if (new_gid != zp->z_gid &&
3360			    zfs_id_overquota(zfsvfs, DMU_GROUPUSED_OBJECT,
3361			    new_gid)) {
3362				if (attrzp)
3363					VN_RELE(ZTOV(attrzp));
3364				err = SET_ERROR(EDQUOT);
3365				goto out2;
3366			}
3367		}
3368
3369		if (projid != ZFS_INVALID_PROJID &&
3370		    zfs_id_overquota(zfsvfs, DMU_PROJECTUSED_OBJECT, projid)) {
3371			if (attrzp)
3372				VN_RELE(ZTOV(attrzp));
3373			err = EDQUOT;
3374			goto out2;
3375		}
3376	}
3377	tx = dmu_tx_create(os);
3378
3379	if (mask & AT_MODE) {
3380		uint64_t pmode = zp->z_mode;
3381		uint64_t acl_obj;
3382		new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
3383
3384		if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED &&
3385		    !(zp->z_pflags & ZFS_ACL_TRIVIAL)) {
3386			err = SET_ERROR(EPERM);
3387			goto out;
3388		}
3389
3390		if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode))
3391			goto out;
3392
3393		mutex_enter(&zp->z_lock);
3394		if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) {
3395			/*
3396			 * Are we upgrading ACL from old V0 format
3397			 * to V1 format?
3398			 */
3399			if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
3400			    zfs_znode_acl_version(zp) ==
3401			    ZFS_ACL_VERSION_INITIAL) {
3402				dmu_tx_hold_free(tx, acl_obj, 0,
3403				    DMU_OBJECT_END);
3404				dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
3405				    0, aclp->z_acl_bytes);
3406			} else {
3407				dmu_tx_hold_write(tx, acl_obj, 0,
3408				    aclp->z_acl_bytes);
3409			}
3410		} else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
3411			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
3412			    0, aclp->z_acl_bytes);
3413		}
3414		mutex_exit(&zp->z_lock);
3415		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
3416	} else {
3417		if (((mask & AT_XVATTR) &&
3418		    XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) ||
3419		    (projid != ZFS_INVALID_PROJID &&
3420		    !(zp->z_pflags & ZFS_PROJID)))
3421			dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
3422		else
3423			dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
3424	}
3425
3426	if (attrzp) {
3427		dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE);
3428	}
3429
3430	fuid_dirtied = zfsvfs->z_fuid_dirty;
3431	if (fuid_dirtied)
3432		zfs_fuid_txhold(zfsvfs, tx);
3433
3434	zfs_sa_upgrade_txholds(tx, zp);
3435
3436	err = dmu_tx_assign(tx, TXG_WAIT);
3437	if (err)
3438		goto out;
3439
3440	count = 0;
3441	/*
3442	 * Set each attribute requested.
3443	 * We group settings according to the locks they need to acquire.
3444	 *
3445	 * Note: you cannot set ctime directly, although it will be
3446	 * updated as a side-effect of calling this function.
3447	 */
3448
3449	if (projid != ZFS_INVALID_PROJID && !(zp->z_pflags & ZFS_PROJID)) {
3450		/*
3451		 * For the existing object that is upgraded from old system,
3452		 * its on-disk layout has no slot for the project ID attribute.
3453		 * But quota accounting logic needs to access related slots by
3454		 * offset directly. So we need to adjust old objects' layout
3455		 * to make the project ID to some unified and fixed offset.
3456		 */
3457		if (attrzp)
3458			err = sa_add_projid(attrzp->z_sa_hdl, tx, projid);
3459		if (err == 0)
3460			err = sa_add_projid(zp->z_sa_hdl, tx, projid);
3461
3462		if (unlikely(err == EEXIST))
3463			err = 0;
3464		else if (err != 0)
3465			goto out;
3466		else
3467			projid = ZFS_INVALID_PROJID;
3468	}
3469
3470	if (mask & (AT_UID|AT_GID|AT_MODE))
3471		mutex_enter(&zp->z_acl_lock);
3472	mutex_enter(&zp->z_lock);
3473
3474	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
3475	    &zp->z_pflags, sizeof (zp->z_pflags));
3476
3477	if (attrzp) {
3478		if (mask & (AT_UID|AT_GID|AT_MODE))
3479			mutex_enter(&attrzp->z_acl_lock);
3480		mutex_enter(&attrzp->z_lock);
3481		SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3482		    SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags,
3483		    sizeof (attrzp->z_pflags));
3484		if (projid != ZFS_INVALID_PROJID) {
3485			attrzp->z_projid = projid;
3486			SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3487			    SA_ZPL_PROJID(zfsvfs), NULL, &attrzp->z_projid,
3488			    sizeof (attrzp->z_projid));
3489		}
3490	}
3491
3492	if (mask & (AT_UID|AT_GID)) {
3493
3494		if (mask & AT_UID) {
3495			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
3496			    &new_uid, sizeof (new_uid));
3497			zp->z_uid = new_uid;
3498			if (attrzp) {
3499				SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3500				    SA_ZPL_UID(zfsvfs), NULL, &new_uid,
3501				    sizeof (new_uid));
3502				attrzp->z_uid = new_uid;
3503			}
3504		}
3505
3506		if (mask & AT_GID) {
3507			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs),
3508			    NULL, &new_gid, sizeof (new_gid));
3509			zp->z_gid = new_gid;
3510			if (attrzp) {
3511				SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3512				    SA_ZPL_GID(zfsvfs), NULL, &new_gid,
3513				    sizeof (new_gid));
3514				attrzp->z_gid = new_gid;
3515			}
3516		}
3517		if (!(mask & AT_MODE)) {
3518			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs),
3519			    NULL, &new_mode, sizeof (new_mode));
3520			new_mode = zp->z_mode;
3521		}
3522		err = zfs_acl_chown_setattr(zp);
3523		ASSERT(err == 0);
3524		if (attrzp) {
3525			err = zfs_acl_chown_setattr(attrzp);
3526			ASSERT(err == 0);
3527		}
3528	}
3529
3530	if (mask & AT_MODE) {
3531		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
3532		    &new_mode, sizeof (new_mode));
3533		zp->z_mode = new_mode;
3534		ASSERT3U((uintptr_t)aclp, !=, NULL);
3535		err = zfs_aclset_common(zp, aclp, cr, tx);
3536		ASSERT0(err);
3537		if (zp->z_acl_cached)
3538			zfs_acl_free(zp->z_acl_cached);
3539		zp->z_acl_cached = aclp;
3540		aclp = NULL;
3541	}
3542
3543
3544	if (mask & AT_ATIME) {
3545		ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime);
3546		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
3547		    &zp->z_atime, sizeof (zp->z_atime));
3548	}
3549
3550	if (mask & AT_MTIME) {
3551		ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
3552		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
3553		    mtime, sizeof (mtime));
3554	}
3555
3556	/* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */
3557	if (mask & AT_SIZE && !(mask & AT_MTIME)) {
3558		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs),
3559		    NULL, mtime, sizeof (mtime));
3560		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
3561		    &ctime, sizeof (ctime));
3562		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
3563		    B_TRUE);
3564	} else if (mask != 0) {
3565		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
3566		    &ctime, sizeof (ctime));
3567		zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime,
3568		    B_TRUE);
3569		if (attrzp) {
3570			SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3571			    SA_ZPL_CTIME(zfsvfs), NULL,
3572			    &ctime, sizeof (ctime));
3573			zfs_tstamp_update_setup(attrzp, STATE_CHANGED,
3574			    mtime, ctime, B_TRUE);
3575		}
3576	}
3577
3578	if (projid != ZFS_INVALID_PROJID) {
3579		zp->z_projid = projid;
3580		SA_ADD_BULK_ATTR(bulk, count,
3581		    SA_ZPL_PROJID(zfsvfs), NULL, &zp->z_projid,
3582		    sizeof (zp->z_projid));
3583	}
3584
3585	/*
3586	 * Do this after setting timestamps to prevent timestamp
3587	 * update from toggling bit
3588	 */
3589
3590	if (xoap && (mask & AT_XVATTR)) {
3591
3592		/*
3593		 * restore trimmed off masks
3594		 * so that return masks can be set for caller.
3595		 */
3596
3597		if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) {
3598			XVA_SET_REQ(xvap, XAT_APPENDONLY);
3599		}
3600		if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) {
3601			XVA_SET_REQ(xvap, XAT_NOUNLINK);
3602		}
3603		if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) {
3604			XVA_SET_REQ(xvap, XAT_IMMUTABLE);
3605		}
3606		if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) {
3607			XVA_SET_REQ(xvap, XAT_NODUMP);
3608		}
3609		if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) {
3610			XVA_SET_REQ(xvap, XAT_AV_MODIFIED);
3611		}
3612		if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) {
3613			XVA_SET_REQ(xvap, XAT_AV_QUARANTINED);
3614		}
3615		if (XVA_ISSET_REQ(&tmpxvattr, XAT_PROJINHERIT)) {
3616			XVA_SET_REQ(xvap, XAT_PROJINHERIT);
3617		}
3618
3619		if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
3620			ASSERT(vp->v_type == VREG);
3621
3622		zfs_xvattr_set(zp, xvap, tx);
3623	}
3624
3625	if (fuid_dirtied)
3626		zfs_fuid_sync(zfsvfs, tx);
3627
3628	if (mask != 0)
3629		zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
3630
3631	mutex_exit(&zp->z_lock);
3632	if (mask & (AT_UID|AT_GID|AT_MODE))
3633		mutex_exit(&zp->z_acl_lock);
3634
3635	if (attrzp) {
3636		if (mask & (AT_UID|AT_GID|AT_MODE))
3637			mutex_exit(&attrzp->z_acl_lock);
3638		mutex_exit(&attrzp->z_lock);
3639	}
3640out:
3641	if (err == 0 && xattr_count > 0) {
3642		err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk,
3643		    xattr_count, tx);
3644		ASSERT(err2 == 0);
3645	}
3646
3647	if (aclp)
3648		zfs_acl_free(aclp);
3649
3650	if (fuidp) {
3651		zfs_fuid_info_free(fuidp);
3652		fuidp = NULL;
3653	}
3654
3655	if (err) {
3656		dmu_tx_abort(tx);
3657		if (attrzp)
3658			VN_RELE(ZTOV(attrzp));
3659		if (err == ERESTART)
3660			goto top;
3661	} else {
3662		if (count > 0)
3663			err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
3664		dmu_tx_commit(tx);
3665		if (attrzp) {
3666			if (err2 == 0 && handle_eadir)
3667				err2 = zfs_setattr_dir(attrzp);
3668			VN_RELE(ZTOV(attrzp));
3669		}
3670	}
3671
3672out2:
3673	if (os->os_sync == ZFS_SYNC_ALWAYS)
3674		zil_commit(zilog, 0);
3675
3676	ZFS_EXIT(zfsvfs);
3677	return (err);
3678}
3679
3680typedef struct zfs_zlock {
3681	krwlock_t	*zl_rwlock;	/* lock we acquired */
3682	znode_t		*zl_znode;	/* znode we held */
3683	struct zfs_zlock *zl_next;	/* next in list */
3684} zfs_zlock_t;
3685
3686/*
3687 * Drop locks and release vnodes that were held by zfs_rename_lock().
3688 */
3689static void
3690zfs_rename_unlock(zfs_zlock_t **zlpp)
3691{
3692	zfs_zlock_t *zl;
3693
3694	while ((zl = *zlpp) != NULL) {
3695		if (zl->zl_znode != NULL)
3696			VN_RELE(ZTOV(zl->zl_znode));
3697		rw_exit(zl->zl_rwlock);
3698		*zlpp = zl->zl_next;
3699		kmem_free(zl, sizeof (*zl));
3700	}
3701}
3702
3703/*
3704 * Search back through the directory tree, using the ".." entries.
3705 * Lock each directory in the chain to prevent concurrent renames.
3706 * Fail any attempt to move a directory into one of its own descendants.
3707 * XXX - z_parent_lock can overlap with map or grow locks
3708 */
3709static int
3710zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp)
3711{
3712	zfs_zlock_t	*zl;
3713	znode_t		*zp = tdzp;
3714	uint64_t	rootid = zp->z_zfsvfs->z_root;
3715	uint64_t	oidp = zp->z_id;
3716	krwlock_t	*rwlp = &szp->z_parent_lock;
3717	krw_t		rw = RW_WRITER;
3718
3719	/*
3720	 * First pass write-locks szp and compares to zp->z_id.
3721	 * Later passes read-lock zp and compare to zp->z_parent.
3722	 */
3723	do {
3724		if (!rw_tryenter(rwlp, rw)) {
3725			/*
3726			 * Another thread is renaming in this path.
3727			 * Note that if we are a WRITER, we don't have any
3728			 * parent_locks held yet.
3729			 */
3730			if (rw == RW_READER && zp->z_id > szp->z_id) {
3731				/*
3732				 * Drop our locks and restart
3733				 */
3734				zfs_rename_unlock(&zl);
3735				*zlpp = NULL;
3736				zp = tdzp;
3737				oidp = zp->z_id;
3738				rwlp = &szp->z_parent_lock;
3739				rw = RW_WRITER;
3740				continue;
3741			} else {
3742				/*
3743				 * Wait for other thread to drop its locks
3744				 */
3745				rw_enter(rwlp, rw);
3746			}
3747		}
3748
3749		zl = kmem_alloc(sizeof (*zl), KM_SLEEP);
3750		zl->zl_rwlock = rwlp;
3751		zl->zl_znode = NULL;
3752		zl->zl_next = *zlpp;
3753		*zlpp = zl;
3754
3755		if (oidp == szp->z_id)		/* We're a descendant of szp */
3756			return (SET_ERROR(EINVAL));
3757
3758		if (oidp == rootid)		/* We've hit the top */
3759			return (0);
3760
3761		if (rw == RW_READER) {		/* i.e. not the first pass */
3762			int error = zfs_zget(zp->z_zfsvfs, oidp, &zp);
3763			if (error)
3764				return (error);
3765			zl->zl_znode = zp;
3766		}
3767		(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zp->z_zfsvfs),
3768		    &oidp, sizeof (oidp));
3769		rwlp = &zp->z_parent_lock;
3770		rw = RW_READER;
3771
3772	} while (zp->z_id != sdzp->z_id);
3773
3774	return (0);
3775}
3776
3777/*
3778 * Move an entry from the provided source directory to the target
3779 * directory.  Change the entry name as indicated.
3780 *
3781 *	IN:	sdvp	- Source directory containing the "old entry".
3782 *		snm	- Old entry name.
3783 *		tdvp	- Target directory to contain the "new entry".
3784 *		tnm	- New entry name.
3785 *		cr	- credentials of caller.
3786 *		ct	- caller context
3787 *		flags	- case flags
3788 *
3789 *	RETURN:	0 on success, error code on failure.
3790 *
3791 * Timestamps:
3792 *	sdvp,tdvp - ctime|mtime updated
3793 */
3794/*ARGSUSED*/
3795static int
3796zfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr,
3797    caller_context_t *ct, int flags)
3798{
3799	znode_t		*tdzp, *szp, *tzp;
3800	znode_t		*sdzp = VTOZ(sdvp);
3801	zfsvfs_t	*zfsvfs = sdzp->z_zfsvfs;
3802	zilog_t		*zilog;
3803	vnode_t		*realvp;
3804	zfs_dirlock_t	*sdl, *tdl;
3805	dmu_tx_t	*tx;
3806	zfs_zlock_t	*zl;
3807	int		cmp, serr, terr;
3808	int		error = 0, rm_err = 0;
3809	int		zflg = 0;
3810	boolean_t	waited = B_FALSE;
3811
3812	ZFS_ENTER(zfsvfs);
3813	ZFS_VERIFY_ZP(sdzp);
3814	zilog = zfsvfs->z_log;
3815
3816	/*
3817	 * Make sure we have the real vp for the target directory.
3818	 */
3819	if (VOP_REALVP(tdvp, &realvp, ct) == 0)
3820		tdvp = realvp;
3821
3822	tdzp = VTOZ(tdvp);
3823	ZFS_VERIFY_ZP(tdzp);
3824
3825	/*
3826	 * We check z_zfsvfs rather than v_vfsp here, because snapshots and the
3827	 * ctldir appear to have the same v_vfsp.
3828	 */
3829	if (tdzp->z_zfsvfs != zfsvfs || zfsctl_is_node(tdvp)) {
3830		ZFS_EXIT(zfsvfs);
3831		return (SET_ERROR(EXDEV));
3832	}
3833
3834	if (zfsvfs->z_utf8 && u8_validate(tnm,
3835	    strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3836		ZFS_EXIT(zfsvfs);
3837		return (SET_ERROR(EILSEQ));
3838	}
3839
3840	if (flags & FIGNORECASE)
3841		zflg |= ZCILOOK;
3842
3843top:
3844	szp = NULL;
3845	tzp = NULL;
3846	zl = NULL;
3847
3848	/*
3849	 * This is to prevent the creation of links into attribute space
3850	 * by renaming a linked file into/outof an attribute directory.
3851	 * See the comment in zfs_link() for why this is considered bad.
3852	 */
3853	if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
3854		ZFS_EXIT(zfsvfs);
3855		return (SET_ERROR(EINVAL));
3856	}
3857
3858	/*
3859	 * Lock source and target directory entries.  To prevent deadlock,
3860	 * a lock ordering must be defined.  We lock the directory with
3861	 * the smallest object id first, or if it's a tie, the one with
3862	 * the lexically first name.
3863	 */
3864	if (sdzp->z_id < tdzp->z_id) {
3865		cmp = -1;
3866	} else if (sdzp->z_id > tdzp->z_id) {
3867		cmp = 1;
3868	} else {
3869		/*
3870		 * First compare the two name arguments without
3871		 * considering any case folding.
3872		 */
3873		int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER);
3874
3875		cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error);
3876		ASSERT(error == 0 || !zfsvfs->z_utf8);
3877		if (cmp == 0) {
3878			/*
3879			 * POSIX: "If the old argument and the new argument
3880			 * both refer to links to the same existing file,
3881			 * the rename() function shall return successfully
3882			 * and perform no other action."
3883			 */
3884			ZFS_EXIT(zfsvfs);
3885			return (0);
3886		}
3887		/*
3888		 * If the file system is case-folding, then we may
3889		 * have some more checking to do.  A case-folding file
3890		 * system is either supporting mixed case sensitivity
3891		 * access or is completely case-insensitive.  Note
3892		 * that the file system is always case preserving.
3893		 *
3894		 * In mixed sensitivity mode case sensitive behavior
3895		 * is the default.  FIGNORECASE must be used to
3896		 * explicitly request case insensitive behavior.
3897		 *
3898		 * If the source and target names provided differ only
3899		 * by case (e.g., a request to rename 'tim' to 'Tim'),
3900		 * we will treat this as a special case in the
3901		 * case-insensitive mode: as long as the source name
3902		 * is an exact match, we will allow this to proceed as
3903		 * a name-change request.
3904		 */
3905		if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
3906		    (zfsvfs->z_case == ZFS_CASE_MIXED &&
3907		    flags & FIGNORECASE)) &&
3908		    u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST,
3909		    &error) == 0) {
3910			/*
3911			 * case preserving rename request, require exact
3912			 * name matches
3913			 */
3914			zflg |= ZCIEXACT;
3915			zflg &= ~ZCILOOK;
3916		}
3917	}
3918
3919	/*
3920	 * If the source and destination directories are the same, we should
3921	 * grab the z_name_lock of that directory only once.
3922	 */
3923	if (sdzp == tdzp) {
3924		zflg |= ZHAVELOCK;
3925		rw_enter(&sdzp->z_name_lock, RW_READER);
3926	}
3927
3928	if (cmp < 0) {
3929		serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp,
3930		    ZEXISTS | zflg, NULL, NULL);
3931		terr = zfs_dirent_lock(&tdl,
3932		    tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL);
3933	} else {
3934		terr = zfs_dirent_lock(&tdl,
3935		    tdzp, tnm, &tzp, zflg, NULL, NULL);
3936		serr = zfs_dirent_lock(&sdl,
3937		    sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg,
3938		    NULL, NULL);
3939	}
3940
3941	if (serr) {
3942		/*
3943		 * Source entry invalid or not there.
3944		 */
3945		if (!terr) {
3946			zfs_dirent_unlock(tdl);
3947			if (tzp)
3948				VN_RELE(ZTOV(tzp));
3949		}
3950
3951		if (sdzp == tdzp)
3952			rw_exit(&sdzp->z_name_lock);
3953
3954		if (strcmp(snm, "..") == 0)
3955			serr = SET_ERROR(EINVAL);
3956		ZFS_EXIT(zfsvfs);
3957		return (serr);
3958	}
3959	if (terr) {
3960		zfs_dirent_unlock(sdl);
3961		VN_RELE(ZTOV(szp));
3962
3963		if (sdzp == tdzp)
3964			rw_exit(&sdzp->z_name_lock);
3965
3966		if (strcmp(tnm, "..") == 0)
3967			terr = SET_ERROR(EINVAL);
3968		ZFS_EXIT(zfsvfs);
3969		return (terr);
3970	}
3971
3972	/*
3973	 * If we are using project inheritance, it means if the directory has
3974	 * ZFS_PROJINHERIT set, then its descendant directories will inherit
3975	 * not only the project ID, but also the ZFS_PROJINHERIT flag. Under
3976	 * such case, we only allow renames into our tree when the project
3977	 * IDs are the same.
3978	 */
3979	if (tdzp->z_pflags & ZFS_PROJINHERIT &&
3980	    tdzp->z_projid != szp->z_projid) {
3981		error = SET_ERROR(EXDEV);
3982		goto out;
3983	}
3984
3985	/*
3986	 * Must have write access at the source to remove the old entry
3987	 * and write access at the target to create the new entry.
3988	 * Note that if target and source are the same, this can be
3989	 * done in a single check.
3990	 */
3991
3992	if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))
3993		goto out;
3994
3995	if (ZTOV(szp)->v_type == VDIR) {
3996		/*
3997		 * Check to make sure rename is valid.
3998		 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
3999		 */
4000		if (error = zfs_rename_lock(szp, tdzp, sdzp, &zl))
4001			goto out;
4002	}
4003
4004	/*
4005	 * Does target exist?
4006	 */
4007	if (tzp) {
4008		/*
4009		 * Source and target must be the same type.
4010		 */
4011		if (ZTOV(szp)->v_type == VDIR) {
4012			if (ZTOV(tzp)->v_type != VDIR) {
4013				error = SET_ERROR(ENOTDIR);
4014				goto out;
4015			}
4016		} else {
4017			if (ZTOV(tzp)->v_type == VDIR) {
4018				error = SET_ERROR(EISDIR);
4019				goto out;
4020			}
4021		}
4022		/*
4023		 * POSIX dictates that when the source and target
4024		 * entries refer to the same file object, rename
4025		 * must do nothing and exit without error.
4026		 */
4027		if (szp->z_id == tzp->z_id) {
4028			error = 0;
4029			goto out;
4030		}
4031	}
4032
4033	vnevent_pre_rename_src(ZTOV(szp), sdvp, snm, ct);
4034	if (tzp)
4035		vnevent_pre_rename_dest(ZTOV(tzp), tdvp, tnm, ct);
4036
4037	/*
4038	 * notify the target directory if it is not the same
4039	 * as source directory.
4040	 */
4041	if (tdvp != sdvp) {
4042		vnevent_pre_rename_dest_dir(tdvp, ZTOV(szp), tnm, ct);
4043	}
4044
4045	tx = dmu_tx_create(zfsvfs->z_os);
4046	dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
4047	dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
4048	dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
4049	dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
4050	if (sdzp != tdzp) {
4051		dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE);
4052		zfs_sa_upgrade_txholds(tx, tdzp);
4053	}
4054	if (tzp) {
4055		dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE);
4056		zfs_sa_upgrade_txholds(tx, tzp);
4057	}
4058
4059	zfs_sa_upgrade_txholds(tx, szp);
4060	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
4061	error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
4062	if (error) {
4063		if (zl != NULL)
4064			zfs_rename_unlock(&zl);
4065		zfs_dirent_unlock(sdl);
4066		zfs_dirent_unlock(tdl);
4067
4068		if (sdzp == tdzp)
4069			rw_exit(&sdzp->z_name_lock);
4070
4071		VN_RELE(ZTOV(szp));
4072		if (tzp)
4073			VN_RELE(ZTOV(tzp));
4074		if (error == ERESTART) {
4075			waited = B_TRUE;
4076			dmu_tx_wait(tx);
4077			dmu_tx_abort(tx);
4078			goto top;
4079		}
4080		dmu_tx_abort(tx);
4081		ZFS_EXIT(zfsvfs);
4082		return (error);
4083	}
4084
4085	if (tzp)	/* Attempt to remove the existing target */
4086		error = rm_err = zfs_link_destroy(tdl, tzp, tx, zflg, NULL);
4087
4088	if (error == 0) {
4089		error = zfs_link_create(tdl, szp, tx, ZRENAMING);
4090		if (error == 0) {
4091			szp->z_pflags |= ZFS_AV_MODIFIED;
4092			if (tdzp->z_pflags & ZFS_PROJINHERIT)
4093				szp->z_pflags |= ZFS_PROJINHERIT;
4094
4095			error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
4096			    (void *)&szp->z_pflags, sizeof (uint64_t), tx);
4097			ASSERT0(error);
4098
4099			error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL);
4100			if (error == 0) {
4101				zfs_log_rename(zilog, tx, TX_RENAME |
4102				    (flags & FIGNORECASE ? TX_CI : 0), sdzp,
4103				    sdl->dl_name, tdzp, tdl->dl_name, szp);
4104
4105				/*
4106				 * Update path information for the target vnode
4107				 */
4108				vn_renamepath(tdvp, ZTOV(szp), tnm,
4109				    strlen(tnm));
4110			} else {
4111				/*
4112				 * At this point, we have successfully created
4113				 * the target name, but have failed to remove
4114				 * the source name.  Since the create was done
4115				 * with the ZRENAMING flag, there are
4116				 * complications; for one, the link count is
4117				 * wrong.  The easiest way to deal with this
4118				 * is to remove the newly created target, and
4119				 * return the original error.  This must
4120				 * succeed; fortunately, it is very unlikely to
4121				 * fail, since we just created it.
4122				 */
4123				VERIFY3U(zfs_link_destroy(tdl, szp, tx,
4124				    ZRENAMING, NULL), ==, 0);
4125			}
4126		}
4127	}
4128
4129	dmu_tx_commit(tx);
4130
4131	if (tzp && rm_err == 0)
4132		vnevent_rename_dest(ZTOV(tzp), tdvp, tnm, ct);
4133
4134	if (error == 0) {
4135		vnevent_rename_src(ZTOV(szp), sdvp, snm, ct);
4136		/* notify the target dir if it is not the same as source dir */
4137		if (tdvp != sdvp)
4138			vnevent_rename_dest_dir(tdvp, ct);
4139	}
4140out:
4141	if (zl != NULL)
4142		zfs_rename_unlock(&zl);
4143
4144	zfs_dirent_unlock(sdl);
4145	zfs_dirent_unlock(tdl);
4146
4147	if (sdzp == tdzp)
4148		rw_exit(&sdzp->z_name_lock);
4149
4150
4151	VN_RELE(ZTOV(szp));
4152	if (tzp)
4153		VN_RELE(ZTOV(tzp));
4154
4155	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4156		zil_commit(zilog, 0);
4157
4158	ZFS_EXIT(zfsvfs);
4159	return (error);
4160}
4161
4162/*
4163 * Insert the indicated symbolic reference entry into the directory.
4164 *
4165 *	IN:	dvp	- Directory to contain new symbolic link.
4166 *		link	- Name for new symlink entry.
4167 *		vap	- Attributes of new entry.
4168 *		cr	- credentials of caller.
4169 *		ct	- caller context
4170 *		flags	- case flags
4171 *
4172 *	RETURN:	0 on success, error code on failure.
4173 *
4174 * Timestamps:
4175 *	dvp - ctime|mtime updated
4176 */
4177/*ARGSUSED*/
4178static int
4179zfs_symlink(vnode_t *dvp, char *name, vattr_t *vap, char *link, cred_t *cr,
4180    caller_context_t *ct, int flags)
4181{
4182	znode_t		*zp, *dzp = VTOZ(dvp);
4183	zfs_dirlock_t	*dl;
4184	dmu_tx_t	*tx;
4185	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
4186	zilog_t		*zilog;
4187	uint64_t	len = strlen(link);
4188	int		error;
4189	int		zflg = ZNEW;
4190	zfs_acl_ids_t	acl_ids;
4191	boolean_t	fuid_dirtied;
4192	uint64_t	txtype = TX_SYMLINK;
4193	boolean_t	waited = B_FALSE;
4194
4195	ASSERT(vap->va_type == VLNK);
4196
4197	ZFS_ENTER(zfsvfs);
4198	ZFS_VERIFY_ZP(dzp);
4199	zilog = zfsvfs->z_log;
4200
4201	if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
4202	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
4203		ZFS_EXIT(zfsvfs);
4204		return (SET_ERROR(EILSEQ));
4205	}
4206	if (flags & FIGNORECASE)
4207		zflg |= ZCILOOK;
4208
4209	if (len > MAXPATHLEN) {
4210		ZFS_EXIT(zfsvfs);
4211		return (SET_ERROR(ENAMETOOLONG));
4212	}
4213
4214	if ((error = zfs_acl_ids_create(dzp, 0,
4215	    vap, cr, NULL, &acl_ids)) != 0) {
4216		ZFS_EXIT(zfsvfs);
4217		return (error);
4218	}
4219top:
4220	/*
4221	 * Attempt to lock directory; fail if entry already exists.
4222	 */
4223	error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL);
4224	if (error) {
4225		zfs_acl_ids_free(&acl_ids);
4226		ZFS_EXIT(zfsvfs);
4227		return (error);
4228	}
4229
4230	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
4231		zfs_acl_ids_free(&acl_ids);
4232		zfs_dirent_unlock(dl);
4233		ZFS_EXIT(zfsvfs);
4234		return (error);
4235	}
4236
4237	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, ZFS_DEFAULT_PROJID)) {
4238		zfs_acl_ids_free(&acl_ids);
4239		zfs_dirent_unlock(dl);
4240		ZFS_EXIT(zfsvfs);
4241		return (SET_ERROR(EDQUOT));
4242	}
4243	tx = dmu_tx_create(zfsvfs->z_os);
4244	fuid_dirtied = zfsvfs->z_fuid_dirty;
4245	dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
4246	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
4247	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
4248	    ZFS_SA_BASE_ATTR_SIZE + len);
4249	dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
4250	if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
4251		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
4252		    acl_ids.z_aclp->z_acl_bytes);
4253	}
4254	if (fuid_dirtied)
4255		zfs_fuid_txhold(zfsvfs, tx);
4256	error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
4257	if (error) {
4258		zfs_dirent_unlock(dl);
4259		if (error == ERESTART) {
4260			waited = B_TRUE;
4261			dmu_tx_wait(tx);
4262			dmu_tx_abort(tx);
4263			goto top;
4264		}
4265		zfs_acl_ids_free(&acl_ids);
4266		dmu_tx_abort(tx);
4267		ZFS_EXIT(zfsvfs);
4268		return (error);
4269	}
4270
4271	/*
4272	 * Create a new object for the symlink.
4273	 * for version 4 ZPL datsets the symlink will be an SA attribute
4274	 */
4275	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
4276
4277	if (fuid_dirtied)
4278		zfs_fuid_sync(zfsvfs, tx);
4279
4280	mutex_enter(&zp->z_lock);
4281	if (zp->z_is_sa)
4282		error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs),
4283		    link, len, tx);
4284	else
4285		zfs_sa_symlink(zp, link, len, tx);
4286	mutex_exit(&zp->z_lock);
4287
4288	zp->z_size = len;
4289	(void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
4290	    &zp->z_size, sizeof (zp->z_size), tx);
4291	/*
4292	 * Insert the new object into the directory.
4293	 */
4294	(void) zfs_link_create(dl, zp, tx, ZNEW);
4295
4296	if (flags & FIGNORECASE)
4297		txtype |= TX_CI;
4298	zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
4299
4300	zfs_acl_ids_free(&acl_ids);
4301
4302	dmu_tx_commit(tx);
4303
4304	zfs_dirent_unlock(dl);
4305
4306	VN_RELE(ZTOV(zp));
4307
4308	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4309		zil_commit(zilog, 0);
4310
4311	ZFS_EXIT(zfsvfs);
4312	return (error);
4313}
4314
4315/*
4316 * Return, in the buffer contained in the provided uio structure,
4317 * the symbolic path referred to by vp.
4318 *
4319 *	IN:	vp	- vnode of symbolic link.
4320 *		uio	- structure to contain the link path.
4321 *		cr	- credentials of caller.
4322 *		ct	- caller context
4323 *
4324 *	OUT:	uio	- structure containing the link path.
4325 *
4326 *	RETURN:	0 on success, error code on failure.
4327 *
4328 * Timestamps:
4329 *	vp - atime updated
4330 */
4331/* ARGSUSED */
4332static int
4333zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct)
4334{
4335	znode_t		*zp = VTOZ(vp);
4336	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
4337	int		error;
4338
4339	ZFS_ENTER(zfsvfs);
4340	ZFS_VERIFY_ZP(zp);
4341
4342	mutex_enter(&zp->z_lock);
4343	if (zp->z_is_sa)
4344		error = sa_lookup_uio(zp->z_sa_hdl,
4345		    SA_ZPL_SYMLINK(zfsvfs), uio);
4346	else
4347		error = zfs_sa_readlink(zp, uio);
4348	mutex_exit(&zp->z_lock);
4349
4350	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
4351
4352	ZFS_EXIT(zfsvfs);
4353	return (error);
4354}
4355
4356/*
4357 * Insert a new entry into directory tdvp referencing svp.
4358 *
4359 *	IN:	tdvp	- Directory to contain new entry.
4360 *		svp	- vnode of new entry.
4361 *		name	- name of new entry.
4362 *		cr	- credentials of caller.
4363 *		ct	- caller context
4364 *
4365 *	RETURN:	0 on success, error code on failure.
4366 *
4367 * Timestamps:
4368 *	tdvp - ctime|mtime updated
4369 *	 svp - ctime updated
4370 */
4371/* ARGSUSED */
4372static int
4373zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr,
4374    caller_context_t *ct, int flags)
4375{
4376	znode_t		*dzp = VTOZ(tdvp);
4377	znode_t		*tzp, *szp;
4378	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
4379	zilog_t		*zilog;
4380	zfs_dirlock_t	*dl;
4381	dmu_tx_t	*tx;
4382	vnode_t		*realvp;
4383	int		error;
4384	int		zf = ZNEW;
4385	uint64_t	parent;
4386	uid_t		owner;
4387	boolean_t	waited = B_FALSE;
4388
4389	ASSERT(tdvp->v_type == VDIR);
4390
4391	ZFS_ENTER(zfsvfs);
4392	ZFS_VERIFY_ZP(dzp);
4393	zilog = zfsvfs->z_log;
4394
4395	if (VOP_REALVP(svp, &realvp, ct) == 0)
4396		svp = realvp;
4397
4398	/*
4399	 * POSIX dictates that we return EPERM here.
4400	 * Better choices include ENOTSUP or EISDIR.
4401	 */
4402	if (svp->v_type == VDIR) {
4403		ZFS_EXIT(zfsvfs);
4404		return (SET_ERROR(EPERM));
4405	}
4406
4407	szp = VTOZ(svp);
4408	ZFS_VERIFY_ZP(szp);
4409
4410	/*
4411	 * If we are using project inheritance, it means if the directory has
4412	 * ZFS_PROJINHERIT set, then its descendant directories will inherit
4413	 * not only the project ID, but also the ZFS_PROJINHERIT flag. Under
4414	 * such case, we only allow hard link creation in our tree when the
4415	 * project IDs are the same.
4416	 */
4417	if (dzp->z_pflags & ZFS_PROJINHERIT && dzp->z_projid != szp->z_projid) {
4418		ZFS_EXIT(zfsvfs);
4419		return (SET_ERROR(EXDEV));
4420	}
4421
4422	/*
4423	 * We check z_zfsvfs rather than v_vfsp here, because snapshots and the
4424	 * ctldir appear to have the same v_vfsp.
4425	 */
4426	if (szp->z_zfsvfs != zfsvfs || zfsctl_is_node(svp)) {
4427		ZFS_EXIT(zfsvfs);
4428		return (SET_ERROR(EXDEV));
4429	}
4430
4431	/* Prevent links to .zfs/shares files */
4432
4433	if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
4434	    &parent, sizeof (uint64_t))) != 0) {
4435		ZFS_EXIT(zfsvfs);
4436		return (error);
4437	}
4438	if (parent == zfsvfs->z_shares_dir) {
4439		ZFS_EXIT(zfsvfs);
4440		return (SET_ERROR(EPERM));
4441	}
4442
4443	if (zfsvfs->z_utf8 && u8_validate(name,
4444	    strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
4445		ZFS_EXIT(zfsvfs);
4446		return (SET_ERROR(EILSEQ));
4447	}
4448	if (flags & FIGNORECASE)
4449		zf |= ZCILOOK;
4450
4451	/*
4452	 * We do not support links between attributes and non-attributes
4453	 * because of the potential security risk of creating links
4454	 * into "normal" file space in order to circumvent restrictions
4455	 * imposed in attribute space.
4456	 */
4457	if ((szp->z_pflags & ZFS_XATTR) != (dzp->z_pflags & ZFS_XATTR)) {
4458		ZFS_EXIT(zfsvfs);
4459		return (SET_ERROR(EINVAL));
4460	}
4461
4462
4463	owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER);
4464	if (owner != crgetuid(cr) && secpolicy_basic_link(cr) != 0) {
4465		ZFS_EXIT(zfsvfs);
4466		return (SET_ERROR(EPERM));
4467	}
4468
4469	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
4470		ZFS_EXIT(zfsvfs);
4471		return (error);
4472	}
4473
4474top:
4475	/*
4476	 * Attempt to lock directory; fail if entry already exists.
4477	 */
4478	error = zfs_dirent_lock(&dl, dzp, name, &tzp, zf, NULL, NULL);
4479	if (error) {
4480		ZFS_EXIT(zfsvfs);
4481		return (error);
4482	}
4483
4484	tx = dmu_tx_create(zfsvfs->z_os);
4485	dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
4486	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
4487	zfs_sa_upgrade_txholds(tx, szp);
4488	zfs_sa_upgrade_txholds(tx, dzp);
4489	error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
4490	if (error) {
4491		zfs_dirent_unlock(dl);
4492		if (error == ERESTART) {
4493			waited = B_TRUE;
4494			dmu_tx_wait(tx);
4495			dmu_tx_abort(tx);
4496			goto top;
4497		}
4498		dmu_tx_abort(tx);
4499		ZFS_EXIT(zfsvfs);
4500		return (error);
4501	}
4502
4503	error = zfs_link_create(dl, szp, tx, 0);
4504
4505	if (error == 0) {
4506		uint64_t txtype = TX_LINK;
4507		if (flags & FIGNORECASE)
4508			txtype |= TX_CI;
4509		zfs_log_link(zilog, tx, txtype, dzp, szp, name);
4510	}
4511
4512	dmu_tx_commit(tx);
4513
4514	zfs_dirent_unlock(dl);
4515
4516	if (error == 0) {
4517		vnevent_link(svp, ct);
4518	}
4519
4520	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4521		zil_commit(zilog, 0);
4522
4523	ZFS_EXIT(zfsvfs);
4524	return (error);
4525}
4526
4527/*
4528 * zfs_null_putapage() is used when the file system has been force
4529 * unmounted. It just drops the pages.
4530 */
4531/* ARGSUSED */
4532static int
4533zfs_null_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp,
4534    size_t *lenp, int flags, cred_t *cr)
4535{
4536	pvn_write_done(pp, B_INVAL|B_FORCE|B_ERROR);
4537	return (0);
4538}
4539
4540/*
4541 * Push a page out to disk, klustering if possible.
4542 *
4543 *	IN:	vp	- file to push page to.
4544 *		pp	- page to push.
4545 *		flags	- additional flags.
4546 *		cr	- credentials of caller.
4547 *
4548 *	OUT:	offp	- start of range pushed.
4549 *		lenp	- len of range pushed.
4550 *
4551 *	RETURN:	0 on success, error code on failure.
4552 *
4553 * NOTE: callers must have locked the page to be pushed.  On
4554 * exit, the page (and all other pages in the kluster) must be
4555 * unlocked.
4556 */
4557/* ARGSUSED */
4558static int
4559zfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp,
4560    size_t *lenp, int flags, cred_t *cr)
4561{
4562	znode_t		*zp = VTOZ(vp);
4563	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
4564	dmu_tx_t	*tx;
4565	u_offset_t	off, koff;
4566	size_t		len, klen;
4567	int		err;
4568
4569	off = pp->p_offset;
4570	len = PAGESIZE;
4571	/*
4572	 * If our blocksize is bigger than the page size, try to kluster
4573	 * multiple pages so that we write a full block (thus avoiding
4574	 * a read-modify-write).
4575	 */
4576	if (off < zp->z_size && zp->z_blksz > PAGESIZE) {
4577		klen = P2ROUNDUP((ulong_t)zp->z_blksz, PAGESIZE);
4578		koff = ISP2(klen) ? P2ALIGN(off, (u_offset_t)klen) : 0;
4579		ASSERT(koff <= zp->z_size);
4580		if (koff + klen > zp->z_size)
4581			klen = P2ROUNDUP(zp->z_size - koff, (uint64_t)PAGESIZE);
4582		pp = pvn_write_kluster(vp, pp, &off, &len, koff, klen, flags);
4583	}
4584	ASSERT3U(btop(len), ==, btopr(len));
4585
4586	/*
4587	 * Can't push pages past end-of-file.
4588	 */
4589	if (off >= zp->z_size) {
4590		/* ignore all pages */
4591		err = 0;
4592		goto out;
4593	} else if (off + len > zp->z_size) {
4594		int npages = btopr(zp->z_size - off);
4595		page_t *trunc;
4596
4597		page_list_break(&pp, &trunc, npages);
4598		/* ignore pages past end of file */
4599		if (trunc)
4600			pvn_write_done(trunc, flags);
4601		len = zp->z_size - off;
4602	}
4603
4604	if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, zp->z_uid) ||
4605	    zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, zp->z_gid)) {
4606		err = SET_ERROR(EDQUOT);
4607		goto out;
4608	}
4609	tx = dmu_tx_create(zfsvfs->z_os);
4610	dmu_tx_hold_write(tx, zp->z_id, off, len);
4611
4612	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
4613	zfs_sa_upgrade_txholds(tx, zp);
4614	err = dmu_tx_assign(tx, TXG_WAIT);
4615	if (err != 0) {
4616		dmu_tx_abort(tx);
4617		goto out;
4618	}
4619
4620	if (zp->z_blksz <= PAGESIZE) {
4621		caddr_t va = zfs_map_page(pp, S_READ);
4622		ASSERT3U(len, <=, PAGESIZE);
4623		dmu_write(zfsvfs->z_os, zp->z_id, off, len, va, tx);
4624		zfs_unmap_page(pp, va);
4625	} else {
4626		err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, pp, tx);
4627	}
4628
4629	if (err == 0) {
4630		uint64_t mtime[2], ctime[2];
4631		sa_bulk_attr_t bulk[3];
4632		int count = 0;
4633
4634		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
4635		    &mtime, 16);
4636		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
4637		    &ctime, 16);
4638		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
4639		    &zp->z_pflags, 8);
4640		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
4641		    B_TRUE);
4642		err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
4643		ASSERT0(err);
4644		zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0);
4645	}
4646	dmu_tx_commit(tx);
4647
4648out:
4649	pvn_write_done(pp, (err ? B_ERROR : 0) | flags);
4650	if (offp)
4651		*offp = off;
4652	if (lenp)
4653		*lenp = len;
4654
4655	return (err);
4656}
4657
4658/*
4659 * Copy the portion of the file indicated from pages into the file.
4660 * The pages are stored in a page list attached to the files vnode.
4661 *
4662 *	IN:	vp	- vnode of file to push page data to.
4663 *		off	- position in file to put data.
4664 *		len	- amount of data to write.
4665 *		flags	- flags to control the operation.
4666 *		cr	- credentials of caller.
4667 *		ct	- caller context.
4668 *
4669 *	RETURN:	0 on success, error code on failure.
4670 *
4671 * Timestamps:
4672 *	vp - ctime|mtime updated
4673 */
4674/*ARGSUSED*/
4675static int
4676zfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr,
4677    caller_context_t *ct)
4678{
4679	znode_t		*zp = VTOZ(vp);
4680	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
4681	page_t		*pp;
4682	size_t		io_len;
4683	u_offset_t	io_off;
4684	uint_t		blksz;
4685	locked_range_t	*lr;
4686	int		error = 0;
4687
4688	ZFS_ENTER(zfsvfs);
4689	ZFS_VERIFY_ZP(zp);
4690
4691	/*
4692	 * There's nothing to do if no data is cached.
4693	 */
4694	if (!vn_has_cached_data(vp)) {
4695		ZFS_EXIT(zfsvfs);
4696		return (0);
4697	}
4698
4699	/*
4700	 * Align this request to the file block size in case we kluster.
4701	 * XXX - this can result in pretty aggresive locking, which can
4702	 * impact simultanious read/write access.  One option might be
4703	 * to break up long requests (len == 0) into block-by-block
4704	 * operations to get narrower locking.
4705	 */
4706	blksz = zp->z_blksz;
4707	if (ISP2(blksz))
4708		io_off = P2ALIGN_TYPED(off, blksz, u_offset_t);
4709	else
4710		io_off = 0;
4711	if (len > 0 && ISP2(blksz))
4712		io_len = P2ROUNDUP_TYPED(len + (off - io_off), blksz, size_t);
4713	else
4714		io_len = 0;
4715
4716	if (io_len == 0) {
4717		/*
4718		 * Search the entire vp list for pages >= io_off.
4719		 */
4720		lr = rangelock_enter(&zp->z_rangelock,
4721		    io_off, UINT64_MAX, RL_WRITER);
4722		error = pvn_vplist_dirty(vp, io_off, zfs_putapage, flags, cr);
4723		goto out;
4724	}
4725	lr = rangelock_enter(&zp->z_rangelock, io_off, io_len, RL_WRITER);
4726
4727	if (off > zp->z_size) {
4728		/* past end of file */
4729		rangelock_exit(lr);
4730		ZFS_EXIT(zfsvfs);
4731		return (0);
4732	}
4733
4734	len = MIN(io_len, P2ROUNDUP(zp->z_size, PAGESIZE) - io_off);
4735
4736	for (off = io_off; io_off < off + len; io_off += io_len) {
4737		if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) {
4738			pp = page_lookup(vp, io_off,
4739			    (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED);
4740		} else {
4741			pp = page_lookup_nowait(vp, io_off,
4742			    (flags & B_FREE) ? SE_EXCL : SE_SHARED);
4743		}
4744
4745		if (pp != NULL && pvn_getdirty(pp, flags)) {
4746			int err;
4747
4748			/*
4749			 * Found a dirty page to push
4750			 */
4751			err = zfs_putapage(vp, pp, &io_off, &io_len, flags, cr);
4752			if (err)
4753				error = err;
4754		} else {
4755			io_len = PAGESIZE;
4756		}
4757	}
4758out:
4759	rangelock_exit(lr);
4760	if ((flags & B_ASYNC) == 0 || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4761		zil_commit(zfsvfs->z_log, zp->z_id);
4762	ZFS_EXIT(zfsvfs);
4763	return (error);
4764}
4765
4766/*ARGSUSED*/
4767void
4768zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
4769{
4770	znode_t	*zp = VTOZ(vp);
4771	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4772	int error;
4773
4774	rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
4775	if (zp->z_sa_hdl == NULL) {
4776		/*
4777		 * The fs has been unmounted, or we did a
4778		 * suspend/resume and this file no longer exists.
4779		 */
4780		if (vn_has_cached_data(vp)) {
4781			(void) pvn_vplist_dirty(vp, 0, zfs_null_putapage,
4782			    B_INVAL, cr);
4783		}
4784
4785		mutex_enter(&zp->z_lock);
4786		mutex_enter(&vp->v_lock);
4787		ASSERT(vp->v_count == 1);
4788		VN_RELE_LOCKED(vp);
4789		mutex_exit(&vp->v_lock);
4790		mutex_exit(&zp->z_lock);
4791		rw_exit(&zfsvfs->z_teardown_inactive_lock);
4792		zfs_znode_free(zp);
4793		return;
4794	}
4795
4796	/*
4797	 * Attempt to push any data in the page cache.  If this fails
4798	 * we will get kicked out later in zfs_zinactive().
4799	 */
4800	if (vn_has_cached_data(vp)) {
4801		(void) pvn_vplist_dirty(vp, 0, zfs_putapage, B_INVAL|B_ASYNC,
4802		    cr);
4803	}
4804
4805	if (zp->z_atime_dirty && zp->z_unlinked == 0) {
4806		dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
4807
4808		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
4809		zfs_sa_upgrade_txholds(tx, zp);
4810		error = dmu_tx_assign(tx, TXG_WAIT);
4811		if (error) {
4812			dmu_tx_abort(tx);
4813		} else {
4814			mutex_enter(&zp->z_lock);
4815			(void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
4816			    (void *)&zp->z_atime, sizeof (zp->z_atime), tx);
4817			zp->z_atime_dirty = 0;
4818			mutex_exit(&zp->z_lock);
4819			dmu_tx_commit(tx);
4820		}
4821	}
4822
4823	zfs_zinactive(zp);
4824	rw_exit(&zfsvfs->z_teardown_inactive_lock);
4825}
4826
4827/*
4828 * Bounds-check the seek operation.
4829 *
4830 *	IN:	vp	- vnode seeking within
4831 *		ooff	- old file offset
4832 *		noffp	- pointer to new file offset
4833 *		ct	- caller context
4834 *
4835 *	RETURN:	0 on success, EINVAL if new offset invalid.
4836 */
4837/* ARGSUSED */
4838static int
4839zfs_seek(vnode_t *vp, offset_t ooff, offset_t *noffp,
4840    caller_context_t *ct)
4841{
4842	if (vp->v_type == VDIR)
4843		return (0);
4844	return ((*noffp < 0) ? EINVAL : 0);
4845}
4846
4847/*
4848 * Pre-filter the generic locking function to trap attempts to place
4849 * a mandatory lock on a memory mapped file.
4850 */
4851static int
4852zfs_frlock(vnode_t *vp, int cmd, flock64_t *bfp, int flag, offset_t offset,
4853    flk_callback_t *flk_cbp, cred_t *cr, caller_context_t *ct)
4854{
4855	znode_t *zp = VTOZ(vp);
4856	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4857
4858	ZFS_ENTER(zfsvfs);
4859	ZFS_VERIFY_ZP(zp);
4860
4861	/*
4862	 * We are following the UFS semantics with respect to mapcnt
4863	 * here: If we see that the file is mapped already, then we will
4864	 * return an error, but we don't worry about races between this
4865	 * function and zfs_map().
4866	 */
4867	if (zp->z_mapcnt > 0 && MANDMODE(zp->z_mode)) {
4868		ZFS_EXIT(zfsvfs);
4869		return (SET_ERROR(EAGAIN));
4870	}
4871	ZFS_EXIT(zfsvfs);
4872	return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct));
4873}
4874
4875/*
4876 * If we can't find a page in the cache, we will create a new page
4877 * and fill it with file data.  For efficiency, we may try to fill
4878 * multiple pages at once (klustering) to fill up the supplied page
4879 * list.  Note that the pages to be filled are held with an exclusive
4880 * lock to prevent access by other threads while they are being filled.
4881 */
4882static int
4883zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg,
4884    caddr_t addr, page_t *pl[], size_t plsz, enum seg_rw rw)
4885{
4886	znode_t *zp = VTOZ(vp);
4887	page_t *pp, *cur_pp;
4888	objset_t *os = zp->z_zfsvfs->z_os;
4889	u_offset_t io_off, total;
4890	size_t io_len;
4891	int err;
4892
4893	if (plsz == PAGESIZE || zp->z_blksz <= PAGESIZE) {
4894		/*
4895		 * We only have a single page, don't bother klustering
4896		 */
4897		io_off = off;
4898		io_len = PAGESIZE;
4899		pp = page_create_va(vp, io_off, io_len,
4900		    PG_EXCL | PG_WAIT, seg, addr);
4901	} else {
4902		/*
4903		 * Try to find enough pages to fill the page list
4904		 */
4905		pp = pvn_read_kluster(vp, off, seg, addr, &io_off,
4906		    &io_len, off, plsz, 0);
4907	}
4908	if (pp == NULL) {
4909		/*
4910		 * The page already exists, nothing to do here.
4911		 */
4912		*pl = NULL;
4913		return (0);
4914	}
4915
4916	/*
4917	 * Fill the pages in the kluster.
4918	 */
4919	cur_pp = pp;
4920	for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) {
4921		caddr_t va;
4922
4923		ASSERT3U(io_off, ==, cur_pp->p_offset);
4924		va = zfs_map_page(cur_pp, S_WRITE);
4925		err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va,
4926		    DMU_READ_PREFETCH);
4927		zfs_unmap_page(cur_pp, va);
4928		if (err) {
4929			/* On error, toss the entire kluster */
4930			pvn_read_done(pp, B_ERROR);
4931			/* convert checksum errors into IO errors */
4932			if (err == ECKSUM)
4933				err = SET_ERROR(EIO);
4934			return (err);
4935		}
4936		cur_pp = cur_pp->p_next;
4937	}
4938
4939	/*
4940	 * Fill in the page list array from the kluster starting
4941	 * from the desired offset `off'.
4942	 * NOTE: the page list will always be null terminated.
4943	 */
4944	pvn_plist_init(pp, pl, plsz, off, io_len, rw);
4945	ASSERT(pl == NULL || (*pl)->p_offset == off);
4946
4947	return (0);
4948}
4949
4950/*
4951 * Return pointers to the pages for the file region [off, off + len]
4952 * in the pl array.  If plsz is greater than len, this function may
4953 * also return page pointers from after the specified region
4954 * (i.e. the region [off, off + plsz]).  These additional pages are
4955 * only returned if they are already in the cache, or were created as
4956 * part of a klustered read.
4957 *
4958 *	IN:	vp	- vnode of file to get data from.
4959 *		off	- position in file to get data from.
4960 *		len	- amount of data to retrieve.
4961 *		plsz	- length of provided page list.
4962 *		seg	- segment to obtain pages for.
4963 *		addr	- virtual address of fault.
4964 *		rw	- mode of created pages.
4965 *		cr	- credentials of caller.
4966 *		ct	- caller context.
4967 *
4968 *	OUT:	protp	- protection mode of created pages.
4969 *		pl	- list of pages created.
4970 *
4971 *	RETURN:	0 on success, error code on failure.
4972 *
4973 * Timestamps:
4974 *	vp - atime updated
4975 */
4976/* ARGSUSED */
4977static int
4978zfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp,
4979    page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
4980    enum seg_rw rw, cred_t *cr, caller_context_t *ct)
4981{
4982	znode_t		*zp = VTOZ(vp);
4983	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
4984	page_t		**pl0 = pl;
4985	int		err = 0;
4986
4987	/* we do our own caching, faultahead is unnecessary */
4988	if (pl == NULL)
4989		return (0);
4990	else if (len > plsz)
4991		len = plsz;
4992	else
4993		len = P2ROUNDUP(len, PAGESIZE);
4994	ASSERT(plsz >= len);
4995
4996	ZFS_ENTER(zfsvfs);
4997	ZFS_VERIFY_ZP(zp);
4998
4999	if (protp)
5000		*protp = PROT_ALL;
5001
5002	/*
5003	 * Loop through the requested range [off, off + len) looking
5004	 * for pages.  If we don't find a page, we will need to create
5005	 * a new page and fill it with data from the file.
5006	 */
5007	while (len > 0) {
5008		if (*pl = page_lookup(vp, off, SE_SHARED))
5009			*(pl+1) = NULL;
5010		else if (err = zfs_fillpage(vp, off, seg, addr, pl, plsz, rw))
5011			goto out;
5012		while (*pl) {
5013			ASSERT3U((*pl)->p_offset, ==, off);
5014			off += PAGESIZE;
5015			addr += PAGESIZE;
5016			if (len > 0) {
5017				ASSERT3U(len, >=, PAGESIZE);
5018				len -= PAGESIZE;
5019			}
5020			ASSERT3U(plsz, >=, PAGESIZE);
5021			plsz -= PAGESIZE;
5022			pl++;
5023		}
5024	}
5025
5026	/*
5027	 * Fill out the page array with any pages already in the cache.
5028	 */
5029	while (plsz > 0 &&
5030	    (*pl++ = page_lookup_nowait(vp, off, SE_SHARED))) {
5031			off += PAGESIZE;
5032			plsz -= PAGESIZE;
5033	}
5034out:
5035	if (err) {
5036		/*
5037		 * Release any pages we have previously locked.
5038		 */
5039		while (pl > pl0)
5040			page_unlock(*--pl);
5041	} else {
5042		ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
5043	}
5044
5045	*pl = NULL;
5046
5047	ZFS_EXIT(zfsvfs);
5048	return (err);
5049}
5050
5051/*
5052 * Request a memory map for a section of a file.  This code interacts
5053 * with common code and the VM system as follows:
5054 *
5055 * - common code calls mmap(), which ends up in smmap_common()
5056 * - this calls VOP_MAP(), which takes you into (say) zfs
5057 * - zfs_map() calls as_map(), passing segvn_create() as the callback
5058 * - segvn_create() creates the new segment and calls VOP_ADDMAP()
5059 * - zfs_addmap() updates z_mapcnt
5060 */
5061/*ARGSUSED*/
5062static int
5063zfs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
5064    size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
5065    caller_context_t *ct)
5066{
5067	znode_t *zp = VTOZ(vp);
5068	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
5069	segvn_crargs_t	vn_a;
5070	int		error;
5071
5072	ZFS_ENTER(zfsvfs);
5073	ZFS_VERIFY_ZP(zp);
5074
5075	/*
5076	 * Note: ZFS_READONLY is handled in zfs_zaccess_common.
5077	 */
5078
5079	if ((prot & PROT_WRITE) && (zp->z_pflags &
5080	    (ZFS_IMMUTABLE | ZFS_APPENDONLY))) {
5081		ZFS_EXIT(zfsvfs);
5082		return (SET_ERROR(EPERM));
5083	}
5084
5085	if ((prot & (PROT_READ | PROT_EXEC)) &&
5086	    (zp->z_pflags & ZFS_AV_QUARANTINED)) {
5087		ZFS_EXIT(zfsvfs);
5088		return (SET_ERROR(EACCES));
5089	}
5090
5091	if (vp->v_flag & VNOMAP) {
5092		ZFS_EXIT(zfsvfs);
5093		return (SET_ERROR(ENOSYS));
5094	}
5095
5096	if (off < 0 || len > MAXOFFSET_T - off) {
5097		ZFS_EXIT(zfsvfs);
5098		return (SET_ERROR(ENXIO));
5099	}
5100
5101	if (vp->v_type != VREG) {
5102		ZFS_EXIT(zfsvfs);
5103		return (SET_ERROR(ENODEV));
5104	}
5105
5106	/*
5107	 * If file is locked, disallow mapping.
5108	 */
5109	if (MANDMODE(zp->z_mode) && vn_has_flocks(vp)) {
5110		ZFS_EXIT(zfsvfs);
5111		return (SET_ERROR(EAGAIN));
5112	}
5113
5114	as_rangelock(as);
5115	error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags);
5116	if (error != 0) {
5117		as_rangeunlock(as);
5118		ZFS_EXIT(zfsvfs);
5119		return (error);
5120	}
5121
5122	vn_a.vp = vp;
5123	vn_a.offset = (u_offset_t)off;
5124	vn_a.type = flags & MAP_TYPE;
5125	vn_a.prot = prot;
5126	vn_a.maxprot = maxprot;
5127	vn_a.cred = cr;
5128	vn_a.amp = NULL;
5129	vn_a.flags = flags & ~MAP_TYPE;
5130	vn_a.szc = 0;
5131	vn_a.lgrp_mem_policy_flags = 0;
5132
5133	error = as_map(as, *addrp, len, segvn_create, &vn_a);
5134
5135	as_rangeunlock(as);
5136	ZFS_EXIT(zfsvfs);
5137	return (error);
5138}
5139
5140/* ARGSUSED */
5141static int
5142zfs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
5143    size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
5144    caller_context_t *ct)
5145{
5146	uint64_t pages = btopr(len);
5147
5148	atomic_add_64(&VTOZ(vp)->z_mapcnt, pages);
5149	return (0);
5150}
5151
5152/*
5153 * The reason we push dirty pages as part of zfs_delmap() is so that we get a
5154 * more accurate mtime for the associated file.  Since we don't have a way of
5155 * detecting when the data was actually modified, we have to resort to
5156 * heuristics.  If an explicit msync() is done, then we mark the mtime when the
5157 * last page is pushed.  The problem occurs when the msync() call is omitted,
5158 * which by far the most common case:
5159 *
5160 *	open()
5161 *	mmap()
5162 *	<modify memory>
5163 *	munmap()
5164 *	close()
5165 *	<time lapse>
5166 *	putpage() via fsflush
5167 *
5168 * If we wait until fsflush to come along, we can have a modification time that
5169 * is some arbitrary point in the future.  In order to prevent this in the
5170 * common case, we flush pages whenever a (MAP_SHARED, PROT_WRITE) mapping is
5171 * torn down.
5172 */
5173/* ARGSUSED */
5174static int
5175zfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
5176    size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr,
5177    caller_context_t *ct)
5178{
5179	uint64_t pages = btopr(len);
5180
5181	ASSERT3U(VTOZ(vp)->z_mapcnt, >=, pages);
5182	atomic_add_64(&VTOZ(vp)->z_mapcnt, -pages);
5183
5184	if ((flags & MAP_SHARED) && (prot & PROT_WRITE) &&
5185	    vn_has_cached_data(vp))
5186		(void) VOP_PUTPAGE(vp, off, len, B_ASYNC, cr, ct);
5187
5188	return (0);
5189}
5190
5191/*
5192 * Free or allocate space in a file.  Currently, this function only
5193 * supports the `F_FREESP' command.  However, this command is somewhat
5194 * misnamed, as its functionality includes the ability to allocate as
5195 * well as free space.
5196 *
5197 *	IN:	vp	- vnode of file to free data in.
5198 *		cmd	- action to take (only F_FREESP supported).
5199 *		bfp	- section of file to free/alloc.
5200 *		flag	- current file open mode flags.
5201 *		offset	- current file offset.
5202 *		cr	- credentials of caller [UNUSED].
5203 *		ct	- caller context.
5204 *
5205 *	RETURN:	0 on success, error code on failure.
5206 *
5207 * Timestamps:
5208 *	vp - ctime|mtime updated
5209 */
5210/* ARGSUSED */
5211static int
5212zfs_space(vnode_t *vp, int cmd, flock64_t *bfp, int flag,
5213    offset_t offset, cred_t *cr, caller_context_t *ct)
5214{
5215	znode_t		*zp = VTOZ(vp);
5216	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
5217	uint64_t	off, len;
5218	int		error;
5219
5220	ZFS_ENTER(zfsvfs);
5221	ZFS_VERIFY_ZP(zp);
5222
5223	if (cmd != F_FREESP) {
5224		ZFS_EXIT(zfsvfs);
5225		return (SET_ERROR(EINVAL));
5226	}
5227
5228	/*
5229	 * In a case vp->v_vfsp != zp->z_zfsvfs->z_vfs (e.g. snapshots) our
5230	 * callers might not be able to detect properly that we are read-only,
5231	 * so check it explicitly here.
5232	 */
5233	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
5234		ZFS_EXIT(zfsvfs);
5235		return (SET_ERROR(EROFS));
5236	}
5237
5238	if (error = convoff(vp, bfp, 0, offset)) {
5239		ZFS_EXIT(zfsvfs);
5240		return (error);
5241	}
5242
5243	if (bfp->l_len < 0) {
5244		ZFS_EXIT(zfsvfs);
5245		return (SET_ERROR(EINVAL));
5246	}
5247
5248	off = bfp->l_start;
5249	len = bfp->l_len; /* 0 means from off to end of file */
5250
5251	error = zfs_freesp(zp, off, len, flag, TRUE);
5252
5253	if (error == 0 && off == 0 && len == 0)
5254		vnevent_truncate(ZTOV(zp), ct);
5255
5256	ZFS_EXIT(zfsvfs);
5257	return (error);
5258}
5259
5260/*ARGSUSED*/
5261static int
5262zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
5263{
5264	znode_t		*zp = VTOZ(vp);
5265	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
5266	uint32_t	gen;
5267	uint64_t	gen64;
5268	uint64_t	object = zp->z_id;
5269	zfid_short_t	*zfid;
5270	int		size, i, error;
5271
5272	ZFS_ENTER(zfsvfs);
5273	ZFS_VERIFY_ZP(zp);
5274
5275	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs),
5276	    &gen64, sizeof (uint64_t))) != 0) {
5277		ZFS_EXIT(zfsvfs);
5278		return (error);
5279	}
5280
5281	gen = (uint32_t)gen64;
5282
5283	size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN;
5284	if (fidp->fid_len < size) {
5285		fidp->fid_len = size;
5286		ZFS_EXIT(zfsvfs);
5287		return (SET_ERROR(ENOSPC));
5288	}
5289
5290	zfid = (zfid_short_t *)fidp;
5291
5292	zfid->zf_len = size;
5293
5294	for (i = 0; i < sizeof (zfid->zf_object); i++)
5295		zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
5296
5297	/* Must have a non-zero generation number to distinguish from .zfs */
5298	if (gen == 0)
5299		gen = 1;
5300	for (i = 0; i < sizeof (zfid->zf_gen); i++)
5301		zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
5302
5303	if (size == LONG_FID_LEN) {
5304		uint64_t	objsetid = dmu_objset_id(zfsvfs->z_os);
5305		zfid_long_t	*zlfid;
5306
5307		zlfid = (zfid_long_t *)fidp;
5308
5309		for (i = 0; i < sizeof (zlfid->zf_setid); i++)
5310			zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i));
5311
5312		/* XXX - this should be the generation number for the objset */
5313		for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
5314			zlfid->zf_setgen[i] = 0;
5315	}
5316
5317	ZFS_EXIT(zfsvfs);
5318	return (0);
5319}
5320
5321static int
5322zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
5323    caller_context_t *ct)
5324{
5325	znode_t		*zp, *xzp;
5326	zfsvfs_t	*zfsvfs;
5327	zfs_dirlock_t	*dl;
5328	int		error;
5329
5330	switch (cmd) {
5331	case _PC_LINK_MAX:
5332		*valp = ULONG_MAX;
5333		return (0);
5334
5335	case _PC_FILESIZEBITS:
5336		*valp = 64;
5337		return (0);
5338
5339	case _PC_XATTR_EXISTS:
5340		zp = VTOZ(vp);
5341		zfsvfs = zp->z_zfsvfs;
5342		ZFS_ENTER(zfsvfs);
5343		ZFS_VERIFY_ZP(zp);
5344		*valp = 0;
5345		error = zfs_dirent_lock(&dl, zp, "", &xzp,
5346		    ZXATTR | ZEXISTS | ZSHARED, NULL, NULL);
5347		if (error == 0) {
5348			zfs_dirent_unlock(dl);
5349			if (!zfs_dirempty(xzp))
5350				*valp = 1;
5351			VN_RELE(ZTOV(xzp));
5352		} else if (error == ENOENT) {
5353			/*
5354			 * If there aren't extended attributes, it's the
5355			 * same as having zero of them.
5356			 */
5357			error = 0;
5358		}
5359		ZFS_EXIT(zfsvfs);
5360		return (error);
5361
5362	case _PC_SATTR_ENABLED:
5363	case _PC_SATTR_EXISTS:
5364		*valp = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
5365		    (vp->v_type == VREG || vp->v_type == VDIR);
5366		return (0);
5367
5368	case _PC_ACCESS_FILTERING:
5369		*valp = vfs_has_feature(vp->v_vfsp, VFSFT_ACCESS_FILTER) &&
5370		    vp->v_type == VDIR;
5371		return (0);
5372
5373	case _PC_ACL_ENABLED:
5374		*valp = _ACL_ACE_ENABLED;
5375		return (0);
5376
5377	case _PC_MIN_HOLE_SIZE:
5378		*valp = (ulong_t)SPA_MINBLOCKSIZE;
5379		return (0);
5380
5381	case _PC_TIMESTAMP_RESOLUTION:
5382		/* nanosecond timestamp resolution */
5383		*valp = 1L;
5384		return (0);
5385
5386	default:
5387		return (fs_pathconf(vp, cmd, valp, cr, ct));
5388	}
5389}
5390
5391/*ARGSUSED*/
5392static int
5393zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
5394    caller_context_t *ct)
5395{
5396	znode_t *zp = VTOZ(vp);
5397	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
5398	int error;
5399	boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
5400
5401	ZFS_ENTER(zfsvfs);
5402	ZFS_VERIFY_ZP(zp);
5403	error = zfs_getacl(zp, vsecp, skipaclchk, cr);
5404	ZFS_EXIT(zfsvfs);
5405
5406	return (error);
5407}
5408
5409/*ARGSUSED*/
5410static int
5411zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
5412    caller_context_t *ct)
5413{
5414	znode_t *zp = VTOZ(vp);
5415	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
5416	int error;
5417	boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
5418	zilog_t	*zilog = zfsvfs->z_log;
5419
5420	ZFS_ENTER(zfsvfs);
5421	ZFS_VERIFY_ZP(zp);
5422
5423	error = zfs_setacl(zp, vsecp, skipaclchk, cr);
5424
5425	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
5426		zil_commit(zilog, 0);
5427
5428	ZFS_EXIT(zfsvfs);
5429	return (error);
5430}
5431
5432/*
5433 * The smallest read we may consider to loan out an arcbuf.
5434 * This must be a power of 2.
5435 */
5436int zcr_blksz_min = (1 << 10);	/* 1K */
5437/*
5438 * If set to less than the file block size, allow loaning out of an
5439 * arcbuf for a partial block read.  This must be a power of 2.
5440 */
5441int zcr_blksz_max = (1 << 17);	/* 128K */
5442
5443/*ARGSUSED*/
5444static int
5445zfs_reqzcbuf(vnode_t *vp, enum uio_rw ioflag, xuio_t *xuio, cred_t *cr,
5446    caller_context_t *ct)
5447{
5448	znode_t	*zp = VTOZ(vp);
5449	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
5450	int max_blksz = zfsvfs->z_max_blksz;
5451	uio_t *uio = &xuio->xu_uio;
5452	ssize_t size = uio->uio_resid;
5453	offset_t offset = uio->uio_loffset;
5454	int blksz;
5455	int fullblk, i;
5456	arc_buf_t *abuf;
5457	ssize_t maxsize;
5458	int preamble, postamble;
5459
5460	if (xuio->xu_type != UIOTYPE_ZEROCOPY)
5461		return (SET_ERROR(EINVAL));
5462
5463	ZFS_ENTER(zfsvfs);
5464	ZFS_VERIFY_ZP(zp);
5465	switch (ioflag) {
5466	case UIO_WRITE:
5467		/*
5468		 * Loan out an arc_buf for write if write size is bigger than
5469		 * max_blksz, and the file's block size is also max_blksz.
5470		 */
5471		blksz = max_blksz;
5472		if (size < blksz || zp->z_blksz != blksz) {
5473			ZFS_EXIT(zfsvfs);
5474			return (SET_ERROR(EINVAL));
5475		}
5476		/*
5477		 * Caller requests buffers for write before knowing where the
5478		 * write offset might be (e.g. NFS TCP write).
5479		 */
5480		if (offset == -1) {
5481			preamble = 0;
5482		} else {
5483			preamble = P2PHASE(offset, blksz);
5484			if (preamble) {
5485				preamble = blksz - preamble;
5486				size -= preamble;
5487			}
5488		}
5489
5490		postamble = P2PHASE(size, blksz);
5491		size -= postamble;
5492
5493		fullblk = size / blksz;
5494		(void) dmu_xuio_init(xuio,
5495		    (preamble != 0) + fullblk + (postamble != 0));
5496		DTRACE_PROBE3(zfs_reqzcbuf_align, int, preamble,
5497		    int, postamble, int,
5498		    (preamble != 0) + fullblk + (postamble != 0));
5499
5500		/*
5501		 * Have to fix iov base/len for partial buffers.  They
5502		 * currently represent full arc_buf's.
5503		 */
5504		if (preamble) {
5505			/* data begins in the middle of the arc_buf */
5506			abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
5507			    blksz);
5508			ASSERT(abuf);
5509			(void) dmu_xuio_add(xuio, abuf,
5510			    blksz - preamble, preamble);
5511		}
5512
5513		for (i = 0; i < fullblk; i++) {
5514			abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
5515			    blksz);
5516			ASSERT(abuf);
5517			(void) dmu_xuio_add(xuio, abuf, 0, blksz);
5518		}
5519
5520		if (postamble) {
5521			/* data ends in the middle of the arc_buf */
5522			abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
5523			    blksz);
5524			ASSERT(abuf);
5525			(void) dmu_xuio_add(xuio, abuf, 0, postamble);
5526		}
5527		break;
5528	case UIO_READ:
5529		/*
5530		 * Loan out an arc_buf for read if the read size is larger than
5531		 * the current file block size.  Block alignment is not
5532		 * considered.  Partial arc_buf will be loaned out for read.
5533		 */
5534		blksz = zp->z_blksz;
5535		if (blksz < zcr_blksz_min)
5536			blksz = zcr_blksz_min;
5537		if (blksz > zcr_blksz_max)
5538			blksz = zcr_blksz_max;
5539		/* avoid potential complexity of dealing with it */
5540		if (blksz > max_blksz) {
5541			ZFS_EXIT(zfsvfs);
5542			return (SET_ERROR(EINVAL));
5543		}
5544
5545		maxsize = zp->z_size - uio->uio_loffset;
5546		if (size > maxsize)
5547			size = maxsize;
5548
5549		if (size < blksz || vn_has_cached_data(vp)) {
5550			ZFS_EXIT(zfsvfs);
5551			return (SET_ERROR(EINVAL));
5552		}
5553		break;
5554	default:
5555		ZFS_EXIT(zfsvfs);
5556		return (SET_ERROR(EINVAL));
5557	}
5558
5559	uio->uio_extflg = UIO_XUIO;
5560	XUIO_XUZC_RW(xuio) = ioflag;
5561	ZFS_EXIT(zfsvfs);
5562	return (0);
5563}
5564
5565/*ARGSUSED*/
5566static int
5567zfs_retzcbuf(vnode_t *vp, xuio_t *xuio, cred_t *cr, caller_context_t *ct)
5568{
5569	int i;
5570	arc_buf_t *abuf;
5571	int ioflag = XUIO_XUZC_RW(xuio);
5572
5573	ASSERT(xuio->xu_type == UIOTYPE_ZEROCOPY);
5574
5575	i = dmu_xuio_cnt(xuio);
5576	while (i-- > 0) {
5577		abuf = dmu_xuio_arcbuf(xuio, i);
5578		/*
5579		 * if abuf == NULL, it must be a write buffer
5580		 * that has been returned in zfs_write().
5581		 */
5582		if (abuf)
5583			dmu_return_arcbuf(abuf);
5584		ASSERT(abuf || ioflag == UIO_WRITE);
5585	}
5586
5587	dmu_xuio_fini(xuio);
5588	return (0);
5589}
5590
5591/*
5592 * Predeclare these here so that the compiler assumes that
5593 * this is an "old style" function declaration that does
5594 * not include arguments => we won't get type mismatch errors
5595 * in the initializations that follow.
5596 */
5597static int zfs_inval();
5598static int zfs_isdir();
5599
5600static int
5601zfs_inval()
5602{
5603	return (SET_ERROR(EINVAL));
5604}
5605
5606static int
5607zfs_isdir()
5608{
5609	return (SET_ERROR(EISDIR));
5610}
5611/*
5612 * Directory vnode operations template
5613 */
5614vnodeops_t *zfs_dvnodeops;
5615const fs_operation_def_t zfs_dvnodeops_template[] = {
5616	VOPNAME_OPEN,		{ .vop_open = zfs_open },
5617	VOPNAME_CLOSE,		{ .vop_close = zfs_close },
5618	VOPNAME_READ,		{ .error = zfs_isdir },
5619	VOPNAME_WRITE,		{ .error = zfs_isdir },
5620	VOPNAME_IOCTL,		{ .vop_ioctl = zfs_ioctl },
5621	VOPNAME_GETATTR,	{ .vop_getattr = zfs_getattr },
5622	VOPNAME_SETATTR,	{ .vop_setattr = zfs_setattr },
5623	VOPNAME_ACCESS,		{ .vop_access = zfs_access },
5624	VOPNAME_LOOKUP,		{ .vop_lookup = zfs_lookup },
5625	VOPNAME_CREATE,		{ .vop_create = zfs_create },
5626	VOPNAME_REMOVE,		{ .vop_remove = zfs_remove },
5627	VOPNAME_LINK,		{ .vop_link = zfs_link },
5628	VOPNAME_RENAME,		{ .vop_rename = zfs_rename },
5629	VOPNAME_MKDIR,		{ .vop_mkdir = zfs_mkdir },
5630	VOPNAME_RMDIR,		{ .vop_rmdir = zfs_rmdir },
5631	VOPNAME_READDIR,	{ .vop_readdir = zfs_readdir },
5632	VOPNAME_SYMLINK,	{ .vop_symlink = zfs_symlink },
5633	VOPNAME_FSYNC,		{ .vop_fsync = zfs_fsync },
5634	VOPNAME_INACTIVE,	{ .vop_inactive = zfs_inactive },
5635	VOPNAME_FID,		{ .vop_fid = zfs_fid },
5636	VOPNAME_SEEK,		{ .vop_seek = zfs_seek },
5637	VOPNAME_PATHCONF,	{ .vop_pathconf = zfs_pathconf },
5638	VOPNAME_GETSECATTR,	{ .vop_getsecattr = zfs_getsecattr },
5639	VOPNAME_SETSECATTR,	{ .vop_setsecattr = zfs_setsecattr },
5640	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
5641	NULL,			NULL
5642};
5643
5644/*
5645 * Regular file vnode operations template
5646 */
5647vnodeops_t *zfs_fvnodeops;
5648const fs_operation_def_t zfs_fvnodeops_template[] = {
5649	VOPNAME_OPEN,		{ .vop_open = zfs_open },
5650	VOPNAME_CLOSE,		{ .vop_close = zfs_close },
5651	VOPNAME_READ,		{ .vop_read = zfs_read },
5652	VOPNAME_WRITE,		{ .vop_write = zfs_write },
5653	VOPNAME_IOCTL,		{ .vop_ioctl = zfs_ioctl },
5654	VOPNAME_GETATTR,	{ .vop_getattr = zfs_getattr },
5655	VOPNAME_SETATTR,	{ .vop_setattr = zfs_setattr },
5656	VOPNAME_ACCESS,		{ .vop_access = zfs_access },
5657	VOPNAME_LOOKUP,		{ .vop_lookup = zfs_lookup },
5658	VOPNAME_RENAME,		{ .vop_rename = zfs_rename },
5659	VOPNAME_FSYNC,		{ .vop_fsync = zfs_fsync },
5660	VOPNAME_INACTIVE,	{ .vop_inactive = zfs_inactive },
5661	VOPNAME_FID,		{ .vop_fid = zfs_fid },
5662	VOPNAME_SEEK,		{ .vop_seek = zfs_seek },
5663	VOPNAME_FRLOCK,		{ .vop_frlock = zfs_frlock },
5664	VOPNAME_SPACE,		{ .vop_space = zfs_space },
5665	VOPNAME_GETPAGE,	{ .vop_getpage = zfs_getpage },
5666	VOPNAME_PUTPAGE,	{ .vop_putpage = zfs_putpage },
5667	VOPNAME_MAP,		{ .vop_map = zfs_map },
5668	VOPNAME_ADDMAP,		{ .vop_addmap = zfs_addmap },
5669	VOPNAME_DELMAP,		{ .vop_delmap = zfs_delmap },
5670	VOPNAME_PATHCONF,	{ .vop_pathconf = zfs_pathconf },
5671	VOPNAME_GETSECATTR,	{ .vop_getsecattr = zfs_getsecattr },
5672	VOPNAME_SETSECATTR,	{ .vop_setsecattr = zfs_setsecattr },
5673	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
5674	VOPNAME_REQZCBUF,	{ .vop_reqzcbuf = zfs_reqzcbuf },
5675	VOPNAME_RETZCBUF,	{ .vop_retzcbuf = zfs_retzcbuf },
5676	NULL,			NULL
5677};
5678
5679/*
5680 * Symbolic link vnode operations template
5681 */
5682vnodeops_t *zfs_symvnodeops;
5683const fs_operation_def_t zfs_symvnodeops_template[] = {
5684	VOPNAME_GETATTR,	{ .vop_getattr = zfs_getattr },
5685	VOPNAME_SETATTR,	{ .vop_setattr = zfs_setattr },
5686	VOPNAME_ACCESS,		{ .vop_access = zfs_access },
5687	VOPNAME_RENAME,		{ .vop_rename = zfs_rename },
5688	VOPNAME_READLINK,	{ .vop_readlink = zfs_readlink },
5689	VOPNAME_INACTIVE,	{ .vop_inactive = zfs_inactive },
5690	VOPNAME_FID,		{ .vop_fid = zfs_fid },
5691	VOPNAME_PATHCONF,	{ .vop_pathconf = zfs_pathconf },
5692	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
5693	NULL,			NULL
5694};
5695
5696/*
5697 * special share hidden files vnode operations template
5698 */
5699vnodeops_t *zfs_sharevnodeops;
5700const fs_operation_def_t zfs_sharevnodeops_template[] = {
5701	VOPNAME_GETATTR,	{ .vop_getattr = zfs_getattr },
5702	VOPNAME_ACCESS,		{ .vop_access = zfs_access },
5703	VOPNAME_INACTIVE,	{ .vop_inactive = zfs_inactive },
5704	VOPNAME_FID,		{ .vop_fid = zfs_fid },
5705	VOPNAME_PATHCONF,	{ .vop_pathconf = zfs_pathconf },
5706	VOPNAME_GETSECATTR,	{ .vop_getsecattr = zfs_getsecattr },
5707	VOPNAME_SETSECATTR,	{ .vop_setsecattr = zfs_setsecattr },
5708	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
5709	NULL,			NULL
5710};
5711
5712/*
5713 * Extended attribute directory vnode operations template
5714 *
5715 * This template is identical to the directory vnodes
5716 * operation template except for restricted operations:
5717 *	VOP_MKDIR()
5718 *	VOP_SYMLINK()
5719 *
5720 * Note that there are other restrictions embedded in:
5721 *	zfs_create()	- restrict type to VREG
5722 *	zfs_link()	- no links into/out of attribute space
5723 *	zfs_rename()	- no moves into/out of attribute space
5724 */
5725vnodeops_t *zfs_xdvnodeops;
5726const fs_operation_def_t zfs_xdvnodeops_template[] = {
5727	VOPNAME_OPEN,		{ .vop_open = zfs_open },
5728	VOPNAME_CLOSE,		{ .vop_close = zfs_close },
5729	VOPNAME_IOCTL,		{ .vop_ioctl = zfs_ioctl },
5730	VOPNAME_GETATTR,	{ .vop_getattr = zfs_getattr },
5731	VOPNAME_SETATTR,	{ .vop_setattr = zfs_setattr },
5732	VOPNAME_ACCESS,		{ .vop_access = zfs_access },
5733	VOPNAME_LOOKUP,		{ .vop_lookup = zfs_lookup },
5734	VOPNAME_CREATE,		{ .vop_create = zfs_create },
5735	VOPNAME_REMOVE,		{ .vop_remove = zfs_remove },
5736	VOPNAME_LINK,		{ .vop_link = zfs_link },
5737	VOPNAME_RENAME,		{ .vop_rename = zfs_rename },
5738	VOPNAME_MKDIR,		{ .error = zfs_inval },
5739	VOPNAME_RMDIR,		{ .vop_rmdir = zfs_rmdir },
5740	VOPNAME_READDIR,	{ .vop_readdir = zfs_readdir },
5741	VOPNAME_SYMLINK,	{ .error = zfs_inval },
5742	VOPNAME_FSYNC,		{ .vop_fsync = zfs_fsync },
5743	VOPNAME_INACTIVE,	{ .vop_inactive = zfs_inactive },
5744	VOPNAME_FID,		{ .vop_fid = zfs_fid },
5745	VOPNAME_SEEK,		{ .vop_seek = zfs_seek },
5746	VOPNAME_PATHCONF,	{ .vop_pathconf = zfs_pathconf },
5747	VOPNAME_GETSECATTR,	{ .vop_getsecattr = zfs_getsecattr },
5748	VOPNAME_SETSECATTR,	{ .vop_setsecattr = zfs_setsecattr },
5749	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
5750	NULL,			NULL
5751};
5752
5753/*
5754 * Error vnode operations template
5755 */
5756vnodeops_t *zfs_evnodeops;
5757const fs_operation_def_t zfs_evnodeops_template[] = {
5758	VOPNAME_INACTIVE,	{ .vop_inactive = zfs_inactive },
5759	VOPNAME_PATHCONF,	{ .vop_pathconf = zfs_pathconf },
5760	NULL,			NULL
5761};
5762