1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
25 * Copyright (c) 2014 Integros [integros.com]
26 * Copyright 2015 Joyent, Inc.
27 * Copyright 2017 Nexenta Systems, Inc.
28 */
29
30/* Portions Copyright 2007 Jeremy Teo */
31/* Portions Copyright 2010 Robert Milkowski */
32
33#include <sys/types.h>
34#include <sys/param.h>
35#include <sys/time.h>
36#include <sys/systm.h>
37#include <sys/sysmacros.h>
38#include <sys/resource.h>
39#include <sys/vfs.h>
40#include <sys/vfs_opreg.h>
41#include <sys/vnode.h>
42#include <sys/file.h>
43#include <sys/stat.h>
44#include <sys/kmem.h>
45#include <sys/taskq.h>
46#include <sys/uio.h>
47#include <sys/vmsystm.h>
48#include <sys/atomic.h>
49#include <sys/vm.h>
50#include <vm/seg_vn.h>
51#include <vm/pvn.h>
52#include <vm/as.h>
53#include <vm/kpm.h>
54#include <vm/seg_kpm.h>
55#include <sys/mman.h>
56#include <sys/pathname.h>
57#include <sys/cmn_err.h>
58#include <sys/errno.h>
59#include <sys/unistd.h>
60#include <sys/zfs_dir.h>
61#include <sys/zfs_acl.h>
62#include <sys/zfs_ioctl.h>
63#include <sys/fs/zfs.h>
64#include <sys/dmu.h>
65#include <sys/dmu_objset.h>
66#include <sys/spa.h>
67#include <sys/txg.h>
68#include <sys/dbuf.h>
69#include <sys/zap.h>
70#include <sys/sa.h>
71#include <sys/dirent.h>
72#include <sys/policy.h>
73#include <sys/sunddi.h>
74#include <sys/filio.h>
75#include <sys/sid.h>
76#include "fs/fs_subr.h"
77#include <sys/zfs_ctldir.h>
78#include <sys/zfs_fuid.h>
79#include <sys/zfs_sa.h>
80#include <sys/dnlc.h>
81#include <sys/zfs_rlock.h>
82#include <sys/extdirent.h>
83#include <sys/kidmap.h>
84#include <sys/cred.h>
85#include <sys/attr.h>
86
87/*
88 * Programming rules.
89 *
90 * Each vnode op performs some logical unit of work.  To do this, the ZPL must
91 * properly lock its in-core state, create a DMU transaction, do the work,
92 * record this work in the intent log (ZIL), commit the DMU transaction,
93 * and wait for the intent log to commit if it is a synchronous operation.
94 * Moreover, the vnode ops must work in both normal and log replay context.
95 * The ordering of events is important to avoid deadlocks and references
96 * to freed memory.  The example below illustrates the following Big Rules:
97 *
98 *  (1)	A check must be made in each zfs thread for a mounted file system.
99 *	This is done avoiding races using ZFS_ENTER(zfsvfs).
100 *	A ZFS_EXIT(zfsvfs) is needed before all returns.  Any znodes
101 *	must be checked with ZFS_VERIFY_ZP(zp).  Both of these macros
102 *	can return EIO from the calling function.
103 *
104 *  (2)	VN_RELE() should always be the last thing except for zil_commit()
105 *	(if necessary) and ZFS_EXIT(). This is for 3 reasons:
106 *	First, if it's the last reference, the vnode/znode
107 *	can be freed, so the zp may point to freed memory.  Second, the last
108 *	reference will call zfs_zinactive(), which may induce a lot of work --
109 *	pushing cached pages (which acquires range locks) and syncing out
110 *	cached atime changes.  Third, zfs_zinactive() may require a new tx,
111 *	which could deadlock the system if you were already holding one.
112 *	If you must call VN_RELE() within a tx then use VN_RELE_ASYNC().
113 *
114 *  (3)	All range locks must be grabbed before calling dmu_tx_assign(),
115 *	as they can span dmu_tx_assign() calls.
116 *
117 *  (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to
118 *      dmu_tx_assign().  This is critical because we don't want to block
119 *      while holding locks.
120 *
121 *	If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT.  This
122 *	reduces lock contention and CPU usage when we must wait (note that if
123 *	throughput is constrained by the storage, nearly every transaction
124 *	must wait).
125 *
126 *      Note, in particular, that if a lock is sometimes acquired before
127 *      the tx assigns, and sometimes after (e.g. z_lock), then failing
128 *      to use a non-blocking assign can deadlock the system.  The scenario:
129 *
130 *	Thread A has grabbed a lock before calling dmu_tx_assign().
131 *	Thread B is in an already-assigned tx, and blocks for this lock.
132 *	Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
133 *	forever, because the previous txg can't quiesce until B's tx commits.
134 *
135 *	If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
136 *	then drop all locks, call dmu_tx_wait(), and try again.  On subsequent
137 *	calls to dmu_tx_assign(), pass TXG_WAITED rather than TXG_NOWAIT,
138 *	to indicate that this operation has already called dmu_tx_wait().
139 *	This will ensure that we don't retry forever, waiting a short bit
140 *	each time.
141 *
142 *  (5)	If the operation succeeded, generate the intent log entry for it
143 *	before dropping locks.  This ensures that the ordering of events
144 *	in the intent log matches the order in which they actually occurred.
145 *	During ZIL replay the zfs_log_* functions will update the sequence
146 *	number to indicate the zil transaction has replayed.
147 *
148 *  (6)	At the end of each vnode op, the DMU tx must always commit,
149 *	regardless of whether there were any errors.
150 *
151 *  (7)	After dropping all locks, invoke zil_commit(zilog, foid)
152 *	to ensure that synchronous semantics are provided when necessary.
153 *
154 * In general, this is how things should be ordered in each vnode op:
155 *
156 *	ZFS_ENTER(zfsvfs);		// exit if unmounted
157 * top:
158 *	zfs_dirent_lock(&dl, ...)	// lock directory entry (may VN_HOLD())
159 *	rw_enter(...);			// grab any other locks you need
160 *	tx = dmu_tx_create(...);	// get DMU tx
161 *	dmu_tx_hold_*();		// hold each object you might modify
162 *	error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
163 *	if (error) {
164 *		rw_exit(...);		// drop locks
165 *		zfs_dirent_unlock(dl);	// unlock directory entry
166 *		VN_RELE(...);		// release held vnodes
167 *		if (error == ERESTART) {
168 *			waited = B_TRUE;
169 *			dmu_tx_wait(tx);
170 *			dmu_tx_abort(tx);
171 *			goto top;
172 *		}
173 *		dmu_tx_abort(tx);	// abort DMU tx
174 *		ZFS_EXIT(zfsvfs);	// finished in zfs
175 *		return (error);		// really out of space
176 *	}
177 *	error = do_real_work();		// do whatever this VOP does
178 *	if (error == 0)
179 *		zfs_log_*(...);		// on success, make ZIL entry
180 *	dmu_tx_commit(tx);		// commit DMU tx -- error or not
181 *	rw_exit(...);			// drop locks
182 *	zfs_dirent_unlock(dl);		// unlock directory entry
183 *	VN_RELE(...);			// release held vnodes
184 *	zil_commit(zilog, foid);	// synchronous when necessary
185 *	ZFS_EXIT(zfsvfs);		// finished in zfs
186 *	return (error);			// done, report error
187 */
188
189/* ARGSUSED */
190static int
191zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
192{
193	znode_t	*zp = VTOZ(*vpp);
194	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
195
196	ZFS_ENTER(zfsvfs);
197	ZFS_VERIFY_ZP(zp);
198
199	if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) &&
200	    ((flag & FAPPEND) == 0)) {
201		ZFS_EXIT(zfsvfs);
202		return (SET_ERROR(EPERM));
203	}
204
205	if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
206	    ZTOV(zp)->v_type == VREG &&
207	    !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) {
208		if (fs_vscan(*vpp, cr, 0) != 0) {
209			ZFS_EXIT(zfsvfs);
210			return (SET_ERROR(EACCES));
211		}
212	}
213
214	/* Keep a count of the synchronous opens in the znode */
215	if (flag & (FSYNC | FDSYNC))
216		atomic_inc_32(&zp->z_sync_cnt);
217
218	ZFS_EXIT(zfsvfs);
219	return (0);
220}
221
222/* ARGSUSED */
223static int
224zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
225    caller_context_t *ct)
226{
227	znode_t	*zp = VTOZ(vp);
228	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
229
230	/*
231	 * Clean up any locks held by this process on the vp.
232	 */
233	cleanlocks(vp, ddi_get_pid(), 0);
234	cleanshares(vp, ddi_get_pid());
235
236	ZFS_ENTER(zfsvfs);
237	ZFS_VERIFY_ZP(zp);
238
239	/* Decrement the synchronous opens in the znode */
240	if ((flag & (FSYNC | FDSYNC)) && (count == 1))
241		atomic_dec_32(&zp->z_sync_cnt);
242
243	if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
244	    ZTOV(zp)->v_type == VREG &&
245	    !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0)
246		VERIFY(fs_vscan(vp, cr, 1) == 0);
247
248	ZFS_EXIT(zfsvfs);
249	return (0);
250}
251
252/*
253 * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and
254 * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter.
255 */
256static int
257zfs_holey(vnode_t *vp, int cmd, offset_t *off)
258{
259	znode_t	*zp = VTOZ(vp);
260	uint64_t noff = (uint64_t)*off; /* new offset */
261	uint64_t file_sz;
262	int error;
263	boolean_t hole;
264
265	file_sz = zp->z_size;
266	if (noff >= file_sz)  {
267		return (SET_ERROR(ENXIO));
268	}
269
270	if (cmd == _FIO_SEEK_HOLE)
271		hole = B_TRUE;
272	else
273		hole = B_FALSE;
274
275	error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff);
276
277	if (error == ESRCH)
278		return (SET_ERROR(ENXIO));
279
280	/*
281	 * We could find a hole that begins after the logical end-of-file,
282	 * because dmu_offset_next() only works on whole blocks.  If the
283	 * EOF falls mid-block, then indicate that the "virtual hole"
284	 * at the end of the file begins at the logical EOF, rather than
285	 * at the end of the last block.
286	 */
287	if (noff > file_sz) {
288		ASSERT(hole);
289		noff = file_sz;
290	}
291
292	if (noff < *off)
293		return (error);
294	*off = noff;
295	return (error);
296}
297
298/* ARGSUSED */
299static int
300zfs_ioctl(vnode_t *vp, int com, intptr_t data, int flag, cred_t *cred,
301    int *rvalp, caller_context_t *ct)
302{
303	offset_t off;
304	offset_t ndata;
305	dmu_object_info_t doi;
306	int error;
307	zfsvfs_t *zfsvfs;
308	znode_t *zp;
309
310	switch (com) {
311	case _FIOFFS:
312	{
313		return (zfs_sync(vp->v_vfsp, 0, cred));
314
315		/*
316		 * The following two ioctls are used by bfu.  Faking out,
317		 * necessary to avoid bfu errors.
318		 */
319	}
320	case _FIOGDIO:
321	case _FIOSDIO:
322	{
323		return (0);
324	}
325
326	case _FIO_SEEK_DATA:
327	case _FIO_SEEK_HOLE:
328	{
329		if (ddi_copyin((void *)data, &off, sizeof (off), flag))
330			return (SET_ERROR(EFAULT));
331
332		zp = VTOZ(vp);
333		zfsvfs = zp->z_zfsvfs;
334		ZFS_ENTER(zfsvfs);
335		ZFS_VERIFY_ZP(zp);
336
337		/* offset parameter is in/out */
338		error = zfs_holey(vp, com, &off);
339		ZFS_EXIT(zfsvfs);
340		if (error)
341			return (error);
342		if (ddi_copyout(&off, (void *)data, sizeof (off), flag))
343			return (SET_ERROR(EFAULT));
344		return (0);
345	}
346	case _FIO_COUNT_FILLED:
347	{
348		/*
349		 * _FIO_COUNT_FILLED adds a new ioctl command which
350		 * exposes the number of filled blocks in a
351		 * ZFS object.
352		 */
353		zp = VTOZ(vp);
354		zfsvfs = zp->z_zfsvfs;
355		ZFS_ENTER(zfsvfs);
356		ZFS_VERIFY_ZP(zp);
357
358		/*
359		 * Wait for all dirty blocks for this object
360		 * to get synced out to disk, and the DMU info
361		 * updated.
362		 */
363		error = dmu_object_wait_synced(zfsvfs->z_os, zp->z_id);
364		if (error) {
365			ZFS_EXIT(zfsvfs);
366			return (error);
367		}
368
369		/*
370		 * Retrieve fill count from DMU object.
371		 */
372		error = dmu_object_info(zfsvfs->z_os, zp->z_id, &doi);
373		if (error) {
374			ZFS_EXIT(zfsvfs);
375			return (error);
376		}
377
378		ndata = doi.doi_fill_count;
379
380		ZFS_EXIT(zfsvfs);
381		if (ddi_copyout(&ndata, (void *)data, sizeof (ndata), flag))
382			return (SET_ERROR(EFAULT));
383		return (0);
384	}
385	}
386	return (SET_ERROR(ENOTTY));
387}
388
389/*
390 * Utility functions to map and unmap a single physical page.  These
391 * are used to manage the mappable copies of ZFS file data, and therefore
392 * do not update ref/mod bits.
393 */
394caddr_t
395zfs_map_page(page_t *pp, enum seg_rw rw)
396{
397	if (kpm_enable)
398		return (hat_kpm_mapin(pp, 0));
399	ASSERT(rw == S_READ || rw == S_WRITE);
400	return (ppmapin(pp, PROT_READ | ((rw == S_WRITE) ? PROT_WRITE : 0),
401	    (caddr_t)-1));
402}
403
404void
405zfs_unmap_page(page_t *pp, caddr_t addr)
406{
407	if (kpm_enable) {
408		hat_kpm_mapout(pp, 0, addr);
409	} else {
410		ppmapout(addr);
411	}
412}
413
414/*
415 * When a file is memory mapped, we must keep the IO data synchronized
416 * between the DMU cache and the memory mapped pages.  What this means:
417 *
418 * On Write:	If we find a memory mapped page, we write to *both*
419 *		the page and the dmu buffer.
420 */
421static void
422update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid)
423{
424	int64_t	off;
425
426	off = start & PAGEOFFSET;
427	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
428		page_t *pp;
429		uint64_t nbytes = MIN(PAGESIZE - off, len);
430
431		if (pp = page_lookup(vp, start, SE_SHARED)) {
432			caddr_t va;
433
434			va = zfs_map_page(pp, S_WRITE);
435			(void) dmu_read(os, oid, start+off, nbytes, va+off,
436			    DMU_READ_PREFETCH);
437			zfs_unmap_page(pp, va);
438			page_unlock(pp);
439		}
440		len -= nbytes;
441		off = 0;
442	}
443}
444
445/*
446 * When a file is memory mapped, we must keep the IO data synchronized
447 * between the DMU cache and the memory mapped pages.  What this means:
448 *
449 * On Read:	We "read" preferentially from memory mapped pages,
450 *		else we default from the dmu buffer.
451 *
452 * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
453 *	 the file is memory mapped.
454 */
455static int
456mappedread(vnode_t *vp, int nbytes, uio_t *uio)
457{
458	znode_t *zp = VTOZ(vp);
459	int64_t	start, off;
460	int len = nbytes;
461	int error = 0;
462
463	start = uio->uio_loffset;
464	off = start & PAGEOFFSET;
465	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
466		page_t *pp;
467		uint64_t bytes = MIN(PAGESIZE - off, len);
468
469		if (pp = page_lookup(vp, start, SE_SHARED)) {
470			caddr_t va;
471
472			va = zfs_map_page(pp, S_READ);
473			error = uiomove(va + off, bytes, UIO_READ, uio);
474			zfs_unmap_page(pp, va);
475			page_unlock(pp);
476		} else {
477			error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
478			    uio, bytes);
479		}
480		len -= bytes;
481		off = 0;
482		if (error)
483			break;
484	}
485	return (error);
486}
487
488offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */
489
490/*
491 * Read bytes from specified file into supplied buffer.
492 *
493 *	IN:	vp	- vnode of file to be read from.
494 *		uio	- structure supplying read location, range info,
495 *			  and return buffer.
496 *		ioflag	- SYNC flags; used to provide FRSYNC semantics.
497 *		cr	- credentials of caller.
498 *		ct	- caller context
499 *
500 *	OUT:	uio	- updated offset and range, buffer filled.
501 *
502 *	RETURN:	0 on success, error code on failure.
503 *
504 * Side Effects:
505 *	vp - atime updated if byte count > 0
506 */
507/* ARGSUSED */
508static int
509zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
510{
511	znode_t		*zp = VTOZ(vp);
512	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
513	ssize_t		n, nbytes;
514	int		error = 0;
515	rl_t		*rl;
516	xuio_t		*xuio = NULL;
517
518	ZFS_ENTER(zfsvfs);
519	ZFS_VERIFY_ZP(zp);
520
521	if (zp->z_pflags & ZFS_AV_QUARANTINED) {
522		ZFS_EXIT(zfsvfs);
523		return (SET_ERROR(EACCES));
524	}
525
526	/*
527	 * Validate file offset
528	 */
529	if (uio->uio_loffset < (offset_t)0) {
530		ZFS_EXIT(zfsvfs);
531		return (SET_ERROR(EINVAL));
532	}
533
534	/*
535	 * Fasttrack empty reads
536	 */
537	if (uio->uio_resid == 0) {
538		ZFS_EXIT(zfsvfs);
539		return (0);
540	}
541
542	/*
543	 * Check for mandatory locks
544	 */
545	if (MANDMODE(zp->z_mode)) {
546		if (error = chklock(vp, FREAD,
547		    uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) {
548			ZFS_EXIT(zfsvfs);
549			return (error);
550		}
551	}
552
553	/*
554	 * If we're in FRSYNC mode, sync out this znode before reading it.
555	 */
556	if (ioflag & FRSYNC || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
557		zil_commit(zfsvfs->z_log, zp->z_id);
558
559	/*
560	 * Lock the range against changes.
561	 */
562	rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER);
563
564	/*
565	 * If we are reading past end-of-file we can skip
566	 * to the end; but we might still need to set atime.
567	 */
568	if (uio->uio_loffset >= zp->z_size) {
569		error = 0;
570		goto out;
571	}
572
573	ASSERT(uio->uio_loffset < zp->z_size);
574	n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset);
575
576	if ((uio->uio_extflg == UIO_XUIO) &&
577	    (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) {
578		int nblk;
579		int blksz = zp->z_blksz;
580		uint64_t offset = uio->uio_loffset;
581
582		xuio = (xuio_t *)uio;
583		if ((ISP2(blksz))) {
584			nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset,
585			    blksz)) / blksz;
586		} else {
587			ASSERT(offset + n <= blksz);
588			nblk = 1;
589		}
590		(void) dmu_xuio_init(xuio, nblk);
591
592		if (vn_has_cached_data(vp)) {
593			/*
594			 * For simplicity, we always allocate a full buffer
595			 * even if we only expect to read a portion of a block.
596			 */
597			while (--nblk >= 0) {
598				(void) dmu_xuio_add(xuio,
599				    dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
600				    blksz), 0, blksz);
601			}
602		}
603	}
604
605	while (n > 0) {
606		nbytes = MIN(n, zfs_read_chunk_size -
607		    P2PHASE(uio->uio_loffset, zfs_read_chunk_size));
608
609		if (vn_has_cached_data(vp)) {
610			error = mappedread(vp, nbytes, uio);
611		} else {
612			error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
613			    uio, nbytes);
614		}
615		if (error) {
616			/* convert checksum errors into IO errors */
617			if (error == ECKSUM)
618				error = SET_ERROR(EIO);
619			break;
620		}
621
622		n -= nbytes;
623	}
624out:
625	zfs_range_unlock(rl);
626
627	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
628	ZFS_EXIT(zfsvfs);
629	return (error);
630}
631
632/*
633 * Write the bytes to a file.
634 *
635 *	IN:	vp	- vnode of file to be written to.
636 *		uio	- structure supplying write location, range info,
637 *			  and data buffer.
638 *		ioflag	- FAPPEND, FSYNC, and/or FDSYNC.  FAPPEND is
639 *			  set if in append mode.
640 *		cr	- credentials of caller.
641 *		ct	- caller context (NFS/CIFS fem monitor only)
642 *
643 *	OUT:	uio	- updated offset and range.
644 *
645 *	RETURN:	0 on success, error code on failure.
646 *
647 * Timestamps:
648 *	vp - ctime|mtime updated if byte count > 0
649 */
650
651/* ARGSUSED */
652static int
653zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
654{
655	znode_t		*zp = VTOZ(vp);
656	rlim64_t	limit = uio->uio_llimit;
657	ssize_t		start_resid = uio->uio_resid;
658	ssize_t		tx_bytes;
659	uint64_t	end_size;
660	dmu_tx_t	*tx;
661	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
662	zilog_t		*zilog;
663	offset_t	woff;
664	ssize_t		n, nbytes;
665	rl_t		*rl;
666	int		max_blksz = zfsvfs->z_max_blksz;
667	int		error = 0;
668	arc_buf_t	*abuf;
669	iovec_t		*aiov = NULL;
670	xuio_t		*xuio = NULL;
671	int		i_iov = 0;
672	int		iovcnt = uio->uio_iovcnt;
673	iovec_t		*iovp = uio->uio_iov;
674	int		write_eof;
675	int		count = 0;
676	sa_bulk_attr_t	bulk[4];
677	uint64_t	mtime[2], ctime[2];
678
679	/*
680	 * Fasttrack empty write
681	 */
682	n = start_resid;
683	if (n == 0)
684		return (0);
685
686	if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
687		limit = MAXOFFSET_T;
688
689	ZFS_ENTER(zfsvfs);
690	ZFS_VERIFY_ZP(zp);
691
692	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
693	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
694	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
695	    &zp->z_size, 8);
696	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
697	    &zp->z_pflags, 8);
698
699	/*
700	 * In a case vp->v_vfsp != zp->z_zfsvfs->z_vfs (e.g. snapshots) our
701	 * callers might not be able to detect properly that we are read-only,
702	 * so check it explicitly here.
703	 */
704	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
705		ZFS_EXIT(zfsvfs);
706		return (SET_ERROR(EROFS));
707	}
708
709	/*
710	 * If immutable or not appending then return EPERM.
711	 * Intentionally allow ZFS_READONLY through here.
712	 * See zfs_zaccess_common()
713	 */
714	if ((zp->z_pflags & ZFS_IMMUTABLE) ||
715	    ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) &&
716	    (uio->uio_loffset < zp->z_size))) {
717		ZFS_EXIT(zfsvfs);
718		return (SET_ERROR(EPERM));
719	}
720
721	zilog = zfsvfs->z_log;
722
723	/*
724	 * Validate file offset
725	 */
726	woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset;
727	if (woff < 0) {
728		ZFS_EXIT(zfsvfs);
729		return (SET_ERROR(EINVAL));
730	}
731
732	/*
733	 * Check for mandatory locks before calling zfs_range_lock()
734	 * in order to prevent a deadlock with locks set via fcntl().
735	 */
736	if (MANDMODE((mode_t)zp->z_mode) &&
737	    (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) {
738		ZFS_EXIT(zfsvfs);
739		return (error);
740	}
741
742	/*
743	 * Pre-fault the pages to ensure slow (eg NFS) pages
744	 * don't hold up txg.
745	 * Skip this if uio contains loaned arc_buf.
746	 */
747	if ((uio->uio_extflg == UIO_XUIO) &&
748	    (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY))
749		xuio = (xuio_t *)uio;
750	else
751		uio_prefaultpages(MIN(n, max_blksz), uio);
752
753	/*
754	 * If in append mode, set the io offset pointer to eof.
755	 */
756	if (ioflag & FAPPEND) {
757		/*
758		 * Obtain an appending range lock to guarantee file append
759		 * semantics.  We reset the write offset once we have the lock.
760		 */
761		rl = zfs_range_lock(zp, 0, n, RL_APPEND);
762		woff = rl->r_off;
763		if (rl->r_len == UINT64_MAX) {
764			/*
765			 * We overlocked the file because this write will cause
766			 * the file block size to increase.
767			 * Note that zp_size cannot change with this lock held.
768			 */
769			woff = zp->z_size;
770		}
771		uio->uio_loffset = woff;
772	} else {
773		/*
774		 * Note that if the file block size will change as a result of
775		 * this write, then this range lock will lock the entire file
776		 * so that we can re-write the block safely.
777		 */
778		rl = zfs_range_lock(zp, woff, n, RL_WRITER);
779	}
780
781	if (woff >= limit) {
782		zfs_range_unlock(rl);
783		ZFS_EXIT(zfsvfs);
784		return (SET_ERROR(EFBIG));
785	}
786
787	if ((woff + n) > limit || woff > (limit - n))
788		n = limit - woff;
789
790	/* Will this write extend the file length? */
791	write_eof = (woff + n > zp->z_size);
792
793	end_size = MAX(zp->z_size, woff + n);
794
795	/*
796	 * Write the file in reasonable size chunks.  Each chunk is written
797	 * in a separate transaction; this keeps the intent log records small
798	 * and allows us to do more fine-grained space accounting.
799	 */
800	while (n > 0) {
801		abuf = NULL;
802		woff = uio->uio_loffset;
803		if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
804		    zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
805			if (abuf != NULL)
806				dmu_return_arcbuf(abuf);
807			error = SET_ERROR(EDQUOT);
808			break;
809		}
810
811		if (xuio && abuf == NULL) {
812			ASSERT(i_iov < iovcnt);
813			aiov = &iovp[i_iov];
814			abuf = dmu_xuio_arcbuf(xuio, i_iov);
815			dmu_xuio_clear(xuio, i_iov);
816			DTRACE_PROBE3(zfs_cp_write, int, i_iov,
817			    iovec_t *, aiov, arc_buf_t *, abuf);
818			ASSERT((aiov->iov_base == abuf->b_data) ||
819			    ((char *)aiov->iov_base - (char *)abuf->b_data +
820			    aiov->iov_len == arc_buf_size(abuf)));
821			i_iov++;
822		} else if (abuf == NULL && n >= max_blksz &&
823		    woff >= zp->z_size &&
824		    P2PHASE(woff, max_blksz) == 0 &&
825		    zp->z_blksz == max_blksz) {
826			/*
827			 * This write covers a full block.  "Borrow" a buffer
828			 * from the dmu so that we can fill it before we enter
829			 * a transaction.  This avoids the possibility of
830			 * holding up the transaction if the data copy hangs
831			 * up on a pagefault (e.g., from an NFS server mapping).
832			 */
833			size_t cbytes;
834
835			abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
836			    max_blksz);
837			ASSERT(abuf != NULL);
838			ASSERT(arc_buf_size(abuf) == max_blksz);
839			if (error = uiocopy(abuf->b_data, max_blksz,
840			    UIO_WRITE, uio, &cbytes)) {
841				dmu_return_arcbuf(abuf);
842				break;
843			}
844			ASSERT(cbytes == max_blksz);
845		}
846
847		/*
848		 * Start a transaction.
849		 */
850		tx = dmu_tx_create(zfsvfs->z_os);
851		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
852		dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
853		zfs_sa_upgrade_txholds(tx, zp);
854		error = dmu_tx_assign(tx, TXG_WAIT);
855		if (error) {
856			dmu_tx_abort(tx);
857			if (abuf != NULL)
858				dmu_return_arcbuf(abuf);
859			break;
860		}
861
862		/*
863		 * If zfs_range_lock() over-locked we grow the blocksize
864		 * and then reduce the lock range.  This will only happen
865		 * on the first iteration since zfs_range_reduce() will
866		 * shrink down r_len to the appropriate size.
867		 */
868		if (rl->r_len == UINT64_MAX) {
869			uint64_t new_blksz;
870
871			if (zp->z_blksz > max_blksz) {
872				/*
873				 * File's blocksize is already larger than the
874				 * "recordsize" property.  Only let it grow to
875				 * the next power of 2.
876				 */
877				ASSERT(!ISP2(zp->z_blksz));
878				new_blksz = MIN(end_size,
879				    1 << highbit64(zp->z_blksz));
880			} else {
881				new_blksz = MIN(end_size, max_blksz);
882			}
883			zfs_grow_blocksize(zp, new_blksz, tx);
884			zfs_range_reduce(rl, woff, n);
885		}
886
887		/*
888		 * XXX - should we really limit each write to z_max_blksz?
889		 * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
890		 */
891		nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
892
893		if (abuf == NULL) {
894			tx_bytes = uio->uio_resid;
895			error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl),
896			    uio, nbytes, tx);
897			tx_bytes -= uio->uio_resid;
898		} else {
899			tx_bytes = nbytes;
900			ASSERT(xuio == NULL || tx_bytes == aiov->iov_len);
901			/*
902			 * If this is not a full block write, but we are
903			 * extending the file past EOF and this data starts
904			 * block-aligned, use assign_arcbuf().  Otherwise,
905			 * write via dmu_write().
906			 */
907			if (tx_bytes < max_blksz && (!write_eof ||
908			    aiov->iov_base != abuf->b_data)) {
909				ASSERT(xuio);
910				dmu_write(zfsvfs->z_os, zp->z_id, woff,
911				    aiov->iov_len, aiov->iov_base, tx);
912				dmu_return_arcbuf(abuf);
913				xuio_stat_wbuf_copied();
914			} else {
915				ASSERT(xuio || tx_bytes == max_blksz);
916				dmu_assign_arcbuf(sa_get_db(zp->z_sa_hdl),
917				    woff, abuf, tx);
918			}
919			ASSERT(tx_bytes <= uio->uio_resid);
920			uioskip(uio, tx_bytes);
921		}
922		if (tx_bytes && vn_has_cached_data(vp)) {
923			update_pages(vp, woff,
924			    tx_bytes, zfsvfs->z_os, zp->z_id);
925		}
926
927		/*
928		 * If we made no progress, we're done.  If we made even
929		 * partial progress, update the znode and ZIL accordingly.
930		 */
931		if (tx_bytes == 0) {
932			(void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
933			    (void *)&zp->z_size, sizeof (uint64_t), tx);
934			dmu_tx_commit(tx);
935			ASSERT(error != 0);
936			break;
937		}
938
939		/*
940		 * Clear Set-UID/Set-GID bits on successful write if not
941		 * privileged and at least one of the excute bits is set.
942		 *
943		 * It would be nice to to this after all writes have
944		 * been done, but that would still expose the ISUID/ISGID
945		 * to another app after the partial write is committed.
946		 *
947		 * Note: we don't call zfs_fuid_map_id() here because
948		 * user 0 is not an ephemeral uid.
949		 */
950		mutex_enter(&zp->z_acl_lock);
951		if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) |
952		    (S_IXUSR >> 6))) != 0 &&
953		    (zp->z_mode & (S_ISUID | S_ISGID)) != 0 &&
954		    secpolicy_vnode_setid_retain(cr,
955		    (zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0) != 0) {
956			uint64_t newmode;
957			zp->z_mode &= ~(S_ISUID | S_ISGID);
958			newmode = zp->z_mode;
959			(void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs),
960			    (void *)&newmode, sizeof (uint64_t), tx);
961		}
962		mutex_exit(&zp->z_acl_lock);
963
964		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
965		    B_TRUE);
966
967		/*
968		 * Update the file size (zp_size) if it has changed;
969		 * account for possible concurrent updates.
970		 */
971		while ((end_size = zp->z_size) < uio->uio_loffset) {
972			(void) atomic_cas_64(&zp->z_size, end_size,
973			    uio->uio_loffset);
974			ASSERT(error == 0);
975		}
976		/*
977		 * If we are replaying and eof is non zero then force
978		 * the file size to the specified eof. Note, there's no
979		 * concurrency during replay.
980		 */
981		if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0)
982			zp->z_size = zfsvfs->z_replay_eof;
983
984		error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
985
986		zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag);
987		dmu_tx_commit(tx);
988
989		if (error != 0)
990			break;
991		ASSERT(tx_bytes == nbytes);
992		n -= nbytes;
993
994		if (!xuio && n > 0)
995			uio_prefaultpages(MIN(n, max_blksz), uio);
996	}
997
998	zfs_range_unlock(rl);
999
1000	/*
1001	 * If we're in replay mode, or we made no progress, return error.
1002	 * Otherwise, it's at least a partial write, so it's successful.
1003	 */
1004	if (zfsvfs->z_replay || uio->uio_resid == start_resid) {
1005		ZFS_EXIT(zfsvfs);
1006		return (error);
1007	}
1008
1009	if (ioflag & (FSYNC | FDSYNC) ||
1010	    zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1011		zil_commit(zilog, zp->z_id);
1012
1013	ZFS_EXIT(zfsvfs);
1014	return (0);
1015}
1016
1017void
1018zfs_get_done(zgd_t *zgd, int error)
1019{
1020	znode_t *zp = zgd->zgd_private;
1021	objset_t *os = zp->z_zfsvfs->z_os;
1022
1023	if (zgd->zgd_db)
1024		dmu_buf_rele(zgd->zgd_db, zgd);
1025
1026	zfs_range_unlock(zgd->zgd_rl);
1027
1028	/*
1029	 * Release the vnode asynchronously as we currently have the
1030	 * txg stopped from syncing.
1031	 */
1032	VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
1033
1034	if (error == 0 && zgd->zgd_bp)
1035		zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
1036
1037	kmem_free(zgd, sizeof (zgd_t));
1038}
1039
1040#ifdef DEBUG
1041static int zil_fault_io = 0;
1042#endif
1043
1044/*
1045 * Get data to generate a TX_WRITE intent log record.
1046 */
1047int
1048zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
1049{
1050	zfsvfs_t *zfsvfs = arg;
1051	objset_t *os = zfsvfs->z_os;
1052	znode_t *zp;
1053	uint64_t object = lr->lr_foid;
1054	uint64_t offset = lr->lr_offset;
1055	uint64_t size = lr->lr_length;
1056	dmu_buf_t *db;
1057	zgd_t *zgd;
1058	int error = 0;
1059
1060	ASSERT(zio != NULL);
1061	ASSERT(size != 0);
1062
1063	/*
1064	 * Nothing to do if the file has been removed
1065	 */
1066	if (zfs_zget(zfsvfs, object, &zp) != 0)
1067		return (SET_ERROR(ENOENT));
1068	if (zp->z_unlinked) {
1069		/*
1070		 * Release the vnode asynchronously as we currently have the
1071		 * txg stopped from syncing.
1072		 */
1073		VN_RELE_ASYNC(ZTOV(zp),
1074		    dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
1075		return (SET_ERROR(ENOENT));
1076	}
1077
1078	zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
1079	zgd->zgd_zilog = zfsvfs->z_log;
1080	zgd->zgd_private = zp;
1081
1082	/*
1083	 * Write records come in two flavors: immediate and indirect.
1084	 * For small writes it's cheaper to store the data with the
1085	 * log record (immediate); for large writes it's cheaper to
1086	 * sync the data and get a pointer to it (indirect) so that
1087	 * we don't have to write the data twice.
1088	 */
1089	if (buf != NULL) { /* immediate write */
1090		zgd->zgd_rl = zfs_range_lock(zp, offset, size, RL_READER);
1091		/* test for truncation needs to be done while range locked */
1092		if (offset >= zp->z_size) {
1093			error = SET_ERROR(ENOENT);
1094		} else {
1095			error = dmu_read(os, object, offset, size, buf,
1096			    DMU_READ_NO_PREFETCH);
1097		}
1098		ASSERT(error == 0 || error == ENOENT);
1099	} else { /* indirect write */
1100		/*
1101		 * Have to lock the whole block to ensure when it's
1102		 * written out and it's checksum is being calculated
1103		 * that no one can change the data. We need to re-check
1104		 * blocksize after we get the lock in case it's changed!
1105		 */
1106		for (;;) {
1107			uint64_t blkoff;
1108			size = zp->z_blksz;
1109			blkoff = ISP2(size) ? P2PHASE(offset, size) : offset;
1110			offset -= blkoff;
1111			zgd->zgd_rl = zfs_range_lock(zp, offset, size,
1112			    RL_READER);
1113			if (zp->z_blksz == size)
1114				break;
1115			offset += blkoff;
1116			zfs_range_unlock(zgd->zgd_rl);
1117		}
1118		/* test for truncation needs to be done while range locked */
1119		if (lr->lr_offset >= zp->z_size)
1120			error = SET_ERROR(ENOENT);
1121#ifdef DEBUG
1122		if (zil_fault_io) {
1123			error = SET_ERROR(EIO);
1124			zil_fault_io = 0;
1125		}
1126#endif
1127		if (error == 0)
1128			error = dmu_buf_hold(os, object, offset, zgd, &db,
1129			    DMU_READ_NO_PREFETCH);
1130
1131		if (error == 0) {
1132			blkptr_t *bp = &lr->lr_blkptr;
1133
1134			zgd->zgd_db = db;
1135			zgd->zgd_bp = bp;
1136
1137			ASSERT(db->db_offset == offset);
1138			ASSERT(db->db_size == size);
1139
1140			error = dmu_sync(zio, lr->lr_common.lrc_txg,
1141			    zfs_get_done, zgd);
1142			ASSERT(error || lr->lr_length <= size);
1143
1144			/*
1145			 * On success, we need to wait for the write I/O
1146			 * initiated by dmu_sync() to complete before we can
1147			 * release this dbuf.  We will finish everything up
1148			 * in the zfs_get_done() callback.
1149			 */
1150			if (error == 0)
1151				return (0);
1152
1153			if (error == EALREADY) {
1154				lr->lr_common.lrc_txtype = TX_WRITE2;
1155				error = 0;
1156			}
1157		}
1158	}
1159
1160	zfs_get_done(zgd, error);
1161
1162	return (error);
1163}
1164
1165/*ARGSUSED*/
1166static int
1167zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr,
1168    caller_context_t *ct)
1169{
1170	znode_t *zp = VTOZ(vp);
1171	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1172	int error;
1173
1174	ZFS_ENTER(zfsvfs);
1175	ZFS_VERIFY_ZP(zp);
1176
1177	if (flag & V_ACE_MASK)
1178		error = zfs_zaccess(zp, mode, flag, B_FALSE, cr);
1179	else
1180		error = zfs_zaccess_rwx(zp, mode, flag, cr);
1181
1182	ZFS_EXIT(zfsvfs);
1183	return (error);
1184}
1185
1186/*
1187 * If vnode is for a device return a specfs vnode instead.
1188 */
1189static int
1190specvp_check(vnode_t **vpp, cred_t *cr)
1191{
1192	int error = 0;
1193
1194	if (IS_DEVVP(*vpp)) {
1195		struct vnode *svp;
1196
1197		svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
1198		VN_RELE(*vpp);
1199		if (svp == NULL)
1200			error = SET_ERROR(ENOSYS);
1201		*vpp = svp;
1202	}
1203	return (error);
1204}
1205
1206
1207/*
1208 * Lookup an entry in a directory, or an extended attribute directory.
1209 * If it exists, return a held vnode reference for it.
1210 *
1211 *	IN:	dvp	- vnode of directory to search.
1212 *		nm	- name of entry to lookup.
1213 *		pnp	- full pathname to lookup [UNUSED].
1214 *		flags	- LOOKUP_XATTR set if looking for an attribute.
1215 *		rdir	- root directory vnode [UNUSED].
1216 *		cr	- credentials of caller.
1217 *		ct	- caller context
1218 *		direntflags - directory lookup flags
1219 *		realpnp - returned pathname.
1220 *
1221 *	OUT:	vpp	- vnode of located entry, NULL if not found.
1222 *
1223 *	RETURN:	0 on success, error code on failure.
1224 *
1225 * Timestamps:
1226 *	NA
1227 */
1228/* ARGSUSED */
1229static int
1230zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
1231    int flags, vnode_t *rdir, cred_t *cr,  caller_context_t *ct,
1232    int *direntflags, pathname_t *realpnp)
1233{
1234	znode_t *zdp = VTOZ(dvp);
1235	zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
1236	int	error = 0;
1237
1238	/*
1239	 * Fast path lookup, however we must skip DNLC lookup
1240	 * for case folding or normalizing lookups because the
1241	 * DNLC code only stores the passed in name.  This means
1242	 * creating 'a' and removing 'A' on a case insensitive
1243	 * file system would work, but DNLC still thinks 'a'
1244	 * exists and won't let you create it again on the next
1245	 * pass through fast path.
1246	 */
1247	if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) {
1248
1249		if (dvp->v_type != VDIR) {
1250			return (SET_ERROR(ENOTDIR));
1251		} else if (zdp->z_sa_hdl == NULL) {
1252			return (SET_ERROR(EIO));
1253		}
1254
1255		if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) {
1256			error = zfs_fastaccesschk_execute(zdp, cr);
1257			if (!error) {
1258				*vpp = dvp;
1259				VN_HOLD(*vpp);
1260				return (0);
1261			}
1262			return (error);
1263		} else if (!zdp->z_zfsvfs->z_norm &&
1264		    (zdp->z_zfsvfs->z_case == ZFS_CASE_SENSITIVE)) {
1265
1266			vnode_t *tvp = dnlc_lookup(dvp, nm);
1267
1268			if (tvp) {
1269				error = zfs_fastaccesschk_execute(zdp, cr);
1270				if (error) {
1271					VN_RELE(tvp);
1272					return (error);
1273				}
1274				if (tvp == DNLC_NO_VNODE) {
1275					VN_RELE(tvp);
1276					return (SET_ERROR(ENOENT));
1277				} else {
1278					*vpp = tvp;
1279					return (specvp_check(vpp, cr));
1280				}
1281			}
1282		}
1283	}
1284
1285	DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm);
1286
1287	ZFS_ENTER(zfsvfs);
1288	ZFS_VERIFY_ZP(zdp);
1289
1290	*vpp = NULL;
1291
1292	if (flags & LOOKUP_XATTR) {
1293		/*
1294		 * If the xattr property is off, refuse the lookup request.
1295		 */
1296		if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) {
1297			ZFS_EXIT(zfsvfs);
1298			return (SET_ERROR(EINVAL));
1299		}
1300
1301		/*
1302		 * We don't allow recursive attributes..
1303		 * Maybe someday we will.
1304		 */
1305		if (zdp->z_pflags & ZFS_XATTR) {
1306			ZFS_EXIT(zfsvfs);
1307			return (SET_ERROR(EINVAL));
1308		}
1309
1310		if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) {
1311			ZFS_EXIT(zfsvfs);
1312			return (error);
1313		}
1314
1315		/*
1316		 * Do we have permission to get into attribute directory?
1317		 */
1318
1319		if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0,
1320		    B_FALSE, cr)) {
1321			VN_RELE(*vpp);
1322			*vpp = NULL;
1323		}
1324
1325		ZFS_EXIT(zfsvfs);
1326		return (error);
1327	}
1328
1329	if (dvp->v_type != VDIR) {
1330		ZFS_EXIT(zfsvfs);
1331		return (SET_ERROR(ENOTDIR));
1332	}
1333
1334	/*
1335	 * Check accessibility of directory.
1336	 */
1337
1338	if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) {
1339		ZFS_EXIT(zfsvfs);
1340		return (error);
1341	}
1342
1343	if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
1344	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1345		ZFS_EXIT(zfsvfs);
1346		return (SET_ERROR(EILSEQ));
1347	}
1348
1349	error = zfs_dirlook(zdp, nm, vpp, flags, direntflags, realpnp);
1350	if (error == 0)
1351		error = specvp_check(vpp, cr);
1352
1353	ZFS_EXIT(zfsvfs);
1354	return (error);
1355}
1356
1357/*
1358 * Attempt to create a new entry in a directory.  If the entry
1359 * already exists, truncate the file if permissible, else return
1360 * an error.  Return the vp of the created or trunc'd file.
1361 *
1362 *	IN:	dvp	- vnode of directory to put new file entry in.
1363 *		name	- name of new file entry.
1364 *		vap	- attributes of new file.
1365 *		excl	- flag indicating exclusive or non-exclusive mode.
1366 *		mode	- mode to open file with.
1367 *		cr	- credentials of caller.
1368 *		flag	- large file flag [UNUSED].
1369 *		ct	- caller context
1370 *		vsecp	- ACL to be set
1371 *
1372 *	OUT:	vpp	- vnode of created or trunc'd entry.
1373 *
1374 *	RETURN:	0 on success, error code on failure.
1375 *
1376 * Timestamps:
1377 *	dvp - ctime|mtime updated if new entry created
1378 *	 vp - ctime|mtime always, atime if new
1379 */
1380
1381/* ARGSUSED */
1382static int
1383zfs_create(vnode_t *dvp, char *name, vattr_t *vap, vcexcl_t excl,
1384    int mode, vnode_t **vpp, cred_t *cr, int flag, caller_context_t *ct,
1385    vsecattr_t *vsecp)
1386{
1387	znode_t		*zp, *dzp = VTOZ(dvp);
1388	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
1389	zilog_t		*zilog;
1390	objset_t	*os;
1391	zfs_dirlock_t	*dl;
1392	dmu_tx_t	*tx;
1393	int		error;
1394	ksid_t		*ksid;
1395	uid_t		uid;
1396	gid_t		gid = crgetgid(cr);
1397	zfs_acl_ids_t   acl_ids;
1398	boolean_t	fuid_dirtied;
1399	boolean_t	have_acl = B_FALSE;
1400	boolean_t	waited = B_FALSE;
1401
1402	/*
1403	 * If we have an ephemeral id, ACL, or XVATTR then
1404	 * make sure file system is at proper version
1405	 */
1406
1407	ksid = crgetsid(cr, KSID_OWNER);
1408	if (ksid)
1409		uid = ksid_getid(ksid);
1410	else
1411		uid = crgetuid(cr);
1412
1413	if (zfsvfs->z_use_fuids == B_FALSE &&
1414	    (vsecp || (vap->va_mask & AT_XVATTR) ||
1415	    IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
1416		return (SET_ERROR(EINVAL));
1417
1418	ZFS_ENTER(zfsvfs);
1419	ZFS_VERIFY_ZP(dzp);
1420	os = zfsvfs->z_os;
1421	zilog = zfsvfs->z_log;
1422
1423	if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
1424	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1425		ZFS_EXIT(zfsvfs);
1426		return (SET_ERROR(EILSEQ));
1427	}
1428
1429	if (vap->va_mask & AT_XVATTR) {
1430		if ((error = secpolicy_xvattr((xvattr_t *)vap,
1431		    crgetuid(cr), cr, vap->va_type)) != 0) {
1432			ZFS_EXIT(zfsvfs);
1433			return (error);
1434		}
1435	}
1436top:
1437	*vpp = NULL;
1438
1439	if ((vap->va_mode & VSVTX) && secpolicy_vnode_stky_modify(cr))
1440		vap->va_mode &= ~VSVTX;
1441
1442	if (*name == '\0') {
1443		/*
1444		 * Null component name refers to the directory itself.
1445		 */
1446		VN_HOLD(dvp);
1447		zp = dzp;
1448		dl = NULL;
1449		error = 0;
1450	} else {
1451		/* possible VN_HOLD(zp) */
1452		int zflg = 0;
1453
1454		if (flag & FIGNORECASE)
1455			zflg |= ZCILOOK;
1456
1457		error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
1458		    NULL, NULL);
1459		if (error) {
1460			if (have_acl)
1461				zfs_acl_ids_free(&acl_ids);
1462			if (strcmp(name, "..") == 0)
1463				error = SET_ERROR(EISDIR);
1464			ZFS_EXIT(zfsvfs);
1465			return (error);
1466		}
1467	}
1468
1469	if (zp == NULL) {
1470		uint64_t txtype;
1471
1472		/*
1473		 * Create a new file object and update the directory
1474		 * to reference it.
1475		 */
1476		if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
1477			if (have_acl)
1478				zfs_acl_ids_free(&acl_ids);
1479			goto out;
1480		}
1481
1482		/*
1483		 * We only support the creation of regular files in
1484		 * extended attribute directories.
1485		 */
1486
1487		if ((dzp->z_pflags & ZFS_XATTR) &&
1488		    (vap->va_type != VREG)) {
1489			if (have_acl)
1490				zfs_acl_ids_free(&acl_ids);
1491			error = SET_ERROR(EINVAL);
1492			goto out;
1493		}
1494
1495		if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap,
1496		    cr, vsecp, &acl_ids)) != 0)
1497			goto out;
1498		have_acl = B_TRUE;
1499
1500		if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
1501			zfs_acl_ids_free(&acl_ids);
1502			error = SET_ERROR(EDQUOT);
1503			goto out;
1504		}
1505
1506		tx = dmu_tx_create(os);
1507
1508		dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
1509		    ZFS_SA_BASE_ATTR_SIZE);
1510
1511		fuid_dirtied = zfsvfs->z_fuid_dirty;
1512		if (fuid_dirtied)
1513			zfs_fuid_txhold(zfsvfs, tx);
1514		dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
1515		dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
1516		if (!zfsvfs->z_use_sa &&
1517		    acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
1518			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
1519			    0, acl_ids.z_aclp->z_acl_bytes);
1520		}
1521		error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
1522		if (error) {
1523			zfs_dirent_unlock(dl);
1524			if (error == ERESTART) {
1525				waited = B_TRUE;
1526				dmu_tx_wait(tx);
1527				dmu_tx_abort(tx);
1528				goto top;
1529			}
1530			zfs_acl_ids_free(&acl_ids);
1531			dmu_tx_abort(tx);
1532			ZFS_EXIT(zfsvfs);
1533			return (error);
1534		}
1535		zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
1536
1537		if (fuid_dirtied)
1538			zfs_fuid_sync(zfsvfs, tx);
1539
1540		(void) zfs_link_create(dl, zp, tx, ZNEW);
1541		txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
1542		if (flag & FIGNORECASE)
1543			txtype |= TX_CI;
1544		zfs_log_create(zilog, tx, txtype, dzp, zp, name,
1545		    vsecp, acl_ids.z_fuidp, vap);
1546		zfs_acl_ids_free(&acl_ids);
1547		dmu_tx_commit(tx);
1548	} else {
1549		int aflags = (flag & FAPPEND) ? V_APPEND : 0;
1550
1551		if (have_acl)
1552			zfs_acl_ids_free(&acl_ids);
1553		have_acl = B_FALSE;
1554
1555		/*
1556		 * A directory entry already exists for this name.
1557		 */
1558		/*
1559		 * Can't truncate an existing file if in exclusive mode.
1560		 */
1561		if (excl == EXCL) {
1562			error = SET_ERROR(EEXIST);
1563			goto out;
1564		}
1565		/*
1566		 * Can't open a directory for writing.
1567		 */
1568		if ((ZTOV(zp)->v_type == VDIR) && (mode & S_IWRITE)) {
1569			error = SET_ERROR(EISDIR);
1570			goto out;
1571		}
1572		/*
1573		 * Verify requested access to file.
1574		 */
1575		if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr))) {
1576			goto out;
1577		}
1578
1579		mutex_enter(&dzp->z_lock);
1580		dzp->z_seq++;
1581		mutex_exit(&dzp->z_lock);
1582
1583		/*
1584		 * Truncate regular files if requested.
1585		 */
1586		if ((ZTOV(zp)->v_type == VREG) &&
1587		    (vap->va_mask & AT_SIZE) && (vap->va_size == 0)) {
1588			/* we can't hold any locks when calling zfs_freesp() */
1589			zfs_dirent_unlock(dl);
1590			dl = NULL;
1591			error = zfs_freesp(zp, 0, 0, mode, TRUE);
1592			if (error == 0) {
1593				vnevent_create(ZTOV(zp), ct);
1594			}
1595		}
1596	}
1597out:
1598
1599	if (dl)
1600		zfs_dirent_unlock(dl);
1601
1602	if (error) {
1603		if (zp)
1604			VN_RELE(ZTOV(zp));
1605	} else {
1606		*vpp = ZTOV(zp);
1607		error = specvp_check(vpp, cr);
1608	}
1609
1610	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1611		zil_commit(zilog, 0);
1612
1613	ZFS_EXIT(zfsvfs);
1614	return (error);
1615}
1616
1617/*
1618 * Remove an entry from a directory.
1619 *
1620 *	IN:	dvp	- vnode of directory to remove entry from.
1621 *		name	- name of entry to remove.
1622 *		cr	- credentials of caller.
1623 *		ct	- caller context
1624 *		flags	- case flags
1625 *
1626 *	RETURN:	0 on success, error code on failure.
1627 *
1628 * Timestamps:
1629 *	dvp - ctime|mtime
1630 *	 vp - ctime (if nlink > 0)
1631 */
1632
1633uint64_t null_xattr = 0;
1634
1635/*ARGSUSED*/
1636static int
1637zfs_remove(vnode_t *dvp, char *name, cred_t *cr, caller_context_t *ct,
1638    int flags)
1639{
1640	znode_t		*zp, *dzp = VTOZ(dvp);
1641	znode_t		*xzp;
1642	vnode_t		*vp;
1643	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
1644	zilog_t		*zilog;
1645	uint64_t	acl_obj, xattr_obj;
1646	uint64_t	xattr_obj_unlinked = 0;
1647	uint64_t	obj = 0;
1648	zfs_dirlock_t	*dl;
1649	dmu_tx_t	*tx;
1650	boolean_t	may_delete_now, delete_now = FALSE;
1651	boolean_t	unlinked, toobig = FALSE;
1652	uint64_t	txtype;
1653	pathname_t	*realnmp = NULL;
1654	pathname_t	realnm;
1655	int		error;
1656	int		zflg = ZEXISTS;
1657	boolean_t	waited = B_FALSE;
1658
1659	ZFS_ENTER(zfsvfs);
1660	ZFS_VERIFY_ZP(dzp);
1661	zilog = zfsvfs->z_log;
1662
1663	if (flags & FIGNORECASE) {
1664		zflg |= ZCILOOK;
1665		pn_alloc(&realnm);
1666		realnmp = &realnm;
1667	}
1668
1669top:
1670	xattr_obj = 0;
1671	xzp = NULL;
1672	/*
1673	 * Attempt to lock directory; fail if entry doesn't exist.
1674	 */
1675	if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
1676	    NULL, realnmp)) {
1677		if (realnmp)
1678			pn_free(realnmp);
1679		ZFS_EXIT(zfsvfs);
1680		return (error);
1681	}
1682
1683	vp = ZTOV(zp);
1684
1685	if (error = zfs_zaccess_delete(dzp, zp, cr)) {
1686		goto out;
1687	}
1688
1689	/*
1690	 * Need to use rmdir for removing directories.
1691	 */
1692	if (vp->v_type == VDIR) {
1693		error = SET_ERROR(EPERM);
1694		goto out;
1695	}
1696
1697	vnevent_remove(vp, dvp, name, ct);
1698
1699	if (realnmp)
1700		dnlc_remove(dvp, realnmp->pn_buf);
1701	else
1702		dnlc_remove(dvp, name);
1703
1704	mutex_enter(&vp->v_lock);
1705	may_delete_now = vp->v_count == 1 && !vn_has_cached_data(vp);
1706	mutex_exit(&vp->v_lock);
1707
1708	/*
1709	 * We may delete the znode now, or we may put it in the unlinked set;
1710	 * it depends on whether we're the last link, and on whether there are
1711	 * other holds on the vnode.  So we dmu_tx_hold() the right things to
1712	 * allow for either case.
1713	 */
1714	obj = zp->z_id;
1715	tx = dmu_tx_create(zfsvfs->z_os);
1716	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
1717	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1718	zfs_sa_upgrade_txholds(tx, zp);
1719	zfs_sa_upgrade_txholds(tx, dzp);
1720	if (may_delete_now) {
1721		toobig =
1722		    zp->z_size > zp->z_blksz * DMU_MAX_DELETEBLKCNT;
1723		/* if the file is too big, only hold_free a token amount */
1724		dmu_tx_hold_free(tx, zp->z_id, 0,
1725		    (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END));
1726	}
1727
1728	/* are there any extended attributes? */
1729	error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
1730	    &xattr_obj, sizeof (xattr_obj));
1731	if (error == 0 && xattr_obj) {
1732		error = zfs_zget(zfsvfs, xattr_obj, &xzp);
1733		ASSERT0(error);
1734		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
1735		dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
1736	}
1737
1738	mutex_enter(&zp->z_lock);
1739	if ((acl_obj = zfs_external_acl(zp)) != 0 && may_delete_now)
1740		dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
1741	mutex_exit(&zp->z_lock);
1742
1743	/* charge as an update -- would be nice not to charge at all */
1744	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
1745
1746	/*
1747	 * Mark this transaction as typically resulting in a net free of space
1748	 */
1749	dmu_tx_mark_netfree(tx);
1750
1751	error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
1752	if (error) {
1753		zfs_dirent_unlock(dl);
1754		VN_RELE(vp);
1755		if (xzp)
1756			VN_RELE(ZTOV(xzp));
1757		if (error == ERESTART) {
1758			waited = B_TRUE;
1759			dmu_tx_wait(tx);
1760			dmu_tx_abort(tx);
1761			goto top;
1762		}
1763		if (realnmp)
1764			pn_free(realnmp);
1765		dmu_tx_abort(tx);
1766		ZFS_EXIT(zfsvfs);
1767		return (error);
1768	}
1769
1770	/*
1771	 * Remove the directory entry.
1772	 */
1773	error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked);
1774
1775	if (error) {
1776		dmu_tx_commit(tx);
1777		goto out;
1778	}
1779
1780	if (unlinked) {
1781		/*
1782		 * Hold z_lock so that we can make sure that the ACL obj
1783		 * hasn't changed.  Could have been deleted due to
1784		 * zfs_sa_upgrade().
1785		 */
1786		mutex_enter(&zp->z_lock);
1787		mutex_enter(&vp->v_lock);
1788		(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
1789		    &xattr_obj_unlinked, sizeof (xattr_obj_unlinked));
1790		delete_now = may_delete_now && !toobig &&
1791		    vp->v_count == 1 && !vn_has_cached_data(vp) &&
1792		    xattr_obj == xattr_obj_unlinked && zfs_external_acl(zp) ==
1793		    acl_obj;
1794		mutex_exit(&vp->v_lock);
1795	}
1796
1797	if (delete_now) {
1798		if (xattr_obj_unlinked) {
1799			ASSERT3U(xzp->z_links, ==, 2);
1800			mutex_enter(&xzp->z_lock);
1801			xzp->z_unlinked = 1;
1802			xzp->z_links = 0;
1803			error = sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs),
1804			    &xzp->z_links, sizeof (xzp->z_links), tx);
1805			ASSERT3U(error,  ==,  0);
1806			mutex_exit(&xzp->z_lock);
1807			zfs_unlinked_add(xzp, tx);
1808
1809			if (zp->z_is_sa)
1810				error = sa_remove(zp->z_sa_hdl,
1811				    SA_ZPL_XATTR(zfsvfs), tx);
1812			else
1813				error = sa_update(zp->z_sa_hdl,
1814				    SA_ZPL_XATTR(zfsvfs), &null_xattr,
1815				    sizeof (uint64_t), tx);
1816			ASSERT0(error);
1817		}
1818		mutex_enter(&vp->v_lock);
1819		VN_RELE_LOCKED(vp);
1820		ASSERT0(vp->v_count);
1821		mutex_exit(&vp->v_lock);
1822		mutex_exit(&zp->z_lock);
1823		zfs_znode_delete(zp, tx);
1824	} else if (unlinked) {
1825		mutex_exit(&zp->z_lock);
1826		zfs_unlinked_add(zp, tx);
1827	}
1828
1829	txtype = TX_REMOVE;
1830	if (flags & FIGNORECASE)
1831		txtype |= TX_CI;
1832	zfs_log_remove(zilog, tx, txtype, dzp, name, obj);
1833
1834	dmu_tx_commit(tx);
1835out:
1836	if (realnmp)
1837		pn_free(realnmp);
1838
1839	zfs_dirent_unlock(dl);
1840
1841	if (!delete_now)
1842		VN_RELE(vp);
1843	if (xzp)
1844		VN_RELE(ZTOV(xzp));
1845
1846	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1847		zil_commit(zilog, 0);
1848
1849	ZFS_EXIT(zfsvfs);
1850	return (error);
1851}
1852
1853/*
1854 * Create a new directory and insert it into dvp using the name
1855 * provided.  Return a pointer to the inserted directory.
1856 *
1857 *	IN:	dvp	- vnode of directory to add subdir to.
1858 *		dirname	- name of new directory.
1859 *		vap	- attributes of new directory.
1860 *		cr	- credentials of caller.
1861 *		ct	- caller context
1862 *		flags	- case flags
1863 *		vsecp	- ACL to be set
1864 *
1865 *	OUT:	vpp	- vnode of created directory.
1866 *
1867 *	RETURN:	0 on success, error code on failure.
1868 *
1869 * Timestamps:
1870 *	dvp - ctime|mtime updated
1871 *	 vp - ctime|mtime|atime updated
1872 */
1873/*ARGSUSED*/
1874static int
1875zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr,
1876    caller_context_t *ct, int flags, vsecattr_t *vsecp)
1877{
1878	znode_t		*zp, *dzp = VTOZ(dvp);
1879	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
1880	zilog_t		*zilog;
1881	zfs_dirlock_t	*dl;
1882	uint64_t	txtype;
1883	dmu_tx_t	*tx;
1884	int		error;
1885	int		zf = ZNEW;
1886	ksid_t		*ksid;
1887	uid_t		uid;
1888	gid_t		gid = crgetgid(cr);
1889	zfs_acl_ids_t   acl_ids;
1890	boolean_t	fuid_dirtied;
1891	boolean_t	waited = B_FALSE;
1892
1893	ASSERT(vap->va_type == VDIR);
1894
1895	/*
1896	 * If we have an ephemeral id, ACL, or XVATTR then
1897	 * make sure file system is at proper version
1898	 */
1899
1900	ksid = crgetsid(cr, KSID_OWNER);
1901	if (ksid)
1902		uid = ksid_getid(ksid);
1903	else
1904		uid = crgetuid(cr);
1905	if (zfsvfs->z_use_fuids == B_FALSE &&
1906	    (vsecp || (vap->va_mask & AT_XVATTR) ||
1907	    IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
1908		return (SET_ERROR(EINVAL));
1909
1910	ZFS_ENTER(zfsvfs);
1911	ZFS_VERIFY_ZP(dzp);
1912	zilog = zfsvfs->z_log;
1913
1914	if (dzp->z_pflags & ZFS_XATTR) {
1915		ZFS_EXIT(zfsvfs);
1916		return (SET_ERROR(EINVAL));
1917	}
1918
1919	if (zfsvfs->z_utf8 && u8_validate(dirname,
1920	    strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1921		ZFS_EXIT(zfsvfs);
1922		return (SET_ERROR(EILSEQ));
1923	}
1924	if (flags & FIGNORECASE)
1925		zf |= ZCILOOK;
1926
1927	if (vap->va_mask & AT_XVATTR) {
1928		if ((error = secpolicy_xvattr((xvattr_t *)vap,
1929		    crgetuid(cr), cr, vap->va_type)) != 0) {
1930			ZFS_EXIT(zfsvfs);
1931			return (error);
1932		}
1933	}
1934
1935	if ((error = zfs_acl_ids_create(dzp, 0, vap, cr,
1936	    vsecp, &acl_ids)) != 0) {
1937		ZFS_EXIT(zfsvfs);
1938		return (error);
1939	}
1940	/*
1941	 * First make sure the new directory doesn't exist.
1942	 *
1943	 * Existence is checked first to make sure we don't return
1944	 * EACCES instead of EEXIST which can cause some applications
1945	 * to fail.
1946	 */
1947top:
1948	*vpp = NULL;
1949
1950	if (error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf,
1951	    NULL, NULL)) {
1952		zfs_acl_ids_free(&acl_ids);
1953		ZFS_EXIT(zfsvfs);
1954		return (error);
1955	}
1956
1957	if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) {
1958		zfs_acl_ids_free(&acl_ids);
1959		zfs_dirent_unlock(dl);
1960		ZFS_EXIT(zfsvfs);
1961		return (error);
1962	}
1963
1964	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
1965		zfs_acl_ids_free(&acl_ids);
1966		zfs_dirent_unlock(dl);
1967		ZFS_EXIT(zfsvfs);
1968		return (SET_ERROR(EDQUOT));
1969	}
1970
1971	/*
1972	 * Add a new entry to the directory.
1973	 */
1974	tx = dmu_tx_create(zfsvfs->z_os);
1975	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
1976	dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
1977	fuid_dirtied = zfsvfs->z_fuid_dirty;
1978	if (fuid_dirtied)
1979		zfs_fuid_txhold(zfsvfs, tx);
1980	if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
1981		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
1982		    acl_ids.z_aclp->z_acl_bytes);
1983	}
1984
1985	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
1986	    ZFS_SA_BASE_ATTR_SIZE);
1987
1988	error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
1989	if (error) {
1990		zfs_dirent_unlock(dl);
1991		if (error == ERESTART) {
1992			waited = B_TRUE;
1993			dmu_tx_wait(tx);
1994			dmu_tx_abort(tx);
1995			goto top;
1996		}
1997		zfs_acl_ids_free(&acl_ids);
1998		dmu_tx_abort(tx);
1999		ZFS_EXIT(zfsvfs);
2000		return (error);
2001	}
2002
2003	/*
2004	 * Create new node.
2005	 */
2006	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
2007
2008	if (fuid_dirtied)
2009		zfs_fuid_sync(zfsvfs, tx);
2010
2011	/*
2012	 * Now put new name in parent dir.
2013	 */
2014	(void) zfs_link_create(dl, zp, tx, ZNEW);
2015
2016	*vpp = ZTOV(zp);
2017
2018	txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap);
2019	if (flags & FIGNORECASE)
2020		txtype |= TX_CI;
2021	zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp,
2022	    acl_ids.z_fuidp, vap);
2023
2024	zfs_acl_ids_free(&acl_ids);
2025
2026	dmu_tx_commit(tx);
2027
2028	zfs_dirent_unlock(dl);
2029
2030	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
2031		zil_commit(zilog, 0);
2032
2033	ZFS_EXIT(zfsvfs);
2034	return (0);
2035}
2036
2037/*
2038 * Remove a directory subdir entry.  If the current working
2039 * directory is the same as the subdir to be removed, the
2040 * remove will fail.
2041 *
2042 *	IN:	dvp	- vnode of directory to remove from.
2043 *		name	- name of directory to be removed.
2044 *		cwd	- vnode of current working directory.
2045 *		cr	- credentials of caller.
2046 *		ct	- caller context
2047 *		flags	- case flags
2048 *
2049 *	RETURN:	0 on success, error code on failure.
2050 *
2051 * Timestamps:
2052 *	dvp - ctime|mtime updated
2053 */
2054/*ARGSUSED*/
2055static int
2056zfs_rmdir(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr,
2057    caller_context_t *ct, int flags)
2058{
2059	znode_t		*dzp = VTOZ(dvp);
2060	znode_t		*zp;
2061	vnode_t		*vp;
2062	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
2063	zilog_t		*zilog;
2064	zfs_dirlock_t	*dl;
2065	dmu_tx_t	*tx;
2066	int		error;
2067	int		zflg = ZEXISTS;
2068	boolean_t	waited = B_FALSE;
2069
2070	ZFS_ENTER(zfsvfs);
2071	ZFS_VERIFY_ZP(dzp);
2072	zilog = zfsvfs->z_log;
2073
2074	if (flags & FIGNORECASE)
2075		zflg |= ZCILOOK;
2076top:
2077	zp = NULL;
2078
2079	/*
2080	 * Attempt to lock directory; fail if entry doesn't exist.
2081	 */
2082	if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
2083	    NULL, NULL)) {
2084		ZFS_EXIT(zfsvfs);
2085		return (error);
2086	}
2087
2088	vp = ZTOV(zp);
2089
2090	if (error = zfs_zaccess_delete(dzp, zp, cr)) {
2091		goto out;
2092	}
2093
2094	if (vp->v_type != VDIR) {
2095		error = SET_ERROR(ENOTDIR);
2096		goto out;
2097	}
2098
2099	if (vp == cwd) {
2100		error = SET_ERROR(EINVAL);
2101		goto out;
2102	}
2103
2104	vnevent_rmdir(vp, dvp, name, ct);
2105
2106	/*
2107	 * Grab a lock on the directory to make sure that noone is
2108	 * trying to add (or lookup) entries while we are removing it.
2109	 */
2110	rw_enter(&zp->z_name_lock, RW_WRITER);
2111
2112	/*
2113	 * Grab a lock on the parent pointer to make sure we play well
2114	 * with the treewalk and directory rename code.
2115	 */
2116	rw_enter(&zp->z_parent_lock, RW_WRITER);
2117
2118	tx = dmu_tx_create(zfsvfs->z_os);
2119	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
2120	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
2121	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
2122	zfs_sa_upgrade_txholds(tx, zp);
2123	zfs_sa_upgrade_txholds(tx, dzp);
2124	dmu_tx_mark_netfree(tx);
2125	error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
2126	if (error) {
2127		rw_exit(&zp->z_parent_lock);
2128		rw_exit(&zp->z_name_lock);
2129		zfs_dirent_unlock(dl);
2130		VN_RELE(vp);
2131		if (error == ERESTART) {
2132			waited = B_TRUE;
2133			dmu_tx_wait(tx);
2134			dmu_tx_abort(tx);
2135			goto top;
2136		}
2137		dmu_tx_abort(tx);
2138		ZFS_EXIT(zfsvfs);
2139		return (error);
2140	}
2141
2142	error = zfs_link_destroy(dl, zp, tx, zflg, NULL);
2143
2144	if (error == 0) {
2145		uint64_t txtype = TX_RMDIR;
2146		if (flags & FIGNORECASE)
2147			txtype |= TX_CI;
2148		zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT);
2149	}
2150
2151	dmu_tx_commit(tx);
2152
2153	rw_exit(&zp->z_parent_lock);
2154	rw_exit(&zp->z_name_lock);
2155out:
2156	zfs_dirent_unlock(dl);
2157
2158	VN_RELE(vp);
2159
2160	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
2161		zil_commit(zilog, 0);
2162
2163	ZFS_EXIT(zfsvfs);
2164	return (error);
2165}
2166
2167/*
2168 * Read as many directory entries as will fit into the provided
2169 * buffer from the given directory cursor position (specified in
2170 * the uio structure).
2171 *
2172 *	IN:	vp	- vnode of directory to read.
2173 *		uio	- structure supplying read location, range info,
2174 *			  and return buffer.
2175 *		cr	- credentials of caller.
2176 *		ct	- caller context
2177 *		flags	- case flags
2178 *
2179 *	OUT:	uio	- updated offset and range, buffer filled.
2180 *		eofp	- set to true if end-of-file detected.
2181 *
2182 *	RETURN:	0 on success, error code on failure.
2183 *
2184 * Timestamps:
2185 *	vp - atime updated
2186 *
2187 * Note that the low 4 bits of the cookie returned by zap is always zero.
2188 * This allows us to use the low range for "special" directory entries:
2189 * We use 0 for '.', and 1 for '..'.  If this is the root of the filesystem,
2190 * we use the offset 2 for the '.zfs' directory.
2191 */
2192/* ARGSUSED */
2193static int
2194zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp,
2195    caller_context_t *ct, int flags)
2196{
2197	znode_t		*zp = VTOZ(vp);
2198	iovec_t		*iovp;
2199	edirent_t	*eodp;
2200	dirent64_t	*odp;
2201	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
2202	objset_t	*os;
2203	caddr_t		outbuf;
2204	size_t		bufsize;
2205	zap_cursor_t	zc;
2206	zap_attribute_t	zap;
2207	uint_t		bytes_wanted;
2208	uint64_t	offset; /* must be unsigned; checks for < 1 */
2209	uint64_t	parent;
2210	int		local_eof;
2211	int		outcount;
2212	int		error;
2213	uint8_t		prefetch;
2214	boolean_t	check_sysattrs;
2215
2216	ZFS_ENTER(zfsvfs);
2217	ZFS_VERIFY_ZP(zp);
2218
2219	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
2220	    &parent, sizeof (parent))) != 0) {
2221		ZFS_EXIT(zfsvfs);
2222		return (error);
2223	}
2224
2225	/*
2226	 * If we are not given an eof variable,
2227	 * use a local one.
2228	 */
2229	if (eofp == NULL)
2230		eofp = &local_eof;
2231
2232	/*
2233	 * Check for valid iov_len.
2234	 */
2235	if (uio->uio_iov->iov_len <= 0) {
2236		ZFS_EXIT(zfsvfs);
2237		return (SET_ERROR(EINVAL));
2238	}
2239
2240	/*
2241	 * Quit if directory has been removed (posix)
2242	 */
2243	if ((*eofp = zp->z_unlinked) != 0) {
2244		ZFS_EXIT(zfsvfs);
2245		return (0);
2246	}
2247
2248	error = 0;
2249	os = zfsvfs->z_os;
2250	offset = uio->uio_loffset;
2251	prefetch = zp->z_zn_prefetch;
2252
2253	/*
2254	 * Initialize the iterator cursor.
2255	 */
2256	if (offset <= 3) {
2257		/*
2258		 * Start iteration from the beginning of the directory.
2259		 */
2260		zap_cursor_init(&zc, os, zp->z_id);
2261	} else {
2262		/*
2263		 * The offset is a serialized cursor.
2264		 */
2265		zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
2266	}
2267
2268	/*
2269	 * Get space to change directory entries into fs independent format.
2270	 */
2271	iovp = uio->uio_iov;
2272	bytes_wanted = iovp->iov_len;
2273	if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) {
2274		bufsize = bytes_wanted;
2275		outbuf = kmem_alloc(bufsize, KM_SLEEP);
2276		odp = (struct dirent64 *)outbuf;
2277	} else {
2278		bufsize = bytes_wanted;
2279		outbuf = NULL;
2280		odp = (struct dirent64 *)iovp->iov_base;
2281	}
2282	eodp = (struct edirent *)odp;
2283
2284	/*
2285	 * If this VFS supports the system attribute view interface; and
2286	 * we're looking at an extended attribute directory; and we care
2287	 * about normalization conflicts on this vfs; then we must check
2288	 * for normalization conflicts with the sysattr name space.
2289	 */
2290	check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
2291	    (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm &&
2292	    (flags & V_RDDIR_ENTFLAGS);
2293
2294	/*
2295	 * Transform to file-system independent format
2296	 */
2297	outcount = 0;
2298	while (outcount < bytes_wanted) {
2299		ino64_t objnum;
2300		ushort_t reclen;
2301		off64_t *next = NULL;
2302
2303		/*
2304		 * Special case `.', `..', and `.zfs'.
2305		 */
2306		if (offset == 0) {
2307			(void) strcpy(zap.za_name, ".");
2308			zap.za_normalization_conflict = 0;
2309			objnum = zp->z_id;
2310		} else if (offset == 1) {
2311			(void) strcpy(zap.za_name, "..");
2312			zap.za_normalization_conflict = 0;
2313			objnum = parent;
2314		} else if (offset == 2 && zfs_show_ctldir(zp)) {
2315			(void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
2316			zap.za_normalization_conflict = 0;
2317			objnum = ZFSCTL_INO_ROOT;
2318		} else {
2319			/*
2320			 * Grab next entry.
2321			 */
2322			if (error = zap_cursor_retrieve(&zc, &zap)) {
2323				if ((*eofp = (error == ENOENT)) != 0)
2324					break;
2325				else
2326					goto update;
2327			}
2328
2329			if (zap.za_integer_length != 8 ||
2330			    zap.za_num_integers != 1) {
2331				cmn_err(CE_WARN, "zap_readdir: bad directory "
2332				    "entry, obj = %lld, offset = %lld\n",
2333				    (u_longlong_t)zp->z_id,
2334				    (u_longlong_t)offset);
2335				error = SET_ERROR(ENXIO);
2336				goto update;
2337			}
2338
2339			objnum = ZFS_DIRENT_OBJ(zap.za_first_integer);
2340			/*
2341			 * MacOS X can extract the object type here such as:
2342			 * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer);
2343			 */
2344
2345			if (check_sysattrs && !zap.za_normalization_conflict) {
2346				zap.za_normalization_conflict =
2347				    xattr_sysattr_casechk(zap.za_name);
2348			}
2349		}
2350
2351		if (flags & V_RDDIR_ACCFILTER) {
2352			/*
2353			 * If we have no access at all, don't include
2354			 * this entry in the returned information
2355			 */
2356			znode_t	*ezp;
2357			if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0)
2358				goto skip_entry;
2359			if (!zfs_has_access(ezp, cr)) {
2360				VN_RELE(ZTOV(ezp));
2361				goto skip_entry;
2362			}
2363			VN_RELE(ZTOV(ezp));
2364		}
2365
2366		if (flags & V_RDDIR_ENTFLAGS)
2367			reclen = EDIRENT_RECLEN(strlen(zap.za_name));
2368		else
2369			reclen = DIRENT64_RECLEN(strlen(zap.za_name));
2370
2371		/*
2372		 * Will this entry fit in the buffer?
2373		 */
2374		if (outcount + reclen > bufsize) {
2375			/*
2376			 * Did we manage to fit anything in the buffer?
2377			 */
2378			if (!outcount) {
2379				error = SET_ERROR(EINVAL);
2380				goto update;
2381			}
2382			break;
2383		}
2384		if (flags & V_RDDIR_ENTFLAGS) {
2385			/*
2386			 * Add extended flag entry:
2387			 */
2388			eodp->ed_ino = objnum;
2389			eodp->ed_reclen = reclen;
2390			/* NOTE: ed_off is the offset for the *next* entry */
2391			next = &(eodp->ed_off);
2392			eodp->ed_eflags = zap.za_normalization_conflict ?
2393			    ED_CASE_CONFLICT : 0;
2394			(void) strncpy(eodp->ed_name, zap.za_name,
2395			    EDIRENT_NAMELEN(reclen));
2396			eodp = (edirent_t *)((intptr_t)eodp + reclen);
2397		} else {
2398			/*
2399			 * Add normal entry:
2400			 */
2401			odp->d_ino = objnum;
2402			odp->d_reclen = reclen;
2403			/* NOTE: d_off is the offset for the *next* entry */
2404			next = &(odp->d_off);
2405			(void) strncpy(odp->d_name, zap.za_name,
2406			    DIRENT64_NAMELEN(reclen));
2407			odp = (dirent64_t *)((intptr_t)odp + reclen);
2408		}
2409		outcount += reclen;
2410
2411		ASSERT(outcount <= bufsize);
2412
2413		/* Prefetch znode */
2414		if (prefetch)
2415			dmu_prefetch(os, objnum, 0, 0, 0,
2416			    ZIO_PRIORITY_SYNC_READ);
2417
2418	skip_entry:
2419		/*
2420		 * Move to the next entry, fill in the previous offset.
2421		 */
2422		if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
2423			zap_cursor_advance(&zc);
2424			offset = zap_cursor_serialize(&zc);
2425		} else {
2426			offset += 1;
2427		}
2428		if (next)
2429			*next = offset;
2430	}
2431	zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
2432
2433	if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) {
2434		iovp->iov_base += outcount;
2435		iovp->iov_len -= outcount;
2436		uio->uio_resid -= outcount;
2437	} else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) {
2438		/*
2439		 * Reset the pointer.
2440		 */
2441		offset = uio->uio_loffset;
2442	}
2443
2444update:
2445	zap_cursor_fini(&zc);
2446	if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1)
2447		kmem_free(outbuf, bufsize);
2448
2449	if (error == ENOENT)
2450		error = 0;
2451
2452	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
2453
2454	uio->uio_loffset = offset;
2455	ZFS_EXIT(zfsvfs);
2456	return (error);
2457}
2458
2459ulong_t zfs_fsync_sync_cnt = 4;
2460
2461static int
2462zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
2463{
2464	znode_t	*zp = VTOZ(vp);
2465	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2466
2467	/*
2468	 * Regardless of whether this is required for standards conformance,
2469	 * this is the logical behavior when fsync() is called on a file with
2470	 * dirty pages.  We use B_ASYNC since the ZIL transactions are already
2471	 * going to be pushed out as part of the zil_commit().
2472	 */
2473	if (vn_has_cached_data(vp) && !(syncflag & FNODSYNC) &&
2474	    (vp->v_type == VREG) && !(IS_SWAPVP(vp)))
2475		(void) VOP_PUTPAGE(vp, (offset_t)0, (size_t)0, B_ASYNC, cr, ct);
2476
2477	(void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt);
2478
2479	if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) {
2480		ZFS_ENTER(zfsvfs);
2481		ZFS_VERIFY_ZP(zp);
2482		zil_commit(zfsvfs->z_log, zp->z_id);
2483		ZFS_EXIT(zfsvfs);
2484	}
2485	return (0);
2486}
2487
2488
2489/*
2490 * Get the requested file attributes and place them in the provided
2491 * vattr structure.
2492 *
2493 *	IN:	vp	- vnode of file.
2494 *		vap	- va_mask identifies requested attributes.
2495 *			  If AT_XVATTR set, then optional attrs are requested
2496 *		flags	- ATTR_NOACLCHECK (CIFS server context)
2497 *		cr	- credentials of caller.
2498 *		ct	- caller context
2499 *
2500 *	OUT:	vap	- attribute values.
2501 *
2502 *	RETURN:	0 (always succeeds).
2503 */
2504/* ARGSUSED */
2505static int
2506zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
2507    caller_context_t *ct)
2508{
2509	znode_t *zp = VTOZ(vp);
2510	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2511	int	error = 0;
2512	uint64_t links;
2513	uint64_t mtime[2], ctime[2];
2514	xvattr_t *xvap = (xvattr_t *)vap;	/* vap may be an xvattr_t * */
2515	xoptattr_t *xoap = NULL;
2516	boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
2517	sa_bulk_attr_t bulk[2];
2518	int count = 0;
2519
2520	ZFS_ENTER(zfsvfs);
2521	ZFS_VERIFY_ZP(zp);
2522
2523	zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid);
2524
2525	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
2526	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
2527
2528	if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) {
2529		ZFS_EXIT(zfsvfs);
2530		return (error);
2531	}
2532
2533	/*
2534	 * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES.
2535	 * Also, if we are the owner don't bother, since owner should
2536	 * always be allowed to read basic attributes of file.
2537	 */
2538	if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) &&
2539	    (vap->va_uid != crgetuid(cr))) {
2540		if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0,
2541		    skipaclchk, cr)) {
2542			ZFS_EXIT(zfsvfs);
2543			return (error);
2544		}
2545	}
2546
2547	/*
2548	 * Return all attributes.  It's cheaper to provide the answer
2549	 * than to determine whether we were asked the question.
2550	 */
2551
2552	mutex_enter(&zp->z_lock);
2553	vap->va_type = vp->v_type;
2554	vap->va_mode = zp->z_mode & MODEMASK;
2555	vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev;
2556	vap->va_nodeid = zp->z_id;
2557	if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp))
2558		links = zp->z_links + 1;
2559	else
2560		links = zp->z_links;
2561	vap->va_nlink = MIN(links, UINT32_MAX);	/* nlink_t limit! */
2562	vap->va_size = zp->z_size;
2563	vap->va_rdev = vp->v_rdev;
2564	vap->va_seq = zp->z_seq;
2565
2566	/*
2567	 * Add in any requested optional attributes and the create time.
2568	 * Also set the corresponding bits in the returned attribute bitmap.
2569	 */
2570	if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) {
2571		if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
2572			xoap->xoa_archive =
2573			    ((zp->z_pflags & ZFS_ARCHIVE) != 0);
2574			XVA_SET_RTN(xvap, XAT_ARCHIVE);
2575		}
2576
2577		if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
2578			xoap->xoa_readonly =
2579			    ((zp->z_pflags & ZFS_READONLY) != 0);
2580			XVA_SET_RTN(xvap, XAT_READONLY);
2581		}
2582
2583		if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
2584			xoap->xoa_system =
2585			    ((zp->z_pflags & ZFS_SYSTEM) != 0);
2586			XVA_SET_RTN(xvap, XAT_SYSTEM);
2587		}
2588
2589		if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
2590			xoap->xoa_hidden =
2591			    ((zp->z_pflags & ZFS_HIDDEN) != 0);
2592			XVA_SET_RTN(xvap, XAT_HIDDEN);
2593		}
2594
2595		if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
2596			xoap->xoa_nounlink =
2597			    ((zp->z_pflags & ZFS_NOUNLINK) != 0);
2598			XVA_SET_RTN(xvap, XAT_NOUNLINK);
2599		}
2600
2601		if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
2602			xoap->xoa_immutable =
2603			    ((zp->z_pflags & ZFS_IMMUTABLE) != 0);
2604			XVA_SET_RTN(xvap, XAT_IMMUTABLE);
2605		}
2606
2607		if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
2608			xoap->xoa_appendonly =
2609			    ((zp->z_pflags & ZFS_APPENDONLY) != 0);
2610			XVA_SET_RTN(xvap, XAT_APPENDONLY);
2611		}
2612
2613		if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
2614			xoap->xoa_nodump =
2615			    ((zp->z_pflags & ZFS_NODUMP) != 0);
2616			XVA_SET_RTN(xvap, XAT_NODUMP);
2617		}
2618
2619		if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
2620			xoap->xoa_opaque =
2621			    ((zp->z_pflags & ZFS_OPAQUE) != 0);
2622			XVA_SET_RTN(xvap, XAT_OPAQUE);
2623		}
2624
2625		if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
2626			xoap->xoa_av_quarantined =
2627			    ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0);
2628			XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
2629		}
2630
2631		if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
2632			xoap->xoa_av_modified =
2633			    ((zp->z_pflags & ZFS_AV_MODIFIED) != 0);
2634			XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
2635		}
2636
2637		if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) &&
2638		    vp->v_type == VREG) {
2639			zfs_sa_get_scanstamp(zp, xvap);
2640		}
2641
2642		if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
2643			uint64_t times[2];
2644
2645			(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(zfsvfs),
2646			    times, sizeof (times));
2647			ZFS_TIME_DECODE(&xoap->xoa_createtime, times);
2648			XVA_SET_RTN(xvap, XAT_CREATETIME);
2649		}
2650
2651		if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
2652			xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0);
2653			XVA_SET_RTN(xvap, XAT_REPARSE);
2654		}
2655		if (XVA_ISSET_REQ(xvap, XAT_GEN)) {
2656			xoap->xoa_generation = zp->z_gen;
2657			XVA_SET_RTN(xvap, XAT_GEN);
2658		}
2659
2660		if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
2661			xoap->xoa_offline =
2662			    ((zp->z_pflags & ZFS_OFFLINE) != 0);
2663			XVA_SET_RTN(xvap, XAT_OFFLINE);
2664		}
2665
2666		if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
2667			xoap->xoa_sparse =
2668			    ((zp->z_pflags & ZFS_SPARSE) != 0);
2669			XVA_SET_RTN(xvap, XAT_SPARSE);
2670		}
2671	}
2672
2673	ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime);
2674	ZFS_TIME_DECODE(&vap->va_mtime, mtime);
2675	ZFS_TIME_DECODE(&vap->va_ctime, ctime);
2676
2677	mutex_exit(&zp->z_lock);
2678
2679	sa_object_size(zp->z_sa_hdl, &vap->va_blksize, &vap->va_nblocks);
2680
2681	if (zp->z_blksz == 0) {
2682		/*
2683		 * Block size hasn't been set; suggest maximal I/O transfers.
2684		 */
2685		vap->va_blksize = zfsvfs->z_max_blksz;
2686	}
2687
2688	ZFS_EXIT(zfsvfs);
2689	return (0);
2690}
2691
2692/*
2693 * Set the file attributes to the values contained in the
2694 * vattr structure.
2695 *
2696 *	IN:	vp	- vnode of file to be modified.
2697 *		vap	- new attribute values.
2698 *			  If AT_XVATTR set, then optional attrs are being set
2699 *		flags	- ATTR_UTIME set if non-default time values provided.
2700 *			- ATTR_NOACLCHECK (CIFS context only).
2701 *		cr	- credentials of caller.
2702 *		ct	- caller context
2703 *
2704 *	RETURN:	0 on success, error code on failure.
2705 *
2706 * Timestamps:
2707 *	vp - ctime updated, mtime updated if size changed.
2708 */
2709/* ARGSUSED */
2710static int
2711zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
2712    caller_context_t *ct)
2713{
2714	znode_t		*zp = VTOZ(vp);
2715	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
2716	zilog_t		*zilog;
2717	dmu_tx_t	*tx;
2718	vattr_t		oldva;
2719	xvattr_t	tmpxvattr;
2720	uint_t		mask = vap->va_mask;
2721	uint_t		saved_mask = 0;
2722	int		trim_mask = 0;
2723	uint64_t	new_mode;
2724	uint64_t	new_uid, new_gid;
2725	uint64_t	xattr_obj;
2726	uint64_t	mtime[2], ctime[2];
2727	znode_t		*attrzp;
2728	int		need_policy = FALSE;
2729	int		err, err2;
2730	zfs_fuid_info_t *fuidp = NULL;
2731	xvattr_t *xvap = (xvattr_t *)vap;	/* vap may be an xvattr_t * */
2732	xoptattr_t	*xoap;
2733	zfs_acl_t	*aclp;
2734	boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
2735	boolean_t	fuid_dirtied = B_FALSE;
2736	sa_bulk_attr_t	bulk[7], xattr_bulk[7];
2737	int		count = 0, xattr_count = 0;
2738
2739	if (mask == 0)
2740		return (0);
2741
2742	if (mask & AT_NOSET)
2743		return (SET_ERROR(EINVAL));
2744
2745	ZFS_ENTER(zfsvfs);
2746	ZFS_VERIFY_ZP(zp);
2747
2748	zilog = zfsvfs->z_log;
2749
2750	/*
2751	 * Make sure that if we have ephemeral uid/gid or xvattr specified
2752	 * that file system is at proper version level
2753	 */
2754
2755	if (zfsvfs->z_use_fuids == B_FALSE &&
2756	    (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) ||
2757	    ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) ||
2758	    (mask & AT_XVATTR))) {
2759		ZFS_EXIT(zfsvfs);
2760		return (SET_ERROR(EINVAL));
2761	}
2762
2763	if (mask & AT_SIZE && vp->v_type == VDIR) {
2764		ZFS_EXIT(zfsvfs);
2765		return (SET_ERROR(EISDIR));
2766	}
2767
2768	if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) {
2769		ZFS_EXIT(zfsvfs);
2770		return (SET_ERROR(EINVAL));
2771	}
2772
2773	/*
2774	 * If this is an xvattr_t, then get a pointer to the structure of
2775	 * optional attributes.  If this is NULL, then we have a vattr_t.
2776	 */
2777	xoap = xva_getxoptattr(xvap);
2778
2779	xva_init(&tmpxvattr);
2780
2781	/*
2782	 * Immutable files can only alter immutable bit and atime
2783	 */
2784	if ((zp->z_pflags & ZFS_IMMUTABLE) &&
2785	    ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) ||
2786	    ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) {
2787		ZFS_EXIT(zfsvfs);
2788		return (SET_ERROR(EPERM));
2789	}
2790
2791	/*
2792	 * Note: ZFS_READONLY is handled in zfs_zaccess_common.
2793	 */
2794
2795	/*
2796	 * Verify timestamps doesn't overflow 32 bits.
2797	 * ZFS can handle large timestamps, but 32bit syscalls can't
2798	 * handle times greater than 2039.  This check should be removed
2799	 * once large timestamps are fully supported.
2800	 */
2801	if (mask & (AT_ATIME | AT_MTIME)) {
2802		if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) ||
2803		    ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) {
2804			ZFS_EXIT(zfsvfs);
2805			return (SET_ERROR(EOVERFLOW));
2806		}
2807	}
2808
2809top:
2810	attrzp = NULL;
2811	aclp = NULL;
2812
2813	/* Can this be moved to before the top label? */
2814	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
2815		ZFS_EXIT(zfsvfs);
2816		return (SET_ERROR(EROFS));
2817	}
2818
2819	/*
2820	 * First validate permissions
2821	 */
2822
2823	if (mask & AT_SIZE) {
2824		err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr);
2825		if (err) {
2826			ZFS_EXIT(zfsvfs);
2827			return (err);
2828		}
2829		/*
2830		 * XXX - Note, we are not providing any open
2831		 * mode flags here (like FNDELAY), so we may
2832		 * block if there are locks present... this
2833		 * should be addressed in openat().
2834		 */
2835		/* XXX - would it be OK to generate a log record here? */
2836		err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
2837		if (err) {
2838			ZFS_EXIT(zfsvfs);
2839			return (err);
2840		}
2841
2842		if (vap->va_size == 0)
2843			vnevent_truncate(ZTOV(zp), ct);
2844	}
2845
2846	if (mask & (AT_ATIME|AT_MTIME) ||
2847	    ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) ||
2848	    XVA_ISSET_REQ(xvap, XAT_READONLY) ||
2849	    XVA_ISSET_REQ(xvap, XAT_ARCHIVE) ||
2850	    XVA_ISSET_REQ(xvap, XAT_OFFLINE) ||
2851	    XVA_ISSET_REQ(xvap, XAT_SPARSE) ||
2852	    XVA_ISSET_REQ(xvap, XAT_CREATETIME) ||
2853	    XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) {
2854		need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
2855		    skipaclchk, cr);
2856	}
2857
2858	if (mask & (AT_UID|AT_GID)) {
2859		int	idmask = (mask & (AT_UID|AT_GID));
2860		int	take_owner;
2861		int	take_group;
2862
2863		/*
2864		 * NOTE: even if a new mode is being set,
2865		 * we may clear S_ISUID/S_ISGID bits.
2866		 */
2867
2868		if (!(mask & AT_MODE))
2869			vap->va_mode = zp->z_mode;
2870
2871		/*
2872		 * Take ownership or chgrp to group we are a member of
2873		 */
2874
2875		take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr));
2876		take_group = (mask & AT_GID) &&
2877		    zfs_groupmember(zfsvfs, vap->va_gid, cr);
2878
2879		/*
2880		 * If both AT_UID and AT_GID are set then take_owner and
2881		 * take_group must both be set in order to allow taking
2882		 * ownership.
2883		 *
2884		 * Otherwise, send the check through secpolicy_vnode_setattr()
2885		 *
2886		 */
2887
2888		if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) ||
2889		    ((idmask == AT_UID) && take_owner) ||
2890		    ((idmask == AT_GID) && take_group)) {
2891			if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0,
2892			    skipaclchk, cr) == 0) {
2893				/*
2894				 * Remove setuid/setgid for non-privileged users
2895				 */
2896				secpolicy_setid_clear(vap, cr);
2897				trim_mask = (mask & (AT_UID|AT_GID));
2898			} else {
2899				need_policy =  TRUE;
2900			}
2901		} else {
2902			need_policy =  TRUE;
2903		}
2904	}
2905
2906	mutex_enter(&zp->z_lock);
2907	oldva.va_mode = zp->z_mode;
2908	zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
2909	if (mask & AT_XVATTR) {
2910		/*
2911		 * Update xvattr mask to include only those attributes
2912		 * that are actually changing.
2913		 *
2914		 * the bits will be restored prior to actually setting
2915		 * the attributes so the caller thinks they were set.
2916		 */
2917		if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
2918			if (xoap->xoa_appendonly !=
2919			    ((zp->z_pflags & ZFS_APPENDONLY) != 0)) {
2920				need_policy = TRUE;
2921			} else {
2922				XVA_CLR_REQ(xvap, XAT_APPENDONLY);
2923				XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY);
2924			}
2925		}
2926
2927		if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
2928			if (xoap->xoa_nounlink !=
2929			    ((zp->z_pflags & ZFS_NOUNLINK) != 0)) {
2930				need_policy = TRUE;
2931			} else {
2932				XVA_CLR_REQ(xvap, XAT_NOUNLINK);
2933				XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK);
2934			}
2935		}
2936
2937		if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
2938			if (xoap->xoa_immutable !=
2939			    ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) {
2940				need_policy = TRUE;
2941			} else {
2942				XVA_CLR_REQ(xvap, XAT_IMMUTABLE);
2943				XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE);
2944			}
2945		}
2946
2947		if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
2948			if (xoap->xoa_nodump !=
2949			    ((zp->z_pflags & ZFS_NODUMP) != 0)) {
2950				need_policy = TRUE;
2951			} else {
2952				XVA_CLR_REQ(xvap, XAT_NODUMP);
2953				XVA_SET_REQ(&tmpxvattr, XAT_NODUMP);
2954			}
2955		}
2956
2957		if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
2958			if (xoap->xoa_av_modified !=
2959			    ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) {
2960				need_policy = TRUE;
2961			} else {
2962				XVA_CLR_REQ(xvap, XAT_AV_MODIFIED);
2963				XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED);
2964			}
2965		}
2966
2967		if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
2968			if ((vp->v_type != VREG &&
2969			    xoap->xoa_av_quarantined) ||
2970			    xoap->xoa_av_quarantined !=
2971			    ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) {
2972				need_policy = TRUE;
2973			} else {
2974				XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED);
2975				XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED);
2976			}
2977		}
2978
2979		if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
2980			mutex_exit(&zp->z_lock);
2981			ZFS_EXIT(zfsvfs);
2982			return (SET_ERROR(EPERM));
2983		}
2984
2985		if (need_policy == FALSE &&
2986		    (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) ||
2987		    XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
2988			need_policy = TRUE;
2989		}
2990	}
2991
2992	mutex_exit(&zp->z_lock);
2993
2994	if (mask & AT_MODE) {
2995		if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) {
2996			err = secpolicy_setid_setsticky_clear(vp, vap,
2997			    &oldva, cr);
2998			if (err) {
2999				ZFS_EXIT(zfsvfs);
3000				return (err);
3001			}
3002			trim_mask |= AT_MODE;
3003		} else {
3004			need_policy = TRUE;
3005		}
3006	}
3007
3008	if (need_policy) {
3009		/*
3010		 * If trim_mask is set then take ownership
3011		 * has been granted or write_acl is present and user
3012		 * has the ability to modify mode.  In that case remove
3013		 * UID|GID and or MODE from mask so that
3014		 * secpolicy_vnode_setattr() doesn't revoke it.
3015		 */
3016
3017		if (trim_mask) {
3018			saved_mask = vap->va_mask;
3019			vap->va_mask &= ~trim_mask;
3020		}
3021		err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags,
3022		    (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp);
3023		if (err) {
3024			ZFS_EXIT(zfsvfs);
3025			return (err);
3026		}
3027
3028		if (trim_mask)
3029			vap->va_mask |= saved_mask;
3030	}
3031
3032	/*
3033	 * secpolicy_vnode_setattr, or take ownership may have
3034	 * changed va_mask
3035	 */
3036	mask = vap->va_mask;
3037
3038	if ((mask & (AT_UID | AT_GID))) {
3039		err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
3040		    &xattr_obj, sizeof (xattr_obj));
3041
3042		if (err == 0 && xattr_obj) {
3043			err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp);
3044			if (err)
3045				goto out2;
3046		}
3047		if (mask & AT_UID) {
3048			new_uid = zfs_fuid_create(zfsvfs,
3049			    (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
3050			if (new_uid != zp->z_uid &&
3051			    zfs_fuid_overquota(zfsvfs, B_FALSE, new_uid)) {
3052				if (attrzp)
3053					VN_RELE(ZTOV(attrzp));
3054				err = SET_ERROR(EDQUOT);
3055				goto out2;
3056			}
3057		}
3058
3059		if (mask & AT_GID) {
3060			new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid,
3061			    cr, ZFS_GROUP, &fuidp);
3062			if (new_gid != zp->z_gid &&
3063			    zfs_fuid_overquota(zfsvfs, B_TRUE, new_gid)) {
3064				if (attrzp)
3065					VN_RELE(ZTOV(attrzp));
3066				err = SET_ERROR(EDQUOT);
3067				goto out2;
3068			}
3069		}
3070	}
3071	tx = dmu_tx_create(zfsvfs->z_os);
3072
3073	if (mask & AT_MODE) {
3074		uint64_t pmode = zp->z_mode;
3075		uint64_t acl_obj;
3076		new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
3077
3078		if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED &&
3079		    !(zp->z_pflags & ZFS_ACL_TRIVIAL)) {
3080			err = SET_ERROR(EPERM);
3081			goto out;
3082		}
3083
3084		if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode))
3085			goto out;
3086
3087		mutex_enter(&zp->z_lock);
3088		if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) {
3089			/*
3090			 * Are we upgrading ACL from old V0 format
3091			 * to V1 format?
3092			 */
3093			if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
3094			    zfs_znode_acl_version(zp) ==
3095			    ZFS_ACL_VERSION_INITIAL) {
3096				dmu_tx_hold_free(tx, acl_obj, 0,
3097				    DMU_OBJECT_END);
3098				dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
3099				    0, aclp->z_acl_bytes);
3100			} else {
3101				dmu_tx_hold_write(tx, acl_obj, 0,
3102				    aclp->z_acl_bytes);
3103			}
3104		} else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
3105			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
3106			    0, aclp->z_acl_bytes);
3107		}
3108		mutex_exit(&zp->z_lock);
3109		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
3110	} else {
3111		if ((mask & AT_XVATTR) &&
3112		    XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
3113			dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
3114		else
3115			dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
3116	}
3117
3118	if (attrzp) {
3119		dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE);
3120	}
3121
3122	fuid_dirtied = zfsvfs->z_fuid_dirty;
3123	if (fuid_dirtied)
3124		zfs_fuid_txhold(zfsvfs, tx);
3125
3126	zfs_sa_upgrade_txholds(tx, zp);
3127
3128	err = dmu_tx_assign(tx, TXG_WAIT);
3129	if (err)
3130		goto out;
3131
3132	count = 0;
3133	/*
3134	 * Set each attribute requested.
3135	 * We group settings according to the locks they need to acquire.
3136	 *
3137	 * Note: you cannot set ctime directly, although it will be
3138	 * updated as a side-effect of calling this function.
3139	 */
3140
3141
3142	if (mask & (AT_UID|AT_GID|AT_MODE))
3143		mutex_enter(&zp->z_acl_lock);
3144	mutex_enter(&zp->z_lock);
3145
3146	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
3147	    &zp->z_pflags, sizeof (zp->z_pflags));
3148
3149	if (attrzp) {
3150		if (mask & (AT_UID|AT_GID|AT_MODE))
3151			mutex_enter(&attrzp->z_acl_lock);
3152		mutex_enter(&attrzp->z_lock);
3153		SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3154		    SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags,
3155		    sizeof (attrzp->z_pflags));
3156	}
3157
3158	if (mask & (AT_UID|AT_GID)) {
3159
3160		if (mask & AT_UID) {
3161			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
3162			    &new_uid, sizeof (new_uid));
3163			zp->z_uid = new_uid;
3164			if (attrzp) {
3165				SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3166				    SA_ZPL_UID(zfsvfs), NULL, &new_uid,
3167				    sizeof (new_uid));
3168				attrzp->z_uid = new_uid;
3169			}
3170		}
3171
3172		if (mask & AT_GID) {
3173			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs),
3174			    NULL, &new_gid, sizeof (new_gid));
3175			zp->z_gid = new_gid;
3176			if (attrzp) {
3177				SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3178				    SA_ZPL_GID(zfsvfs), NULL, &new_gid,
3179				    sizeof (new_gid));
3180				attrzp->z_gid = new_gid;
3181			}
3182		}
3183		if (!(mask & AT_MODE)) {
3184			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs),
3185			    NULL, &new_mode, sizeof (new_mode));
3186			new_mode = zp->z_mode;
3187		}
3188		err = zfs_acl_chown_setattr(zp);
3189		ASSERT(err == 0);
3190		if (attrzp) {
3191			err = zfs_acl_chown_setattr(attrzp);
3192			ASSERT(err == 0);
3193		}
3194	}
3195
3196	if (mask & AT_MODE) {
3197		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
3198		    &new_mode, sizeof (new_mode));
3199		zp->z_mode = new_mode;
3200		ASSERT3U((uintptr_t)aclp, !=, NULL);
3201		err = zfs_aclset_common(zp, aclp, cr, tx);
3202		ASSERT0(err);
3203		if (zp->z_acl_cached)
3204			zfs_acl_free(zp->z_acl_cached);
3205		zp->z_acl_cached = aclp;
3206		aclp = NULL;
3207	}
3208
3209
3210	if (mask & AT_ATIME) {
3211		ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime);
3212		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
3213		    &zp->z_atime, sizeof (zp->z_atime));
3214	}
3215
3216	if (mask & AT_MTIME) {
3217		ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
3218		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
3219		    mtime, sizeof (mtime));
3220	}
3221
3222	/* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */
3223	if (mask & AT_SIZE && !(mask & AT_MTIME)) {
3224		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs),
3225		    NULL, mtime, sizeof (mtime));
3226		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
3227		    &ctime, sizeof (ctime));
3228		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
3229		    B_TRUE);
3230	} else if (mask != 0) {
3231		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
3232		    &ctime, sizeof (ctime));
3233		zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime,
3234		    B_TRUE);
3235		if (attrzp) {
3236			SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3237			    SA_ZPL_CTIME(zfsvfs), NULL,
3238			    &ctime, sizeof (ctime));
3239			zfs_tstamp_update_setup(attrzp, STATE_CHANGED,
3240			    mtime, ctime, B_TRUE);
3241		}
3242	}
3243	/*
3244	 * Do this after setting timestamps to prevent timestamp
3245	 * update from toggling bit
3246	 */
3247
3248	if (xoap && (mask & AT_XVATTR)) {
3249
3250		/*
3251		 * restore trimmed off masks
3252		 * so that return masks can be set for caller.
3253		 */
3254
3255		if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) {
3256			XVA_SET_REQ(xvap, XAT_APPENDONLY);
3257		}
3258		if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) {
3259			XVA_SET_REQ(xvap, XAT_NOUNLINK);
3260		}
3261		if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) {
3262			XVA_SET_REQ(xvap, XAT_IMMUTABLE);
3263		}
3264		if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) {
3265			XVA_SET_REQ(xvap, XAT_NODUMP);
3266		}
3267		if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) {
3268			XVA_SET_REQ(xvap, XAT_AV_MODIFIED);
3269		}
3270		if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) {
3271			XVA_SET_REQ(xvap, XAT_AV_QUARANTINED);
3272		}
3273
3274		if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
3275			ASSERT(vp->v_type == VREG);
3276
3277		zfs_xvattr_set(zp, xvap, tx);
3278	}
3279
3280	if (fuid_dirtied)
3281		zfs_fuid_sync(zfsvfs, tx);
3282
3283	if (mask != 0)
3284		zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
3285
3286	mutex_exit(&zp->z_lock);
3287	if (mask & (AT_UID|AT_GID|AT_MODE))
3288		mutex_exit(&zp->z_acl_lock);
3289
3290	if (attrzp) {
3291		if (mask & (AT_UID|AT_GID|AT_MODE))
3292			mutex_exit(&attrzp->z_acl_lock);
3293		mutex_exit(&attrzp->z_lock);
3294	}
3295out:
3296	if (err == 0 && attrzp) {
3297		err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk,
3298		    xattr_count, tx);
3299		ASSERT(err2 == 0);
3300	}
3301
3302	if (attrzp)
3303		VN_RELE(ZTOV(attrzp));
3304
3305	if (aclp)
3306		zfs_acl_free(aclp);
3307
3308	if (fuidp) {
3309		zfs_fuid_info_free(fuidp);
3310		fuidp = NULL;
3311	}
3312
3313	if (err) {
3314		dmu_tx_abort(tx);
3315		if (err == ERESTART)
3316			goto top;
3317	} else {
3318		err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
3319		dmu_tx_commit(tx);
3320	}
3321
3322out2:
3323	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3324		zil_commit(zilog, 0);
3325
3326	ZFS_EXIT(zfsvfs);
3327	return (err);
3328}
3329
3330typedef struct zfs_zlock {
3331	krwlock_t	*zl_rwlock;	/* lock we acquired */
3332	znode_t		*zl_znode;	/* znode we held */
3333	struct zfs_zlock *zl_next;	/* next in list */
3334} zfs_zlock_t;
3335
3336/*
3337 * Drop locks and release vnodes that were held by zfs_rename_lock().
3338 */
3339static void
3340zfs_rename_unlock(zfs_zlock_t **zlpp)
3341{
3342	zfs_zlock_t *zl;
3343
3344	while ((zl = *zlpp) != NULL) {
3345		if (zl->zl_znode != NULL)
3346			VN_RELE(ZTOV(zl->zl_znode));
3347		rw_exit(zl->zl_rwlock);
3348		*zlpp = zl->zl_next;
3349		kmem_free(zl, sizeof (*zl));
3350	}
3351}
3352
3353/*
3354 * Search back through the directory tree, using the ".." entries.
3355 * Lock each directory in the chain to prevent concurrent renames.
3356 * Fail any attempt to move a directory into one of its own descendants.
3357 * XXX - z_parent_lock can overlap with map or grow locks
3358 */
3359static int
3360zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp)
3361{
3362	zfs_zlock_t	*zl;
3363	znode_t		*zp = tdzp;
3364	uint64_t	rootid = zp->z_zfsvfs->z_root;
3365	uint64_t	oidp = zp->z_id;
3366	krwlock_t	*rwlp = &szp->z_parent_lock;
3367	krw_t		rw = RW_WRITER;
3368
3369	/*
3370	 * First pass write-locks szp and compares to zp->z_id.
3371	 * Later passes read-lock zp and compare to zp->z_parent.
3372	 */
3373	do {
3374		if (!rw_tryenter(rwlp, rw)) {
3375			/*
3376			 * Another thread is renaming in this path.
3377			 * Note that if we are a WRITER, we don't have any
3378			 * parent_locks held yet.
3379			 */
3380			if (rw == RW_READER && zp->z_id > szp->z_id) {
3381				/*
3382				 * Drop our locks and restart
3383				 */
3384				zfs_rename_unlock(&zl);
3385				*zlpp = NULL;
3386				zp = tdzp;
3387				oidp = zp->z_id;
3388				rwlp = &szp->z_parent_lock;
3389				rw = RW_WRITER;
3390				continue;
3391			} else {
3392				/*
3393				 * Wait for other thread to drop its locks
3394				 */
3395				rw_enter(rwlp, rw);
3396			}
3397		}
3398
3399		zl = kmem_alloc(sizeof (*zl), KM_SLEEP);
3400		zl->zl_rwlock = rwlp;
3401		zl->zl_znode = NULL;
3402		zl->zl_next = *zlpp;
3403		*zlpp = zl;
3404
3405		if (oidp == szp->z_id)		/* We're a descendant of szp */
3406			return (SET_ERROR(EINVAL));
3407
3408		if (oidp == rootid)		/* We've hit the top */
3409			return (0);
3410
3411		if (rw == RW_READER) {		/* i.e. not the first pass */
3412			int error = zfs_zget(zp->z_zfsvfs, oidp, &zp);
3413			if (error)
3414				return (error);
3415			zl->zl_znode = zp;
3416		}
3417		(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zp->z_zfsvfs),
3418		    &oidp, sizeof (oidp));
3419		rwlp = &zp->z_parent_lock;
3420		rw = RW_READER;
3421
3422	} while (zp->z_id != sdzp->z_id);
3423
3424	return (0);
3425}
3426
3427/*
3428 * Move an entry from the provided source directory to the target
3429 * directory.  Change the entry name as indicated.
3430 *
3431 *	IN:	sdvp	- Source directory containing the "old entry".
3432 *		snm	- Old entry name.
3433 *		tdvp	- Target directory to contain the "new entry".
3434 *		tnm	- New entry name.
3435 *		cr	- credentials of caller.
3436 *		ct	- caller context
3437 *		flags	- case flags
3438 *
3439 *	RETURN:	0 on success, error code on failure.
3440 *
3441 * Timestamps:
3442 *	sdvp,tdvp - ctime|mtime updated
3443 */
3444/*ARGSUSED*/
3445static int
3446zfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr,
3447    caller_context_t *ct, int flags)
3448{
3449	znode_t		*tdzp, *szp, *tzp;
3450	znode_t		*sdzp = VTOZ(sdvp);
3451	zfsvfs_t	*zfsvfs = sdzp->z_zfsvfs;
3452	zilog_t		*zilog;
3453	vnode_t		*realvp;
3454	zfs_dirlock_t	*sdl, *tdl;
3455	dmu_tx_t	*tx;
3456	zfs_zlock_t	*zl;
3457	int		cmp, serr, terr;
3458	int		error = 0, rm_err = 0;
3459	int		zflg = 0;
3460	boolean_t	waited = B_FALSE;
3461
3462	ZFS_ENTER(zfsvfs);
3463	ZFS_VERIFY_ZP(sdzp);
3464	zilog = zfsvfs->z_log;
3465
3466	/*
3467	 * Make sure we have the real vp for the target directory.
3468	 */
3469	if (VOP_REALVP(tdvp, &realvp, ct) == 0)
3470		tdvp = realvp;
3471
3472	tdzp = VTOZ(tdvp);
3473	ZFS_VERIFY_ZP(tdzp);
3474
3475	/*
3476	 * We check z_zfsvfs rather than v_vfsp here, because snapshots and the
3477	 * ctldir appear to have the same v_vfsp.
3478	 */
3479	if (tdzp->z_zfsvfs != zfsvfs || zfsctl_is_node(tdvp)) {
3480		ZFS_EXIT(zfsvfs);
3481		return (SET_ERROR(EXDEV));
3482	}
3483
3484	if (zfsvfs->z_utf8 && u8_validate(tnm,
3485	    strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3486		ZFS_EXIT(zfsvfs);
3487		return (SET_ERROR(EILSEQ));
3488	}
3489
3490	if (flags & FIGNORECASE)
3491		zflg |= ZCILOOK;
3492
3493top:
3494	szp = NULL;
3495	tzp = NULL;
3496	zl = NULL;
3497
3498	/*
3499	 * This is to prevent the creation of links into attribute space
3500	 * by renaming a linked file into/outof an attribute directory.
3501	 * See the comment in zfs_link() for why this is considered bad.
3502	 */
3503	if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
3504		ZFS_EXIT(zfsvfs);
3505		return (SET_ERROR(EINVAL));
3506	}
3507
3508	/*
3509	 * Lock source and target directory entries.  To prevent deadlock,
3510	 * a lock ordering must be defined.  We lock the directory with
3511	 * the smallest object id first, or if it's a tie, the one with
3512	 * the lexically first name.
3513	 */
3514	if (sdzp->z_id < tdzp->z_id) {
3515		cmp = -1;
3516	} else if (sdzp->z_id > tdzp->z_id) {
3517		cmp = 1;
3518	} else {
3519		/*
3520		 * First compare the two name arguments without
3521		 * considering any case folding.
3522		 */
3523		int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER);
3524
3525		cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error);
3526		ASSERT(error == 0 || !zfsvfs->z_utf8);
3527		if (cmp == 0) {
3528			/*
3529			 * POSIX: "If the old argument and the new argument
3530			 * both refer to links to the same existing file,
3531			 * the rename() function shall return successfully
3532			 * and perform no other action."
3533			 */
3534			ZFS_EXIT(zfsvfs);
3535			return (0);
3536		}
3537		/*
3538		 * If the file system is case-folding, then we may
3539		 * have some more checking to do.  A case-folding file
3540		 * system is either supporting mixed case sensitivity
3541		 * access or is completely case-insensitive.  Note
3542		 * that the file system is always case preserving.
3543		 *
3544		 * In mixed sensitivity mode case sensitive behavior
3545		 * is the default.  FIGNORECASE must be used to
3546		 * explicitly request case insensitive behavior.
3547		 *
3548		 * If the source and target names provided differ only
3549		 * by case (e.g., a request to rename 'tim' to 'Tim'),
3550		 * we will treat this as a special case in the
3551		 * case-insensitive mode: as long as the source name
3552		 * is an exact match, we will allow this to proceed as
3553		 * a name-change request.
3554		 */
3555		if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
3556		    (zfsvfs->z_case == ZFS_CASE_MIXED &&
3557		    flags & FIGNORECASE)) &&
3558		    u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST,
3559		    &error) == 0) {
3560			/*
3561			 * case preserving rename request, require exact
3562			 * name matches
3563			 */
3564			zflg |= ZCIEXACT;
3565			zflg &= ~ZCILOOK;
3566		}
3567	}
3568
3569	/*
3570	 * If the source and destination directories are the same, we should
3571	 * grab the z_name_lock of that directory only once.
3572	 */
3573	if (sdzp == tdzp) {
3574		zflg |= ZHAVELOCK;
3575		rw_enter(&sdzp->z_name_lock, RW_READER);
3576	}
3577
3578	if (cmp < 0) {
3579		serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp,
3580		    ZEXISTS | zflg, NULL, NULL);
3581		terr = zfs_dirent_lock(&tdl,
3582		    tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL);
3583	} else {
3584		terr = zfs_dirent_lock(&tdl,
3585		    tdzp, tnm, &tzp, zflg, NULL, NULL);
3586		serr = zfs_dirent_lock(&sdl,
3587		    sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg,
3588		    NULL, NULL);
3589	}
3590
3591	if (serr) {
3592		/*
3593		 * Source entry invalid or not there.
3594		 */
3595		if (!terr) {
3596			zfs_dirent_unlock(tdl);
3597			if (tzp)
3598				VN_RELE(ZTOV(tzp));
3599		}
3600
3601		if (sdzp == tdzp)
3602			rw_exit(&sdzp->z_name_lock);
3603
3604		if (strcmp(snm, "..") == 0)
3605			serr = SET_ERROR(EINVAL);
3606		ZFS_EXIT(zfsvfs);
3607		return (serr);
3608	}
3609	if (terr) {
3610		zfs_dirent_unlock(sdl);
3611		VN_RELE(ZTOV(szp));
3612
3613		if (sdzp == tdzp)
3614			rw_exit(&sdzp->z_name_lock);
3615
3616		if (strcmp(tnm, "..") == 0)
3617			terr = SET_ERROR(EINVAL);
3618		ZFS_EXIT(zfsvfs);
3619		return (terr);
3620	}
3621
3622	/*
3623	 * Must have write access at the source to remove the old entry
3624	 * and write access at the target to create the new entry.
3625	 * Note that if target and source are the same, this can be
3626	 * done in a single check.
3627	 */
3628
3629	if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))
3630		goto out;
3631
3632	if (ZTOV(szp)->v_type == VDIR) {
3633		/*
3634		 * Check to make sure rename is valid.
3635		 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
3636		 */
3637		if (error = zfs_rename_lock(szp, tdzp, sdzp, &zl))
3638			goto out;
3639	}
3640
3641	/*
3642	 * Does target exist?
3643	 */
3644	if (tzp) {
3645		/*
3646		 * Source and target must be the same type.
3647		 */
3648		if (ZTOV(szp)->v_type == VDIR) {
3649			if (ZTOV(tzp)->v_type != VDIR) {
3650				error = SET_ERROR(ENOTDIR);
3651				goto out;
3652			}
3653		} else {
3654			if (ZTOV(tzp)->v_type == VDIR) {
3655				error = SET_ERROR(EISDIR);
3656				goto out;
3657			}
3658		}
3659		/*
3660		 * POSIX dictates that when the source and target
3661		 * entries refer to the same file object, rename
3662		 * must do nothing and exit without error.
3663		 */
3664		if (szp->z_id == tzp->z_id) {
3665			error = 0;
3666			goto out;
3667		}
3668	}
3669
3670	vnevent_pre_rename_src(ZTOV(szp), sdvp, snm, ct);
3671	if (tzp)
3672		vnevent_pre_rename_dest(ZTOV(tzp), tdvp, tnm, ct);
3673
3674	/*
3675	 * notify the target directory if it is not the same
3676	 * as source directory.
3677	 */
3678	if (tdvp != sdvp) {
3679		vnevent_pre_rename_dest_dir(tdvp, ZTOV(szp), tnm, ct);
3680	}
3681
3682	tx = dmu_tx_create(zfsvfs->z_os);
3683	dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
3684	dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
3685	dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
3686	dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
3687	if (sdzp != tdzp) {
3688		dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE);
3689		zfs_sa_upgrade_txholds(tx, tdzp);
3690	}
3691	if (tzp) {
3692		dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE);
3693		zfs_sa_upgrade_txholds(tx, tzp);
3694	}
3695
3696	zfs_sa_upgrade_txholds(tx, szp);
3697	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
3698	error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
3699	if (error) {
3700		if (zl != NULL)
3701			zfs_rename_unlock(&zl);
3702		zfs_dirent_unlock(sdl);
3703		zfs_dirent_unlock(tdl);
3704
3705		if (sdzp == tdzp)
3706			rw_exit(&sdzp->z_name_lock);
3707
3708		VN_RELE(ZTOV(szp));
3709		if (tzp)
3710			VN_RELE(ZTOV(tzp));
3711		if (error == ERESTART) {
3712			waited = B_TRUE;
3713			dmu_tx_wait(tx);
3714			dmu_tx_abort(tx);
3715			goto top;
3716		}
3717		dmu_tx_abort(tx);
3718		ZFS_EXIT(zfsvfs);
3719		return (error);
3720	}
3721
3722	if (tzp)	/* Attempt to remove the existing target */
3723		error = rm_err = zfs_link_destroy(tdl, tzp, tx, zflg, NULL);
3724
3725	if (error == 0) {
3726		error = zfs_link_create(tdl, szp, tx, ZRENAMING);
3727		if (error == 0) {
3728			szp->z_pflags |= ZFS_AV_MODIFIED;
3729
3730			error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
3731			    (void *)&szp->z_pflags, sizeof (uint64_t), tx);
3732			ASSERT0(error);
3733
3734			error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL);
3735			if (error == 0) {
3736				zfs_log_rename(zilog, tx, TX_RENAME |
3737				    (flags & FIGNORECASE ? TX_CI : 0), sdzp,
3738				    sdl->dl_name, tdzp, tdl->dl_name, szp);
3739
3740				/*
3741				 * Update path information for the target vnode
3742				 */
3743				vn_renamepath(tdvp, ZTOV(szp), tnm,
3744				    strlen(tnm));
3745			} else {
3746				/*
3747				 * At this point, we have successfully created
3748				 * the target name, but have failed to remove
3749				 * the source name.  Since the create was done
3750				 * with the ZRENAMING flag, there are
3751				 * complications; for one, the link count is
3752				 * wrong.  The easiest way to deal with this
3753				 * is to remove the newly created target, and
3754				 * return the original error.  This must
3755				 * succeed; fortunately, it is very unlikely to
3756				 * fail, since we just created it.
3757				 */
3758				VERIFY3U(zfs_link_destroy(tdl, szp, tx,
3759				    ZRENAMING, NULL), ==, 0);
3760			}
3761		}
3762	}
3763
3764	dmu_tx_commit(tx);
3765
3766	if (tzp && rm_err == 0)
3767		vnevent_rename_dest(ZTOV(tzp), tdvp, tnm, ct);
3768
3769	if (error == 0) {
3770		vnevent_rename_src(ZTOV(szp), sdvp, snm, ct);
3771		/* notify the target dir if it is not the same as source dir */
3772		if (tdvp != sdvp)
3773			vnevent_rename_dest_dir(tdvp, ct);
3774	}
3775out:
3776	if (zl != NULL)
3777		zfs_rename_unlock(&zl);
3778
3779	zfs_dirent_unlock(sdl);
3780	zfs_dirent_unlock(tdl);
3781
3782	if (sdzp == tdzp)
3783		rw_exit(&sdzp->z_name_lock);
3784
3785
3786	VN_RELE(ZTOV(szp));
3787	if (tzp)
3788		VN_RELE(ZTOV(tzp));
3789
3790	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3791		zil_commit(zilog, 0);
3792
3793	ZFS_EXIT(zfsvfs);
3794	return (error);
3795}
3796
3797/*
3798 * Insert the indicated symbolic reference entry into the directory.
3799 *
3800 *	IN:	dvp	- Directory to contain new symbolic link.
3801 *		link	- Name for new symlink entry.
3802 *		vap	- Attributes of new entry.
3803 *		cr	- credentials of caller.
3804 *		ct	- caller context
3805 *		flags	- case flags
3806 *
3807 *	RETURN:	0 on success, error code on failure.
3808 *
3809 * Timestamps:
3810 *	dvp - ctime|mtime updated
3811 */
3812/*ARGSUSED*/
3813static int
3814zfs_symlink(vnode_t *dvp, char *name, vattr_t *vap, char *link, cred_t *cr,
3815    caller_context_t *ct, int flags)
3816{
3817	znode_t		*zp, *dzp = VTOZ(dvp);
3818	zfs_dirlock_t	*dl;
3819	dmu_tx_t	*tx;
3820	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
3821	zilog_t		*zilog;
3822	uint64_t	len = strlen(link);
3823	int		error;
3824	int		zflg = ZNEW;
3825	zfs_acl_ids_t	acl_ids;
3826	boolean_t	fuid_dirtied;
3827	uint64_t	txtype = TX_SYMLINK;
3828	boolean_t	waited = B_FALSE;
3829
3830	ASSERT(vap->va_type == VLNK);
3831
3832	ZFS_ENTER(zfsvfs);
3833	ZFS_VERIFY_ZP(dzp);
3834	zilog = zfsvfs->z_log;
3835
3836	if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
3837	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3838		ZFS_EXIT(zfsvfs);
3839		return (SET_ERROR(EILSEQ));
3840	}
3841	if (flags & FIGNORECASE)
3842		zflg |= ZCILOOK;
3843
3844	if (len > MAXPATHLEN) {
3845		ZFS_EXIT(zfsvfs);
3846		return (SET_ERROR(ENAMETOOLONG));
3847	}
3848
3849	if ((error = zfs_acl_ids_create(dzp, 0,
3850	    vap, cr, NULL, &acl_ids)) != 0) {
3851		ZFS_EXIT(zfsvfs);
3852		return (error);
3853	}
3854top:
3855	/*
3856	 * Attempt to lock directory; fail if entry already exists.
3857	 */
3858	error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL);
3859	if (error) {
3860		zfs_acl_ids_free(&acl_ids);
3861		ZFS_EXIT(zfsvfs);
3862		return (error);
3863	}
3864
3865	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
3866		zfs_acl_ids_free(&acl_ids);
3867		zfs_dirent_unlock(dl);
3868		ZFS_EXIT(zfsvfs);
3869		return (error);
3870	}
3871
3872	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
3873		zfs_acl_ids_free(&acl_ids);
3874		zfs_dirent_unlock(dl);
3875		ZFS_EXIT(zfsvfs);
3876		return (SET_ERROR(EDQUOT));
3877	}
3878	tx = dmu_tx_create(zfsvfs->z_os);
3879	fuid_dirtied = zfsvfs->z_fuid_dirty;
3880	dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
3881	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
3882	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
3883	    ZFS_SA_BASE_ATTR_SIZE + len);
3884	dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
3885	if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
3886		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
3887		    acl_ids.z_aclp->z_acl_bytes);
3888	}
3889	if (fuid_dirtied)
3890		zfs_fuid_txhold(zfsvfs, tx);
3891	error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
3892	if (error) {
3893		zfs_dirent_unlock(dl);
3894		if (error == ERESTART) {
3895			waited = B_TRUE;
3896			dmu_tx_wait(tx);
3897			dmu_tx_abort(tx);
3898			goto top;
3899		}
3900		zfs_acl_ids_free(&acl_ids);
3901		dmu_tx_abort(tx);
3902		ZFS_EXIT(zfsvfs);
3903		return (error);
3904	}
3905
3906	/*
3907	 * Create a new object for the symlink.
3908	 * for version 4 ZPL datsets the symlink will be an SA attribute
3909	 */
3910	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
3911
3912	if (fuid_dirtied)
3913		zfs_fuid_sync(zfsvfs, tx);
3914
3915	mutex_enter(&zp->z_lock);
3916	if (zp->z_is_sa)
3917		error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs),
3918		    link, len, tx);
3919	else
3920		zfs_sa_symlink(zp, link, len, tx);
3921	mutex_exit(&zp->z_lock);
3922
3923	zp->z_size = len;
3924	(void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
3925	    &zp->z_size, sizeof (zp->z_size), tx);
3926	/*
3927	 * Insert the new object into the directory.
3928	 */
3929	(void) zfs_link_create(dl, zp, tx, ZNEW);
3930
3931	if (flags & FIGNORECASE)
3932		txtype |= TX_CI;
3933	zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
3934
3935	zfs_acl_ids_free(&acl_ids);
3936
3937	dmu_tx_commit(tx);
3938
3939	zfs_dirent_unlock(dl);
3940
3941	VN_RELE(ZTOV(zp));
3942
3943	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3944		zil_commit(zilog, 0);
3945
3946	ZFS_EXIT(zfsvfs);
3947	return (error);
3948}
3949
3950/*
3951 * Return, in the buffer contained in the provided uio structure,
3952 * the symbolic path referred to by vp.
3953 *
3954 *	IN:	vp	- vnode of symbolic link.
3955 *		uio	- structure to contain the link path.
3956 *		cr	- credentials of caller.
3957 *		ct	- caller context
3958 *
3959 *	OUT:	uio	- structure containing the link path.
3960 *
3961 *	RETURN:	0 on success, error code on failure.
3962 *
3963 * Timestamps:
3964 *	vp - atime updated
3965 */
3966/* ARGSUSED */
3967static int
3968zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct)
3969{
3970	znode_t		*zp = VTOZ(vp);
3971	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
3972	int		error;
3973
3974	ZFS_ENTER(zfsvfs);
3975	ZFS_VERIFY_ZP(zp);
3976
3977	mutex_enter(&zp->z_lock);
3978	if (zp->z_is_sa)
3979		error = sa_lookup_uio(zp->z_sa_hdl,
3980		    SA_ZPL_SYMLINK(zfsvfs), uio);
3981	else
3982		error = zfs_sa_readlink(zp, uio);
3983	mutex_exit(&zp->z_lock);
3984
3985	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
3986
3987	ZFS_EXIT(zfsvfs);
3988	return (error);
3989}
3990
3991/*
3992 * Insert a new entry into directory tdvp referencing svp.
3993 *
3994 *	IN:	tdvp	- Directory to contain new entry.
3995 *		svp	- vnode of new entry.
3996 *		name	- name of new entry.
3997 *		cr	- credentials of caller.
3998 *		ct	- caller context
3999 *
4000 *	RETURN:	0 on success, error code on failure.
4001 *
4002 * Timestamps:
4003 *	tdvp - ctime|mtime updated
4004 *	 svp - ctime updated
4005 */
4006/* ARGSUSED */
4007static int
4008zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr,
4009    caller_context_t *ct, int flags)
4010{
4011	znode_t		*dzp = VTOZ(tdvp);
4012	znode_t		*tzp, *szp;
4013	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
4014	zilog_t		*zilog;
4015	zfs_dirlock_t	*dl;
4016	dmu_tx_t	*tx;
4017	vnode_t		*realvp;
4018	int		error;
4019	int		zf = ZNEW;
4020	uint64_t	parent;
4021	uid_t		owner;
4022	boolean_t	waited = B_FALSE;
4023
4024	ASSERT(tdvp->v_type == VDIR);
4025
4026	ZFS_ENTER(zfsvfs);
4027	ZFS_VERIFY_ZP(dzp);
4028	zilog = zfsvfs->z_log;
4029
4030	if (VOP_REALVP(svp, &realvp, ct) == 0)
4031		svp = realvp;
4032
4033	/*
4034	 * POSIX dictates that we return EPERM here.
4035	 * Better choices include ENOTSUP or EISDIR.
4036	 */
4037	if (svp->v_type == VDIR) {
4038		ZFS_EXIT(zfsvfs);
4039		return (SET_ERROR(EPERM));
4040	}
4041
4042	szp = VTOZ(svp);
4043	ZFS_VERIFY_ZP(szp);
4044
4045	/*
4046	 * We check z_zfsvfs rather than v_vfsp here, because snapshots and the
4047	 * ctldir appear to have the same v_vfsp.
4048	 */
4049	if (szp->z_zfsvfs != zfsvfs || zfsctl_is_node(svp)) {
4050		ZFS_EXIT(zfsvfs);
4051		return (SET_ERROR(EXDEV));
4052	}
4053
4054	/* Prevent links to .zfs/shares files */
4055
4056	if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
4057	    &parent, sizeof (uint64_t))) != 0) {
4058		ZFS_EXIT(zfsvfs);
4059		return (error);
4060	}
4061	if (parent == zfsvfs->z_shares_dir) {
4062		ZFS_EXIT(zfsvfs);
4063		return (SET_ERROR(EPERM));
4064	}
4065
4066	if (zfsvfs->z_utf8 && u8_validate(name,
4067	    strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
4068		ZFS_EXIT(zfsvfs);
4069		return (SET_ERROR(EILSEQ));
4070	}
4071	if (flags & FIGNORECASE)
4072		zf |= ZCILOOK;
4073
4074	/*
4075	 * We do not support links between attributes and non-attributes
4076	 * because of the potential security risk of creating links
4077	 * into "normal" file space in order to circumvent restrictions
4078	 * imposed in attribute space.
4079	 */
4080	if ((szp->z_pflags & ZFS_XATTR) != (dzp->z_pflags & ZFS_XATTR)) {
4081		ZFS_EXIT(zfsvfs);
4082		return (SET_ERROR(EINVAL));
4083	}
4084
4085
4086	owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER);
4087	if (owner != crgetuid(cr) && secpolicy_basic_link(cr) != 0) {
4088		ZFS_EXIT(zfsvfs);
4089		return (SET_ERROR(EPERM));
4090	}
4091
4092	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
4093		ZFS_EXIT(zfsvfs);
4094		return (error);
4095	}
4096
4097top:
4098	/*
4099	 * Attempt to lock directory; fail if entry already exists.
4100	 */
4101	error = zfs_dirent_lock(&dl, dzp, name, &tzp, zf, NULL, NULL);
4102	if (error) {
4103		ZFS_EXIT(zfsvfs);
4104		return (error);
4105	}
4106
4107	tx = dmu_tx_create(zfsvfs->z_os);
4108	dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
4109	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
4110	zfs_sa_upgrade_txholds(tx, szp);
4111	zfs_sa_upgrade_txholds(tx, dzp);
4112	error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
4113	if (error) {
4114		zfs_dirent_unlock(dl);
4115		if (error == ERESTART) {
4116			waited = B_TRUE;
4117			dmu_tx_wait(tx);
4118			dmu_tx_abort(tx);
4119			goto top;
4120		}
4121		dmu_tx_abort(tx);
4122		ZFS_EXIT(zfsvfs);
4123		return (error);
4124	}
4125
4126	error = zfs_link_create(dl, szp, tx, 0);
4127
4128	if (error == 0) {
4129		uint64_t txtype = TX_LINK;
4130		if (flags & FIGNORECASE)
4131			txtype |= TX_CI;
4132		zfs_log_link(zilog, tx, txtype, dzp, szp, name);
4133	}
4134
4135	dmu_tx_commit(tx);
4136
4137	zfs_dirent_unlock(dl);
4138
4139	if (error == 0) {
4140		vnevent_link(svp, ct);
4141	}
4142
4143	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4144		zil_commit(zilog, 0);
4145
4146	ZFS_EXIT(zfsvfs);
4147	return (error);
4148}
4149
4150/*
4151 * zfs_null_putapage() is used when the file system has been force
4152 * unmounted. It just drops the pages.
4153 */
4154/* ARGSUSED */
4155static int
4156zfs_null_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp,
4157    size_t *lenp, int flags, cred_t *cr)
4158{
4159	pvn_write_done(pp, B_INVAL|B_FORCE|B_ERROR);
4160	return (0);
4161}
4162
4163/*
4164 * Push a page out to disk, klustering if possible.
4165 *
4166 *	IN:	vp	- file to push page to.
4167 *		pp	- page to push.
4168 *		flags	- additional flags.
4169 *		cr	- credentials of caller.
4170 *
4171 *	OUT:	offp	- start of range pushed.
4172 *		lenp	- len of range pushed.
4173 *
4174 *	RETURN:	0 on success, error code on failure.
4175 *
4176 * NOTE: callers must have locked the page to be pushed.  On
4177 * exit, the page (and all other pages in the kluster) must be
4178 * unlocked.
4179 */
4180/* ARGSUSED */
4181static int
4182zfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp,
4183    size_t *lenp, int flags, cred_t *cr)
4184{
4185	znode_t		*zp = VTOZ(vp);
4186	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
4187	dmu_tx_t	*tx;
4188	u_offset_t	off, koff;
4189	size_t		len, klen;
4190	int		err;
4191
4192	off = pp->p_offset;
4193	len = PAGESIZE;
4194	/*
4195	 * If our blocksize is bigger than the page size, try to kluster
4196	 * multiple pages so that we write a full block (thus avoiding
4197	 * a read-modify-write).
4198	 */
4199	if (off < zp->z_size && zp->z_blksz > PAGESIZE) {
4200		klen = P2ROUNDUP((ulong_t)zp->z_blksz, PAGESIZE);
4201		koff = ISP2(klen) ? P2ALIGN(off, (u_offset_t)klen) : 0;
4202		ASSERT(koff <= zp->z_size);
4203		if (koff + klen > zp->z_size)
4204			klen = P2ROUNDUP(zp->z_size - koff, (uint64_t)PAGESIZE);
4205		pp = pvn_write_kluster(vp, pp, &off, &len, koff, klen, flags);
4206	}
4207	ASSERT3U(btop(len), ==, btopr(len));
4208
4209	/*
4210	 * Can't push pages past end-of-file.
4211	 */
4212	if (off >= zp->z_size) {
4213		/* ignore all pages */
4214		err = 0;
4215		goto out;
4216	} else if (off + len > zp->z_size) {
4217		int npages = btopr(zp->z_size - off);
4218		page_t *trunc;
4219
4220		page_list_break(&pp, &trunc, npages);
4221		/* ignore pages past end of file */
4222		if (trunc)
4223			pvn_write_done(trunc, flags);
4224		len = zp->z_size - off;
4225	}
4226
4227	if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
4228	    zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
4229		err = SET_ERROR(EDQUOT);
4230		goto out;
4231	}
4232	tx = dmu_tx_create(zfsvfs->z_os);
4233	dmu_tx_hold_write(tx, zp->z_id, off, len);
4234
4235	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
4236	zfs_sa_upgrade_txholds(tx, zp);
4237	err = dmu_tx_assign(tx, TXG_WAIT);
4238	if (err != 0) {
4239		dmu_tx_abort(tx);
4240		goto out;
4241	}
4242
4243	if (zp->z_blksz <= PAGESIZE) {
4244		caddr_t va = zfs_map_page(pp, S_READ);
4245		ASSERT3U(len, <=, PAGESIZE);
4246		dmu_write(zfsvfs->z_os, zp->z_id, off, len, va, tx);
4247		zfs_unmap_page(pp, va);
4248	} else {
4249		err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, pp, tx);
4250	}
4251
4252	if (err == 0) {
4253		uint64_t mtime[2], ctime[2];
4254		sa_bulk_attr_t bulk[3];
4255		int count = 0;
4256
4257		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
4258		    &mtime, 16);
4259		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
4260		    &ctime, 16);
4261		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
4262		    &zp->z_pflags, 8);
4263		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
4264		    B_TRUE);
4265		err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
4266		ASSERT0(err);
4267		zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0);
4268	}
4269	dmu_tx_commit(tx);
4270
4271out:
4272	pvn_write_done(pp, (err ? B_ERROR : 0) | flags);
4273	if (offp)
4274		*offp = off;
4275	if (lenp)
4276		*lenp = len;
4277
4278	return (err);
4279}
4280
4281/*
4282 * Copy the portion of the file indicated from pages into the file.
4283 * The pages are stored in a page list attached to the files vnode.
4284 *
4285 *	IN:	vp	- vnode of file to push page data to.
4286 *		off	- position in file to put data.
4287 *		len	- amount of data to write.
4288 *		flags	- flags to control the operation.
4289 *		cr	- credentials of caller.
4290 *		ct	- caller context.
4291 *
4292 *	RETURN:	0 on success, error code on failure.
4293 *
4294 * Timestamps:
4295 *	vp - ctime|mtime updated
4296 */
4297/*ARGSUSED*/
4298static int
4299zfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr,
4300    caller_context_t *ct)
4301{
4302	znode_t		*zp = VTOZ(vp);
4303	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
4304	page_t		*pp;
4305	size_t		io_len;
4306	u_offset_t	io_off;
4307	uint_t		blksz;
4308	rl_t		*rl;
4309	int		error = 0;
4310
4311	ZFS_ENTER(zfsvfs);
4312	ZFS_VERIFY_ZP(zp);
4313
4314	/*
4315	 * There's nothing to do if no data is cached.
4316	 */
4317	if (!vn_has_cached_data(vp)) {
4318		ZFS_EXIT(zfsvfs);
4319		return (0);
4320	}
4321
4322	/*
4323	 * Align this request to the file block size in case we kluster.
4324	 * XXX - this can result in pretty aggresive locking, which can
4325	 * impact simultanious read/write access.  One option might be
4326	 * to break up long requests (len == 0) into block-by-block
4327	 * operations to get narrower locking.
4328	 */
4329	blksz = zp->z_blksz;
4330	if (ISP2(blksz))
4331		io_off = P2ALIGN_TYPED(off, blksz, u_offset_t);
4332	else
4333		io_off = 0;
4334	if (len > 0 && ISP2(blksz))
4335		io_len = P2ROUNDUP_TYPED(len + (off - io_off), blksz, size_t);
4336	else
4337		io_len = 0;
4338
4339	if (io_len == 0) {
4340		/*
4341		 * Search the entire vp list for pages >= io_off.
4342		 */
4343		rl = zfs_range_lock(zp, io_off, UINT64_MAX, RL_WRITER);
4344		error = pvn_vplist_dirty(vp, io_off, zfs_putapage, flags, cr);
4345		goto out;
4346	}
4347	rl = zfs_range_lock(zp, io_off, io_len, RL_WRITER);
4348
4349	if (off > zp->z_size) {
4350		/* past end of file */
4351		zfs_range_unlock(rl);
4352		ZFS_EXIT(zfsvfs);
4353		return (0);
4354	}
4355
4356	len = MIN(io_len, P2ROUNDUP(zp->z_size, PAGESIZE) - io_off);
4357
4358	for (off = io_off; io_off < off + len; io_off += io_len) {
4359		if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) {
4360			pp = page_lookup(vp, io_off,
4361			    (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED);
4362		} else {
4363			pp = page_lookup_nowait(vp, io_off,
4364			    (flags & B_FREE) ? SE_EXCL : SE_SHARED);
4365		}
4366
4367		if (pp != NULL && pvn_getdirty(pp, flags)) {
4368			int err;
4369
4370			/*
4371			 * Found a dirty page to push
4372			 */
4373			err = zfs_putapage(vp, pp, &io_off, &io_len, flags, cr);
4374			if (err)
4375				error = err;
4376		} else {
4377			io_len = PAGESIZE;
4378		}
4379	}
4380out:
4381	zfs_range_unlock(rl);
4382	if ((flags & B_ASYNC) == 0 || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4383		zil_commit(zfsvfs->z_log, zp->z_id);
4384	ZFS_EXIT(zfsvfs);
4385	return (error);
4386}
4387
4388/*ARGSUSED*/
4389void
4390zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
4391{
4392	znode_t	*zp = VTOZ(vp);
4393	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4394	int error;
4395
4396	rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
4397	if (zp->z_sa_hdl == NULL) {
4398		/*
4399		 * The fs has been unmounted, or we did a
4400		 * suspend/resume and this file no longer exists.
4401		 */
4402		if (vn_has_cached_data(vp)) {
4403			(void) pvn_vplist_dirty(vp, 0, zfs_null_putapage,
4404			    B_INVAL, cr);
4405		}
4406
4407		mutex_enter(&zp->z_lock);
4408		mutex_enter(&vp->v_lock);
4409		ASSERT(vp->v_count == 1);
4410		VN_RELE_LOCKED(vp);
4411		mutex_exit(&vp->v_lock);
4412		mutex_exit(&zp->z_lock);
4413		rw_exit(&zfsvfs->z_teardown_inactive_lock);
4414		zfs_znode_free(zp);
4415		return;
4416	}
4417
4418	/*
4419	 * Attempt to push any data in the page cache.  If this fails
4420	 * we will get kicked out later in zfs_zinactive().
4421	 */
4422	if (vn_has_cached_data(vp)) {
4423		(void) pvn_vplist_dirty(vp, 0, zfs_putapage, B_INVAL|B_ASYNC,
4424		    cr);
4425	}
4426
4427	if (zp->z_atime_dirty && zp->z_unlinked == 0) {
4428		dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
4429
4430		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
4431		zfs_sa_upgrade_txholds(tx, zp);
4432		error = dmu_tx_assign(tx, TXG_WAIT);
4433		if (error) {
4434			dmu_tx_abort(tx);
4435		} else {
4436			mutex_enter(&zp->z_lock);
4437			(void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
4438			    (void *)&zp->z_atime, sizeof (zp->z_atime), tx);
4439			zp->z_atime_dirty = 0;
4440			mutex_exit(&zp->z_lock);
4441			dmu_tx_commit(tx);
4442		}
4443	}
4444
4445	zfs_zinactive(zp);
4446	rw_exit(&zfsvfs->z_teardown_inactive_lock);
4447}
4448
4449/*
4450 * Bounds-check the seek operation.
4451 *
4452 *	IN:	vp	- vnode seeking within
4453 *		ooff	- old file offset
4454 *		noffp	- pointer to new file offset
4455 *		ct	- caller context
4456 *
4457 *	RETURN:	0 on success, EINVAL if new offset invalid.
4458 */
4459/* ARGSUSED */
4460static int
4461zfs_seek(vnode_t *vp, offset_t ooff, offset_t *noffp,
4462    caller_context_t *ct)
4463{
4464	if (vp->v_type == VDIR)
4465		return (0);
4466	return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
4467}
4468
4469/*
4470 * Pre-filter the generic locking function to trap attempts to place
4471 * a mandatory lock on a memory mapped file.
4472 */
4473static int
4474zfs_frlock(vnode_t *vp, int cmd, flock64_t *bfp, int flag, offset_t offset,
4475    flk_callback_t *flk_cbp, cred_t *cr, caller_context_t *ct)
4476{
4477	znode_t *zp = VTOZ(vp);
4478	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4479
4480	ZFS_ENTER(zfsvfs);
4481	ZFS_VERIFY_ZP(zp);
4482
4483	/*
4484	 * We are following the UFS semantics with respect to mapcnt
4485	 * here: If we see that the file is mapped already, then we will
4486	 * return an error, but we don't worry about races between this
4487	 * function and zfs_map().
4488	 */
4489	if (zp->z_mapcnt > 0 && MANDMODE(zp->z_mode)) {
4490		ZFS_EXIT(zfsvfs);
4491		return (SET_ERROR(EAGAIN));
4492	}
4493	ZFS_EXIT(zfsvfs);
4494	return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct));
4495}
4496
4497/*
4498 * If we can't find a page in the cache, we will create a new page
4499 * and fill it with file data.  For efficiency, we may try to fill
4500 * multiple pages at once (klustering) to fill up the supplied page
4501 * list.  Note that the pages to be filled are held with an exclusive
4502 * lock to prevent access by other threads while they are being filled.
4503 */
4504static int
4505zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg,
4506    caddr_t addr, page_t *pl[], size_t plsz, enum seg_rw rw)
4507{
4508	znode_t *zp = VTOZ(vp);
4509	page_t *pp, *cur_pp;
4510	objset_t *os = zp->z_zfsvfs->z_os;
4511	u_offset_t io_off, total;
4512	size_t io_len;
4513	int err;
4514
4515	if (plsz == PAGESIZE || zp->z_blksz <= PAGESIZE) {
4516		/*
4517		 * We only have a single page, don't bother klustering
4518		 */
4519		io_off = off;
4520		io_len = PAGESIZE;
4521		pp = page_create_va(vp, io_off, io_len,
4522		    PG_EXCL | PG_WAIT, seg, addr);
4523	} else {
4524		/*
4525		 * Try to find enough pages to fill the page list
4526		 */
4527		pp = pvn_read_kluster(vp, off, seg, addr, &io_off,
4528		    &io_len, off, plsz, 0);
4529	}
4530	if (pp == NULL) {
4531		/*
4532		 * The page already exists, nothing to do here.
4533		 */
4534		*pl = NULL;
4535		return (0);
4536	}
4537
4538	/*
4539	 * Fill the pages in the kluster.
4540	 */
4541	cur_pp = pp;
4542	for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) {
4543		caddr_t va;
4544
4545		ASSERT3U(io_off, ==, cur_pp->p_offset);
4546		va = zfs_map_page(cur_pp, S_WRITE);
4547		err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va,
4548		    DMU_READ_PREFETCH);
4549		zfs_unmap_page(cur_pp, va);
4550		if (err) {
4551			/* On error, toss the entire kluster */
4552			pvn_read_done(pp, B_ERROR);
4553			/* convert checksum errors into IO errors */
4554			if (err == ECKSUM)
4555				err = SET_ERROR(EIO);
4556			return (err);
4557		}
4558		cur_pp = cur_pp->p_next;
4559	}
4560
4561	/*
4562	 * Fill in the page list array from the kluster starting
4563	 * from the desired offset `off'.
4564	 * NOTE: the page list will always be null terminated.
4565	 */
4566	pvn_plist_init(pp, pl, plsz, off, io_len, rw);
4567	ASSERT(pl == NULL || (*pl)->p_offset == off);
4568
4569	return (0);
4570}
4571
4572/*
4573 * Return pointers to the pages for the file region [off, off + len]
4574 * in the pl array.  If plsz is greater than len, this function may
4575 * also return page pointers from after the specified region
4576 * (i.e. the region [off, off + plsz]).  These additional pages are
4577 * only returned if they are already in the cache, or were created as
4578 * part of a klustered read.
4579 *
4580 *	IN:	vp	- vnode of file to get data from.
4581 *		off	- position in file to get data from.
4582 *		len	- amount of data to retrieve.
4583 *		plsz	- length of provided page list.
4584 *		seg	- segment to obtain pages for.
4585 *		addr	- virtual address of fault.
4586 *		rw	- mode of created pages.
4587 *		cr	- credentials of caller.
4588 *		ct	- caller context.
4589 *
4590 *	OUT:	protp	- protection mode of created pages.
4591 *		pl	- list of pages created.
4592 *
4593 *	RETURN:	0 on success, error code on failure.
4594 *
4595 * Timestamps:
4596 *	vp - atime updated
4597 */
4598/* ARGSUSED */
4599static int
4600zfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp,
4601    page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
4602    enum seg_rw rw, cred_t *cr, caller_context_t *ct)
4603{
4604	znode_t		*zp = VTOZ(vp);
4605	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
4606	page_t		**pl0 = pl;
4607	int		err = 0;
4608
4609	/* we do our own caching, faultahead is unnecessary */
4610	if (pl == NULL)
4611		return (0);
4612	else if (len > plsz)
4613		len = plsz;
4614	else
4615		len = P2ROUNDUP(len, PAGESIZE);
4616	ASSERT(plsz >= len);
4617
4618	ZFS_ENTER(zfsvfs);
4619	ZFS_VERIFY_ZP(zp);
4620
4621	if (protp)
4622		*protp = PROT_ALL;
4623
4624	/*
4625	 * Loop through the requested range [off, off + len) looking
4626	 * for pages.  If we don't find a page, we will need to create
4627	 * a new page and fill it with data from the file.
4628	 */
4629	while (len > 0) {
4630		if (*pl = page_lookup(vp, off, SE_SHARED))
4631			*(pl+1) = NULL;
4632		else if (err = zfs_fillpage(vp, off, seg, addr, pl, plsz, rw))
4633			goto out;
4634		while (*pl) {
4635			ASSERT3U((*pl)->p_offset, ==, off);
4636			off += PAGESIZE;
4637			addr += PAGESIZE;
4638			if (len > 0) {
4639				ASSERT3U(len, >=, PAGESIZE);
4640				len -= PAGESIZE;
4641			}
4642			ASSERT3U(plsz, >=, PAGESIZE);
4643			plsz -= PAGESIZE;
4644			pl++;
4645		}
4646	}
4647
4648	/*
4649	 * Fill out the page array with any pages already in the cache.
4650	 */
4651	while (plsz > 0 &&
4652	    (*pl++ = page_lookup_nowait(vp, off, SE_SHARED))) {
4653			off += PAGESIZE;
4654			plsz -= PAGESIZE;
4655	}
4656out:
4657	if (err) {
4658		/*
4659		 * Release any pages we have previously locked.
4660		 */
4661		while (pl > pl0)
4662			page_unlock(*--pl);
4663	} else {
4664		ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
4665	}
4666
4667	*pl = NULL;
4668
4669	ZFS_EXIT(zfsvfs);
4670	return (err);
4671}
4672
4673/*
4674 * Request a memory map for a section of a file.  This code interacts
4675 * with common code and the VM system as follows:
4676 *
4677 * - common code calls mmap(), which ends up in smmap_common()
4678 * - this calls VOP_MAP(), which takes you into (say) zfs
4679 * - zfs_map() calls as_map(), passing segvn_create() as the callback
4680 * - segvn_create() creates the new segment and calls VOP_ADDMAP()
4681 * - zfs_addmap() updates z_mapcnt
4682 */
4683/*ARGSUSED*/
4684static int
4685zfs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
4686    size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
4687    caller_context_t *ct)
4688{
4689	znode_t *zp = VTOZ(vp);
4690	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4691	segvn_crargs_t	vn_a;
4692	int		error;
4693
4694	ZFS_ENTER(zfsvfs);
4695	ZFS_VERIFY_ZP(zp);
4696
4697	/*
4698	 * Note: ZFS_READONLY is handled in zfs_zaccess_common.
4699	 */
4700
4701	if ((prot & PROT_WRITE) && (zp->z_pflags &
4702	    (ZFS_IMMUTABLE | ZFS_APPENDONLY))) {
4703		ZFS_EXIT(zfsvfs);
4704		return (SET_ERROR(EPERM));
4705	}
4706
4707	if ((prot & (PROT_READ | PROT_EXEC)) &&
4708	    (zp->z_pflags & ZFS_AV_QUARANTINED)) {
4709		ZFS_EXIT(zfsvfs);
4710		return (SET_ERROR(EACCES));
4711	}
4712
4713	if (vp->v_flag & VNOMAP) {
4714		ZFS_EXIT(zfsvfs);
4715		return (SET_ERROR(ENOSYS));
4716	}
4717
4718	if (off < 0 || len > MAXOFFSET_T - off) {
4719		ZFS_EXIT(zfsvfs);
4720		return (SET_ERROR(ENXIO));
4721	}
4722
4723	if (vp->v_type != VREG) {
4724		ZFS_EXIT(zfsvfs);
4725		return (SET_ERROR(ENODEV));
4726	}
4727
4728	/*
4729	 * If file is locked, disallow mapping.
4730	 */
4731	if (MANDMODE(zp->z_mode) && vn_has_flocks(vp)) {
4732		ZFS_EXIT(zfsvfs);
4733		return (SET_ERROR(EAGAIN));
4734	}
4735
4736	as_rangelock(as);
4737	error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags);
4738	if (error != 0) {
4739		as_rangeunlock(as);
4740		ZFS_EXIT(zfsvfs);
4741		return (error);
4742	}
4743
4744	vn_a.vp = vp;
4745	vn_a.offset = (u_offset_t)off;
4746	vn_a.type = flags & MAP_TYPE;
4747	vn_a.prot = prot;
4748	vn_a.maxprot = maxprot;
4749	vn_a.cred = cr;
4750	vn_a.amp = NULL;
4751	vn_a.flags = flags & ~MAP_TYPE;
4752	vn_a.szc = 0;
4753	vn_a.lgrp_mem_policy_flags = 0;
4754
4755	error = as_map(as, *addrp, len, segvn_create, &vn_a);
4756
4757	as_rangeunlock(as);
4758	ZFS_EXIT(zfsvfs);
4759	return (error);
4760}
4761
4762/* ARGSUSED */
4763static int
4764zfs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
4765    size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
4766    caller_context_t *ct)
4767{
4768	uint64_t pages = btopr(len);
4769
4770	atomic_add_64(&VTOZ(vp)->z_mapcnt, pages);
4771	return (0);
4772}
4773
4774/*
4775 * The reason we push dirty pages as part of zfs_delmap() is so that we get a
4776 * more accurate mtime for the associated file.  Since we don't have a way of
4777 * detecting when the data was actually modified, we have to resort to
4778 * heuristics.  If an explicit msync() is done, then we mark the mtime when the
4779 * last page is pushed.  The problem occurs when the msync() call is omitted,
4780 * which by far the most common case:
4781 *
4782 *	open()
4783 *	mmap()
4784 *	<modify memory>
4785 *	munmap()
4786 *	close()
4787 *	<time lapse>
4788 *	putpage() via fsflush
4789 *
4790 * If we wait until fsflush to come along, we can have a modification time that
4791 * is some arbitrary point in the future.  In order to prevent this in the
4792 * common case, we flush pages whenever a (MAP_SHARED, PROT_WRITE) mapping is
4793 * torn down.
4794 */
4795/* ARGSUSED */
4796static int
4797zfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
4798    size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr,
4799    caller_context_t *ct)
4800{
4801	uint64_t pages = btopr(len);
4802
4803	ASSERT3U(VTOZ(vp)->z_mapcnt, >=, pages);
4804	atomic_add_64(&VTOZ(vp)->z_mapcnt, -pages);
4805
4806	if ((flags & MAP_SHARED) && (prot & PROT_WRITE) &&
4807	    vn_has_cached_data(vp))
4808		(void) VOP_PUTPAGE(vp, off, len, B_ASYNC, cr, ct);
4809
4810	return (0);
4811}
4812
4813/*
4814 * Free or allocate space in a file.  Currently, this function only
4815 * supports the `F_FREESP' command.  However, this command is somewhat
4816 * misnamed, as its functionality includes the ability to allocate as
4817 * well as free space.
4818 *
4819 *	IN:	vp	- vnode of file to free data in.
4820 *		cmd	- action to take (only F_FREESP supported).
4821 *		bfp	- section of file to free/alloc.
4822 *		flag	- current file open mode flags.
4823 *		offset	- current file offset.
4824 *		cr	- credentials of caller [UNUSED].
4825 *		ct	- caller context.
4826 *
4827 *	RETURN:	0 on success, error code on failure.
4828 *
4829 * Timestamps:
4830 *	vp - ctime|mtime updated
4831 */
4832/* ARGSUSED */
4833static int
4834zfs_space(vnode_t *vp, int cmd, flock64_t *bfp, int flag,
4835    offset_t offset, cred_t *cr, caller_context_t *ct)
4836{
4837	znode_t		*zp = VTOZ(vp);
4838	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
4839	uint64_t	off, len;
4840	int		error;
4841
4842	ZFS_ENTER(zfsvfs);
4843	ZFS_VERIFY_ZP(zp);
4844
4845	if (cmd != F_FREESP) {
4846		ZFS_EXIT(zfsvfs);
4847		return (SET_ERROR(EINVAL));
4848	}
4849
4850	/*
4851	 * In a case vp->v_vfsp != zp->z_zfsvfs->z_vfs (e.g. snapshots) our
4852	 * callers might not be able to detect properly that we are read-only,
4853	 * so check it explicitly here.
4854	 */
4855	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
4856		ZFS_EXIT(zfsvfs);
4857		return (SET_ERROR(EROFS));
4858	}
4859
4860	if (error = convoff(vp, bfp, 0, offset)) {
4861		ZFS_EXIT(zfsvfs);
4862		return (error);
4863	}
4864
4865	if (bfp->l_len < 0) {
4866		ZFS_EXIT(zfsvfs);
4867		return (SET_ERROR(EINVAL));
4868	}
4869
4870	off = bfp->l_start;
4871	len = bfp->l_len; /* 0 means from off to end of file */
4872
4873	error = zfs_freesp(zp, off, len, flag, TRUE);
4874
4875	if (error == 0 && off == 0 && len == 0)
4876		vnevent_truncate(ZTOV(zp), ct);
4877
4878	ZFS_EXIT(zfsvfs);
4879	return (error);
4880}
4881
4882/*ARGSUSED*/
4883static int
4884zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
4885{
4886	znode_t		*zp = VTOZ(vp);
4887	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
4888	uint32_t	gen;
4889	uint64_t	gen64;
4890	uint64_t	object = zp->z_id;
4891	zfid_short_t	*zfid;
4892	int		size, i, error;
4893
4894	ZFS_ENTER(zfsvfs);
4895	ZFS_VERIFY_ZP(zp);
4896
4897	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs),
4898	    &gen64, sizeof (uint64_t))) != 0) {
4899		ZFS_EXIT(zfsvfs);
4900		return (error);
4901	}
4902
4903	gen = (uint32_t)gen64;
4904
4905	size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN;
4906	if (fidp->fid_len < size) {
4907		fidp->fid_len = size;
4908		ZFS_EXIT(zfsvfs);
4909		return (SET_ERROR(ENOSPC));
4910	}
4911
4912	zfid = (zfid_short_t *)fidp;
4913
4914	zfid->zf_len = size;
4915
4916	for (i = 0; i < sizeof (zfid->zf_object); i++)
4917		zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
4918
4919	/* Must have a non-zero generation number to distinguish from .zfs */
4920	if (gen == 0)
4921		gen = 1;
4922	for (i = 0; i < sizeof (zfid->zf_gen); i++)
4923		zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
4924
4925	if (size == LONG_FID_LEN) {
4926		uint64_t	objsetid = dmu_objset_id(zfsvfs->z_os);
4927		zfid_long_t	*zlfid;
4928
4929		zlfid = (zfid_long_t *)fidp;
4930
4931		for (i = 0; i < sizeof (zlfid->zf_setid); i++)
4932			zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i));
4933
4934		/* XXX - this should be the generation number for the objset */
4935		for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
4936			zlfid->zf_setgen[i] = 0;
4937	}
4938
4939	ZFS_EXIT(zfsvfs);
4940	return (0);
4941}
4942
4943static int
4944zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
4945    caller_context_t *ct)
4946{
4947	znode_t		*zp, *xzp;
4948	zfsvfs_t	*zfsvfs;
4949	zfs_dirlock_t	*dl;
4950	int		error;
4951
4952	switch (cmd) {
4953	case _PC_LINK_MAX:
4954		*valp = ULONG_MAX;
4955		return (0);
4956
4957	case _PC_FILESIZEBITS:
4958		*valp = 64;
4959		return (0);
4960
4961	case _PC_XATTR_EXISTS:
4962		zp = VTOZ(vp);
4963		zfsvfs = zp->z_zfsvfs;
4964		ZFS_ENTER(zfsvfs);
4965		ZFS_VERIFY_ZP(zp);
4966		*valp = 0;
4967		error = zfs_dirent_lock(&dl, zp, "", &xzp,
4968		    ZXATTR | ZEXISTS | ZSHARED, NULL, NULL);
4969		if (error == 0) {
4970			zfs_dirent_unlock(dl);
4971			if (!zfs_dirempty(xzp))
4972				*valp = 1;
4973			VN_RELE(ZTOV(xzp));
4974		} else if (error == ENOENT) {
4975			/*
4976			 * If there aren't extended attributes, it's the
4977			 * same as having zero of them.
4978			 */
4979			error = 0;
4980		}
4981		ZFS_EXIT(zfsvfs);
4982		return (error);
4983
4984	case _PC_SATTR_ENABLED:
4985	case _PC_SATTR_EXISTS:
4986		*valp = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
4987		    (vp->v_type == VREG || vp->v_type == VDIR);
4988		return (0);
4989
4990	case _PC_ACCESS_FILTERING:
4991		*valp = vfs_has_feature(vp->v_vfsp, VFSFT_ACCESS_FILTER) &&
4992		    vp->v_type == VDIR;
4993		return (0);
4994
4995	case _PC_ACL_ENABLED:
4996		*valp = _ACL_ACE_ENABLED;
4997		return (0);
4998
4999	case _PC_MIN_HOLE_SIZE:
5000		*valp = (ulong_t)SPA_MINBLOCKSIZE;
5001		return (0);
5002
5003	case _PC_TIMESTAMP_RESOLUTION:
5004		/* nanosecond timestamp resolution */
5005		*valp = 1L;
5006		return (0);
5007
5008	default:
5009		return (fs_pathconf(vp, cmd, valp, cr, ct));
5010	}
5011}
5012
5013/*ARGSUSED*/
5014static int
5015zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
5016    caller_context_t *ct)
5017{
5018	znode_t *zp = VTOZ(vp);
5019	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
5020	int error;
5021	boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
5022
5023	ZFS_ENTER(zfsvfs);
5024	ZFS_VERIFY_ZP(zp);
5025	error = zfs_getacl(zp, vsecp, skipaclchk, cr);
5026	ZFS_EXIT(zfsvfs);
5027
5028	return (error);
5029}
5030
5031/*ARGSUSED*/
5032static int
5033zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
5034    caller_context_t *ct)
5035{
5036	znode_t *zp = VTOZ(vp);
5037	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
5038	int error;
5039	boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
5040	zilog_t	*zilog = zfsvfs->z_log;
5041
5042	ZFS_ENTER(zfsvfs);
5043	ZFS_VERIFY_ZP(zp);
5044
5045	error = zfs_setacl(zp, vsecp, skipaclchk, cr);
5046
5047	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
5048		zil_commit(zilog, 0);
5049
5050	ZFS_EXIT(zfsvfs);
5051	return (error);
5052}
5053
5054/*
5055 * The smallest read we may consider to loan out an arcbuf.
5056 * This must be a power of 2.
5057 */
5058int zcr_blksz_min = (1 << 10);	/* 1K */
5059/*
5060 * If set to less than the file block size, allow loaning out of an
5061 * arcbuf for a partial block read.  This must be a power of 2.
5062 */
5063int zcr_blksz_max = (1 << 17);	/* 128K */
5064
5065/*ARGSUSED*/
5066static int
5067zfs_reqzcbuf(vnode_t *vp, enum uio_rw ioflag, xuio_t *xuio, cred_t *cr,
5068    caller_context_t *ct)
5069{
5070	znode_t	*zp = VTOZ(vp);
5071	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
5072	int max_blksz = zfsvfs->z_max_blksz;
5073	uio_t *uio = &xuio->xu_uio;
5074	ssize_t size = uio->uio_resid;
5075	offset_t offset = uio->uio_loffset;
5076	int blksz;
5077	int fullblk, i;
5078	arc_buf_t *abuf;
5079	ssize_t maxsize;
5080	int preamble, postamble;
5081
5082	if (xuio->xu_type != UIOTYPE_ZEROCOPY)
5083		return (SET_ERROR(EINVAL));
5084
5085	ZFS_ENTER(zfsvfs);
5086	ZFS_VERIFY_ZP(zp);
5087	switch (ioflag) {
5088	case UIO_WRITE:
5089		/*
5090		 * Loan out an arc_buf for write if write size is bigger than
5091		 * max_blksz, and the file's block size is also max_blksz.
5092		 */
5093		blksz = max_blksz;
5094		if (size < blksz || zp->z_blksz != blksz) {
5095			ZFS_EXIT(zfsvfs);
5096			return (SET_ERROR(EINVAL));
5097		}
5098		/*
5099		 * Caller requests buffers for write before knowing where the
5100		 * write offset might be (e.g. NFS TCP write).
5101		 */
5102		if (offset == -1) {
5103			preamble = 0;
5104		} else {
5105			preamble = P2PHASE(offset, blksz);
5106			if (preamble) {
5107				preamble = blksz - preamble;
5108				size -= preamble;
5109			}
5110		}
5111
5112		postamble = P2PHASE(size, blksz);
5113		size -= postamble;
5114
5115		fullblk = size / blksz;
5116		(void) dmu_xuio_init(xuio,
5117		    (preamble != 0) + fullblk + (postamble != 0));
5118		DTRACE_PROBE3(zfs_reqzcbuf_align, int, preamble,
5119		    int, postamble, int,
5120		    (preamble != 0) + fullblk + (postamble != 0));
5121
5122		/*
5123		 * Have to fix iov base/len for partial buffers.  They
5124		 * currently represent full arc_buf's.
5125		 */
5126		if (preamble) {
5127			/* data begins in the middle of the arc_buf */
5128			abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
5129			    blksz);
5130			ASSERT(abuf);
5131			(void) dmu_xuio_add(xuio, abuf,
5132			    blksz - preamble, preamble);
5133		}
5134
5135		for (i = 0; i < fullblk; i++) {
5136			abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
5137			    blksz);
5138			ASSERT(abuf);
5139			(void) dmu_xuio_add(xuio, abuf, 0, blksz);
5140		}
5141
5142		if (postamble) {
5143			/* data ends in the middle of the arc_buf */
5144			abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
5145			    blksz);
5146			ASSERT(abuf);
5147			(void) dmu_xuio_add(xuio, abuf, 0, postamble);
5148		}
5149		break;
5150	case UIO_READ:
5151		/*
5152		 * Loan out an arc_buf for read if the read size is larger than
5153		 * the current file block size.  Block alignment is not
5154		 * considered.  Partial arc_buf will be loaned out for read.
5155		 */
5156		blksz = zp->z_blksz;
5157		if (blksz < zcr_blksz_min)
5158			blksz = zcr_blksz_min;
5159		if (blksz > zcr_blksz_max)
5160			blksz = zcr_blksz_max;
5161		/* avoid potential complexity of dealing with it */
5162		if (blksz > max_blksz) {
5163			ZFS_EXIT(zfsvfs);
5164			return (SET_ERROR(EINVAL));
5165		}
5166
5167		maxsize = zp->z_size - uio->uio_loffset;
5168		if (size > maxsize)
5169			size = maxsize;
5170
5171		if (size < blksz || vn_has_cached_data(vp)) {
5172			ZFS_EXIT(zfsvfs);
5173			return (SET_ERROR(EINVAL));
5174		}
5175		break;
5176	default:
5177		ZFS_EXIT(zfsvfs);
5178		return (SET_ERROR(EINVAL));
5179	}
5180
5181	uio->uio_extflg = UIO_XUIO;
5182	XUIO_XUZC_RW(xuio) = ioflag;
5183	ZFS_EXIT(zfsvfs);
5184	return (0);
5185}
5186
5187/*ARGSUSED*/
5188static int
5189zfs_retzcbuf(vnode_t *vp, xuio_t *xuio, cred_t *cr, caller_context_t *ct)
5190{
5191	int i;
5192	arc_buf_t *abuf;
5193	int ioflag = XUIO_XUZC_RW(xuio);
5194
5195	ASSERT(xuio->xu_type == UIOTYPE_ZEROCOPY);
5196
5197	i = dmu_xuio_cnt(xuio);
5198	while (i-- > 0) {
5199		abuf = dmu_xuio_arcbuf(xuio, i);
5200		/*
5201		 * if abuf == NULL, it must be a write buffer
5202		 * that has been returned in zfs_write().
5203		 */
5204		if (abuf)
5205			dmu_return_arcbuf(abuf);
5206		ASSERT(abuf || ioflag == UIO_WRITE);
5207	}
5208
5209	dmu_xuio_fini(xuio);
5210	return (0);
5211}
5212
5213/*
5214 * Predeclare these here so that the compiler assumes that
5215 * this is an "old style" function declaration that does
5216 * not include arguments => we won't get type mismatch errors
5217 * in the initializations that follow.
5218 */
5219static int zfs_inval();
5220static int zfs_isdir();
5221
5222static int
5223zfs_inval()
5224{
5225	return (SET_ERROR(EINVAL));
5226}
5227
5228static int
5229zfs_isdir()
5230{
5231	return (SET_ERROR(EISDIR));
5232}
5233/*
5234 * Directory vnode operations template
5235 */
5236vnodeops_t *zfs_dvnodeops;
5237const fs_operation_def_t zfs_dvnodeops_template[] = {
5238	VOPNAME_OPEN,		{ .vop_open = zfs_open },
5239	VOPNAME_CLOSE,		{ .vop_close = zfs_close },
5240	VOPNAME_READ,		{ .error = zfs_isdir },
5241	VOPNAME_WRITE,		{ .error = zfs_isdir },
5242	VOPNAME_IOCTL,		{ .vop_ioctl = zfs_ioctl },
5243	VOPNAME_GETATTR,	{ .vop_getattr = zfs_getattr },
5244	VOPNAME_SETATTR,	{ .vop_setattr = zfs_setattr },
5245	VOPNAME_ACCESS,		{ .vop_access = zfs_access },
5246	VOPNAME_LOOKUP,		{ .vop_lookup = zfs_lookup },
5247	VOPNAME_CREATE,		{ .vop_create = zfs_create },
5248	VOPNAME_REMOVE,		{ .vop_remove = zfs_remove },
5249	VOPNAME_LINK,		{ .vop_link = zfs_link },
5250	VOPNAME_RENAME,		{ .vop_rename = zfs_rename },
5251	VOPNAME_MKDIR,		{ .vop_mkdir = zfs_mkdir },
5252	VOPNAME_RMDIR,		{ .vop_rmdir = zfs_rmdir },
5253	VOPNAME_READDIR,	{ .vop_readdir = zfs_readdir },
5254	VOPNAME_SYMLINK,	{ .vop_symlink = zfs_symlink },
5255	VOPNAME_FSYNC,		{ .vop_fsync = zfs_fsync },
5256	VOPNAME_INACTIVE,	{ .vop_inactive = zfs_inactive },
5257	VOPNAME_FID,		{ .vop_fid = zfs_fid },
5258	VOPNAME_SEEK,		{ .vop_seek = zfs_seek },
5259	VOPNAME_PATHCONF,	{ .vop_pathconf = zfs_pathconf },
5260	VOPNAME_GETSECATTR,	{ .vop_getsecattr = zfs_getsecattr },
5261	VOPNAME_SETSECATTR,	{ .vop_setsecattr = zfs_setsecattr },
5262	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
5263	NULL,			NULL
5264};
5265
5266/*
5267 * Regular file vnode operations template
5268 */
5269vnodeops_t *zfs_fvnodeops;
5270const fs_operation_def_t zfs_fvnodeops_template[] = {
5271	VOPNAME_OPEN,		{ .vop_open = zfs_open },
5272	VOPNAME_CLOSE,		{ .vop_close = zfs_close },
5273	VOPNAME_READ,		{ .vop_read = zfs_read },
5274	VOPNAME_WRITE,		{ .vop_write = zfs_write },
5275	VOPNAME_IOCTL,		{ .vop_ioctl = zfs_ioctl },
5276	VOPNAME_GETATTR,	{ .vop_getattr = zfs_getattr },
5277	VOPNAME_SETATTR,	{ .vop_setattr = zfs_setattr },
5278	VOPNAME_ACCESS,		{ .vop_access = zfs_access },
5279	VOPNAME_LOOKUP,		{ .vop_lookup = zfs_lookup },
5280	VOPNAME_RENAME,		{ .vop_rename = zfs_rename },
5281	VOPNAME_FSYNC,		{ .vop_fsync = zfs_fsync },
5282	VOPNAME_INACTIVE,	{ .vop_inactive = zfs_inactive },
5283	VOPNAME_FID,		{ .vop_fid = zfs_fid },
5284	VOPNAME_SEEK,		{ .vop_seek = zfs_seek },
5285	VOPNAME_FRLOCK,		{ .vop_frlock = zfs_frlock },
5286	VOPNAME_SPACE,		{ .vop_space = zfs_space },
5287	VOPNAME_GETPAGE,	{ .vop_getpage = zfs_getpage },
5288	VOPNAME_PUTPAGE,	{ .vop_putpage = zfs_putpage },
5289	VOPNAME_MAP,		{ .vop_map = zfs_map },
5290	VOPNAME_ADDMAP,		{ .vop_addmap = zfs_addmap },
5291	VOPNAME_DELMAP,		{ .vop_delmap = zfs_delmap },
5292	VOPNAME_PATHCONF,	{ .vop_pathconf = zfs_pathconf },
5293	VOPNAME_GETSECATTR,	{ .vop_getsecattr = zfs_getsecattr },
5294	VOPNAME_SETSECATTR,	{ .vop_setsecattr = zfs_setsecattr },
5295	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
5296	VOPNAME_REQZCBUF,	{ .vop_reqzcbuf = zfs_reqzcbuf },
5297	VOPNAME_RETZCBUF,	{ .vop_retzcbuf = zfs_retzcbuf },
5298	NULL,			NULL
5299};
5300
5301/*
5302 * Symbolic link vnode operations template
5303 */
5304vnodeops_t *zfs_symvnodeops;
5305const fs_operation_def_t zfs_symvnodeops_template[] = {
5306	VOPNAME_GETATTR,	{ .vop_getattr = zfs_getattr },
5307	VOPNAME_SETATTR,	{ .vop_setattr = zfs_setattr },
5308	VOPNAME_ACCESS,		{ .vop_access = zfs_access },
5309	VOPNAME_RENAME,		{ .vop_rename = zfs_rename },
5310	VOPNAME_READLINK,	{ .vop_readlink = zfs_readlink },
5311	VOPNAME_INACTIVE,	{ .vop_inactive = zfs_inactive },
5312	VOPNAME_FID,		{ .vop_fid = zfs_fid },
5313	VOPNAME_PATHCONF,	{ .vop_pathconf = zfs_pathconf },
5314	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
5315	NULL,			NULL
5316};
5317
5318/*
5319 * special share hidden files vnode operations template
5320 */
5321vnodeops_t *zfs_sharevnodeops;
5322const fs_operation_def_t zfs_sharevnodeops_template[] = {
5323	VOPNAME_GETATTR,	{ .vop_getattr = zfs_getattr },
5324	VOPNAME_ACCESS,		{ .vop_access = zfs_access },
5325	VOPNAME_INACTIVE,	{ .vop_inactive = zfs_inactive },
5326	VOPNAME_FID,		{ .vop_fid = zfs_fid },
5327	VOPNAME_PATHCONF,	{ .vop_pathconf = zfs_pathconf },
5328	VOPNAME_GETSECATTR,	{ .vop_getsecattr = zfs_getsecattr },
5329	VOPNAME_SETSECATTR,	{ .vop_setsecattr = zfs_setsecattr },
5330	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
5331	NULL,			NULL
5332};
5333
5334/*
5335 * Extended attribute directory vnode operations template
5336 *
5337 * This template is identical to the directory vnodes
5338 * operation template except for restricted operations:
5339 *	VOP_MKDIR()
5340 *	VOP_SYMLINK()
5341 *
5342 * Note that there are other restrictions embedded in:
5343 *	zfs_create()	- restrict type to VREG
5344 *	zfs_link()	- no links into/out of attribute space
5345 *	zfs_rename()	- no moves into/out of attribute space
5346 */
5347vnodeops_t *zfs_xdvnodeops;
5348const fs_operation_def_t zfs_xdvnodeops_template[] = {
5349	VOPNAME_OPEN,		{ .vop_open = zfs_open },
5350	VOPNAME_CLOSE,		{ .vop_close = zfs_close },
5351	VOPNAME_IOCTL,		{ .vop_ioctl = zfs_ioctl },
5352	VOPNAME_GETATTR,	{ .vop_getattr = zfs_getattr },
5353	VOPNAME_SETATTR,	{ .vop_setattr = zfs_setattr },
5354	VOPNAME_ACCESS,		{ .vop_access = zfs_access },
5355	VOPNAME_LOOKUP,		{ .vop_lookup = zfs_lookup },
5356	VOPNAME_CREATE,		{ .vop_create = zfs_create },
5357	VOPNAME_REMOVE,		{ .vop_remove = zfs_remove },
5358	VOPNAME_LINK,		{ .vop_link = zfs_link },
5359	VOPNAME_RENAME,		{ .vop_rename = zfs_rename },
5360	VOPNAME_MKDIR,		{ .error = zfs_inval },
5361	VOPNAME_RMDIR,		{ .vop_rmdir = zfs_rmdir },
5362	VOPNAME_READDIR,	{ .vop_readdir = zfs_readdir },
5363	VOPNAME_SYMLINK,	{ .error = zfs_inval },
5364	VOPNAME_FSYNC,		{ .vop_fsync = zfs_fsync },
5365	VOPNAME_INACTIVE,	{ .vop_inactive = zfs_inactive },
5366	VOPNAME_FID,		{ .vop_fid = zfs_fid },
5367	VOPNAME_SEEK,		{ .vop_seek = zfs_seek },
5368	VOPNAME_PATHCONF,	{ .vop_pathconf = zfs_pathconf },
5369	VOPNAME_GETSECATTR,	{ .vop_getsecattr = zfs_getsecattr },
5370	VOPNAME_SETSECATTR,	{ .vop_setsecattr = zfs_setsecattr },
5371	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
5372	NULL,			NULL
5373};
5374
5375/*
5376 * Error vnode operations template
5377 */
5378vnodeops_t *zfs_evnodeops;
5379const fs_operation_def_t zfs_evnodeops_template[] = {
5380	VOPNAME_INACTIVE,	{ .vop_inactive = zfs_inactive },
5381	VOPNAME_PATHCONF,	{ .vop_pathconf = zfs_pathconf },
5382	NULL,			NULL
5383};
5384