xref: /illumos-gate/usr/src/uts/common/fs/zfs/zfs_vnops.c (revision 99d0d3f5)
1fa9e4066Sahrens /*
2fa9e4066Sahrens  * CDDL HEADER START
3fa9e4066Sahrens  *
4fa9e4066Sahrens  * The contents of this file are subject to the terms of the
5736b9155Smarks  * Common Development and Distribution License (the "License").
6736b9155Smarks  * You may not use this file except in compliance with the License.
7fa9e4066Sahrens  *
8fa9e4066Sahrens  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9fa9e4066Sahrens  * or http://www.opensolaris.org/os/licensing.
10fa9e4066Sahrens  * See the License for the specific language governing permissions
11fa9e4066Sahrens  * and limitations under the License.
12fa9e4066Sahrens  *
13fa9e4066Sahrens  * When distributing Covered Code, include this CDDL HEADER in each
14fa9e4066Sahrens  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15fa9e4066Sahrens  * If applicable, add the following below this CDDL HEADER, with the
16fa9e4066Sahrens  * fields enclosed by brackets "[]" replaced with your own identifying
17fa9e4066Sahrens  * information: Portions Copyright [yyyy] [name of copyright owner]
18fa9e4066Sahrens  *
19fa9e4066Sahrens  * CDDL HEADER END
20fa9e4066Sahrens  */
211c17160aSKevin Crowe 
22fa9e4066Sahrens /*
23d39ee142SMark Shellenbaum  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24ade42b55SSebastien Roy  * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
25c3d26abcSMatthew Ahrens  * Copyright (c) 2014 Integros [integros.com]
26aaa9aa59SJerry Jelinek  * Copyright 2020 Joyent, Inc.
271c17160aSKevin Crowe  * Copyright 2017 Nexenta Systems, Inc.
28fa9e4066Sahrens  */
3075c76197Speteh /* Portions Copyright 2007 Jeremy Teo */
3155da60b9SMark J Musante /* Portions Copyright 2010 Robert Milkowski */
33fa9e4066Sahrens #include <sys/types.h>
34fa9e4066Sahrens #include <sys/param.h>
35fa9e4066Sahrens #include <sys/time.h>
36fa9e4066Sahrens #include <sys/systm.h>
37fa9e4066Sahrens #include <sys/sysmacros.h>
38fa9e4066Sahrens #include <sys/resource.h>
39fa9e4066Sahrens #include <sys/vfs.h>
40aa59c4cbSrsb #include <sys/vfs_opreg.h>
41fa9e4066Sahrens #include <sys/vnode.h>
42fa9e4066Sahrens #include <sys/file.h>
43fa9e4066Sahrens #include <sys/stat.h>
44fa9e4066Sahrens #include <sys/kmem.h>
45fa9e4066Sahrens #include <sys/taskq.h>
46fa9e4066Sahrens #include <sys/uio.h>
47fa9e4066Sahrens #include <sys/vmsystm.h>
48fa9e4066Sahrens #include <sys/atomic.h>
4944eda4d7Smaybee #include <sys/vm.h>
50fa9e4066Sahrens #include <vm/seg_vn.h>
51fa9e4066Sahrens #include <vm/pvn.h>
52fa9e4066Sahrens #include <vm/as.h>
530fab61baSJonathan W Adams #include <vm/kpm.h>
540fab61baSJonathan W Adams #include <vm/seg_kpm.h>
55fa9e4066Sahrens #include <sys/mman.h>
56fa9e4066Sahrens #include <sys/pathname.h>
57fa9e4066Sahrens #include <sys/cmn_err.h>
58fa9e4066Sahrens #include <sys/errno.h>
59fa9e4066Sahrens #include <sys/unistd.h>
60fa9e4066Sahrens #include <sys/zfs_dir.h>
61fa9e4066Sahrens #include <sys/zfs_acl.h>
62fa9e4066Sahrens #include <sys/zfs_ioctl.h>
63fa9e4066Sahrens #include <sys/fs/zfs.h>
64fa9e4066Sahrens #include <sys/dmu.h>
6555da60b9SMark J Musante #include <sys/dmu_objset.h>
66fa9e4066Sahrens #include <sys/spa.h>
67fa9e4066Sahrens #include <sys/txg.h>
68fa9e4066Sahrens #include <sys/dbuf.h>
69fa9e4066Sahrens #include <sys/zap.h>
700a586ceaSMark Shellenbaum #include <sys/sa.h>
71fa9e4066Sahrens #include <sys/dirent.h>
72fa9e4066Sahrens #include <sys/policy.h>
73fa9e4066Sahrens #include <sys/sunddi.h>
74fa9e4066Sahrens #include <sys/filio.h>
75c1ce5987SMark Shellenbaum #include <sys/sid.h>
76fa9e4066Sahrens #include "fs/fs_subr.h"
77fa9e4066Sahrens #include <sys/zfs_ctldir.h>
78da6c28aaSamw #include <sys/zfs_fuid.h>
790a586ceaSMark Shellenbaum #include <sys/zfs_sa.h>
80033f9833Sek #include <sys/dnlc.h>
81104e2ed7Sperrin #include <sys/zfs_rlock.h>
82da6c28aaSamw #include <sys/extdirent.h>
83da6c28aaSamw #include <sys/kidmap.h>
8467dbe2beSCasper H.S. Dik #include <sys/cred.h>
85b38f0970Sck #include <sys/attr.h>
861271e4b1SPrakash Surya #include <sys/zil.h>
87f67950b2SNasf-Fan #include <sys/sa_impl.h>
88f67950b2SNasf-Fan #include <sys/zfs_project.h>
90fa9e4066Sahrens /*
91fa9e4066Sahrens  * Programming rules.
92fa9e4066Sahrens  *
93fa9e4066Sahrens  * Each vnode op performs some logical unit of work.  To do this, the ZPL must
94fa9e4066Sahrens  * properly lock its in-core state, create a DMU transaction, do the work,
95fa9e4066Sahrens  * record this work in the intent log (ZIL), commit the DMU transaction,
96da6c28aaSamw  * and wait for the intent log to commit if it is a synchronous operation.
97da6c28aaSamw  * Moreover, the vnode ops must work in both normal and log replay context.
98fa9e4066Sahrens  * The ordering of events is important to avoid deadlocks and references
99fa9e4066Sahrens  * to freed memory.  The example below illustrates the following Big Rules:
100fa9e4066Sahrens  *
101f7170741SWill Andrews  *  (1)	A check must be made in each zfs thread for a mounted file system.
1023cb34c60Sahrens  *	This is done avoiding races using ZFS_ENTER(zfsvfs).
103f7170741SWill Andrews  *	A ZFS_EXIT(zfsvfs) is needed before all returns.  Any znodes
104f7170741SWill Andrews  *	must be checked with ZFS_VERIFY_ZP(zp).  Both of these macros
105f7170741SWill Andrews  *	can return EIO from the calling function.
106fa9e4066Sahrens  *
107fa9e4066Sahrens  *  (2)	VN_RELE() should always be the last thing except for zil_commit()
108b19a79ecSperrin  *	(if necessary) and ZFS_EXIT(). This is for 3 reasons:
109fa9e4066Sahrens  *	First, if it's the last reference, the vnode/znode
110fa9e4066Sahrens  *	can be freed, so the zp may point to freed memory.  Second, the last
111fa9e4066Sahrens  *	reference will call zfs_zinactive(), which may induce a lot of work --
112104e2ed7Sperrin  *	pushing cached pages (which acquires range locks) and syncing out
113fa9e4066Sahrens  *	cached atime changes.  Third, zfs_zinactive() may require a new tx,
114fa9e4066Sahrens  *	which could deadlock the system if you were already holding one.
1159d3574bfSNeil Perrin  *	If you must call VN_RELE() within a tx then use VN_RELE_ASYNC().
116fa9e4066Sahrens  *
1177885c754Sperrin  *  (3)	All range locks must be grabbed before calling dmu_tx_assign(),
1187885c754Sperrin  *	as they can span dmu_tx_assign() calls.
1197885c754Sperrin  *
120e722410cSMatthew Ahrens  *  (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to
121e722410cSMatthew Ahrens  *      dmu_tx_assign().  This is critical because we don't want to block
122e722410cSMatthew Ahrens  *      while holding locks.
123e722410cSMatthew Ahrens  *
124e722410cSMatthew Ahrens  *	If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT.  This
125e722410cSMatthew Ahrens  *	reduces lock contention and CPU usage when we must wait (note that if
126e722410cSMatthew Ahrens  *	throughput is constrained by the storage, nearly every transaction
127e722410cSMatthew Ahrens  *	must wait).
128e722410cSMatthew Ahrens  *
129e722410cSMatthew Ahrens  *      Note, in particular, that if a lock is sometimes acquired before
130e722410cSMatthew Ahrens  *      the tx assigns, and sometimes after (e.g. z_lock), then failing
131e722410cSMatthew Ahrens  *      to use a non-blocking assign can deadlock the system.  The scenario:
132fa9e4066Sahrens  *
133fa9e4066Sahrens  *	Thread A has grabbed a lock before calling dmu_tx_assign().
134fa9e4066Sahrens  *	Thread B is in an already-assigned tx, and blocks for this lock.
135fa9e4066Sahrens  *	Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
136fa9e4066Sahrens  *	forever, because the previous txg can't quiesce until B's tx commits.
137fa9e4066Sahrens  *
138fa9e4066Sahrens  *	If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
13969962b56SMatthew Ahrens  *	then drop all locks, call dmu_tx_wait(), and try again.  On subsequent
140f864f99eSPrakash Surya  *	calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT,
14169962b56SMatthew Ahrens  *	to indicate that this operation has already called dmu_tx_wait().
14269962b56SMatthew Ahrens  *	This will ensure that we don't retry forever, waiting a short bit
14369962b56SMatthew Ahrens  *	each time.
144fa9e4066Sahrens  *
1457885c754Sperrin  *  (5)	If the operation succeeded, generate the intent log entry for it
146fa9e4066Sahrens  *	before dropping locks.  This ensures that the ordering of events
147fa9e4066Sahrens  *	in the intent log matches the order in which they actually occurred.
148f7170741SWill Andrews  *	During ZIL replay the zfs_log_* functions will update the sequence
1491209a471SNeil Perrin  *	number to indicate the zil transaction has replayed.
150fa9e4066Sahrens  *
1517885c754Sperrin  *  (6)	At the end of each vnode op, the DMU tx must always commit,
152fa9e4066Sahrens  *	regardless of whether there were any errors.
153fa9e4066Sahrens  *
1545002558fSNeil Perrin  *  (7)	After dropping all locks, invoke zil_commit(zilog, foid)
155fa9e4066Sahrens  *	to ensure that synchronous semantics are provided when necessary.
156fa9e4066Sahrens  *
157fa9e4066Sahrens  * In general, this is how things should be ordered in each vnode op:
158fa9e4066Sahrens  *
159fa9e4066Sahrens  *	ZFS_ENTER(zfsvfs);		// exit if unmounted
160fa9e4066Sahrens  * top:
161fa9e4066Sahrens  *	zfs_dirent_lock(&dl, ...)	// lock directory entry (may VN_HOLD())
162fa9e4066Sahrens  *	rw_enter(...);			// grab any other locks you need
163fa9e4066Sahrens  *	tx = dmu_tx_create(...);	// get DMU tx
164fa9e4066Sahrens  *	dmu_tx_hold_*();		// hold each object you might modify
165f864f99eSPrakash Surya  *	error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
166fa9e4066Sahrens  *	if (error) {
167fa9e4066Sahrens  *		rw_exit(...);		// drop locks
168fa9e4066Sahrens  *		zfs_dirent_unlock(dl);	// unlock directory entry
169fa9e4066Sahrens  *		VN_RELE(...);		// release held vnodes
1701209a471SNeil Perrin  *		if (error == ERESTART) {
17169962b56SMatthew Ahrens  *			waited = B_TRUE;
1728a2f1b91Sahrens  *			dmu_tx_wait(tx);
1738a2f1b91Sahrens  *			dmu_tx_abort(tx);
174fa9e4066Sahrens  *			goto top;
175fa9e4066Sahrens  *		}
1768a2f1b91Sahrens  *		dmu_tx_abort(tx);	// abort DMU tx
177fa9e4066Sahrens  *		ZFS_EXIT(zfsvfs);	// finished in zfs
178fa9e4066Sahrens  *		return (error);		// really out of space
179fa9e4066Sahrens  *	}
180fa9e4066Sahrens  *	error = do_real_work();		// do whatever this VOP does
181fa9e4066Sahrens  *	if (error == 0)
182b19a79ecSperrin  *		zfs_log_*(...);		// on success, make ZIL entry
183fa9e4066Sahrens  *	dmu_tx_commit(tx);		// commit DMU tx -- error or not
184fa9e4066Sahrens  *	rw_exit(...);			// drop locks
185fa9e4066Sahrens  *	zfs_dirent_unlock(dl);		// unlock directory entry
186fa9e4066Sahrens  *	VN_RELE(...);			// release held vnodes
1875002558fSNeil Perrin  *	zil_commit(zilog, foid);	// synchronous when necessary
188fa9e4066Sahrens  *	ZFS_EXIT(zfsvfs);		// finished in zfs
189fa9e4066Sahrens  *	return (error);			// done, report error
190fa9e4066Sahrens  */
192fa9e4066Sahrens /* ARGSUSED */
193fa9e4066Sahrens static int
zfs_open(vnode_t ** vpp,int flag,cred_t * cr,caller_context_t * ct)194da6c28aaSamw zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
195fa9e4066Sahrens {
19667bd71c6Sperrin 	znode_t	*zp = VTOZ(*vpp);
197b614fdaaSMark Shellenbaum 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
198b614fdaaSMark Shellenbaum 
199b614fdaaSMark Shellenbaum 	ZFS_ENTER(zfsvfs);
200b614fdaaSMark Shellenbaum 	ZFS_VERIFY_ZP(zp);
2020a586ceaSMark Shellenbaum 	if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) &&
203da6c28aaSamw 	    ((flag & FAPPEND) == 0)) {
204b614fdaaSMark Shellenbaum 		ZFS_EXIT(zfsvfs);
205be6fd75aSMatthew Ahrens 		return (SET_ERROR(EPERM));
206da6c28aaSamw 	}
208da6c28aaSamw 	if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
209da6c28aaSamw 	    ZTOV(zp)->v_type == VREG &&
2100a586ceaSMark Shellenbaum 	    !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) {
211b614fdaaSMark Shellenbaum 		if (fs_vscan(*vpp, cr, 0) != 0) {
212b614fdaaSMark Shellenbaum 			ZFS_EXIT(zfsvfs);
213be6fd75aSMatthew Ahrens 			return (SET_ERROR(EACCES));
214b614fdaaSMark Shellenbaum 		}
215b614fdaaSMark Shellenbaum 	}
21767bd71c6Sperrin 	/* Keep a count of the synchronous opens in the znode */
21867bd71c6Sperrin 	if (flag & (FSYNC | FDSYNC))
21967bd71c6Sperrin 		atomic_inc_32(&zp->z_sync_cnt);
221b614fdaaSMark Shellenbaum 	ZFS_EXIT(zfsvfs);
222fa9e4066Sahrens 	return (0);
223fa9e4066Sahrens }
225fa9e4066Sahrens /* ARGSUSED */
226fa9e4066Sahrens static int
zfs_close(vnode_t * vp,int flag,int count,offset_t offset,cred_t * cr,caller_context_t * ct)227da6c28aaSamw zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
228da6c28aaSamw     caller_context_t *ct)
229fa9e4066Sahrens {
23067bd71c6Sperrin 	znode_t	*zp = VTOZ(vp);
231b614fdaaSMark Shellenbaum 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
232b614fdaaSMark Shellenbaum 
233ee8143cbSChris Kirby 	/*
234ee8143cbSChris Kirby 	 * Clean up any locks held by this process on the vp.
235ee8143cbSChris Kirby 	 */
236ee8143cbSChris Kirby 	cleanlocks(vp, ddi_get_pid(), 0);
237ee8143cbSChris Kirby 	cleanshares(vp, ddi_get_pid());
238ee8143cbSChris Kirby 
239b614fdaaSMark Shellenbaum 	ZFS_ENTER(zfsvfs);
240b614fdaaSMark Shellenbaum 	ZFS_VERIFY_ZP(zp);
24267bd71c6Sperrin 	/* Decrement the synchronous opens in the znode */
243ecb72030Sperrin 	if ((flag & (FSYNC | FDSYNC)) && (count == 1))
24467bd71c6Sperrin 		atomic_dec_32(&zp->z_sync_cnt);
246da6c28aaSamw 	if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
247da6c28aaSamw 	    ZTOV(zp)->v_type == VREG &&
2480a586ceaSMark Shellenbaum 	    !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0)
249da6c28aaSamw 		VERIFY(fs_vscan(vp, cr, 1) == 0);
251b614fdaaSMark Shellenbaum 	ZFS_EXIT(zfsvfs);
252fa9e4066Sahrens 	return (0);
253fa9e4066Sahrens }
255fa9e4066Sahrens /*
256fa9e4066Sahrens  * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and
257fa9e4066Sahrens  * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter.
258fa9e4066Sahrens  */
259fa9e4066Sahrens static int
zfs_holey(vnode_t * vp,int cmd,offset_t * off)260fa9e4066Sahrens zfs_holey(vnode_t *vp, int cmd, offset_t *off)
261fa9e4066Sahrens {
262fa9e4066Sahrens 	znode_t	*zp = VTOZ(vp);
263fa9e4066Sahrens 	uint64_t noff = (uint64_t)*off; /* new offset */
264fa9e4066Sahrens 	uint64_t file_sz;
265fa9e4066Sahrens 	int error;
266fa9e4066Sahrens 	boolean_t hole;
2680a586ceaSMark Shellenbaum 	file_sz = zp->z_size;
269fa9e4066Sahrens 	if (noff >= file_sz)  {
270be6fd75aSMatthew Ahrens 		return (SET_ERROR(ENXIO));
271fa9e4066Sahrens 	}
273fa9e4066Sahrens 	if (cmd == _FIO_SEEK_HOLE)
274fa9e4066Sahrens 		hole = B_TRUE;
275fa9e4066Sahrens 	else
276fa9e4066Sahrens 		hole = B_FALSE;
278fa9e4066Sahrens 	error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff);
2800fbc0cd0SMatthew Ahrens 	if (error == ESRCH)
281be6fd75aSMatthew Ahrens 		return (SET_ERROR(ENXIO));
2820fbc0cd0SMatthew Ahrens 
2830fbc0cd0SMatthew Ahrens 	/*
2840fbc0cd0SMatthew Ahrens 	 * We could find a hole that begins after the logical end-of-file,
2850fbc0cd0SMatthew Ahrens 	 * because dmu_offset_next() only works on whole blocks.  If the
2860fbc0cd0SMatthew Ahrens 	 * EOF falls mid-block, then indicate that the "virtual hole"
2870fbc0cd0SMatthew Ahrens 	 * at the end of the file begins at the logical EOF, rather than
2880fbc0cd0SMatthew Ahrens 	 * at the end of the last block.
2890fbc0cd0SMatthew Ahrens 	 */
2900fbc0cd0SMatthew Ahrens 	if (noff > file_sz) {
2910fbc0cd0SMatthew Ahrens 		ASSERT(hole);
2920fbc0cd0SMatthew Ahrens 		noff = file_sz;
293fa9e4066Sahrens 	}
295fa9e4066Sahrens 	if (noff < *off)
296fa9e4066Sahrens 		return (error);
297fa9e4066Sahrens 	*off = noff;
298fa9e4066Sahrens 	return (error);
299fa9e4066Sahrens }
301f67950b2SNasf-Fan static int
zfs_ioctl_getxattr(vnode_t * vp,intptr_t data,int flag,cred_t * cr,caller_context_t * ct)302f67950b2SNasf-Fan zfs_ioctl_getxattr(vnode_t *vp, intptr_t data, int flag, cred_t *cr,
303f67950b2SNasf-Fan     caller_context_t *ct)
304f67950b2SNasf-Fan {
305f67950b2SNasf-Fan 	zfsxattr_t fsx = { 0 };
306f67950b2SNasf-Fan 	znode_t *zp = VTOZ(vp);
308f67950b2SNasf-Fan 	if (zp->z_pflags & ZFS_PROJINHERIT)
309f67950b2SNasf-Fan 		fsx.fsx_xflags = ZFS_PROJINHERIT_FL;
310f67950b2SNasf-Fan 	if (zp->z_pflags & ZFS_PROJID)
311f67950b2SNasf-Fan 		fsx.fsx_projid = zp->z_projid;
312f67950b2SNasf-Fan 	if (ddi_copyout(&fsx, (void *)data, sizeof (fsx), flag))
313f67950b2SNasf-Fan 		return (SET_ERROR(EFAULT));
315f67950b2SNasf-Fan 	return (0);
316f67950b2SNasf-Fan }
318f67950b2SNasf-Fan static int zfs_setattr(vnode_t *, vattr_t *, int, cred_t *, caller_context_t *);
320f67950b2SNasf-Fan static int
zfs_ioctl_setxattr(vnode_t * vp,intptr_t data,int flags,cred_t * cr,caller_context_t * ct)321f67950b2SNasf-Fan zfs_ioctl_setxattr(vnode_t *vp, intptr_t data, int flags, cred_t *cr,
322f67950b2SNasf-Fan     caller_context_t *ct)
323f67950b2SNasf-Fan {
324f67950b2SNasf-Fan 	znode_t *zp = VTOZ(vp);
325f67950b2SNasf-Fan 	zfsxattr_t fsx;
326f67950b2SNasf-Fan 	xvattr_t xva;
327f67950b2SNasf-Fan 	xoptattr_t *xoap;
328f67950b2SNasf-Fan 	int err;
330f67950b2SNasf-Fan 	if (ddi_copyin((void *)data, &fsx, sizeof (fsx), flags))
331f67950b2SNasf-Fan 		return (SET_ERROR(EFAULT));
333f67950b2SNasf-Fan 	if (!zpl_is_valid_projid(fsx.fsx_projid))
334f67950b2SNasf-Fan 		return (SET_ERROR(EINVAL));
336f67950b2SNasf-Fan 	if (fsx.fsx_xflags & ~ZFS_PROJINHERIT_FL)
337f67950b2SNasf-Fan 		return (SET_ERROR(EOPNOTSUPP));
339f67950b2SNasf-Fan 	xva_init(&xva);
340f67950b2SNasf-Fan 	xoap = xva_getxoptattr(&xva);
342f67950b2SNasf-Fan 	XVA_SET_REQ(&xva, XAT_PROJINHERIT);
343f67950b2SNasf-Fan 	if (fsx.fsx_xflags & ZFS_PROJINHERIT_FL)
344f67950b2SNasf-Fan 		xoap->xoa_projinherit = B_TRUE;
346f67950b2SNasf-Fan 	XVA_SET_REQ(&xva, XAT_PROJID);
347f67950b2SNasf-Fan 	xoap->xoa_projid = fsx.fsx_projid;
349f67950b2SNasf-Fan 	return (zfs_setattr(vp, (vattr_t *)&xva, flags, cr, ct));
350f67950b2SNasf-Fan }
352fa9e4066Sahrens /* ARGSUSED */
353fa9e4066Sahrens static int
zfs_ioctl(vnode_t * vp,int com,intptr_t data,int flag,cred_t * cred,int * rvalp,caller_context_t * ct)354fa9e4066Sahrens zfs_ioctl(vnode_t *vp, int com, intptr_t data, int flag, cred_t *cred,
355da6c28aaSamw     int *rvalp, caller_context_t *ct)
356fa9e4066Sahrens {
357fa9e4066Sahrens 	offset_t off;
3582bcf0248SMax Grossman 	offset_t ndata;
3592bcf0248SMax Grossman 	dmu_object_info_t doi;
360fa9e4066Sahrens 	int error;
361fa9e4066Sahrens 	zfsvfs_t *zfsvfs;
362f18faf3fSek 	znode_t *zp;
364fa9e4066Sahrens 	switch (com) {
365ecb72030Sperrin 	case _FIOFFS:
3662bcf0248SMax Grossman 	{
367fa9e4066Sahrens 		return (zfs_sync(vp->v_vfsp, 0, cred));
369ea8dc4b6Seschrock 		/*
370ea8dc4b6Seschrock 		 * The following two ioctls are used by bfu.  Faking out,
371ea8dc4b6Seschrock 		 * necessary to avoid bfu errors.
372ea8dc4b6Seschrock 		 */
3732bcf0248SMax Grossman 	}
374ecb72030Sperrin 	case _FIOGDIO:
375ecb72030Sperrin 	case _FIOSDIO:
3762bcf0248SMax Grossman 	{
377ea8dc4b6Seschrock 		return (0);
3782bcf0248SMax Grossman 	}
380aaa9aa59SJerry Jelinek 	case _FIODIRECTIO:
381aaa9aa59SJerry Jelinek 	{
382aaa9aa59SJerry Jelinek 		/*
383aaa9aa59SJerry Jelinek 		 * ZFS inherently provides the basic semantics for directio.
384aaa9aa59SJerry Jelinek 		 * This is the summary from the ZFS on Linux support for
385aaa9aa59SJerry Jelinek 		 * O_DIRECT, which is the common form of directio, and required
386aaa9aa59SJerry Jelinek 		 * no changes to ZFS.
387aaa9aa59SJerry Jelinek 		 *
388aaa9aa59SJerry Jelinek 		 * 1. Minimize cache effects of the I/O.
389aaa9aa59SJerry Jelinek 		 *
390aaa9aa59SJerry Jelinek 		 *    By design the ARC is already scan-resistant, which helps
391aaa9aa59SJerry Jelinek 		 *    mitigate the need for special O_DIRECT handling.
392aaa9aa59SJerry Jelinek 		 *
393aaa9aa59SJerry Jelinek 		 * 2. O_DIRECT _MAY_ impose restrictions on IO alignment and
394aaa9aa59SJerry Jelinek 		 *    length.
395aaa9aa59SJerry Jelinek 		 *
396aaa9aa59SJerry Jelinek 		 *    No additional alignment or length restrictions are
397aaa9aa59SJerry Jelinek 		 *    imposed by ZFS.
398aaa9aa59SJerry Jelinek 		 *
399aaa9aa59SJerry Jelinek 		 * 3. O_DIRECT _MAY_ perform unbuffered IO operations directly
400aaa9aa59SJerry Jelinek 		 *    between user memory and block device.
401aaa9aa59SJerry Jelinek 		 *
402aaa9aa59SJerry Jelinek 		 *    No unbuffered IO operations are currently supported. In
403aaa9aa59SJerry Jelinek 		 *    order to support features such as compression, encryption,
404aaa9aa59SJerry Jelinek 		 *    and checksumming a copy must be made to transform the
405aaa9aa59SJerry Jelinek 		 *    data.
406aaa9aa59SJerry Jelinek 		 *
407aaa9aa59SJerry Jelinek 		 * 4. O_DIRECT _MAY_ imply O_DSYNC (XFS).
408aaa9aa59SJerry Jelinek 		 *
409aaa9aa59SJerry Jelinek 		 *    O_DIRECT does not imply O_DSYNC for ZFS.
410aaa9aa59SJerry Jelinek 		 *
411aaa9aa59SJerry Jelinek 		 * 5. O_DIRECT _MAY_ disable file locking that serializes IO
412aaa9aa59SJerry Jelinek 		 *    operations.
413aaa9aa59SJerry Jelinek 		 *
414aaa9aa59SJerry Jelinek 		 *    All I/O in ZFS is locked for correctness and this locking
415aaa9aa59SJerry Jelinek 		 *    is not disabled by O_DIRECT.
416aaa9aa59SJerry Jelinek 		 */
417aaa9aa59SJerry Jelinek 		return (0);
418aaa9aa59SJerry Jelinek 	}
419aaa9aa59SJerry Jelinek 
420ecb72030Sperrin 	case _FIO_SEEK_DATA:
421ecb72030Sperrin 	case _FIO_SEEK_HOLE:
4222bcf0248SMax Grossman 	{
423fa9e4066Sahrens 		if (ddi_copyin((void *)data, &off, sizeof (off), flag))
424be6fd75aSMatthew Ahrens 			return (SET_ERROR(EFAULT));
426f18faf3fSek 		zp = VTOZ(vp);
427f18faf3fSek 		zfsvfs = zp->z_zfsvfs;
4283cb34c60Sahrens 		ZFS_ENTER(zfsvfs);
4293cb34c60Sahrens 		ZFS_VERIFY_ZP(zp);
431fa9e4066Sahrens 		/* offset parameter is in/out */
432fa9e4066Sahrens 		error = zfs_holey(vp, com, &off);
433fa9e4066Sahrens 		ZFS_EXIT(zfsvfs);
434fa9e4066Sahrens 		if (error)
435fa9e4066Sahrens 			return (error);
436fa9e4066Sahrens 		if (ddi_copyout(&off, (void *)data, sizeof (off), flag))
437be6fd75aSMatthew Ahrens 			return (SET_ERROR(EFAULT));
438fa9e4066Sahrens 		return (0);
439fa9e4066Sahrens 	}
4402bcf0248SMax Grossman 	case _FIO_COUNT_FILLED:
4412bcf0248SMax Grossman 	{
4422bcf0248SMax Grossman 		/*
4432bcf0248SMax Grossman 		 * _FIO_COUNT_FILLED adds a new ioctl command which
4442bcf0248SMax Grossman 		 * exposes the number of filled blocks in a
4452bcf0248SMax Grossman 		 * ZFS object.
4462bcf0248SMax Grossman 		 */
4472bcf0248SMax Grossman 		zp = VTOZ(vp);
4482bcf0248SMax Grossman 		zfsvfs = zp->z_zfsvfs;
4492bcf0248SMax Grossman 		ZFS_ENTER(zfsvfs);
4502bcf0248SMax Grossman 		ZFS_VERIFY_ZP(zp);
4512bcf0248SMax Grossman 
4522bcf0248SMax Grossman 		/*
4532bcf0248SMax Grossman 		 * Wait for all dirty blocks for this object
4542bcf0248SMax Grossman 		 * to get synced out to disk, and the DMU info
4552bcf0248SMax Grossman 		 * updated.
4562bcf0248SMax Grossman 		 */
4572bcf0248SMax Grossman 		error = dmu_object_wait_synced(zfsvfs->z_os, zp->z_id);
4582bcf0248SMax Grossman 		if (error) {
4592bcf0248SMax Grossman 			ZFS_EXIT(zfsvfs);
4602bcf0248SMax Grossman 			return (error);
4612bcf0248SMax Grossman 		}
4622bcf0248SMax Grossman 
4632bcf0248SMax Grossman 		/*
4642bcf0248SMax Grossman 		 * Retrieve fill count from DMU object.
4652bcf0248SMax Grossman 		 */
4662bcf0248SMax Grossman 		error = dmu_object_info(zfsvfs->z_os, zp->z_id, &doi);
4672bcf0248SMax Grossman 		if (error) {
4682bcf0248SMax Grossman 			ZFS_EXIT(zfsvfs);
4692bcf0248SMax Grossman 			return (error);
4702bcf0248SMax Grossman 		}
4712bcf0248SMax Grossman 
4722bcf0248SMax Grossman 		ndata = doi.doi_fill_count;
4732bcf0248SMax Grossman 
4742bcf0248SMax Grossman 		ZFS_EXIT(zfsvfs);
4752bcf0248SMax Grossman 		if (ddi_copyout(&ndata, (void *)data, sizeof (ndata), flag))
4762bcf0248SMax Grossman 			return (SET_ERROR(EFAULT));
4772bcf0248SMax Grossman 		return (0);
4782bcf0248SMax Grossman 	}
479f67950b2SNasf-Fan 	case ZFS_IOC_FSGETXATTR:
480f67950b2SNasf-Fan 		return (zfs_ioctl_getxattr(vp, data, flag, cred, ct));
481f67950b2SNasf-Fan 	case ZFS_IOC_FSSETXATTR:
482f67950b2SNasf-Fan 		return (zfs_ioctl_setxattr(vp, data, flag, cred, ct));
4832bcf0248SMax Grossman 	}
484be6fd75aSMatthew Ahrens 	return (SET_ERROR(ENOTTY));
485fa9e4066Sahrens }
4870fab61baSJonathan W Adams /*
4880fab61baSJonathan W Adams  * Utility functions to map and unmap a single physical page.  These
4890fab61baSJonathan W Adams  * are used to manage the mappable copies of ZFS file data, and therefore
4900fab61baSJonathan W Adams  * do not update ref/mod bits.
4910fab61baSJonathan W Adams  */
4920fab61baSJonathan W Adams caddr_t
zfs_map_page(page_t * pp,enum seg_rw rw)4930fab61baSJonathan W Adams zfs_map_page(page_t *pp, enum seg_rw rw)
4940fab61baSJonathan W Adams {
4950fab61baSJonathan W Adams 	if (kpm_enable)
4960fab61baSJonathan W Adams 		return (hat_kpm_mapin(pp, 0));
4970fab61baSJonathan W Adams 	ASSERT(rw == S_READ || rw == S_WRITE);
4980fab61baSJonathan W Adams 	return (ppmapin(pp, PROT_READ | ((rw == S_WRITE) ? PROT_WRITE : 0),
4990fab61baSJonathan W Adams 	    (caddr_t)-1));
5000fab61baSJonathan W Adams }
5010fab61baSJonathan W Adams 
5020fab61baSJonathan W Adams void
zfs_unmap_page(page_t * pp,caddr_t addr)5030fab61baSJonathan W Adams zfs_unmap_page(page_t *pp, caddr_t addr)
5040fab61baSJonathan W Adams {
5050fab61baSJonathan W Adams 	if (kpm_enable) {
5060fab61baSJonathan W Adams 		hat_kpm_mapout(pp, 0, addr);
5070fab61baSJonathan W Adams 	} else {
5080fab61baSJonathan W Adams 		ppmapout(addr);
5090fab61baSJonathan W Adams 	}
5100fab61baSJonathan W Adams }
5110fab61baSJonathan W Adams 
512fa9e4066Sahrens /*
513fa9e4066Sahrens  * When a file is memory mapped, we must keep the IO data synchronized
514fa9e4066Sahrens  * between the DMU cache and the memory mapped pages.  What this means:
515fa9e4066Sahrens  *
516fa9e4066Sahrens  * On Write:	If we find a memory mapped page, we write to *both*
517fa9e4066Sahrens  *		the page and the dmu buffer.
518fa9e4066Sahrens  */
519ac05c741SMark Maybee static void
update_pages(vnode_t * vp,int64_t start,int len,objset_t * os,uint64_t oid)520ac05c741SMark Maybee update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid)
521fa9e4066Sahrens {
522ac05c741SMark Maybee 	int64_t	off;
524fa9e4066Sahrens 	off = start & PAGEOFFSET;
525fa9e4066Sahrens 	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
526fa9e4066Sahrens 		page_t *pp;
527ac05c741SMark Maybee 		uint64_t nbytes = MIN(PAGESIZE - off, len);
529fa9e4066Sahrens 		if (pp = page_lookup(vp, start, SE_SHARED)) {
530fa9e4066Sahrens 			caddr_t va;
5320fab61baSJonathan W Adams 			va = zfs_map_page(pp, S_WRITE);
5337bfdf011SNeil Perrin 			(void) dmu_read(os, oid, start+off, nbytes, va+off,
5347bfdf011SNeil Perrin 			    DMU_READ_PREFETCH);
5350fab61baSJonathan W Adams 			zfs_unmap_page(pp, va);
536fa9e4066Sahrens 			page_unlock(pp);
537fa9e4066Sahrens 		}
538ac05c741SMark Maybee 		len -= nbytes;
539fa9e4066Sahrens 		off = 0;
540fa9e4066Sahrens 	}
541fa9e4066Sahrens }
543fa9e4066Sahrens /*
544fa9e4066Sahrens  * When a file is memory mapped, we must keep the IO data synchronized
545fa9e4066Sahrens  * between the DMU cache and the memory mapped pages.  What this means:
546fa9e4066Sahrens  *
547fa9e4066Sahrens  * On Read:	We "read" preferentially from memory mapped pages,
548fa9e4066Sahrens  *		else we default from the dmu buffer.
549fa9e4066Sahrens  *
550fa9e4066Sahrens  * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
551f7170741SWill Andrews  *	 the file is memory mapped.
552fa9e4066Sahrens  */
553fa9e4066Sahrens static int
mappedread(vnode_t * vp,int nbytes,uio_t * uio)554feb08c6bSbillm mappedread(vnode_t *vp, int nbytes, uio_t *uio)
555fa9e4066Sahrens {
556feb08c6bSbillm 	znode_t *zp = VTOZ(vp);
557feb08c6bSbillm 	int64_t	start, off;
558fa9e4066Sahrens 	int len = nbytes;
559fa9e4066Sahrens 	int error = 0;
561fa9e4066Sahrens 	start = uio->uio_loffset;
562fa9e4066Sahrens 	off = start & PAGEOFFSET;
563fa9e4066Sahrens 	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
564fa9e4066Sahrens 		page_t *pp;
565feb08c6bSbillm 		uint64_t bytes = MIN(PAGESIZE - off, len);
567fa9e4066Sahrens 		if (pp = page_lookup(vp, start, SE_SHARED)) {
568fa9e4066Sahrens 			caddr_t va;
5700fab61baSJonathan W Adams 			va = zfs_map_page(pp, S_READ);
571fa9e4066Sahrens 			error = uiomove(va + off, bytes, UIO_READ, uio);
5720fab61baSJonathan W Adams 			zfs_unmap_page(pp, va);
573fa9e4066Sahrens 			page_unlock(pp);
574fa9e4066Sahrens 		} else {
575f8554bb9SMatthew Ahrens 			error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
576f8554bb9SMatthew Ahrens 			    uio, bytes);
577fa9e4066Sahrens 		}
578fa9e4066Sahrens 		len -= bytes;
579fa9e4066Sahrens 		off = 0;
580fa9e4066Sahrens 		if (error)
581fa9e4066Sahrens 			break;
582fa9e4066Sahrens 	}
583fa9e4066Sahrens 	return (error);
584fa9e4066Sahrens }
586feb08c6bSbillm offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */
588fa9e4066Sahrens /*
589fa9e4066Sahrens  * Read bytes from specified file into supplied buffer.
590fa9e4066Sahrens  *
591fa9e4066Sahrens  *	IN:	vp	- vnode of file to be read from.
592fa9e4066Sahrens  *		uio	- structure supplying read location, range info,
593fa9e4066Sahrens  *			  and return buffer.
594fa9e4066Sahrens  *		ioflag	- SYNC flags; used to provide FRSYNC semantics.
595fa9e4066Sahrens  *		cr	- credentials of caller.
596da6c28aaSamw  *		ct	- caller context
597fa9e4066Sahrens  *
598fa9e4066Sahrens  *	OUT:	uio	- updated offset and range, buffer filled.
599fa9e4066Sahrens  *
600f7170741SWill Andrews  *	RETURN:	0 on success, error code on failure.
601fa9e4066Sahrens  *
602fa9e4066Sahrens  * Side Effects:
603fa9e4066Sahrens  *	vp - atime updated if byte count > 0
604fa9e4066Sahrens  */
605fa9e4066Sahrens /* ARGSUSED */
606fa9e4066Sahrens static int
zfs_read(vnode_t * vp,uio_t * uio,int ioflag,cred_t * cr,caller_context_t * ct)607fa9e4066Sahrens zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
608fa9e4066Sahrens {
609fa9e4066Sahrens 	znode_t		*zp = VTOZ(vp);
610fa9e4066Sahrens 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
611feb08c6bSbillm 	ssize_t		n, nbytes;
612d5285caeSGeorge Wilson 	int		error = 0;
613da7753c4SGeorge Wilson 	boolean_t	frsync = B_FALSE;
614c242f9a0Schunli zhang - Sun Microsystems - Irvine United States 	xuio_t		*xuio = NULL;
6163cb34c60Sahrens 	ZFS_ENTER(zfsvfs);
6173cb34c60Sahrens 	ZFS_VERIFY_ZP(zp);
6190a586ceaSMark Shellenbaum 	if (zp->z_pflags & ZFS_AV_QUARANTINED) {
6200616c50eSmarks 		ZFS_EXIT(zfsvfs);
621be6fd75aSMatthew Ahrens 		return (SET_ERROR(EACCES));
6220616c50eSmarks 	}
624fa9e4066Sahrens 	/*
625fa9e4066Sahrens 	 * Validate file offset
626fa9e4066Sahrens 	 */
627fa9e4066Sahrens 	if (uio->uio_loffset < (offset_t)0) {
628fa9e4066Sahrens 		ZFS_EXIT(zfsvfs);
629be6fd75aSMatthew Ahrens 		return (SET_ERROR(EINVAL));
630fa9e4066Sahrens 	}
632fa9e4066Sahrens 	/*
633fa9e4066Sahrens 	 * Fasttrack empty reads
634fa9e4066Sahrens 	 */
635fa9e4066Sahrens 	if (uio->uio_resid == 0) {
636fa9e4066Sahrens 		ZFS_EXIT(zfsvfs);
637fa9e4066Sahrens 		return (0);
638fa9e4066Sahrens 	}
640fa9e4066Sahrens 	/*
641104e2ed7Sperrin 	 * Check for mandatory locks
642fa9e4066Sahrens 	 */
6430a586ceaSMark Shellenbaum 	if (MANDMODE(zp->z_mode)) {
644fa9e4066Sahrens 		if (error = chklock(vp, FREAD,
645fa9e4066Sahrens 		    uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) {
646fa9e4066Sahrens 			ZFS_EXIT(zfsvfs);
647fa9e4066Sahrens 			return (error);
648fa9e4066Sahrens 		}
649fa9e4066Sahrens 	}
651da7753c4SGeorge Wilson #ifdef FRSYNC
652fa9e4066Sahrens 	/*
653fa9e4066Sahrens 	 * If we're in FRSYNC mode, sync out this znode before reading it.
654da7753c4SGeorge Wilson 	 * Only do this for non-snapshots.
655da7753c4SGeorge Wilson 	 *
656da7753c4SGeorge Wilson 	 * Some platforms do not support FRSYNC and instead map it
657da7753c4SGeorge Wilson 	 * to FSYNC, which results in unnecessary calls to zil_commit. We
658da7753c4SGeorge Wilson 	 * only honor FRSYNC requests on platforms which support it.
659fa9e4066Sahrens 	 */
660da7753c4SGeorge Wilson 	frsync = !!(ioflag & FRSYNC);
661da7753c4SGeorge Wilson #endif
662da7753c4SGeorge Wilson 
663da7753c4SGeorge Wilson 	if (zfsvfs->z_log &&
664da7753c4SGeorge Wilson 	    (frsync || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS))
6655002558fSNeil Perrin 		zil_commit(zfsvfs->z_log, zp->z_id);
667fa9e4066Sahrens 	/*
668104e2ed7Sperrin 	 * Lock the range against changes.
669fa9e4066Sahrens 	 */
67079315247SMatthew Ahrens 	locked_range_t *lr = rangelock_enter(&zp->z_rangelock,
67179315247SMatthew Ahrens 	    uio->uio_loffset, uio->uio_resid, RL_READER);