1fa9e4066Sahrens /*
2fa9e4066Sahrens * CDDL HEADER START
3fa9e4066Sahrens *
4fa9e4066Sahrens * The contents of this file are subject to the terms of the
5736b9155Smarks * Common Development and Distribution License (the "License").
6736b9155Smarks * You may not use this file except in compliance with the License.
7fa9e4066Sahrens *
8fa9e4066Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9fa9e4066Sahrens * or http://www.opensolaris.org/os/licensing.
10fa9e4066Sahrens * See the License for the specific language governing permissions
11fa9e4066Sahrens * and limitations under the License.
12fa9e4066Sahrens *
13fa9e4066Sahrens * When distributing Covered Code, include this CDDL HEADER in each
14fa9e4066Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15fa9e4066Sahrens * If applicable, add the following below this CDDL HEADER, with the
16fa9e4066Sahrens * fields enclosed by brackets "[]" replaced with your own identifying
17fa9e4066Sahrens * information: Portions Copyright [yyyy] [name of copyright owner]
18fa9e4066Sahrens *
19fa9e4066Sahrens * CDDL HEADER END
20fa9e4066Sahrens */
211c17160aSKevin Crowe
22fa9e4066Sahrens /*
23d39ee142SMark Shellenbaum * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24ade42b55SSebastien Roy * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
25c3d26abcSMatthew Ahrens * Copyright (c) 2014 Integros [integros.com]
26aaa9aa59SJerry Jelinek * Copyright 2020 Joyent, Inc.
271c17160aSKevin Crowe * Copyright 2017 Nexenta Systems, Inc.
28fa9e4066Sahrens */
29fa9e4066Sahrens
3075c76197Speteh /* Portions Copyright 2007 Jeremy Teo */
3155da60b9SMark J Musante /* Portions Copyright 2010 Robert Milkowski */
3275c76197Speteh
33fa9e4066Sahrens #include <sys/types.h>
34fa9e4066Sahrens #include <sys/param.h>
35fa9e4066Sahrens #include <sys/time.h>
36fa9e4066Sahrens #include <sys/systm.h>
37fa9e4066Sahrens #include <sys/sysmacros.h>
38fa9e4066Sahrens #include <sys/resource.h>
39fa9e4066Sahrens #include <sys/vfs.h>
40aa59c4cbSrsb #include <sys/vfs_opreg.h>
41fa9e4066Sahrens #include <sys/vnode.h>
42fa9e4066Sahrens #include <sys/file.h>
43fa9e4066Sahrens #include <sys/stat.h>
44fa9e4066Sahrens #include <sys/kmem.h>
45fa9e4066Sahrens #include <sys/taskq.h>
46fa9e4066Sahrens #include <sys/uio.h>
47fa9e4066Sahrens #include <sys/vmsystm.h>
48fa9e4066Sahrens #include <sys/atomic.h>
4944eda4d7Smaybee #include <sys/vm.h>
50fa9e4066Sahrens #include <vm/seg_vn.h>
51fa9e4066Sahrens #include <vm/pvn.h>
52fa9e4066Sahrens #include <vm/as.h>
530fab61baSJonathan W Adams #include <vm/kpm.h>
540fab61baSJonathan W Adams #include <vm/seg_kpm.h>
55fa9e4066Sahrens #include <sys/mman.h>
56fa9e4066Sahrens #include <sys/pathname.h>
57fa9e4066Sahrens #include <sys/cmn_err.h>
58fa9e4066Sahrens #include <sys/errno.h>
59fa9e4066Sahrens #include <sys/unistd.h>
60fa9e4066Sahrens #include <sys/zfs_dir.h>
61fa9e4066Sahrens #include <sys/zfs_acl.h>
62fa9e4066Sahrens #include <sys/zfs_ioctl.h>
63fa9e4066Sahrens #include <sys/fs/zfs.h>
64fa9e4066Sahrens #include <sys/dmu.h>
6555da60b9SMark J Musante #include <sys/dmu_objset.h>
66fa9e4066Sahrens #include <sys/spa.h>
67fa9e4066Sahrens #include <sys/txg.h>
68fa9e4066Sahrens #include <sys/dbuf.h>
69fa9e4066Sahrens #include <sys/zap.h>
700a586ceaSMark Shellenbaum #include <sys/sa.h>
71fa9e4066Sahrens #include <sys/dirent.h>
72fa9e4066Sahrens #include <sys/policy.h>
73fa9e4066Sahrens #include <sys/sunddi.h>
74fa9e4066Sahrens #include <sys/filio.h>
75c1ce5987SMark Shellenbaum #include <sys/sid.h>
76fa9e4066Sahrens #include "fs/fs_subr.h"
77fa9e4066Sahrens #include <sys/zfs_ctldir.h>
78da6c28aaSamw #include <sys/zfs_fuid.h>
790a586ceaSMark Shellenbaum #include <sys/zfs_sa.h>
80033f9833Sek #include <sys/dnlc.h>
81104e2ed7Sperrin #include <sys/zfs_rlock.h>
82da6c28aaSamw #include <sys/extdirent.h>
83da6c28aaSamw #include <sys/kidmap.h>
8467dbe2beSCasper H.S. Dik #include <sys/cred.h>
85b38f0970Sck #include <sys/attr.h>
861271e4b1SPrakash Surya #include <sys/zil.h>
87f67950b2SNasf-Fan #include <sys/sa_impl.h>
88f67950b2SNasf-Fan #include <sys/zfs_project.h>
89fa9e4066Sahrens
90fa9e4066Sahrens /*
91fa9e4066Sahrens * Programming rules.
92fa9e4066Sahrens *
93fa9e4066Sahrens * Each vnode op performs some logical unit of work. To do this, the ZPL must
94fa9e4066Sahrens * properly lock its in-core state, create a DMU transaction, do the work,
95fa9e4066Sahrens * record this work in the intent log (ZIL), commit the DMU transaction,
96da6c28aaSamw * and wait for the intent log to commit if it is a synchronous operation.
97da6c28aaSamw * Moreover, the vnode ops must work in both normal and log replay context.
98fa9e4066Sahrens * The ordering of events is important to avoid deadlocks and references
99fa9e4066Sahrens * to freed memory. The example below illustrates the following Big Rules:
100fa9e4066Sahrens *
101f7170741SWill Andrews * (1) A check must be made in each zfs thread for a mounted file system.
1023cb34c60Sahrens * This is done avoiding races using ZFS_ENTER(zfsvfs).
103f7170741SWill Andrews * A ZFS_EXIT(zfsvfs) is needed before all returns. Any znodes
104f7170741SWill Andrews * must be checked with ZFS_VERIFY_ZP(zp). Both of these macros
105f7170741SWill Andrews * can return EIO from the calling function.
106fa9e4066Sahrens *
107fa9e4066Sahrens * (2) VN_RELE() should always be the last thing except for zil_commit()
108b19a79ecSperrin * (if necessary) and ZFS_EXIT(). This is for 3 reasons:
109fa9e4066Sahrens * First, if it's the last reference, the vnode/znode
110fa9e4066Sahrens * can be freed, so the zp may point to freed memory. Second, the last
111fa9e4066Sahrens * reference will call zfs_zinactive(), which may induce a lot of work --
112104e2ed7Sperrin * pushing cached pages (which acquires range locks) and syncing out
113fa9e4066Sahrens * cached atime changes. Third, zfs_zinactive() may require a new tx,
114fa9e4066Sahrens * which could deadlock the system if you were already holding one.
1159d3574bfSNeil Perrin * If you must call VN_RELE() within a tx then use VN_RELE_ASYNC().
116fa9e4066Sahrens *
1177885c754Sperrin * (3) All range locks must be grabbed before calling dmu_tx_assign(),
1187885c754Sperrin * as they can span dmu_tx_assign() calls.
1197885c754Sperrin *
120e722410cSMatthew Ahrens * (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to
121e722410cSMatthew Ahrens * dmu_tx_assign(). This is critical because we don't want to block
122e722410cSMatthew Ahrens * while holding locks.
123e722410cSMatthew Ahrens *
124e722410cSMatthew Ahrens * If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT. This
125e722410cSMatthew Ahrens * reduces lock contention and CPU usage when we must wait (note that if
126e722410cSMatthew Ahrens * throughput is constrained by the storage, nearly every transaction
127e722410cSMatthew Ahrens * must wait).
128e722410cSMatthew Ahrens *
129e722410cSMatthew Ahrens * Note, in particular, that if a lock is sometimes acquired before
130e722410cSMatthew Ahrens * the tx assigns, and sometimes after (e.g. z_lock), then failing
131e722410cSMatthew Ahrens * to use a non-blocking assign can deadlock the system. The scenario:
132fa9e4066Sahrens *
133fa9e4066Sahrens * Thread A has grabbed a lock before calling dmu_tx_assign().
134fa9e4066Sahrens * Thread B is in an already-assigned tx, and blocks for this lock.
135fa9e4066Sahrens * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
136fa9e4066Sahrens * forever, because the previous txg can't quiesce until B's tx commits.
137fa9e4066Sahrens *
138fa9e4066Sahrens * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
13969962b56SMatthew Ahrens * then drop all locks, call dmu_tx_wait(), and try again. On subsequent
140f864f99eSPrakash Surya * calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT,
14169962b56SMatthew Ahrens * to indicate that this operation has already called dmu_tx_wait().
14269962b56SMatthew Ahrens * This will ensure that we don't retry forever, waiting a short bit
14369962b56SMatthew Ahrens * each time.
144fa9e4066Sahrens *
1457885c754Sperrin * (5) If the operation succeeded, generate the intent log entry for it
146fa9e4066Sahrens * before dropping locks. This ensures that the ordering of events
147fa9e4066Sahrens * in the intent log matches the order in which they actually occurred.
148f7170741SWill Andrews * During ZIL replay the zfs_log_* functions will update the sequence
1491209a471SNeil Perrin * number to indicate the zil transaction has replayed.
150fa9e4066Sahrens *
1517885c754Sperrin * (6) At the end of each vnode op, the DMU tx must always commit,
152fa9e4066Sahrens * regardless of whether there were any errors.
153fa9e4066Sahrens *
1545002558fSNeil Perrin * (7) After dropping all locks, invoke zil_commit(zilog, foid)
155fa9e4066Sahrens * to ensure that synchronous semantics are provided when necessary.
156fa9e4066Sahrens *
157fa9e4066Sahrens * In general, this is how things should be ordered in each vnode op:
158fa9e4066Sahrens *
159fa9e4066Sahrens * ZFS_ENTER(zfsvfs); // exit if unmounted
160fa9e4066Sahrens * top:
161fa9e4066Sahrens * zfs_dirent_lock(&dl, ...) // lock directory entry (may VN_HOLD())
162fa9e4066Sahrens * rw_enter(...); // grab any other locks you need
163fa9e4066Sahrens * tx = dmu_tx_create(...); // get DMU tx
164fa9e4066Sahrens * dmu_tx_hold_*(); // hold each object you might modify
165f864f99eSPrakash Surya * error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
166fa9e4066Sahrens * if (error) {
167fa9e4066Sahrens * rw_exit(...); // drop locks
168fa9e4066Sahrens * zfs_dirent_unlock(dl); // unlock directory entry
169fa9e4066Sahrens * VN_RELE(...); // release held vnodes
1701209a471SNeil Perrin * if (error == ERESTART) {
17169962b56SMatthew Ahrens * waited = B_TRUE;
1728a2f1b91Sahrens * dmu_tx_wait(tx);
1738a2f1b91Sahrens * dmu_tx_abort(tx);
174fa9e4066Sahrens * goto top;
175fa9e4066Sahrens * }
1768a2f1b91Sahrens * dmu_tx_abort(tx); // abort DMU tx
177fa9e4066Sahrens * ZFS_EXIT(zfsvfs); // finished in zfs
178fa9e4066Sahrens * return (error); // really out of space
179fa9e4066Sahrens * }
180fa9e4066Sahrens * error = do_real_work(); // do whatever this VOP does
181fa9e4066Sahrens * if (error == 0)
182b19a79ecSperrin * zfs_log_*(...); // on success, make ZIL entry
183fa9e4066Sahrens * dmu_tx_commit(tx); // commit DMU tx -- error or not
184fa9e4066Sahrens * rw_exit(...); // drop locks
185fa9e4066Sahrens * zfs_dirent_unlock(dl); // unlock directory entry
186fa9e4066Sahrens * VN_RELE(...); // release held vnodes
1875002558fSNeil Perrin * zil_commit(zilog, foid); // synchronous when necessary
188fa9e4066Sahrens * ZFS_EXIT(zfsvfs); // finished in zfs
189fa9e4066Sahrens * return (error); // done, report error
190fa9e4066Sahrens */
1913cb34c60Sahrens
192fa9e4066Sahrens /* ARGSUSED */
193fa9e4066Sahrens static int
zfs_open(vnode_t ** vpp,int flag,cred_t * cr,caller_context_t * ct)194da6c28aaSamw zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
195fa9e4066Sahrens {
19667bd71c6Sperrin znode_t *zp = VTOZ(*vpp);
197b614fdaaSMark Shellenbaum zfsvfs_t *zfsvfs = zp->z_zfsvfs;
198b614fdaaSMark Shellenbaum
199b614fdaaSMark Shellenbaum ZFS_ENTER(zfsvfs);
200b614fdaaSMark Shellenbaum ZFS_VERIFY_ZP(zp);
20167bd71c6Sperrin
2020a586ceaSMark Shellenbaum if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) &&
203da6c28aaSamw ((flag & FAPPEND) == 0)) {
204b614fdaaSMark Shellenbaum ZFS_EXIT(zfsvfs);
205be6fd75aSMatthew Ahrens return (SET_ERROR(EPERM));
206da6c28aaSamw }
207da6c28aaSamw
208da6c28aaSamw if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
209da6c28aaSamw ZTOV(zp)->v_type == VREG &&
2100a586ceaSMark Shellenbaum !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) {
211b614fdaaSMark Shellenbaum if (fs_vscan(*vpp, cr, 0) != 0) {
212b614fdaaSMark Shellenbaum ZFS_EXIT(zfsvfs);
213be6fd75aSMatthew Ahrens return (SET_ERROR(EACCES));
214b614fdaaSMark Shellenbaum }
215b614fdaaSMark Shellenbaum }
216da6c28aaSamw
21767bd71c6Sperrin /* Keep a count of the synchronous opens in the znode */
21867bd71c6Sperrin if (flag & (FSYNC | FDSYNC))
21967bd71c6Sperrin atomic_inc_32(&zp->z_sync_cnt);
220da6c28aaSamw
221b614fdaaSMark Shellenbaum ZFS_EXIT(zfsvfs);
222fa9e4066Sahrens return (0);
223fa9e4066Sahrens }
224fa9e4066Sahrens
225fa9e4066Sahrens /* ARGSUSED */
226fa9e4066Sahrens static int
zfs_close(vnode_t * vp,int flag,int count,offset_t offset,cred_t * cr,caller_context_t * ct)227da6c28aaSamw zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
228da6c28aaSamw caller_context_t *ct)
229fa9e4066Sahrens {
23067bd71c6Sperrin znode_t *zp = VTOZ(vp);
231b614fdaaSMark Shellenbaum zfsvfs_t *zfsvfs = zp->z_zfsvfs;
232b614fdaaSMark Shellenbaum
233ee8143cbSChris Kirby /*
234ee8143cbSChris Kirby * Clean up any locks held by this process on the vp.
235ee8143cbSChris Kirby */
236ee8143cbSChris Kirby cleanlocks(vp, ddi_get_pid(), 0);
237ee8143cbSChris Kirby cleanshares(vp, ddi_get_pid());
238ee8143cbSChris Kirby
239b614fdaaSMark Shellenbaum ZFS_ENTER(zfsvfs);
240b614fdaaSMark Shellenbaum ZFS_VERIFY_ZP(zp);
24167bd71c6Sperrin
24267bd71c6Sperrin /* Decrement the synchronous opens in the znode */
243ecb72030Sperrin if ((flag & (FSYNC | FDSYNC)) && (count == 1))
24467bd71c6Sperrin atomic_dec_32(&zp->z_sync_cnt);
24567bd71c6Sperrin
246da6c28aaSamw if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
247da6c28aaSamw ZTOV(zp)->v_type == VREG &&
2480a586ceaSMark Shellenbaum !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0)
249da6c28aaSamw VERIFY(fs_vscan(vp, cr, 1) == 0);
250da6c28aaSamw
251b614fdaaSMark Shellenbaum ZFS_EXIT(zfsvfs);
252fa9e4066Sahrens return (0);
253fa9e4066Sahrens }
254fa9e4066Sahrens
255fa9e4066Sahrens /*
256fa9e4066Sahrens * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and
257fa9e4066Sahrens * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter.
258fa9e4066Sahrens */
259fa9e4066Sahrens static int
zfs_holey(vnode_t * vp,int cmd,offset_t * off)260fa9e4066Sahrens zfs_holey(vnode_t *vp, int cmd, offset_t *off)
261fa9e4066Sahrens {
262fa9e4066Sahrens znode_t *zp = VTOZ(vp);
263fa9e4066Sahrens uint64_t noff = (uint64_t)*off; /* new offset */
264fa9e4066Sahrens uint64_t file_sz;
265fa9e4066Sahrens int error;
266fa9e4066Sahrens boolean_t hole;
267fa9e4066Sahrens
2680a586ceaSMark Shellenbaum file_sz = zp->z_size;
269fa9e4066Sahrens if (noff >= file_sz) {
270be6fd75aSMatthew Ahrens return (SET_ERROR(ENXIO));
271fa9e4066Sahrens }
272fa9e4066Sahrens
273fa9e4066Sahrens if (cmd == _FIO_SEEK_HOLE)
274fa9e4066Sahrens hole = B_TRUE;
275fa9e4066Sahrens else
276fa9e4066Sahrens hole = B_FALSE;
277fa9e4066Sahrens
278fa9e4066Sahrens error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff);
279fa9e4066Sahrens
2800fbc0cd0SMatthew Ahrens if (error == ESRCH)
281be6fd75aSMatthew Ahrens return (SET_ERROR(ENXIO));
2820fbc0cd0SMatthew Ahrens
2830fbc0cd0SMatthew Ahrens /*
2840fbc0cd0SMatthew Ahrens * We could find a hole that begins after the logical end-of-file,
2850fbc0cd0SMatthew Ahrens * because dmu_offset_next() only works on whole blocks. If the
2860fbc0cd0SMatthew Ahrens * EOF falls mid-block, then indicate that the "virtual hole"
2870fbc0cd0SMatthew Ahrens * at the end of the file begins at the logical EOF, rather than
2880fbc0cd0SMatthew Ahrens * at the end of the last block.
2890fbc0cd0SMatthew Ahrens */
2900fbc0cd0SMatthew Ahrens if (noff > file_sz) {
2910fbc0cd0SMatthew Ahrens ASSERT(hole);
2920fbc0cd0SMatthew Ahrens noff = file_sz;
293fa9e4066Sahrens }
294fa9e4066Sahrens
295fa9e4066Sahrens if (noff < *off)
296fa9e4066Sahrens return (error);
297fa9e4066Sahrens *off = noff;
298fa9e4066Sahrens return (error);
299fa9e4066Sahrens }
300fa9e4066Sahrens
301f67950b2SNasf-Fan static int
zfs_ioctl_getxattr(vnode_t * vp,intptr_t data,int flag,cred_t * cr,caller_context_t * ct)302f67950b2SNasf-Fan zfs_ioctl_getxattr(vnode_t *vp, intptr_t data, int flag, cred_t *cr,
303f67950b2SNasf-Fan caller_context_t *ct)
304f67950b2SNasf-Fan {
305f67950b2SNasf-Fan zfsxattr_t fsx = { 0 };
306f67950b2SNasf-Fan znode_t *zp = VTOZ(vp);
307f67950b2SNasf-Fan
308f67950b2SNasf-Fan if (zp->z_pflags & ZFS_PROJINHERIT)
309f67950b2SNasf-Fan fsx.fsx_xflags = ZFS_PROJINHERIT_FL;
310f67950b2SNasf-Fan if (zp->z_pflags & ZFS_PROJID)
311f67950b2SNasf-Fan fsx.fsx_projid = zp->z_projid;
312f67950b2SNasf-Fan if (ddi_copyout(&fsx, (void *)data, sizeof (fsx), flag))
313f67950b2SNasf-Fan return (SET_ERROR(EFAULT));
314f67950b2SNasf-Fan
315f67950b2SNasf-Fan return (0);
316f67950b2SNasf-Fan }
317f67950b2SNasf-Fan
318f67950b2SNasf-Fan static int zfs_setattr(vnode_t *, vattr_t *, int, cred_t *, caller_context_t *);
319f67950b2SNasf-Fan
320f67950b2SNasf-Fan static int
zfs_ioctl_setxattr(vnode_t * vp,intptr_t data,int flags,cred_t * cr,caller_context_t * ct)321f67950b2SNasf-Fan zfs_ioctl_setxattr(vnode_t *vp, intptr_t data, int flags, cred_t *cr,
322f67950b2SNasf-Fan caller_context_t *ct)
323f67950b2SNasf-Fan {
324f67950b2SNasf-Fan znode_t *zp = VTOZ(vp);
325f67950b2SNasf-Fan zfsxattr_t fsx;
326f67950b2SNasf-Fan xvattr_t xva;
327f67950b2SNasf-Fan xoptattr_t *xoap;
328f67950b2SNasf-Fan int err;
329f67950b2SNasf-Fan
330f67950b2SNasf-Fan if (ddi_copyin((void *)data, &fsx, sizeof (fsx), flags))
331f67950b2SNasf-Fan return (SET_ERROR(EFAULT));
332f67950b2SNasf-Fan
333f67950b2SNasf-Fan if (!zpl_is_valid_projid(fsx.fsx_projid))
334f67950b2SNasf-Fan return (SET_ERROR(EINVAL));
335f67950b2SNasf-Fan
336f67950b2SNasf-Fan if (fsx.fsx_xflags & ~ZFS_PROJINHERIT_FL)
337f67950b2SNasf-Fan return (SET_ERROR(EOPNOTSUPP));
338f67950b2SNasf-Fan
339f67950b2SNasf-Fan xva_init(&xva);
340f67950b2SNasf-Fan xoap = xva_getxoptattr(&xva);
341f67950b2SNasf-Fan
342f67950b2SNasf-Fan XVA_SET_REQ(&xva, XAT_PROJINHERIT);
343f67950b2SNasf-Fan if (fsx.fsx_xflags & ZFS_PROJINHERIT_FL)
344f67950b2SNasf-Fan xoap->xoa_projinherit = B_TRUE;
345f67950b2SNasf-Fan
346f67950b2SNasf-Fan XVA_SET_REQ(&xva, XAT_PROJID);
347f67950b2SNasf-Fan xoap->xoa_projid = fsx.fsx_projid;
348f67950b2SNasf-Fan
349f67950b2SNasf-Fan return (zfs_setattr(vp, (vattr_t *)&xva, flags, cr, ct));
350f67950b2SNasf-Fan }
351f67950b2SNasf-Fan
352fa9e4066Sahrens /* ARGSUSED */
353fa9e4066Sahrens static int
zfs_ioctl(vnode_t * vp,int com,intptr_t data,int flag,cred_t * cred,int * rvalp,caller_context_t * ct)354fa9e4066Sahrens zfs_ioctl(vnode_t *vp, int com, intptr_t data, int flag, cred_t *cred,
355da6c28aaSamw int *rvalp, caller_context_t *ct)
356fa9e4066Sahrens {
357fa9e4066Sahrens offset_t off;
3582bcf0248SMax Grossman offset_t ndata;
3592bcf0248SMax Grossman dmu_object_info_t doi;
360fa9e4066Sahrens int error;
361fa9e4066Sahrens zfsvfs_t *zfsvfs;
362f18faf3fSek znode_t *zp;
363fa9e4066Sahrens
364fa9e4066Sahrens switch (com) {
365ecb72030Sperrin case _FIOFFS:
3662bcf0248SMax Grossman {
367fa9e4066Sahrens return (zfs_sync(vp->v_vfsp, 0, cred));
368fa9e4066Sahrens
369ea8dc4b6Seschrock /*
370ea8dc4b6Seschrock * The following two ioctls are used by bfu. Faking out,
371ea8dc4b6Seschrock * necessary to avoid bfu errors.
372ea8dc4b6Seschrock */
3732bcf0248SMax Grossman }
374ecb72030Sperrin case _FIOGDIO:
375ecb72030Sperrin case _FIOSDIO:
3762bcf0248SMax Grossman {
377ea8dc4b6Seschrock return (0);
3782bcf0248SMax Grossman }
379ea8dc4b6Seschrock
380aaa9aa59SJerry Jelinek case _FIODIRECTIO:
381aaa9aa59SJerry Jelinek {
382aaa9aa59SJerry Jelinek /*
383aaa9aa59SJerry Jelinek * ZFS inherently provides the basic semantics for directio.
384aaa9aa59SJerry Jelinek * This is the summary from the ZFS on Linux support for
385aaa9aa59SJerry Jelinek * O_DIRECT, which is the common form of directio, and required
386aaa9aa59SJerry Jelinek * no changes to ZFS.
387aaa9aa59SJerry Jelinek *
388aaa9aa59SJerry Jelinek * 1. Minimize cache effects of the I/O.
389aaa9aa59SJerry Jelinek *
390aaa9aa59SJerry Jelinek * By design the ARC is already scan-resistant, which helps
391aaa9aa59SJerry Jelinek * mitigate the need for special O_DIRECT handling.
392aaa9aa59SJerry Jelinek *
393aaa9aa59SJerry Jelinek * 2. O_DIRECT _MAY_ impose restrictions on IO alignment and
394aaa9aa59SJerry Jelinek * length.
395aaa9aa59SJerry Jelinek *
396aaa9aa59SJerry Jelinek * No additional alignment or length restrictions are
397aaa9aa59SJerry Jelinek * imposed by ZFS.
398aaa9aa59SJerry Jelinek *
399aaa9aa59SJerry Jelinek * 3. O_DIRECT _MAY_ perform unbuffered IO operations directly
400aaa9aa59SJerry Jelinek * between user memory and block device.
401aaa9aa59SJerry Jelinek *
402aaa9aa59SJerry Jelinek * No unbuffered IO operations are currently supported. In
403aaa9aa59SJerry Jelinek * order to support features such as compression, encryption,
404aaa9aa59SJerry Jelinek * and checksumming a copy must be made to transform the
405aaa9aa59SJerry Jelinek * data.
406aaa9aa59SJerry Jelinek *
407aaa9aa59SJerry Jelinek * 4. O_DIRECT _MAY_ imply O_DSYNC (XFS).
408aaa9aa59SJerry Jelinek *
409aaa9aa59SJerry Jelinek * O_DIRECT does not imply O_DSYNC for ZFS.
410aaa9aa59SJerry Jelinek *
411aaa9aa59SJerry Jelinek * 5. O_DIRECT _MAY_ disable file locking that serializes IO
412aaa9aa59SJerry Jelinek * operations.
413aaa9aa59SJerry Jelinek *
414aaa9aa59SJerry Jelinek * All I/O in ZFS is locked for correctness and this locking
415aaa9aa59SJerry Jelinek * is not disabled by O_DIRECT.
416aaa9aa59SJerry Jelinek */
417aaa9aa59SJerry Jelinek return (0);
418aaa9aa59SJerry Jelinek }
419aaa9aa59SJerry Jelinek
420ecb72030Sperrin case _FIO_SEEK_DATA:
421ecb72030Sperrin case _FIO_SEEK_HOLE:
4222bcf0248SMax Grossman {
423fa9e4066Sahrens if (ddi_copyin((void *)data, &off, sizeof (off), flag))
424be6fd75aSMatthew Ahrens return (SET_ERROR(EFAULT));
425fa9e4066Sahrens
426f18faf3fSek zp = VTOZ(vp);
427f18faf3fSek zfsvfs = zp->z_zfsvfs;
4283cb34c60Sahrens ZFS_ENTER(zfsvfs);
4293cb34c60Sahrens ZFS_VERIFY_ZP(zp);
430fa9e4066Sahrens
431fa9e4066Sahrens /* offset parameter is in/out */
432fa9e4066Sahrens error = zfs_holey(vp, com, &off);
433fa9e4066Sahrens ZFS_EXIT(zfsvfs);
434fa9e4066Sahrens if (error)
435fa9e4066Sahrens return (error);
436fa9e4066Sahrens if (ddi_copyout(&off, (void *)data, sizeof (off), flag))
437be6fd75aSMatthew Ahrens return (SET_ERROR(EFAULT));
438fa9e4066Sahrens return (0);
439fa9e4066Sahrens }
4402bcf0248SMax Grossman case _FIO_COUNT_FILLED:
4412bcf0248SMax Grossman {
4422bcf0248SMax Grossman /*
4432bcf0248SMax Grossman * _FIO_COUNT_FILLED adds a new ioctl command which
4442bcf0248SMax Grossman * exposes the number of filled blocks in a
4452bcf0248SMax Grossman * ZFS object.
4462bcf0248SMax Grossman */
4472bcf0248SMax Grossman zp = VTOZ(vp);
4482bcf0248SMax Grossman zfsvfs = zp->z_zfsvfs;
4492bcf0248SMax Grossman ZFS_ENTER(zfsvfs);
4502bcf0248SMax Grossman ZFS_VERIFY_ZP(zp);
4512bcf0248SMax Grossman
4522bcf0248SMax Grossman /*
4532bcf0248SMax Grossman * Wait for all dirty blocks for this object
4542bcf0248SMax Grossman * to get synced out to disk, and the DMU info
4552bcf0248SMax Grossman * updated.
4562bcf0248SMax Grossman */
4572bcf0248SMax Grossman error = dmu_object_wait_synced(zfsvfs->z_os, zp->z_id);
4582bcf0248SMax Grossman if (error) {
4592bcf0248SMax Grossman ZFS_EXIT(zfsvfs);
4602bcf0248SMax Grossman return (error);
4612bcf0248SMax Grossman }
4622bcf0248SMax Grossman
4632bcf0248SMax Grossman /*
4642bcf0248SMax Grossman * Retrieve fill count from DMU object.
4652bcf0248SMax Grossman */
4662bcf0248SMax Grossman error = dmu_object_info(zfsvfs->z_os, zp->z_id, &doi);
4672bcf0248SMax Grossman if (error) {
4682bcf0248SMax Grossman ZFS_EXIT(zfsvfs);
4692bcf0248SMax Grossman return (error);
4702bcf0248SMax Grossman }
4712bcf0248SMax Grossman
4722bcf0248SMax Grossman ndata = doi.doi_fill_count;
4732bcf0248SMax Grossman
4742bcf0248SMax Grossman ZFS_EXIT(zfsvfs);
4752bcf0248SMax Grossman if (ddi_copyout(&ndata, (void *)data, sizeof (ndata), flag))
4762bcf0248SMax Grossman return (SET_ERROR(EFAULT));
4772bcf0248SMax Grossman return (0);
4782bcf0248SMax Grossman }
479f67950b2SNasf-Fan case ZFS_IOC_FSGETXATTR:
480f67950b2SNasf-Fan return (zfs_ioctl_getxattr(vp, data, flag, cred, ct));
481f67950b2SNasf-Fan case ZFS_IOC_FSSETXATTR:
482f67950b2SNasf-Fan return (zfs_ioctl_setxattr(vp, data, flag, cred, ct));
4832bcf0248SMax Grossman }
484be6fd75aSMatthew Ahrens return (SET_ERROR(ENOTTY));
485fa9e4066Sahrens }
486fa9e4066Sahrens
4870fab61baSJonathan W Adams /*
4880fab61baSJonathan W Adams * Utility functions to map and unmap a single physical page. These
4890fab61baSJonathan W Adams * are used to manage the mappable copies of ZFS file data, and therefore
4900fab61baSJonathan W Adams * do not update ref/mod bits.
4910fab61baSJonathan W Adams */
4920fab61baSJonathan W Adams caddr_t
zfs_map_page(page_t * pp,enum seg_rw rw)4930fab61baSJonathan W Adams zfs_map_page(page_t *pp, enum seg_rw rw)
4940fab61baSJonathan W Adams {
4950fab61baSJonathan W Adams if (kpm_enable)
4960fab61baSJonathan W Adams return (hat_kpm_mapin(pp, 0));
4970fab61baSJonathan W Adams ASSERT(rw == S_READ || rw == S_WRITE);
4980fab61baSJonathan W Adams return (ppmapin(pp, PROT_READ | ((rw == S_WRITE) ? PROT_WRITE : 0),
4990fab61baSJonathan W Adams (caddr_t)-1));
5000fab61baSJonathan W Adams }
5010fab61baSJonathan W Adams
5020fab61baSJonathan W Adams void
zfs_unmap_page(page_t * pp,caddr_t addr)5030fab61baSJonathan W Adams zfs_unmap_page(page_t *pp, caddr_t addr)
5040fab61baSJonathan W Adams {
5050fab61baSJonathan W Adams if (kpm_enable) {
5060fab61baSJonathan W Adams hat_kpm_mapout(pp, 0, addr);
5070fab61baSJonathan W Adams } else {
5080fab61baSJonathan W Adams ppmapout(addr);
5090fab61baSJonathan W Adams }
5100fab61baSJonathan W Adams }
5110fab61baSJonathan W Adams
512fa9e4066Sahrens /*
513fa9e4066Sahrens * When a file is memory mapped, we must keep the IO data synchronized
514fa9e4066Sahrens * between the DMU cache and the memory mapped pages. What this means:
515fa9e4066Sahrens *
516fa9e4066Sahrens * On Write: If we find a memory mapped page, we write to *both*
517fa9e4066Sahrens * the page and the dmu buffer.
518fa9e4066Sahrens */
519ac05c741SMark Maybee static void
update_pages(vnode_t * vp,int64_t start,int len,objset_t * os,uint64_t oid)520ac05c741SMark Maybee update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid)
521fa9e4066Sahrens {
522ac05c741SMark Maybee int64_t off;
523fa9e4066Sahrens
524fa9e4066Sahrens off = start & PAGEOFFSET;
525fa9e4066Sahrens for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
526fa9e4066Sahrens page_t *pp;
527ac05c741SMark Maybee uint64_t nbytes = MIN(PAGESIZE - off, len);
528fa9e4066Sahrens
529fa9e4066Sahrens if (pp = page_lookup(vp, start, SE_SHARED)) {
530fa9e4066Sahrens caddr_t va;
531fa9e4066Sahrens
5320fab61baSJonathan W Adams va = zfs_map_page(pp, S_WRITE);
5337bfdf011SNeil Perrin (void) dmu_read(os, oid, start+off, nbytes, va+off,
5347bfdf011SNeil Perrin DMU_READ_PREFETCH);
5350fab61baSJonathan W Adams zfs_unmap_page(pp, va);
536fa9e4066Sahrens page_unlock(pp);
537fa9e4066Sahrens }
538ac05c741SMark Maybee len -= nbytes;
539fa9e4066Sahrens off = 0;
540fa9e4066Sahrens }
541fa9e4066Sahrens }
542fa9e4066Sahrens
543fa9e4066Sahrens /*
544fa9e4066Sahrens * When a file is memory mapped, we must keep the IO data synchronized
545fa9e4066Sahrens * between the DMU cache and the memory mapped pages. What this means:
546fa9e4066Sahrens *
547fa9e4066Sahrens * On Read: We "read" preferentially from memory mapped pages,
548fa9e4066Sahrens * else we default from the dmu buffer.
549fa9e4066Sahrens *
550fa9e4066Sahrens * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
551f7170741SWill Andrews * the file is memory mapped.
552fa9e4066Sahrens */
553fa9e4066Sahrens static int
mappedread(vnode_t * vp,int nbytes,uio_t * uio)554feb08c6bSbillm mappedread(vnode_t *vp, int nbytes, uio_t *uio)
555fa9e4066Sahrens {
556feb08c6bSbillm znode_t *zp = VTOZ(vp);
557feb08c6bSbillm int64_t start, off;
558fa9e4066Sahrens int len = nbytes;
559fa9e4066Sahrens int error = 0;
560fa9e4066Sahrens
561fa9e4066Sahrens start = uio->uio_loffset;
562fa9e4066Sahrens off = start & PAGEOFFSET;
563fa9e4066Sahrens for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
564fa9e4066Sahrens page_t *pp;
565feb08c6bSbillm uint64_t bytes = MIN(PAGESIZE - off, len);
566fa9e4066Sahrens
567fa9e4066Sahrens if (pp = page_lookup(vp, start, SE_SHARED)) {
568fa9e4066Sahrens caddr_t va;
569fa9e4066Sahrens
5700fab61baSJonathan W Adams va = zfs_map_page(pp, S_READ);
571fa9e4066Sahrens error = uiomove(va + off, bytes, UIO_READ, uio);
5720fab61baSJonathan W Adams zfs_unmap_page(pp, va);
573fa9e4066Sahrens page_unlock(pp);
574fa9e4066Sahrens } else {
575f8554bb9SMatthew Ahrens error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
576f8554bb9SMatthew Ahrens uio, bytes);
577fa9e4066Sahrens }
578fa9e4066Sahrens len -= bytes;
579fa9e4066Sahrens off = 0;
580fa9e4066Sahrens if (error)
581fa9e4066Sahrens break;
582fa9e4066Sahrens }
583fa9e4066Sahrens return (error);
584fa9e4066Sahrens }
585fa9e4066Sahrens
586feb08c6bSbillm offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */
587fa9e4066Sahrens
588fa9e4066Sahrens /*
589fa9e4066Sahrens * Read bytes from specified file into supplied buffer.
590fa9e4066Sahrens *
591fa9e4066Sahrens * IN: vp - vnode of file to be read from.
592fa9e4066Sahrens * uio - structure supplying read location, range info,
593fa9e4066Sahrens * and return buffer.
594fa9e4066Sahrens * ioflag - SYNC flags; used to provide FRSYNC semantics.
595fa9e4066Sahrens * cr - credentials of caller.
596da6c28aaSamw * ct - caller context
597fa9e4066Sahrens *
598fa9e4066Sahrens * OUT: uio - updated offset and range, buffer filled.
599fa9e4066Sahrens *
600f7170741SWill Andrews * RETURN: 0 on success, error code on failure.
601fa9e4066Sahrens *
602fa9e4066Sahrens * Side Effects:
603fa9e4066Sahrens * vp - atime updated if byte count > 0
604fa9e4066Sahrens */
605fa9e4066Sahrens /* ARGSUSED */
606fa9e4066Sahrens static int
zfs_read(vnode_t * vp,uio_t * uio,int ioflag,cred_t * cr,caller_context_t * ct)607fa9e4066Sahrens zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
608fa9e4066Sahrens {
609fa9e4066Sahrens znode_t *zp = VTOZ(vp);
610fa9e4066Sahrens zfsvfs_t *zfsvfs = zp->z_zfsvfs;
611feb08c6bSbillm ssize_t n, nbytes;
612d5285caeSGeorge Wilson int error = 0;
613da7753c4SGeorge Wilson boolean_t frsync = B_FALSE;
614c242f9a0Schunli zhang - Sun Microsystems - Irvine United States xuio_t *xuio = NULL;
615fa9e4066Sahrens
6163cb34c60Sahrens ZFS_ENTER(zfsvfs);
6173cb34c60Sahrens ZFS_VERIFY_ZP(zp);
618fa9e4066Sahrens
6190a586ceaSMark Shellenbaum if (zp->z_pflags & ZFS_AV_QUARANTINED) {
6200616c50eSmarks ZFS_EXIT(zfsvfs);
621be6fd75aSMatthew Ahrens return (SET_ERROR(EACCES));
6220616c50eSmarks }
6230616c50eSmarks
624fa9e4066Sahrens /*
625fa9e4066Sahrens * Validate file offset
626fa9e4066Sahrens */
627fa9e4066Sahrens if (uio->uio_loffset < (offset_t)0) {
628fa9e4066Sahrens ZFS_EXIT(zfsvfs);
629be6fd75aSMatthew Ahrens return (SET_ERROR(EINVAL));
630fa9e4066Sahrens }
631fa9e4066Sahrens
632fa9e4066Sahrens /*
633fa9e4066Sahrens * Fasttrack empty reads
634fa9e4066Sahrens */
635fa9e4066Sahrens if (uio->uio_resid == 0) {
636fa9e4066Sahrens ZFS_EXIT(zfsvfs);
637fa9e4066Sahrens return (0);
638fa9e4066Sahrens }
639fa9e4066Sahrens
640fa9e4066Sahrens /*
641104e2ed7Sperrin * Check for mandatory locks
642fa9e4066Sahrens */
6430a586ceaSMark Shellenbaum if (MANDMODE(zp->z_mode)) {
644fa9e4066Sahrens if (error = chklock(vp, FREAD,
645fa9e4066Sahrens uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) {
646fa9e4066Sahrens ZFS_EXIT(zfsvfs);
647fa9e4066Sahrens return (error);
648fa9e4066Sahrens }
649fa9e4066Sahrens }
650fa9e4066Sahrens
651da7753c4SGeorge Wilson #ifdef FRSYNC
652fa9e4066Sahrens /*
653fa9e4066Sahrens * If we're in FRSYNC mode, sync out this znode before reading it.
654da7753c4SGeorge Wilson * Only do this for non-snapshots.
655da7753c4SGeorge Wilson *
656da7753c4SGeorge Wilson * Some platforms do not support FRSYNC and instead map it
657da7753c4SGeorge Wilson * to FSYNC, which results in unnecessary calls to zil_commit. We
658da7753c4SGeorge Wilson * only honor FRSYNC requests on platforms which support it.
659fa9e4066Sahrens */
660da7753c4SGeorge Wilson frsync = !!(ioflag & FRSYNC);
661da7753c4SGeorge Wilson #endif
662da7753c4SGeorge Wilson
663da7753c4SGeorge Wilson if (zfsvfs->z_log &&
664da7753c4SGeorge Wilson (frsync || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS))
6655002558fSNeil Perrin zil_commit(zfsvfs->z_log, zp->z_id);
666fa9e4066Sahrens
667fa9e4066Sahrens /*
668104e2ed7Sperrin * Lock the range against changes.
669fa9e4066Sahrens */
67079315247SMatthew Ahrens locked_range_t *lr = rangelock_enter(&zp->z_rangelock,
67179315247SMatthew Ahrens uio->uio_loffset, uio->uio_resid, RL_READER);
672