17c478bdstevel@tonic-gate/*
27c478bdstevel@tonic-gate * CDDL HEADER START
37c478bdstevel@tonic-gate *
47c478bdstevel@tonic-gate * The contents of this file are subject to the terms of the
55b024a5batschul * Common Development and Distribution License (the "License").
65b024a5batschul * You may not use this file except in compliance with the License.
77c478bdstevel@tonic-gate *
87c478bdstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
97c478bdstevel@tonic-gate * or http://www.opensolaris.org/os/licensing.
107c478bdstevel@tonic-gate * See the License for the specific language governing permissions
117c478bdstevel@tonic-gate * and limitations under the License.
127c478bdstevel@tonic-gate *
137c478bdstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each
147c478bdstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
157c478bdstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the
167c478bdstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying
177c478bdstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner]
187c478bdstevel@tonic-gate *
197c478bdstevel@tonic-gate * CDDL HEADER END
207c478bdstevel@tonic-gate */
21342440ePrasad Singamsetty
227c478bdstevel@tonic-gate/*
239b5097eOwen Roberts * Copyright (c) 1984, 2010, Oracle and/or its affiliates. All rights reserved.
24284ce98Patrick Mooney * Copyright 2018 Joyent, Inc.
2548bbca8Daniel Hoffman * Copyright (c) 2016 by Delphix. All rights reserved.
267c478bdstevel@tonic-gate */
277c478bdstevel@tonic-gate
287c478bdstevel@tonic-gate/*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
292575b44Toomas Soome/*	  All Rights Reserved	*/
307c478bdstevel@tonic-gate
317c478bdstevel@tonic-gate/*
327c478bdstevel@tonic-gate * Portions of this source code were derived from Berkeley 4.3 BSD
337c478bdstevel@tonic-gate * under license from the Regents of the University of California.
347c478bdstevel@tonic-gate */
357c478bdstevel@tonic-gate
367c478bdstevel@tonic-gate#include <sys/types.h>
377c478bdstevel@tonic-gate#include <sys/t_lock.h>
387c478bdstevel@tonic-gate#include <sys/ksynch.h>
397c478bdstevel@tonic-gate#include <sys/param.h>
407c478bdstevel@tonic-gate#include <sys/time.h>
417c478bdstevel@tonic-gate#include <sys/systm.h>
427c478bdstevel@tonic-gate#include <sys/sysmacros.h>
437c478bdstevel@tonic-gate#include <sys/resource.h>
447c478bdstevel@tonic-gate#include <sys/signal.h>
457c478bdstevel@tonic-gate#include <sys/cred.h>
467c478bdstevel@tonic-gate#include <sys/user.h>
477c478bdstevel@tonic-gate#include <sys/buf.h>
487c478bdstevel@tonic-gate#include <sys/vfs.h>
49aa59c4crsb#include <sys/vfs_opreg.h>
507c478bdstevel@tonic-gate#include <sys/vnode.h>
517c478bdstevel@tonic-gate#include <sys/proc.h>
527c478bdstevel@tonic-gate#include <sys/disp.h>
537c478bdstevel@tonic-gate#include <sys/file.h>
547c478bdstevel@tonic-gate#include <sys/fcntl.h>
557c478bdstevel@tonic-gate#include <sys/flock.h>
56bc69f43aguzovsk#include <sys/atomic.h>
577c478bdstevel@tonic-gate#include <sys/kmem.h>
587c478bdstevel@tonic-gate#include <sys/uio.h>
597c478bdstevel@tonic-gate#include <sys/dnlc.h>
607c478bdstevel@tonic-gate#include <sys/conf.h>
617c478bdstevel@tonic-gate#include <sys/mman.h>
627c478bdstevel@tonic-gate#include <sys/pathname.h>
637c478bdstevel@tonic-gate#include <sys/debug.h>
647c478bdstevel@tonic-gate#include <sys/vmsystm.h>
657c478bdstevel@tonic-gate#include <sys/cmn_err.h>
667c478bdstevel@tonic-gate#include <sys/filio.h>
677c478bdstevel@tonic-gate#include <sys/policy.h>
687c478bdstevel@tonic-gate
697c478bdstevel@tonic-gate#include <sys/fs/ufs_fs.h>
707c478bdstevel@tonic-gate#include <sys/fs/ufs_lockfs.h>
717c478bdstevel@tonic-gate#include <sys/fs/ufs_filio.h>
727c478bdstevel@tonic-gate#include <sys/fs/ufs_inode.h>
737c478bdstevel@tonic-gate#include <sys/fs/ufs_fsdir.h>
747c478bdstevel@tonic-gate#include <sys/fs/ufs_quota.h>
757c478bdstevel@tonic-gate#include <sys/fs/ufs_log.h>
767c478bdstevel@tonic-gate#include <sys/fs/ufs_snap.h>
777c478bdstevel@tonic-gate#include <sys/fs/ufs_trans.h>
787c478bdstevel@tonic-gate#include <sys/fs/ufs_panic.h>
797c478bdstevel@tonic-gate#include <sys/fs/ufs_bio.h>
807c478bdstevel@tonic-gate#include <sys/dirent.h>		/* must be AFTER <sys/fs/fsdir.h>! */
817c478bdstevel@tonic-gate#include <sys/errno.h>
827c478bdstevel@tonic-gate#include <sys/fssnap_if.h>
837c478bdstevel@tonic-gate#include <sys/unistd.h>
847c478bdstevel@tonic-gate#include <sys/sunddi.h>
857c478bdstevel@tonic-gate
867c478bdstevel@tonic-gate#include <sys/filio.h>		/* _FIOIO */
877c478bdstevel@tonic-gate
887c478bdstevel@tonic-gate#include <vm/hat.h>
897c478bdstevel@tonic-gate#include <vm/page.h>
907c478bdstevel@tonic-gate#include <vm/pvn.h>
917c478bdstevel@tonic-gate#include <vm/as.h>
927c478bdstevel@tonic-gate#include <vm/seg.h>
937c478bdstevel@tonic-gate#include <vm/seg_map.h>
947c478bdstevel@tonic-gate#include <vm/seg_vn.h>
957c478bdstevel@tonic-gate#include <vm/seg_kmem.h>
967c478bdstevel@tonic-gate#include <vm/rm.h>
977c478bdstevel@tonic-gate#include <sys/swap.h>
987c478bdstevel@tonic-gate
997c478bdstevel@tonic-gate#include <fs/fs_subr.h>
1007c478bdstevel@tonic-gate
101986fd29setje#include <sys/fs/decomp.h>
102986fd29setje
1037c478bdstevel@tonic-gatestatic struct instats ins;
1047c478bdstevel@tonic-gate
1052575b44Toomas Soomestatic	int ufs_getpage_ra(struct vnode *, u_offset_t, struct seg *, caddr_t);
1067c478bdstevel@tonic-gatestatic	int ufs_getpage_miss(struct vnode *, u_offset_t, size_t, struct seg *,
1077c478bdstevel@tonic-gate		caddr_t, struct page **, size_t, enum seg_rw, int);
108da6c28aamwstatic	int ufs_open(struct vnode **, int, struct cred *, caller_context_t *);
109da6c28aamwstatic	int ufs_close(struct vnode *, int, int, offset_t, struct cred *,
110da6c28aamw		caller_context_t *);
1117c478bdstevel@tonic-gatestatic	int ufs_read(struct vnode *, struct uio *, int, struct cred *,
112da6c28aamw		struct caller_context *);
1137c478bdstevel@tonic-gatestatic	int ufs_write(struct vnode *, struct uio *, int, struct cred *,
114da6c28aamw		struct caller_context *);
115da6c28aamwstatic	int ufs_ioctl(struct vnode *, int, intptr_t, int, struct cred *,
116da6c28aamw		int *, caller_context_t *);
117da6c28aamwstatic	int ufs_getattr(struct vnode *, struct vattr *, int, struct cred *,
118da6c28aamw		caller_context_t *);
1197c478bdstevel@tonic-gatestatic	int ufs_setattr(struct vnode *, struct vattr *, int, struct cred *,
120da6c28aamw		caller_context_t *);
121da6c28aamwstatic	int ufs_access(struct vnode *, int, int, struct cred *,
122da6c28aamw		caller_context_t *);
1237c478bdstevel@tonic-gatestatic	int ufs_lookup(struct vnode *, char *, struct vnode **,
124da6c28aamw		struct pathname *, int, struct vnode *, struct cred *,
125da6c28aamw		caller_context_t *, int *, pathname_t *);
1267c478bdstevel@tonic-gatestatic	int ufs_create(struct vnode *, char *, struct vattr *, enum vcexcl,
127da6c28aamw		int, struct vnode **, struct cred *, int,
128da6c28aamw		caller_context_t *, vsecattr_t  *);
129da6c28aamwstatic	int ufs_remove(struct vnode *, char *, struct cred *,
130da6c28aamw		caller_context_t *, int);
131da6c28aamwstatic	int ufs_link(struct vnode *, struct vnode *, char *, struct cred *,
132da6c28aamw		caller_context_t *, int);
1337c478bdstevel@tonic-gatestatic	int ufs_rename(struct vnode *, char *, struct vnode *, char *,
134da6c28aamw		struct cred *, caller_context_t *, int);
1357c478bdstevel@tonic-gatestatic	int ufs_mkdir(struct vnode *, char *, struct vattr *, struct vnode **,
136da6c28aamw		struct cred *, caller_context_t *, int, vsecattr_t *);
137da6c28aamwstatic	int ufs_rmdir(struct vnode *, char *, struct vnode *, struct cred *,
138da6c28aamw		caller_context_t *, int);
139da6c28aamwstatic	int ufs_readdir(struct vnode *, struct uio *, struct cred *, int *,
140da6c28aamw		caller_context_t *, int);
1417c478bdstevel@tonic-gatestatic	int ufs_symlink(struct vnode *, char *, struct vattr *, char *,
142da6c28aamw		struct cred *, caller_context_t *, int);
143da6c28aamwstatic	int ufs_readlink(struct vnode *, struct uio *, struct cred *,
144da6c28aamw		caller_context_t *);
145da6c28aamwstatic	int ufs_fsync(struct vnode *, int, struct cred *, caller_context_t *);
146da6c28aamwstatic	void ufs_inactive(struct vnode *, struct cred *, caller_context_t *);
147da6c28aamwstatic	int ufs_fid(struct vnode *, struct fid *, caller_context_t *);
1487c478bdstevel@tonic-gatestatic	int ufs_rwlock(struct vnode *, int, caller_context_t *);
1497c478bdstevel@tonic-gatestatic	void ufs_rwunlock(struct vnode *, int, caller_context_t *);
150da6c28aamwstatic	int ufs_seek(struct vnode *, offset_t, offset_t *, caller_context_t *);
1517c478bdstevel@tonic-gatestatic	int ufs_frlock(struct vnode *, int, struct flock64 *, int, offset_t,
152da6c28aamw		struct flk_callback *, struct cred *,
153da6c28aamw		caller_context_t *);
1547c478bdstevel@tonic-gatestatic  int ufs_space(struct vnode *, int, struct flock64 *, int, offset_t,
1557c478bdstevel@tonic-gate		cred_t *, caller_context_t *);
1567c478bdstevel@tonic-gatestatic	int ufs_getpage(struct vnode *, offset_t, size_t, uint_t *,
1577c478bdstevel@tonic-gate		struct page **, size_t, struct seg *, caddr_t,
158da6c28aamw		enum seg_rw, struct cred *, caller_context_t *);
159da6c28aamwstatic	int ufs_putpage(struct vnode *, offset_t, size_t, int, struct cred *,
160da6c28aamw		caller_context_t *);
1617c478bdstevel@tonic-gatestatic	int ufs_putpages(struct vnode *, offset_t, size_t, int, struct cred *);
1627c478bdstevel@tonic-gatestatic	int ufs_map(struct vnode *, offset_t, struct as *, caddr_t *, size_t,
163da6c28aamw		uchar_t, uchar_t, uint_t, struct cred *, caller_context_t *);
1647c478bdstevel@tonic-gatestatic	int ufs_addmap(struct vnode *, offset_t, struct as *, caddr_t,  size_t,
165da6c28aamw		uchar_t, uchar_t, uint_t, struct cred *, caller_context_t *);
1667c478bdstevel@tonic-gatestatic	int ufs_delmap(struct vnode *, offset_t, struct as *, caddr_t,  size_t,
167da6c28aamw		uint_t, uint_t, uint_t, struct cred *, caller_context_t *);
168da6c28aamwstatic	int ufs_poll(vnode_t *, short, int, short *, struct pollhead **,
169da6c28aamw		caller_context_t *);
170d7334e5rmstatic	int ufs_dump(vnode_t *, caddr_t, offset_t, offset_t,
171d7334e5rm    caller_context_t *);
172da6c28aamwstatic	int ufs_l_pathconf(struct vnode *, int, ulong_t *, struct cred *,
173da6c28aamw		caller_context_t *);
1747c478bdstevel@tonic-gatestatic	int ufs_pageio(struct vnode *, struct page *, u_offset_t, size_t, int,
175da6c28aamw		struct cred *, caller_context_t *);
176d7334e5rmstatic	int ufs_dumpctl(vnode_t *, int, offset_t *, caller_context_t *);
1777c478bdstevel@tonic-gatestatic	daddr32_t *save_dblks(struct inode *, struct ufsvfs *, daddr32_t *,
178da6c28aamw		daddr32_t *, int, int);
179da6c28aamwstatic	int ufs_getsecattr(struct vnode *, vsecattr_t *, int, struct cred *,
180da6c28aamw		caller_context_t *);
181da6c28aamwstatic	int ufs_setsecattr(struct vnode *, vsecattr_t *, int, struct cred *,
182da6c28aamw		caller_context_t *);
18360c8e82Frank Batschulatstatic	int ufs_priv_access(void *, int, struct cred *);
1849b5097eOwen Robertsstatic	int ufs_eventlookup(struct vnode *, char *, struct cred *,
1859b5097eOwen Roberts    struct vnode **);
18602ff05avsakar
1877c478bdstevel@tonic-gate/*
1887c478bdstevel@tonic-gate * For lockfs: ulockfs begin/end is now inlined in the ufs_xxx functions.
1897c478bdstevel@tonic-gate *
1907c478bdstevel@tonic-gate * XXX - ULOCKFS in fs_pathconf and ufs_ioctl is not inlined yet.
1917c478bdstevel@tonic-gate */
1927c478bdstevel@tonic-gatestruct vnodeops *ufs_vnodeops;
1937c478bdstevel@tonic-gate
194aa59c4crsb/* NOTE: "not blkd" below  means that the operation isn't blocked by lockfs */
1957c478bdstevel@tonic-gateconst fs_operation_def_t ufs_vnodeops_template[] = {
196aa59c4crsb	VOPNAME_OPEN,		{ .vop_open = ufs_open },	/* not blkd */
197aa59c4crsb	VOPNAME_CLOSE,		{ .vop_close = ufs_close },	/* not blkd */
198aa59c4crsb	VOPNAME_READ,		{ .vop_read = ufs_read },
199aa59c4crsb	VOPNAME_WRITE,		{ .vop_write = ufs_write },
200aa59c4crsb	VOPNAME_IOCTL,		{ .vop_ioctl = ufs_ioctl },
201aa59c4crsb	VOPNAME_GETATTR,	{ .vop_getattr = ufs_getattr },
202aa59c4crsb	VOPNAME_SETATTR,	{ .vop_setattr = ufs_setattr },
203aa59c4crsb	VOPNAME_ACCESS,		{ .vop_access = ufs_access },
204aa59c4crsb	VOPNAME_LOOKUP,		{ .vop_lookup = ufs_lookup },
205aa59c4crsb	VOPNAME_CREATE,		{ .vop_create = ufs_create },
206aa59c4crsb	VOPNAME_REMOVE,		{ .vop_remove = ufs_remove },
207aa59c4crsb	VOPNAME_LINK,		{ .vop_link = ufs_link },
208aa59c4crsb	VOPNAME_RENAME,		{ .vop_rename = ufs_rename },
209aa59c4crsb	VOPNAME_MKDIR,		{ .vop_mkdir = ufs_mkdir },
210aa59c4crsb	VOPNAME_RMDIR,		{ .vop_rmdir = ufs_rmdir },
211aa59c4crsb	VOPNAME_READDIR,	{ .vop_readdir = ufs_readdir },
212aa59c4crsb	VOPNAME_SYMLINK,	{ .vop_symlink = ufs_symlink },
213aa59c4crsb	VOPNAME_READLINK,	{ .vop_readlink = ufs_readlink },
214aa59c4crsb	VOPNAME_FSYNC,		{ .vop_fsync = ufs_fsync },
215aa59c4crsb	VOPNAME_INACTIVE,	{ .vop_inactive = ufs_inactive }, /* not blkd */
216aa59c4crsb	VOPNAME_FID,		{ .vop_fid = ufs_fid },
217aa59c4crsb	VOPNAME_RWLOCK,		{ .vop_rwlock = ufs_rwlock },	/* not blkd */
218aa59c4crsb	VOPNAME_RWUNLOCK,	{ .vop_rwunlock = ufs_rwunlock }, /* not blkd */
219aa59c4crsb	VOPNAME_SEEK,		{ .vop_seek = ufs_seek },
220aa59c4crsb	VOPNAME_FRLOCK,		{ .vop_frlock = ufs_frlock },
221aa59c4crsb	VOPNAME_SPACE,		{ .vop_space = ufs_space },
222aa59c4crsb	VOPNAME_GETPAGE,	{ .vop_getpage = ufs_getpage },
223aa59c4crsb	VOPNAME_PUTPAGE,	{ .vop_putpage = ufs_putpage },
224aa59c4crsb	VOPNAME_MAP,		{ .vop_map = ufs_map },
225aa59c4crsb	VOPNAME_ADDMAP,		{ .vop_addmap = ufs_addmap },	/* not blkd */
226aa59c4crsb	VOPNAME_DELMAP,		{ .vop_delmap = ufs_delmap },	/* not blkd */
227aa59c4crsb	VOPNAME_POLL,		{ .vop_poll = ufs_poll },	/* not blkd */
228aa59c4crsb	VOPNAME_DUMP,		{ .vop_dump = ufs_dump },
229aa59c4crsb	VOPNAME_PATHCONF,	{ .vop_pathconf = ufs_l_pathconf },
230aa59c4crsb	VOPNAME_PAGEIO,		{ .vop_pageio = ufs_pageio },
231aa59c4crsb	VOPNAME_DUMPCTL,	{ .vop_dumpctl = ufs_dumpctl },
232aa59c4crsb	VOPNAME_GETSECATTR,	{ .vop_getsecattr = ufs_getsecattr },
233aa59c4crsb	VOPNAME_SETSECATTR,	{ .vop_setsecattr = ufs_setsecattr },
234aa59c4crsb	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
235aa59c4crsb	NULL,			NULL
2367c478bdstevel@tonic-gate};
2377c478bdstevel@tonic-gate
2387c478bdstevel@tonic-gate#define	MAX_BACKFILE_COUNT	9999
2397c478bdstevel@tonic-gate
2407c478bdstevel@tonic-gate/*
2417c478bdstevel@tonic-gate * Created by ufs_dumpctl() to store a file's disk block info into memory.
2427c478bdstevel@tonic-gate * Used by ufs_dump() to dump data to disk directly.
2437c478bdstevel@tonic-gate */
2447c478bdstevel@tonic-gatestruct dump {
2457c478bdstevel@tonic-gate	struct inode	*ip;		/* the file we contain */
2467c478bdstevel@tonic-gate	daddr_t		fsbs;		/* number of blocks stored */
2477c478bdstevel@tonic-gate	struct timeval32 time;		/* time stamp for the struct */
2482575b44Toomas Soome	daddr32_t	dblk[1];	/* place holder for block info */
2497c478bdstevel@tonic-gate};
2507c478bdstevel@tonic-gate
2517c478bdstevel@tonic-gatestatic struct dump *dump_info = NULL;
2527c478bdstevel@tonic-gate
2537c478bdstevel@tonic-gate/*
2547c478bdstevel@tonic-gate * Previously there was no special action required for ordinary files.
2557c478bdstevel@tonic-gate * (Devices are handled through the device file system.)
2567c478bdstevel@tonic-gate * Now we support Large Files and Large File API requires open to
2577c478bdstevel@tonic-gate * fail if file is large.
2587c478bdstevel@tonic-gate * We could take care to prevent data corruption
2597c478bdstevel@tonic-gate * by doing an atomic check of size and truncate if file is opened with
2607c478bdstevel@tonic-gate * FTRUNC flag set but traditionally this is being done by the vfs/vnode
2617c478bdstevel@tonic-gate * layers. So taking care of truncation here is a change in the existing
2627c478bdstevel@tonic-gate * semantics of VOP_OPEN and therefore we chose not to implement any thing
2637c478bdstevel@tonic-gate * here. The check for the size of the file > 2GB is being done at the
2647c478bdstevel@tonic-gate * vfs layer in routine vn_open().
2657c478bdstevel@tonic-gate */
2667c478bdstevel@tonic-gate
2677c478bdstevel@tonic-gate/* ARGSUSED */
2687c478bdstevel@tonic-gatestatic int
269da6c28aamwufs_open(struct vnode **vpp, int flag, struct cred *cr, caller_context_t *ct)
2707c478bdstevel@tonic-gate{
2717c478bdstevel@tonic-gate	return (0);
2727c478bdstevel@tonic-gate}
2737c478bdstevel@tonic-gate
2747c478bdstevel@tonic-gate/*ARGSUSED*/
2757c478bdstevel@tonic-gatestatic int
2767c478bdstevel@tonic-gateufs_close(struct vnode *vp, int flag, int count, offset_t offset,
27780d5689Patrick Mooney    struct cred *cr, caller_context_t *ct)
2787c478bdstevel@tonic-gate{
2797c478bdstevel@tonic-gate	cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
2807c478bdstevel@tonic-gate	cleanshares(vp, ttoproc(curthread)->p_pid);
2817c478bdstevel@tonic-gate
2827c478bdstevel@tonic-gate	/*
2837c478bdstevel@tonic-gate	 * Push partially filled cluster at last close.
2847c478bdstevel@tonic-gate	 * ``last close'' is approximated because the dnlc
2857c478bdstevel@tonic-gate	 * may have a hold on the vnode.
2867c478bdstevel@tonic-gate	 * Checking for VBAD here will also act as a forced umount check.
2877c478bdstevel@tonic-gate	 */
2887c478bdstevel@tonic-gate	if (vp->v_count <= 2 && vp->v_type != VBAD) {
2897c478bdstevel@tonic-gate		struct inode *ip = VTOI(vp);
2907c478bdstevel@tonic-gate		if (ip->i_delaylen) {
2917c478bdstevel@tonic-gate			ins.in_poc.value.ul++;
2927c478bdstevel@tonic-gate			(void) ufs_putpages(vp, ip->i_delayoff, ip->i_delaylen,
29380d3443frankho			    B_ASYNC | B_FREE, cr);
2947c478bdstevel@tonic-gate			ip->i_delaylen = 0;
2957c478bdstevel@tonic-gate		}
2967c478bdstevel@tonic-gate	}
2977c478bdstevel@tonic-gate
2987c478bdstevel@tonic-gate	return (0);
2997c478bdstevel@tonic-gate}
3007c478bdstevel@tonic-gate
3017c478bdstevel@tonic-gate/*ARGSUSED*/
3027c478bdstevel@tonic-gatestatic int
3037c478bdstevel@tonic-gateufs_read(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *cr,
30480d5689Patrick Mooney    struct caller_context *ct)
3057c478bdstevel@tonic-gate{
3067c478bdstevel@tonic-gate	struct inode *ip = VTOI(vp);
3077c478bdstevel@tonic-gate	struct ufsvfs *ufsvfsp;
3087c478bdstevel@tonic-gate	struct ulockfs *ulp = NULL;
3097c478bdstevel@tonic-gate	int error = 0;
3107c478bdstevel@tonic-gate	int intrans = 0;
3117c478bdstevel@tonic-gate
3127c478bdstevel@tonic-gate	ASSERT(RW_READ_HELD(&ip->i_rwlock));
3137c478bdstevel@tonic-gate
3147c478bdstevel@tonic-gate	/*
3157c478bdstevel@tonic-gate	 * Mandatory locking needs to be done before ufs_lockfs_begin()
3167c478bdstevel@tonic-gate	 * and TRANS_BEGIN_SYNC() calls since mandatory locks can sleep.
3177c478bdstevel@tonic-gate	 */
3187c478bdstevel@tonic-gate	if (MANDLOCK(vp, ip->i_mode)) {
3197c478bdstevel@tonic-gate		/*
3207c478bdstevel@tonic-gate		 * ufs_getattr ends up being called by chklock
3217c478bdstevel@tonic-gate		 */
3227c478bdstevel@tonic-gate		error = chklock(vp, FREAD, uiop->uio_loffset,
32380d3443frankho		    uiop->uio_resid, uiop->uio_fmode, ct);
3247c478bdstevel@tonic-gate		if (error)
3257c478bdstevel@tonic-gate			goto out;
3267c478bdstevel@tonic-gate	}
3277c478bdstevel@tonic-gate
3287c478bdstevel@tonic-gate	ufsvfsp = ip->i_ufsvfs;
3297c478bdstevel@tonic-gate	error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_READ_MASK);
3307c478bdstevel@tonic-gate	if (error)
3317c478bdstevel@tonic-gate		goto out;
3327c478bdstevel@tonic-gate
3337c478bdstevel@tonic-gate	/*
3347c478bdstevel@tonic-gate	 * In the case that a directory is opened for reading as a file
3357c478bdstevel@tonic-gate	 * (eg "cat .") with the  O_RSYNC, O_SYNC and O_DSYNC flags set.
3367c478bdstevel@tonic-gate	 * The locking order had to be changed to avoid a deadlock with
3377c478bdstevel@tonic-gate	 * an update taking place on that directory at the same time.
3387c478bdstevel@tonic-gate	 */
3397c478bdstevel@tonic-gate	if ((ip->i_mode & IFMT) == IFDIR) {
3407c478bdstevel@tonic-gate
3417c478bdstevel@tonic-gate		rw_enter(&ip->i_contents, RW_READER);
3427c478bdstevel@tonic-gate		error = rdip(ip, uiop, ioflag, cr);
3437c478bdstevel@tonic-gate		rw_exit(&ip->i_contents);
3447c478bdstevel@tonic-gate
3457c478bdstevel@tonic-gate		if (error) {
3467c478bdstevel@tonic-gate			if (ulp)
3477c478bdstevel@tonic-gate				ufs_lockfs_end(ulp);
3487c478bdstevel@tonic-gate			goto out;
3497c478bdstevel@tonic-gate		}
3507c478bdstevel@tonic-gate
3517c478bdstevel@tonic-gate		if (ulp && (ioflag & FRSYNC) && (ioflag & (FSYNC | FDSYNC)) &&
3527c478bdstevel@tonic-gate		    TRANS_ISTRANS(ufsvfsp)) {
3537c478bdstevel@tonic-gate			rw_exit(&ip->i_rwlock);
3547c478bdstevel@tonic-gate			TRANS_BEGIN_SYNC(ufsvfsp, TOP_READ_SYNC, TOP_READ_SIZE,
3557c478bdstevel@tonic-gate			    error);
3567c478bdstevel@tonic-gate			ASSERT(!error);
3577c478bdstevel@tonic-gate			TRANS_END_SYNC(ufsvfsp, error, TOP_READ_SYNC,
3587c478bdstevel@tonic-gate			    TOP_READ_SIZE);
3597c478bdstevel@tonic-gate			rw_enter(&ip->i_rwlock, RW_READER);
3607c478bdstevel@tonic-gate		}
3617c478bdstevel@tonic-gate	} else {
3627c478bdstevel@tonic-gate		/*
3637c478bdstevel@tonic-gate		 * Only transact reads to files opened for sync-read and
3647c478bdstevel@tonic-gate		 * sync-write on a file system that is not write locked.
3657c478bdstevel@tonic-gate		 *
3667c478bdstevel@tonic-gate		 * The ``not write locked'' check prevents problems with
3677c478bdstevel@tonic-gate		 * enabling/disabling logging on a busy file system.  E.g.,
3687c478bdstevel@tonic-gate		 * logging exists at the beginning of the read but does not
3697c478bdstevel@tonic-gate		 * at the end.
3707c478bdstevel@tonic-gate		 *
3717c478bdstevel@tonic-gate		 */
3727c478bdstevel@tonic-gate		if (ulp && (ioflag & FRSYNC) && (ioflag & (FSYNC | FDSYNC)) &&
3737c478bdstevel@tonic-gate		    TRANS_ISTRANS(ufsvfsp)) {
3747c478bdstevel@tonic-gate			TRANS_BEGIN_SYNC(ufsvfsp, TOP_READ_SYNC, TOP_READ_SIZE,
3757c478bdstevel@tonic-gate			    error);
3767c478bdstevel@tonic-gate			ASSERT(!error);
3777c478bdstevel@tonic-gate			intrans = 1;
3787c478bdstevel@tonic-gate		}
3797c478bdstevel@tonic-gate
3807c478bdstevel@tonic-gate		rw_enter(&ip->i_contents, RW_READER);
3817c478bdstevel@tonic-gate		error = rdip(ip, uiop, ioflag, cr);
3827c478bdstevel@tonic-gate		rw_exit(&ip->i_contents);
3837c478bdstevel@tonic-gate
3847c478bdstevel@tonic-gate		if (intrans) {
3857c478bdstevel@tonic-gate			TRANS_END_SYNC(ufsvfsp, error, TOP_READ_SYNC,
3867c478bdstevel@tonic-gate			    TOP_READ_SIZE);
3877c478bdstevel@tonic-gate		}
3887c478bdstevel@tonic-gate	}
3897c478bdstevel@tonic-gate
3907c478bdstevel@tonic-gate	if (ulp) {
3917c478bdstevel@tonic-gate		ufs_lockfs_end(ulp);
3927c478bdstevel@tonic-gate	}
3937c478bdstevel@tonic-gateout:
3947c478bdstevel@tonic-gate
3957c478bdstevel@tonic-gate	return (error);
3967c478bdstevel@tonic-gate}
3977c478bdstevel@tonic-gate
3987c478bdstevel@tonic-gateextern	int	ufs_HW;		/* high water mark */
3997c478bdstevel@tonic-gateextern	int	ufs_LW;		/* low water mark */
4007c478bdstevel@tonic-gateint	ufs_WRITES = 1;		/* XXX - enable/disable */
4017c478bdstevel@tonic-gateint	ufs_throttles = 0;	/* throttling count */
4027c478bdstevel@tonic-gateint	ufs_allow_shared_writes = 1;	/* directio shared writes */
4037c478bdstevel@tonic-gate
4047c478bdstevel@tonic-gatestatic int
4057c478bdstevel@tonic-gateufs_check_rewrite(struct inode *ip, struct uio *uiop, int ioflag)
4067c478bdstevel@tonic-gate{
407f90bab2swilcox	int	shared_write;
408f90bab2swilcox
409f90bab2swilcox	/*
410f90bab2swilcox	 * If the FDSYNC flag is set then ignore the global
411f90bab2swilcox	 * ufs_allow_shared_writes in this case.
412f90bab2swilcox	 */
413f90bab2swilcox	shared_write = (ioflag & FDSYNC) | ufs_allow_shared_writes;
4147c478bdstevel@tonic-gate
4157c478bdstevel@tonic-gate	/*
4167c478bdstevel@tonic-gate	 * Filter to determine if this request is suitable as a
4177c478bdstevel@tonic-gate	 * concurrent rewrite. This write must not allocate blocks
4187c478bdstevel@tonic-gate	 * by extending the file or filling in holes. No use trying
4197c478bdstevel@tonic-gate	 * through FSYNC descriptors as the inode will be synchronously
4207c478bdstevel@tonic-gate	 * updated after the write. The uio structure has not yet been
4217c478bdstevel@tonic-gate	 * checked for sanity, so assume nothing.
4227c478bdstevel@tonic-gate	 */
4237c478bdstevel@tonic-gate	return (((ip->i_mode & IFMT) == IFREG) && !(ioflag & FAPPEND) &&
42480d3443frankho	    (uiop->uio_loffset >= (offset_t)0) &&
42580d3443frankho	    (uiop->uio_loffset < ip->i_size) && (uiop->uio_resid > 0) &&
42680d3443frankho	    ((ip->i_size - uiop->uio_loffset) >= uiop->uio_resid) &&
42780d3443frankho	    !(ioflag & FSYNC) && !bmap_has_holes(ip) &&
42880d3443frankho	    shared_write);
4297c478bdstevel@tonic-gate}
4307c478bdstevel@tonic-gate
4317c478bdstevel@tonic-gate/*ARGSUSED*/
4327c478bdstevel@tonic-gatestatic int
4337c478bdstevel@tonic-gateufs_write(struct vnode *vp, struct uio *uiop, int ioflag, cred_t *cr,
43480d5689Patrick Mooney    caller_context_t *ct)
4357c478bdstevel@tonic-gate{
4367c478bdstevel@tonic-gate	struct inode *ip = VTOI(vp);
4377c478bdstevel@tonic-gate	struct ufsvfs *ufsvfsp;
4387c478bdstevel@tonic-gate	struct ulockfs *ulp;
4397c478bdstevel@tonic-gate	int retry = 1;
4407c478bdstevel@tonic-gate	int error, resv, resid = 0;
4417c478bdstevel@tonic-gate	int directio_status;
4427c478bdstevel@tonic-gate	int exclusive;
443f90bab2swilcox	int rewriteflg;
4447c478bdstevel@tonic-gate	long start_resid = uiop->uio_resid;
4457c478bdstevel@tonic-gate
4467c478bdstevel@tonic-gate	ASSERT(RW_LOCK_HELD(&ip->i_rwlock));
4477c478bdstevel@tonic-gate
4487c478bdstevel@tonic-gateretry_mandlock:
4497c478bdstevel@tonic-gate	/*
4507c478bdstevel@tonic-gate	 * Mandatory locking needs to be done before ufs_lockfs_begin()
4517c478bdstevel@tonic-gate	 * and TRANS_BEGIN_[A]SYNC() calls since mandatory locks can sleep.
4527c478bdstevel@tonic-gate	 * Check for forced unmounts normally done in ufs_lockfs_begin().
4537c478bdstevel@tonic-gate	 */
4547c478bdstevel@tonic-gate	if ((ufsvfsp = ip->i_ufsvfs) == NULL) {
4557c478bdstevel@tonic-gate		error = EIO;
4567c478bdstevel@tonic-gate		goto out;
4577c478bdstevel@tonic-gate	}
4587c478bdstevel@tonic-gate	if (MANDLOCK(vp, ip->i_mode)) {
4597c478bdstevel@tonic-gate
4607c478bdstevel@tonic-gate		ASSERT(RW_WRITE_HELD(&ip->i_rwlock));
4617c478bdstevel@tonic-gate
4627c478bdstevel@tonic-gate		/*
4637c478bdstevel@tonic-gate		 * ufs_getattr ends up being called by chklock
4647c478bdstevel@tonic-gate		 */
4657c478bdstevel@tonic-gate		error = chklock(vp, FWRITE, uiop->uio_loffset,
46680d3443frankho		    uiop->uio_resid, uiop->uio_fmode, ct);
4677c478bdstevel@tonic-gate		if (error)
4687c478bdstevel@tonic-gate			goto out;
4697c478bdstevel@tonic-gate	}
4707c478bdstevel@tonic-gate
4717c478bdstevel@tonic-gate	/* i_rwlock can change in chklock */
4727c478bdstevel@tonic-gate	exclusive = rw_write_held(&ip->i_rwlock);
473f90bab2swilcox	rewriteflg = ufs_check_rewrite(ip, uiop, ioflag);
4747c478bdstevel@tonic-gate
4757c478bdstevel@tonic-gate	/*
4767c478bdstevel@tonic-gate	 * Check for fast-path special case of directio re-writes.
4777c478bdstevel@tonic-gate	 */
4787c478bdstevel@tonic-gate	if ((ip->i_flag & IDIRECTIO || ufsvfsp->vfs_forcedirectio) &&
479f90bab2swilcox	    !exclusive && rewriteflg) {
4807c478bdstevel@tonic-gate
4817c478bdstevel@tonic-gate		error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_WRITE_MASK);
4827c478bdstevel@tonic-gate		if (error)
4837c478bdstevel@tonic-gate			goto out;
4847c478bdstevel@tonic-gate
4857c478bdstevel@tonic-gate		rw_enter(&ip->i_contents, RW_READER);
4867c478bdstevel@tonic-gate		error = ufs_directio_write(ip, uiop, ioflag, 1, cr,
48780d3443frankho		    &directio_status);
4887c478bdstevel@tonic-gate		if (directio_status == DIRECTIO_SUCCESS) {
4897c478bdstevel@tonic-gate			uint_t i_flag_save;
4907c478bdstevel@tonic-gate
4917c478bdstevel@tonic-gate			if (start_resid != uiop->uio_resid)
4927c478bdstevel@tonic-gate				error = 0;
4937c478bdstevel@tonic-gate			/*
4947c478bdstevel@tonic-gate			 * Special treatment of access times for re-writes.
4957c478bdstevel@tonic-gate			 * If IMOD is not already set, then convert it
4967c478bdstevel@tonic-gate			 * to IMODACC for this operation. This defers
4977c478bdstevel@tonic-gate			 * entering a delta into the log until the inode
4987c478bdstevel@tonic-gate			 * is flushed. This mimics what is done for read
4997c478bdstevel@tonic-gate			 * operations and inode access time.
5007c478bdstevel@tonic-gate			 */
5017c478bdstevel@tonic-gate			mutex_enter(&ip->i_tlock);
5027c478bdstevel@tonic-gate			i_flag_save = ip->i_flag;
5037c478bdstevel@tonic-gate			ip->i_flag |= IUPD | ICHG;
5047c478bdstevel@tonic-gate			ip->i_seq++;
5057c478bdstevel@tonic-gate			ITIMES_NOLOCK(ip);
5067c478bdstevel@tonic-gate			if ((i_flag_save & IMOD) == 0) {
5077c478bdstevel@tonic-gate				ip->i_flag &= ~IMOD;
5087c478bdstevel@tonic-gate				ip->i_flag |= IMODACC;
5097c478bdstevel@tonic-gate			}
5107c478bdstevel@tonic-gate			mutex_exit(&ip->i_tlock);
5117c478bdstevel@tonic-gate			rw_exit(&ip->i_contents);
5127c478bdstevel@tonic-gate			if (ulp)
5137c478bdstevel@tonic-gate				ufs_lockfs_end(ulp);
5147c478bdstevel@tonic-gate			goto out;
5157c478bdstevel@tonic-gate		}
5167c478bdstevel@tonic-gate		rw_exit(&ip->i_contents);
5177c478bdstevel@tonic-gate		if (ulp)
5187c478bdstevel@tonic-gate			ufs_lockfs_end(ulp);
5197c478bdstevel@tonic-gate	}
5207c478bdstevel@tonic-gate
5217c478bdstevel@tonic-gate	if (!exclusive && !rw_tryupgrade(&ip->i_rwlock)) {
5227c478bdstevel@tonic-gate		rw_exit(&ip->i_rwlock);
5237c478bdstevel@tonic-gate		rw_enter(&ip->i_rwlock, RW_WRITER);
5247c478bdstevel@tonic-gate		/*
5257c478bdstevel@tonic-gate		 * Mandatory locking could have been enabled
5267c478bdstevel@tonic-gate		 * after dropping the i_rwlock.
5277c478bdstevel@tonic-gate		 */
5287c478bdstevel@tonic-gate		if (MANDLOCK(vp, ip->i_mode))
5297c478bdstevel@tonic-gate			goto retry_mandlock;
5307c478bdstevel@tonic-gate	}
5317c478bdstevel@tonic-gate
5327c478bdstevel@tonic-gate	error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_WRITE_MASK);
5337c478bdstevel@tonic-gate	if (error)
5347c478bdstevel@tonic-gate		goto out;
5357c478bdstevel@tonic-gate
5367c478bdstevel@tonic-gate	/*
5377c478bdstevel@tonic-gate	 * Amount of log space needed for this write
5387c478bdstevel@tonic-gate	 */
539f90bab2swilcox	if (!rewriteflg || !(ioflag & FDSYNC))
540f90bab2swilcox		TRANS_WRITE_RESV(ip, uiop, ulp, &resv, &resid);
5417c478bdstevel@tonic-gate
5427c478bdstevel@tonic-gate	/*
5437c478bdstevel@tonic-gate	 * Throttle writes.
5447c478bdstevel@tonic-gate	 */
5457c478bdstevel@tonic-gate	if (ufs_WRITES && (ip->i_writes > ufs_HW)) {
5467c478bdstevel@tonic-gate		mutex_enter(&ip->i_tlock);
5477c478bdstevel@tonic-gate		while (ip->i_writes > ufs_HW) {
5487c478bdstevel@tonic-gate			ufs_throttles++;
5497c478bdstevel@tonic-gate			cv_wait(&ip->i_wrcv, &ip->i_tlock);
5507c478bdstevel@tonic-gate		}
5517c478bdstevel@tonic-gate		mutex_exit(&ip->i_tlock);
5527c478bdstevel@tonic-gate	}
5537c478bdstevel@tonic-gate
5547c478bdstevel@tonic-gate	/*
5557c478bdstevel@tonic-gate	 * Enter Transaction
556f90bab2swilcox	 *
557f90bab2swilcox	 * If the write is a rewrite there is no need to open a transaction
558f90bab2swilcox	 * if the FDSYNC flag is set and not the FSYNC.  In this case just
559f90bab2swilcox	 * set the IMODACC flag to modify do the update at a later time
560f90bab2swilcox	 * thus avoiding the overhead of the logging transaction that is
561f90bab2swilcox	 * not required.
5627c478bdstevel@tonic-gate	 */
5637c478bdstevel@tonic-gate	if (ioflag & (FSYNC|FDSYNC)) {
5647c478bdstevel@tonic-gate		if (ulp) {
565f90bab2swilcox			if (rewriteflg) {
566f90bab2swilcox				uint_t i_flag_save;
567f90bab2swilcox
568f90bab2swilcox				rw_enter(&ip->i_contents, RW_READER);
569f90bab2swilcox				mutex_enter(&ip->i_tlock);
570f90bab2swilcox				i_flag_save = ip->i_flag;
571f90bab2swilcox				ip->i_flag |= IUPD | ICHG;
572f90bab2swilcox				ip->i_seq++;
573f90bab2swilcox				ITIMES_NOLOCK(ip);
574f90bab2swilcox				if ((i_flag_save & IMOD) == 0) {
575f90bab2swilcox					ip->i_flag &= ~IMOD;
576f90bab2swilcox					ip->i_flag |= IMODACC;
577f90bab2swilcox				}
578f90bab2swilcox				mutex_exit(&ip->i_tlock);
579f90bab2swilcox				rw_exit(&ip->i_contents);
580f90bab2swilcox			} else {
581f90bab2swilcox				int terr = 0;
582f90bab2swilcox				TRANS_BEGIN_SYNC(ufsvfsp, TOP_WRITE_SYNC, resv,
583f90bab2swilcox				    terr);
584f90bab2swilcox				ASSERT(!terr);
585f90bab2swilcox			}
5867c478bdstevel@tonic-gate		}
5877c478bdstevel@tonic-gate	} else {
5887c478bdstevel@tonic-gate		if (ulp)
5897c478bdstevel@tonic-gate			TRANS_BEGIN_ASYNC(ufsvfsp, TOP_WRITE, resv);
5907c478bdstevel@tonic-gate	}
5917c478bdstevel@tonic-gate
5927c478bdstevel@tonic-gate	/*
5937c478bdstevel@tonic-gate	 * Write the file
5947c478bdstevel@tonic-gate	 */
5957c478bdstevel@tonic-gate	rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
5967c478bdstevel@tonic-gate	rw_enter(&ip->i_contents, RW_WRITER);
5977c478bdstevel@tonic-gate	if ((ioflag & FAPPEND) != 0 && (ip->i_mode & IFMT) == IFREG) {
5987c478bdstevel@tonic-gate		/*
5997c478bdstevel@tonic-gate		 * In append mode start at end of file.
6007c478bdstevel@tonic-gate		 */
6017c478bdstevel@tonic-gate		uiop->uio_loffset = ip->i_size;
6027c478bdstevel@tonic-gate	}
6037c478bdstevel@tonic-gate
6047c478bdstevel@tonic-gate	/*
6057c478bdstevel@tonic-gate	 * Mild optimisation, don't call ufs_trans_write() unless we have to
6067c478bdstevel@tonic-gate	 * Also, suppress file system full messages if we will retry.
6077c478bdstevel@tonic-gate	 */
6087c478bdstevel@tonic-gate	if (retry)
6097c478bdstevel@tonic-gate		ip->i_flag |= IQUIET;
6107c478bdstevel@tonic-gate	if (resid) {
6117c478bdstevel@tonic-gate		TRANS_WRITE(ip, uiop, ioflag, error, ulp, cr, resv, resid);
6127c478bdstevel@tonic-gate	} else {
6137c478bdstevel@tonic-gate		error = wrip(ip, uiop, ioflag, cr);
6147c478bdstevel@tonic-gate	}
6157c478bdstevel@tonic-gate	ip->i_flag &= ~IQUIET;
6167c478bdstevel@tonic-gate
6177c478bdstevel@tonic-gate	rw_exit(&ip->i_contents);
6187c478bdstevel@tonic-gate	rw_exit(&ufsvfsp->vfs_dqrwlock);
6197c478bdstevel@tonic-gate
6207c478bdstevel@tonic-gate	/*
6217c478bdstevel@tonic-gate	 * Leave Transaction
6227c478bdstevel@tonic-gate	 */
6237c478bdstevel@tonic-gate	if (ulp) {
6247c478bdstevel@tonic-gate		if (ioflag & (FSYNC|FDSYNC)) {
625f90bab2swilcox			if (!rewriteflg) {
626f90bab2swilcox				int terr = 0;
627f90bab2swilcox
628f90bab2swilcox				TRANS_END_SYNC(ufsvfsp, terr, TOP_WRITE_SYNC,
62980d3443frankho				    resv);
630f90bab2swilcox				if (error == 0)
631f90bab2swilcox					error = terr;
632f90bab2swilcox			}
6337c478bdstevel@tonic-gate		} else {
6347c478bdstevel@tonic-gate			TRANS_END_ASYNC(ufsvfsp, TOP_WRITE, resv);
6357c478bdstevel@tonic-gate		}
6367c478bdstevel@tonic-gate		ufs_lockfs_end(ulp);
6377c478bdstevel@tonic-gate	}
6387c478bdstevel@tonic-gateout:
6397c478bdstevel@tonic-gate	if ((error == ENOSPC) && retry && TRANS_ISTRANS(ufsvfsp)) {
6407c478bdstevel@tonic-gate		/*
6417c478bdstevel@tonic-gate		 * Any blocks tied up in pending deletes?
6427c478bdstevel@tonic-gate		 */
6437c478bdstevel@tonic-gate		ufs_delete_drain_wait(ufsvfsp, 1);
6447c478bdstevel@tonic-gate		retry = 0;
6457c478bdstevel@tonic-gate		goto retry_mandlock;
6467c478bdstevel@tonic-gate	}
6477c478bdstevel@tonic-gate
6487c478bdstevel@tonic-gate	if (error == ENOSPC && (start_resid != uiop->uio_resid))
6497c478bdstevel@tonic-gate		error = 0;
6507c478bdstevel@tonic-gate
6517c478bdstevel@tonic-gate	return (error);
6527c478bdstevel@tonic-gate}
6537c478bdstevel@tonic-gate
6547c478bdstevel@tonic-gate/*
6557c478bdstevel@tonic-gate * Don't cache write blocks to files with the sticky bit set.
6567c478bdstevel@tonic-gate * Used to keep swap files from blowing the page cache on a server.
6577c478bdstevel@tonic-gate */
6587c478bdstevel@tonic-gateint stickyhack = 1;
6597c478bdstevel@tonic-gate
6607c478bdstevel@tonic-gate/*
6617c478bdstevel@tonic-gate * wrip does the real work of write requests for ufs.
6627c478bdstevel@tonic-gate */
6637c478bdstevel@tonic-gateint
6647c478bdstevel@tonic-gatewrip(struct inode *ip, struct uio *uio, int ioflag, struct cred *cr)
6657c478bdstevel@tonic-gate{
6667c478bdstevel@tonic-gate	rlim64_t limit = uio->uio_llimit;
6677c478bdstevel@tonic-gate	u_offset_t off;
6687c478bdstevel@tonic-gate	u_offset_t old_i_size;
6697c478bdstevel@tonic-gate	struct fs *fs;
6707c478bdstevel@tonic-gate	struct vnode *vp;
6717c478bdstevel@tonic-gate	struct ufsvfs *ufsvfsp;
6727c478bdstevel@tonic-gate	caddr_t base;
6737c478bdstevel@tonic-gate	long start_resid = uio->uio_resid;	/* save starting resid */
6747c478bdstevel@tonic-gate	long premove_resid;			/* resid before uiomove() */
6757c478bdstevel@tonic-gate	uint_t flags;
6767c478bdstevel@tonic-gate	int newpage;
6777c478bdstevel@tonic-gate	int iupdat_flag, directio_status;
6787c478bdstevel@tonic-gate	int n, on, mapon;
6797c478bdstevel@tonic-gate	int error, pagecreate;
6807c478bdstevel@tonic-gate	int do_dqrwlock;		/* drop/reacquire vfs_dqrwlock */
6817c478bdstevel@tonic-gate	int32_t	iblocks;
6827c478bdstevel@tonic-gate	int	new_iblocks;
6837c478bdstevel@tonic-gate
6847c478bdstevel@tonic-gate	/*
6857c478bdstevel@tonic-gate	 * ip->i_size is incremented before the uiomove
6867c478bdstevel@tonic-gate	 * is done on a write.  If the move fails (bad user
6877c478bdstevel@tonic-gate	 * address) reset ip->i_size.
6887c478bdstevel@tonic-gate	 * The better way would be to increment ip->i_size
6897c478bdstevel@tonic-gate	 * only if the uiomove succeeds.
6907c478bdstevel@tonic-gate	 */
6917c478bdstevel@tonic-gate	int i_size_changed = 0;
6927c478bdstevel@tonic-gate	o_mode_t type;
6937c478bdstevel@tonic-gate	int i_seq_needed = 0;
6947c478bdstevel@tonic-gate
6957c478bdstevel@tonic-gate	vp = ITOV(ip);
6967c478bdstevel@tonic-gate
6977c478bdstevel@tonic-gate	/*
6987c478bdstevel@tonic-gate	 * check for forced unmount - should not happen as
6997c478bdstevel@tonic-gate	 * the request passed the lockfs checks.
7007c478bdstevel@tonic-gate	 */
7017c478bdstevel@tonic-gate	if ((ufsvfsp = ip->i_ufsvfs) == NULL)
7027c478bdstevel@tonic-gate		return (EIO);
7037c478bdstevel@tonic-gate
7047c478bdstevel@tonic-gate	fs = ip->i_fs;
7057c478bdstevel@tonic-gate
7067c478bdstevel@tonic-gate	ASSERT(RW_WRITE_HELD(&ip->i_contents));
7077c478bdstevel@tonic-gate
7087c478bdstevel@tonic-gate	/* check for valid filetype */
7097c478bdstevel@tonic-gate	type = ip->i_mode & IFMT;
7107c478bdstevel@tonic-gate	if ((type != IFREG) && (type != IFDIR) && (type != IFATTRDIR) &&
7117c478bdstevel@tonic-gate	    (type != IFLNK) && (type != IFSHAD)) {
7127c478bdstevel@tonic-gate		return (EIO);
7137c478bdstevel@tonic-gate	}
7147c478bdstevel@tonic-gate
7157c478bdstevel@tonic-gate	/*
7167c478bdstevel@tonic-gate	 * the actual limit of UFS file size
7177c478bdstevel@tonic-gate	 * is UFS_MAXOFFSET_T
7187c478bdstevel@tonic-gate	 */
7197c478bdstevel@tonic-gate	if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
7207c478bdstevel@tonic-gate		limit = MAXOFFSET_T;
7217c478bdstevel@tonic-gate
7227c478bdstevel@tonic-gate	if (uio->uio_loffset >= limit) {
7237c478bdstevel@tonic-gate		proc_t *p = ttoproc(curthread);
7247c478bdstevel@tonic-gate
7257c478bdstevel@tonic-gate		mutex_enter(&p->p_lock);
7267c478bdstevel@tonic-gate		(void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE], p->p_rctls,
7277c478bdstevel@tonic-gate		    p, RCA_UNSAFE_SIGINFO);
7287c478bdstevel@tonic-gate		mutex_exit(&p->p_lock);
7297c478bdstevel@tonic-gate		return (EFBIG);
7307c478bdstevel@tonic-gate	}
7317c478bdstevel@tonic-gate
7327c478bdstevel@tonic-gate	/*
7337c478bdstevel@tonic-gate	 * if largefiles are disallowed, the limit is
7347c478bdstevel@tonic-gate	 * the pre-largefiles value of 2GB
7357c478bdstevel@tonic-gate	 */
7367c478bdstevel@tonic-gate	if (ufsvfsp->vfs_lfflags & UFS_LARGEFILES)
7377c478bdstevel@tonic-gate		limit = MIN(UFS_MAXOFFSET_T, limit);
7387c478bdstevel@tonic-gate	else
7397c478bdstevel@tonic-gate		limit = MIN(MAXOFF32_T, limit);
7407c478bdstevel@tonic-gate
7417c478bdstevel@tonic-gate	if (uio->uio_loffset < (offset_t)0) {
7427c478bdstevel@tonic-gate		return (EINVAL);
7437c478bdstevel@tonic-gate	}
7447c478bdstevel@tonic-gate	if (uio->uio_resid == 0) {
7457c478bdstevel@tonic-gate		return (0);
7467c478bdstevel@tonic-gate	}
7477c478bdstevel@tonic-gate
7487c478bdstevel@tonic-gate	if (uio->uio_loffset >= limit)
7497c478bdstevel@tonic-gate		return (EFBIG);
7507c478bdstevel@tonic-gate
7517c478bdstevel@tonic-gate	ip->i_flag |= INOACC;	/* don't update ref time in getpage */
7527c478bdstevel@tonic-gate
7537c478bdstevel@tonic-gate	if (ioflag & (FSYNC|FDSYNC)) {
7547c478bdstevel@tonic-gate		ip->i_flag |= ISYNC;
7557c478bdstevel@tonic-gate		iupdat_flag = 1;
7567c478bdstevel@tonic-gate	}
7577c478bdstevel@tonic-gate	/*
7587c478bdstevel@tonic-gate	 * Try to go direct
7597c478bdstevel@tonic-gate	 */
7607c478bdstevel@tonic-gate	if (ip->i_flag & IDIRECTIO || ufsvfsp->vfs_forcedirectio) {
7617c478bdstevel@tonic-gate		uio->uio_llimit = limit;
7627c478bdstevel@tonic-gate		error = ufs_directio_write(ip, uio, ioflag, 0, cr,
76380d3443frankho		    &directio_status);
7647c478bdstevel@tonic-gate		/*
7657c478bdstevel@tonic-gate		 * If ufs_directio wrote to the file or set the flags,
7667c478bdstevel@tonic-gate		 * we need to update i_seq, but it may be deferred.
7677c478bdstevel@tonic-gate		 */
7687c478bdstevel@tonic-gate		if (start_resid != uio->uio_resid ||
76980d3443frankho		    (ip->i_flag & (ICHG|IUPD))) {
7707c478bdstevel@tonic-gate			i_seq_needed = 1;
7717c478bdstevel@tonic-gate			ip->i_flag |= ISEQ;
7727c478bdstevel@tonic-gate		}
7737c478bdstevel@tonic-gate		if (directio_status == DIRECTIO_SUCCESS)
7747c478bdstevel@tonic-gate			goto out;
7757c478bdstevel@tonic-gate	}
7767c478bdstevel@tonic-gate
7777c478bdstevel@tonic-gate	/*
7787c478bdstevel@tonic-gate	 * Behavior with respect to dropping/reacquiring vfs_dqrwlock:
7797c478bdstevel@tonic-gate	 *
7807c478bdstevel@tonic-gate	 * o shadow inodes: vfs_dqrwlock is not held at all
7817c478bdstevel@tonic-gate	 * o quota updates: vfs_dqrwlock is read or write held
7827c478bdstevel@tonic-gate	 * o other updates: vfs_dqrwlock is read held
7837c478bdstevel@tonic-gate	 *
7847c478bdstevel@tonic-gate	 * The first case is the only one where we do not hold
7857c478bdstevel@tonic-gate	 * vfs_dqrwlock at all while entering wrip().
7867c478bdstevel@tonic-gate	 * We must make sure not to downgrade/drop vfs_dqrwlock if we
7877c478bdstevel@tonic-gate	 * have it as writer, i.e. if we are updating the quota inode.
7887c478bdstevel@tonic-gate	 * There is no potential deadlock scenario in this case as
7897c478bdstevel@tonic-gate	 * ufs_getpage() takes care of this and avoids reacquiring
7907c478bdstevel@tonic-gate	 * vfs_dqrwlock in that case.
7917c478bdstevel@tonic-gate	 *
7927c478bdstevel@tonic-gate	 * This check is done here since the above conditions do not change
7937c478bdstevel@tonic-gate	 * and we possibly loop below, so save a few cycles.
7947c478bdstevel@tonic-gate	 */
7957c478bdstevel@tonic-gate	if ((type == IFSHAD) ||
79680d3443frankho	    (rw_owner(&ufsvfsp->vfs_dqrwlock) == curthread)) {
797986fd29setje		do_dqrwlock = 0;
7987c478bdstevel@tonic-gate	} else {
7997c478bdstevel@tonic-gate		do_dqrwlock = 1;
8007c478bdstevel@tonic-gate	}
8017c478bdstevel@tonic-gate
8027c478bdstevel@tonic-gate	/*
8037c478bdstevel@tonic-gate	 * Large Files: We cast MAXBMASK to offset_t
8047c478bdstevel@tonic-gate	 * inorder to mask out the higher bits. Since offset_t
8057c478bdstevel@tonic-gate	 * is a signed value, the high order bit set in MAXBMASK
8067c478bdstevel@tonic-gate	 * value makes it do the right thing by having all bits 1
8077c478bdstevel@tonic-gate	 * in the higher word. May be removed for _SOLARIS64_.
8087c478bdstevel@tonic-gate	 */
8097c478bdstevel@tonic-gate
8107c478bdstevel@tonic-gate	fs = ip->i_fs;
8117c478bdstevel@tonic-gate	do {
8127c478bdstevel@tonic-gate		u_offset_t uoff = uio->uio_loffset;
8137c478bdstevel@tonic-gate		off = uoff & (offset_t)MAXBMASK;
8147c478bdstevel@tonic-gate		mapon = (int)(uoff & (offset_t)MAXBOFFSET);
8157c478bdstevel@tonic-gate		on = (int)blkoff(fs, uoff);
8167c478bdstevel@tonic-gate		n = (int)MIN(fs->fs_bsize - on, uio->uio_resid);
8177c478bdstevel@tonic-gate		new_iblocks = 1;
8187c478bdstevel@tonic-gate
8197c478bdstevel@tonic-gate		if (type == IFREG && uoff + n >= limit) {
8207c478bdstevel@tonic-gate			if (uoff >= limit) {
8217c478bdstevel@tonic-gate				error = EFBIG;
8227c478bdstevel@tonic-gate				goto out;
8237c478bdstevel@tonic-gate			}
8247c478bdstevel@tonic-gate			/*
8257c478bdstevel@tonic-gate			 * since uoff + n >= limit,
8267c478bdstevel@tonic-gate			 * therefore n >= limit - uoff, and n is an int
8277c478bdstevel@tonic-gate			 * so it is safe to cast it to an int
8287c478bdstevel@tonic-gate			 */
8297c478bdstevel@tonic-gate			n = (int)(limit - (rlim64_t)uoff);
8307c478bdstevel@tonic-gate		}
8317c478bdstevel@tonic-gate		if (uoff + n > ip->i_size) {
8327c478bdstevel@tonic-gate			/*
8337c478bdstevel@tonic-gate			 * We are extending the length of the file.
8347c478bdstevel@tonic-gate			 * bmap is used so that we are sure that
8357c478bdstevel@tonic-gate			 * if we need to allocate new blocks, that it
8367c478bdstevel@tonic-gate			 * is done here before we up the file size.
8377c478bdstevel@tonic-gate			 */
8387c478bdstevel@tonic-gate			error = bmap_write(ip, uoff, (int)(on + n),
839303bf60sdebnath			    mapon == 0, NULL, cr);
8407c478bdstevel@tonic-gate			/*
8417c478bdstevel@tonic-gate			 * bmap_write never drops i_contents so if
8427c478bdstevel@tonic-gate			 * the flags are set it changed the file.
8437c478bdstevel@tonic-gate			 */
8447c478bdstevel@tonic-gate			if (ip->i_flag & (ICHG|IUPD)) {
8457c478bdstevel@tonic-gate				i_seq_needed = 1;
8467c478bdstevel@tonic-gate				ip->i_flag |= ISEQ;
8477c478bdstevel@tonic-gate			}
8487c478bdstevel@tonic-gate			if (error)
8497c478bdstevel@tonic-gate				break;
8507c478bdstevel@tonic-gate			/*
8517c478bdstevel@tonic-gate			 * There is a window of vulnerability here.
8527c478bdstevel@tonic-gate			 * The sequence of operations: allocate file
8537c478bdstevel@tonic-gate			 * system blocks, uiomove the data into pages,
8547c478bdstevel@tonic-gate			 * and then update the size of the file in the
8557c478bdstevel@tonic-gate			 * inode, must happen atomically.  However, due
8567c478bdstevel@tonic-gate			 * to current locking constraints, this can not
8577c478bdstevel@tonic-gate			 * be done.
8587c478bdstevel@tonic-gate			 */
8597c478bdstevel@tonic-gate			ASSERT(ip->i_writer == NULL);
8607c478bdstevel@tonic-gate			ip->i_writer = curthread;
8617c478bdstevel@tonic-gate			i_size_changed = 1;
8627c478bdstevel@tonic-gate			/*
8637c478bdstevel@tonic-gate			 * If we are writing from the beginning of
8647c478bdstevel@tonic-gate			 * the mapping, we can just create the
8657c478bdstevel@tonic-gate			 * pages without having to read them.
8667c478bdstevel@tonic-gate			 */
8677c478bdstevel@tonic-gate			pagecreate = (mapon == 0);
8687c478bdstevel@tonic-gate		} else if (n == MAXBSIZE) {
8697c478bdstevel@tonic-gate			/*
8707c478bdstevel@tonic-gate			 * Going to do a whole mappings worth,
8717c478bdstevel@tonic-gate			 * so we can just create the pages w/o
8727c478bdstevel@tonic-gate			 * having to read them in.  But before
8737c478bdstevel@tonic-gate			 * we do that, we need to make sure any
8747c478bdstevel@tonic-gate			 * needed blocks are allocated first.
8757c478bdstevel@tonic-gate			 */
8767c478bdstevel@tonic-gate			iblocks = ip->i_blocks;
877303bf60sdebnath			error = bmap_write(ip, uoff, (int)(on + n),
878303bf60sdebnath			    BI_ALLOC_ONLY, NULL, cr);
8797c478bdstevel@tonic-gate			/*
8807c478bdstevel@tonic-gate			 * bmap_write never drops i_contents so if
8817c478bdstevel@tonic-gate			 * the flags are set it changed the file.
8827c478bdstevel@tonic-gate			 */
8837c478bdstevel@tonic-gate			if (ip->i_flag & (ICHG|IUPD)) {
8847c478bdstevel@tonic-gate				i_seq_needed = 1;
8857c478bdstevel@tonic-gate				ip->i_flag |= ISEQ;
8867c478bdstevel@tonic-gate			}
8877c478bdstevel@tonic-gate			if (error)
8887c478bdstevel@tonic-gate				break;
8897c478bdstevel@tonic-gate			pagecreate = 1;
8907c478bdstevel@tonic-gate			/*
8917c478bdstevel@tonic-gate			 * check if the new created page needed the
8927c478bdstevel@tonic-gate			 * allocation of new disk blocks.
8937c478bdstevel@tonic-gate			 */
8947c478bdstevel@tonic-gate			if (iblocks == ip->i_blocks)
8957c478bdstevel@tonic-gate				new_iblocks = 0; /* no new blocks allocated */
8967c478bdstevel@tonic-gate		} else {
8977c478bdstevel@tonic-gate			pagecreate = 0;
8987c478bdstevel@tonic-gate			/*
8997c478bdstevel@tonic-gate			 * In sync mode flush the indirect blocks which
9007c478bdstevel@tonic-gate			 * may have been allocated and not written on
9017c478bdstevel@tonic-gate			 * disk. In above cases bmap_write will allocate
9027c478bdstevel@tonic-gate			 * in sync mode.
9037c478bdstevel@tonic-gate			 */
9047c478bdstevel@tonic-gate			if (ioflag & (FSYNC|FDSYNC)) {
9057c478bdstevel@tonic-gate				error = ufs_indirblk_sync(ip, uoff);
9067c478bdstevel@tonic-gate				if (error)
9077c478bdstevel@tonic-gate					break;
9087c478bdstevel@tonic-gate			}
9097c478bdstevel@tonic-gate		}
9107c478bdstevel@tonic-gate
9117c478bdstevel@tonic-gate		/*
9127c478bdstevel@tonic-gate		 * At this point we can enter ufs_getpage() in one
9137c478bdstevel@tonic-gate		 * of two ways:
9147c478bdstevel@tonic-gate		 * 1) segmap_getmapflt() calls ufs_getpage() when the
9157c478bdstevel@tonic-gate		 *    forcefault parameter is true (pagecreate == 0)
9167c478bdstevel@tonic-gate		 * 2) uiomove() causes a page fault.
9177c478bdstevel@tonic-gate		 *
9187c478bdstevel@tonic-gate		 * We have to drop the contents lock to prevent the VM
919da6c28aamw		 * system from trying to reacquire it in ufs_getpage()
9207c478bdstevel@tonic-gate		 * should the uiomove cause a pagefault.
9217c478bdstevel@tonic-gate		 *
9227c478bdstevel@tonic-gate		 * We have to drop the reader vfs_dqrwlock here as well.
9237c478bdstevel@tonic-gate		 */
9247c478bdstevel@tonic-gate		rw_exit(&ip->i_contents);
9257c478bdstevel@tonic-gate		if (do_dqrwlock) {
9267c478bdstevel@tonic-gate			ASSERT(RW_LOCK_HELD(&ufsvfsp->vfs_dqrwlock));
9277c478bdstevel@tonic-gate			ASSERT(!(RW_WRITE_HELD(&ufsvfsp->vfs_dqrwlock)));
9287c478bdstevel@tonic-gate			rw_exit(&ufsvfsp->vfs_dqrwlock);
9297c478bdstevel@tonic-gate		}
9307c478bdstevel@tonic-gate
931a565276praks		newpage = 0;
932a565276praks		premove_resid = uio->uio_resid;
9336f5f1c6Donghai Qiao
9346f5f1c6Donghai Qiao		/*
9356f5f1c6Donghai Qiao		 * Touch the page and fault it in if it is not in core
9366f5f1c6Donghai Qiao		 * before segmap_getmapflt or vpm_data_copy can lock it.
9376f5f1c6Donghai Qiao		 * This is to avoid the deadlock if the buffer is mapped
9386f5f1c6Donghai Qiao		 * to the same file through mmap which we want to write.
9396f5f1c6Donghai Qiao		 */
9406f5f1c6Donghai Qiao		uio_prefaultpages((long)n, uio);
9416f5f1c6Donghai Qiao
942a565276praks		if (vpm_enable) {
943a565276praks			/*
944a565276praks			 * Copy data. If new pages are created, part of
945a565276praks			 * the page that is not written will be initizliazed
946a565276praks			 * with zeros.
947a565276praks			 */
948a565276praks			error = vpm_data_copy(vp, (off + mapon), (uint_t)n,
94980d3443frankho			    uio, !pagecreate, &newpage, 0, S_WRITE);
950a565276praks		} else {
951a565276praks
952a565276praks			base = segmap_getmapflt(segkmap, vp, (off + mapon),
95380d3443frankho			    (uint_t)n, !pagecreate, S_WRITE);
9547c478bdstevel@tonic-gate
955a565276praks			/*
956a565276praks			 * segmap_pagecreate() returns 1 if it calls
957a565276praks			 * page_create_va() to allocate any pages.
958a565276praks			 */
9597c478bdstevel@tonic-gate
960a565276praks			if (pagecreate)
961a565276praks				newpage = segmap_pagecreate(segkmap, base,
962a565276praks				    (size_t)n, 0);
9637c478bdstevel@tonic-gate
964a565276praks			error = uiomove(base + mapon, (long)n, UIO_WRITE, uio);
965a565276praks		}
9667c478bdstevel@tonic-gate
9677c478bdstevel@tonic-gate		/*
9687c478bdstevel@tonic-gate		 * If "newpage" is set, then a new page was created and it
9697c478bdstevel@tonic-gate		 * does not contain valid data, so it needs to be initialized
9707c478bdstevel@tonic-gate		 * at this point.
9717c478bdstevel@tonic-gate		 * Otherwise the page contains old data, which was overwritten
9727c478bdstevel@tonic-gate		 * partially or as a whole in uiomove.
9737c478bdstevel@tonic-gate		 * If there is only one iovec structure within uio, then
9747c478bdstevel@tonic-gate		 * on error uiomove will not be able to update uio->uio_loffset
9757c478bdstevel@tonic-gate		 * and we would zero the whole page here!
9767c478bdstevel@tonic-gate		 *
9777c478bdstevel@tonic-gate		 * If uiomove fails because of an error, the old valid data
9787c478bdstevel@tonic-gate		 * is kept instead of filling the rest of the page with zero's.
9797c478bdstevel@tonic-gate		 */
980a565276praks		if (!vpm_enable && newpage &&
9817c478bdstevel@tonic-gate		    uio->uio_loffset < roundup(off + mapon + n, PAGESIZE)) {
9827c478bdstevel@tonic-gate			/*
9837c478bdstevel@tonic-gate			 * We created pages w/o initializing them completely,
9847c478bdstevel@tonic-gate			 * thus we need to zero the part that wasn't set up.
9857c478bdstevel@tonic-gate			 * This happens on most EOF write cases and if
9867c478bdstevel@tonic-gate			 * we had some sort of error during the uiomove.
9877c478bdstevel@tonic-gate			 */
9887c478bdstevel@tonic-gate			int nzero, nmoved;
9897c478bdstevel@tonic-gate
9907c478bdstevel@tonic-gate			nmoved = (int)(uio->uio_loffset - (off + mapon));
9917c478bdstevel@tonic-gate			ASSERT(nmoved >= 0 && nmoved <= n);
9927c478bdstevel@tonic-gate			nzero = roundup(on + n, PAGESIZE) - nmoved;
9937c478bdstevel@tonic-gate			ASSERT(nzero > 0 && mapon + nmoved + nzero <= MAXBSIZE);
9947c478bdstevel@tonic-gate			(void) kzero(base + mapon + nmoved, (uint_t)nzero);
9957c478bdstevel@tonic-gate		}
9967c478bdstevel@tonic-gate
9977c478bdstevel@tonic-gate		/*
9987c478bdstevel@tonic-gate		 * Unlock the pages allocated by page_create_va()
9997c478bdstevel@tonic-gate		 * in segmap_pagecreate()
10007c478bdstevel@tonic-gate		 */
1001a565276praks		if (!vpm_enable && newpage)
10027c478bdstevel@tonic-gate			segmap_pageunlock(segkmap, base, (size_t)n, S_WRITE);
10037c478bdstevel@tonic-gate
10047c478bdstevel@tonic-gate		/*
10057c478bdstevel@tonic-gate		 * If the size of the file changed, then update the
10067c478bdstevel@tonic-gate		 * size field in the inode now.  This can't be done
10077c478bdstevel@tonic-gate		 * before the call to segmap_pageunlock or there is
10087c478bdstevel@tonic-gate		 * a potential deadlock with callers to ufs_putpage().
10097c478bdstevel@tonic-gate		 * They will be holding i_contents and trying to lock
10107c478bdstevel@tonic-gate		 * a page, while this thread is holding a page locked
10117c478bdstevel@tonic-gate		 * and trying to acquire i_contents.
10127c478bdstevel@tonic-gate		 */
10137c478bdstevel@tonic-gate		if (i_size_changed) {
10147c478bdstevel@tonic-gate			rw_enter(&ip->i_contents, RW_WRITER);
10157c478bdstevel@tonic-gate			old_i_size = ip->i_size;
10167c478bdstevel@tonic-gate			UFS_SET_ISIZE(uoff + n, ip);
10177c478bdstevel@tonic-gate			TRANS_INODE(ufsvfsp, ip);
10187c478bdstevel@tonic-gate			/*
10197c478bdstevel@tonic-gate			 * file has grown larger than 2GB. Set flag
10207c478bdstevel@tonic-gate			 * in superblock to indicate this, if it
10217c478bdstevel@tonic-gate			 * is not already set.
10227c478bdstevel@tonic-gate			 */
10237c478bdstevel@tonic-gate			if ((ip->i_size > MAXOFF32_T) &&
10247c478bdstevel@tonic-gate			    !(fs->fs_flags & FSLARGEFILES)) {
10257c478bdstevel@tonic-gate				ASSERT(ufsvfsp->vfs_lfflags & UFS_LARGEFILES);
10267c478bdstevel@tonic-gate				mutex_enter(&ufsvfsp->vfs_lock);
10277c478bdstevel@tonic-gate				fs->fs_flags |= FSLARGEFILES;
10287c478bdstevel@tonic-gate				ufs_sbwrite(ufsvfsp);
10297c478bdstevel@tonic-gate				mutex_exit(&ufsvfsp->vfs_lock);
10307c478bdstevel@tonic-gate			}
10317c478bdstevel@tonic-gate			mutex_enter(&ip->i_tlock);
10327c478bdstevel@tonic-gate			ip->i_writer = NULL;
10337c478bdstevel@tonic-gate			cv_broadcast(&ip->i_wrcv);
10347c478bdstevel@tonic-gate			mutex_exit(&ip->i_tlock);
10357c478bdstevel@tonic-gate			rw_exit(&ip->i_contents);
10367c478bdstevel@tonic-gate		}
10377c478bdstevel@tonic-gate
10387c478bdstevel@tonic-gate		if (error) {
10397c478bdstevel@tonic-gate			/*
10407c478bdstevel@tonic-gate			 * If we failed on a write, we may have already
10417c478bdstevel@tonic-gate			 * allocated file blocks as well as pages.  It's
10427c478bdstevel@tonic-gate			 * hard to undo the block allocation, but we must
10437c478bdstevel@tonic-gate			 * be sure to invalidate any pages that may have
10447c478bdstevel@tonic-gate			 * been allocated.
10457c478bdstevel@tonic-gate			 *
10467c478bdstevel@tonic-gate			 * If the page was created without initialization
10477c478bdstevel@tonic-gate			 * then we must check if it should be possible
10487c478bdstevel@tonic-gate			 * to destroy the new page and to keep the old data
10497c478bdstevel@tonic-gate			 * on the disk.
10507c478bdstevel@tonic-gate			 *
10517c478bdstevel@tonic-gate			 * It is possible to destroy the page without
10527c478bdstevel@tonic-gate			 * having to write back its contents only when
10537c478bdstevel@tonic-gate			 * - the size of the file keeps unchanged
10547c478bdstevel@tonic-gate			 * - bmap_write() did not allocate new disk blocks
10557c478bdstevel@tonic-gate			 *   it is possible to create big files using "seek" and
10567c478bdstevel@tonic-gate			 *   write to the end of the file. A "write" to a
10577c478bdstevel@tonic-gate			 *   position before the end of the file would not
10587c478bdstevel@tonic-gate			 *   change the size of the file but it would allocate
10597c478bdstevel@tonic-gate			 *   new disk blocks.
10607c478bdstevel@tonic-gate			 * - uiomove intended to overwrite the whole page.
10617c478bdstevel@tonic-gate			 * - a new page was created (newpage == 1).
10627c478bdstevel@tonic-gate			 */
10637c478bdstevel@tonic-gate
10647c478bdstevel@tonic-gate			if (i_size_changed == 0 && new_iblocks == 0 &&
10657c478bdstevel@tonic-gate			    newpage) {
10667c478bdstevel@tonic-gate
10677c478bdstevel@tonic-gate				/* unwind what uiomove eventually last did */
10687c478bdstevel@tonic-gate				uio->uio_resid = premove_resid;
10697c478bdstevel@tonic-gate
10707c478bdstevel@tonic-gate				/*
10717c478bdstevel@tonic-gate				 * destroy the page, do not write ambiguous
10727c478bdstevel@tonic-gate				 * data to the disk.
10737c478bdstevel@tonic-gate				 */
10747c478bdstevel@tonic-gate				flags = SM_DESTROY;
10757c478bdstevel@tonic-gate			} else {
10767c478bdstevel@tonic-gate				/*
10777c478bdstevel@tonic-gate				 * write the page back to the disk, if dirty,
10787c478bdstevel@tonic-gate				 * and remove the page from the cache.
10797c478bdstevel@tonic-gate				 */
10807c478bdstevel@tonic-gate				flags = SM_INVAL;
10817c478bdstevel@tonic-gate			}
1082a565276praks
1083a565276praks			if (vpm_enable) {
1084a565276praks				/*
1085a565276praks				 *  Flush pages.
1086a565276praks				 */
1087a565276praks				(void) vpm_sync_pages(vp, off, n, flags);
1088a565276praks			} else {
1089a565276praks				(void) segmap_release(segkmap, base, flags);
1090a565276praks			}
10917c478bdstevel@tonic-gate		} else {
10927c478bdstevel@tonic-gate			flags = 0;
10937c478bdstevel@tonic-gate			/*
10947c478bdstevel@tonic-gate			 * Force write back for synchronous write cases.
10957c478bdstevel@tonic-gate			 */
10967c478bdstevel@tonic-gate			if ((ioflag & (FSYNC|FDSYNC)) || type == IFDIR) {
10977c478bdstevel@tonic-gate				/*
10987c478bdstevel@tonic-gate				 * If the sticky bit is set but the
10997c478bdstevel@tonic-gate				 * execute bit is not set, we do a
11007c478bdstevel@tonic-gate				 * synchronous write back and free
11017c478bdstevel@tonic-gate				 * the page when done.  We set up swap
11027c478bdstevel@tonic-gate				 * files to be handled this way to
11037c478bdstevel@tonic-gate				 * prevent servers from keeping around
11047c478bdstevel@tonic-gate				 * the client's swap pages too long.
11057c478bdstevel@tonic-gate				 * XXX - there ought to be a better way.
11067c478bdstevel@tonic-gate				 */
11077c478bdstevel@tonic-gate				if (IS_SWAPVP(vp)) {
11087c478bdstevel@tonic-gate					flags = SM_WRITE | SM_FREE |
11097c478bdstevel@tonic-gate					    SM_DONTNEED;
11107c478bdstevel@tonic-gate					iupdat_flag = 0;
11117c478bdstevel@tonic-gate				} else {
11127c478bdstevel@tonic-gate					flags = SM_WRITE;
11137c478bdstevel@tonic-gate				}
11147c478bdstevel@tonic-gate			} else if (n + on == MAXBSIZE || IS_SWAPVP(vp)) {
11157c478bdstevel@tonic-gate				/*
11167c478bdstevel@tonic-gate				 * Have written a whole block.
11177c478bdstevel@tonic-gate				 * Start an asynchronous write and
11187c478bdstevel@tonic-gate				 * mark the buffer to indicate that
11197c478bdstevel@tonic-gate				 * it won't be needed again soon.
11207c478bdstevel@tonic-gate				 */
11217c478bdstevel@tonic-gate				flags = SM_WRITE | SM_ASYNC | SM_DONTNEED;
11227c478bdstevel@tonic-gate			}
1123a565276praks			if (vpm_enable) {
1124a565276praks				/*
1125a565276praks				 * Flush pages.
1126a565276praks				 */
11273bd1497praks				error = vpm_sync_pages(vp, off, n, flags);
1128a565276praks			} else {
11293bd1497praks				error = segmap_release(segkmap, base, flags);
1130a565276praks			}
11317c478bdstevel@tonic-gate			/*
11327c478bdstevel@tonic-gate			 * If the operation failed and is synchronous,
11337c478bdstevel@tonic-gate			 * then we need to unwind what uiomove() last
11347c478bdstevel@tonic-gate			 * did so we can potentially return an error to
11357c478bdstevel@tonic-gate			 * the caller.  If this write operation was
11367c478bdstevel@tonic-gate			 * done in two pieces and the first succeeded,
11377c478bdstevel@tonic-gate			 * then we won't return an error for the second
11387c478bdstevel@tonic-gate			 * piece that failed.  However, we only want to
11397c478bdstevel@tonic-gate			 * return a resid value that reflects what was
11407c478bdstevel@tonic-gate			 * really done.
11417c478bdstevel@tonic-gate			 *
11427c478bdstevel@tonic-gate			 * Failures for non-synchronous operations can
11437c478bdstevel@tonic-gate			 * be ignored since the page subsystem will
11447c478bdstevel@tonic-gate			 * retry the operation until it succeeds or the
11457c478bdstevel@tonic-gate			 * file system is unmounted.
11467c478bdstevel@tonic-gate			 */
11477c478bdstevel@tonic-gate			if (error) {
11487c478bdstevel@tonic-gate				if ((ioflag & (FSYNC | FDSYNC)) ||
11497c478bdstevel@tonic-gate				    type == IFDIR) {
11507c478bdstevel@tonic-gate					uio->uio_resid = premove_resid;
11517c478bdstevel@tonic-gate				} else {
11527c478bdstevel@tonic-gate					error = 0;
11537c478bdstevel@tonic-gate				}
11547c478bdstevel@tonic-gate			}
11557c478bdstevel@tonic-gate		}
11567c478bdstevel@tonic-gate
11577c478bdstevel@tonic-gate		/*
11587c478bdstevel@tonic-gate		 * Re-acquire contents lock.
11597c478bdstevel@tonic-gate		 * If it was dropped, reacquire reader vfs_dqrwlock as well.
11607c478bdstevel@tonic-gate		 */
11617c478bdstevel@tonic-gate		if (do_dqrwlock)
11627c478bdstevel@tonic-gate			rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
11637c478bdstevel@tonic-gate		rw_enter(&ip->i_contents, RW_WRITER);
11647c478bdstevel@tonic-gate
11657c478bdstevel@tonic-gate		/*
11667c478bdstevel@tonic-gate		 * If the uiomove() failed or if a synchronous
11677c478bdstevel@tonic-gate		 * page push failed, fix up i_size.
11687c478bdstevel@tonic-gate		 */
11697c478bdstevel@tonic-gate		if (error) {
11707c478bdstevel@tonic-gate			if (i_size_changed) {
11717c478bdstevel@tonic-gate				/*
11727c478bdstevel@tonic-gate				 * The uiomove failed, and we
11737c478bdstevel@tonic-gate				 * allocated blocks,so get rid
11747c478bdstevel@tonic-gate				 * of them.
11757c478bdstevel@tonic-gate				 */
11767c478bdstevel@tonic-gate				(void) ufs_itrunc(ip, old_i_size, 0, cr);
11777c478bdstevel@tonic-gate			}
11787c478bdstevel@tonic-gate		} else {
11797c478bdstevel@tonic-gate			/*
11807c478bdstevel@tonic-gate			 * XXX - Can this be out of the loop?
11817c478bdstevel@tonic-gate			 */
11827c478bdstevel@tonic-gate			ip->i_flag |= IUPD | ICHG;
11837c478bdstevel@tonic-gate			/*
11847c478bdstevel@tonic-gate			 * Only do one increase of i_seq for multiple
11857c478bdstevel@tonic-gate			 * pieces.  Because we drop locks, record
11867c478bdstevel@tonic-gate			 * the fact that we changed the timestamp and
11877c478bdstevel@tonic-gate			 * are deferring the increase in case another thread
11887c478bdstevel@tonic-gate			 * pushes our timestamp update.
11897c478bdstevel@tonic-gate			 */
11907c478bdstevel@tonic-gate			i_seq_needed = 1;
11917c478bdstevel@tonic-gate			ip->i_flag |= ISEQ;
11927c478bdstevel@tonic-gate			if (i_size_changed)
11937c478bdstevel@tonic-gate				ip->i_flag |= IATTCHG;
11947c478bdstevel@tonic-gate			if ((ip->i_mode & (IEXEC | (IEXEC >> 3) |
11957c478bdstevel@tonic-gate			    (IEXEC >> 6))) != 0 &&
11967c478bdstevel@tonic-gate			    (ip->i_mode & (ISUID | ISGID)) != 0 &&
11977c478bdstevel@tonic-gate			    secpolicy_vnode_setid_retain(cr,
11987c478bdstevel@tonic-gate			    (ip->i_mode & ISUID) != 0 && ip->i_uid == 0) != 0) {
11997c478bdstevel@tonic-gate				/*
12007c478bdstevel@tonic-gate				 * Clear Set-UID & Set-GID bits on
12017c478bdstevel@tonic-gate				 * successful write if not privileged
12027c478bdstevel@tonic-gate				 * and at least one of the execute bits
12037c478bdstevel@tonic-gate				 * is set.  If we always clear Set-GID,
12047c478bdstevel@tonic-gate				 * mandatory file and record locking is
12057c478bdstevel@tonic-gate				 * unuseable.
12067c478bdstevel@tonic-gate				 */
12077c478bdstevel@tonic-gate				ip->i_mode &= ~(ISUID | ISGID);
12087c478bdstevel@tonic-gate			}
12097c478bdstevel@tonic-gate		}
1210f90bab2swilcox		/*
1211f90bab2swilcox		 * In the case the FDSYNC flag is set and this is a
1212f90bab2swilcox		 * "rewrite" we won't log a delta.
1213f90bab2swilcox		 * The FSYNC flag overrides all cases.
1214f90bab2swilcox		 */
1215f90bab2swilcox		if (!ufs_check_rewrite(ip, uio, ioflag) || !(ioflag & FDSYNC)) {
1216f90bab2swilcox			TRANS_INODE(ufsvfsp, ip);
1217f90bab2swilcox		}
12187c478bdstevel@tonic-gate	} while (error == 0 && uio->uio_resid > 0 && n != 0);
12197c478bdstevel@tonic-gate
12207c478bdstevel@tonic-gateout:
12217c478bdstevel@tonic-gate	/*
12227c478bdstevel@tonic-gate	 * Make sure i_seq is increased at least once per write
12237c478bdstevel@tonic-gate	 */
12247c478bdstevel@tonic-gate	if (i_seq_needed) {
12257c478bdstevel@tonic-gate		ip->i_seq++;
12267c478bdstevel@tonic-gate		ip->i_flag &= ~ISEQ;	/* no longer deferred */
12277c478bdstevel@tonic-gate	}
12287c478bdstevel@tonic-gate
12297c478bdstevel@tonic-gate	/*
12307c478bdstevel@tonic-gate	 * Inode is updated according to this table -
12317c478bdstevel@tonic-gate	 *
12327c478bdstevel@tonic-gate	 *   FSYNC	  FDSYNC(posix.4)
12337c478bdstevel@tonic-gate	 *   --------------------------
12347c478bdstevel@tonic-gate	 *   always@	  IATTCHG|IBDWRITE
12357c478bdstevel@tonic-gate	 *
12362575b44Toomas Soome	 * @ -	If we are doing synchronous write the only time we should
12377c478bdstevel@tonic-gate	 *	not be sync'ing the ip here is if we have the stickyhack
12387c478bdstevel@tonic-gate	 *	activated, the file is marked with the sticky bit and
12397c478bdstevel@tonic-gate	 *	no exec bit, the file length has not been changed and
12407c478bdstevel@tonic-gate	 *	no new blocks have been allocated during this write.
12417c478bdstevel@tonic-gate	 */
12427c478bdstevel@tonic-gate
12437c478bdstevel@tonic-gate	if ((ip->i_flag & ISYNC) != 0) {
12447c478bdstevel@tonic-gate		/*
12457c478bdstevel@tonic-gate		 * we have eliminated nosync
12467c478bdstevel@tonic-gate		 */
12477c478bdstevel@tonic-gate		if ((ip->i_flag & (IATTCHG|IBDWRITE)) ||
124880d3443frankho		    ((ioflag & FSYNC) && iupdat_flag)) {
12497c478bdstevel@tonic-gate			ufs_iupdat(ip, 1);
12507c478bdstevel@tonic-gate		}
12517c478bdstevel@tonic-gate	}
12527c478bdstevel@tonic-gate
12537c478bdstevel@tonic-gate	/*
12547c478bdstevel@tonic-gate	 * If we've already done a partial-write, terminate
12557c478bdstevel@tonic-gate	 * the write but return no error unless the error is ENOSPC
12567c478bdstevel@tonic-gate	 * because the caller can detect this and free resources and
12577c478bdstevel@tonic-gate	 * try again.
12587c478bdstevel@tonic-gate	 */
12597c478bdstevel@tonic-gate	if ((start_resid != uio->uio_resid) && (error != ENOSPC))
12607c478bdstevel@tonic-gate		error = 0;
12617c478bdstevel@tonic-gate
12627c478bdstevel@tonic-gate	ip->i_flag &= ~(INOACC | ISYNC);
12637c478bdstevel@tonic-gate	ITIMES_NOLOCK(ip);
12647c478bdstevel@tonic-gate	return (error);
12657c478bdstevel@tonic-gate}
12667c478bdstevel@tonic-gate
12677c478bdstevel@tonic-gate/*
12687c478bdstevel@tonic-gate * rdip does the real work of read requests for ufs.
12697c478bdstevel@tonic-gate */
12707c478bdstevel@tonic-gateint
12717c478bdstevel@tonic-gaterdip(struct inode *ip, struct uio *uio, int ioflag, cred_t *cr)
12727c478bdstevel@tonic-gate{
12737c478bdstevel@tonic-gate	u_offset_t off;
12747c478bdstevel@tonic-gate	caddr_t base;
12757c478bdstevel@tonic-gate	struct fs *fs;
12767c478bdstevel@tonic-gate	struct ufsvfs *ufsvfsp;
12777c478bdstevel@tonic-gate	struct vnode *vp;
12787c478bdstevel@tonic-gate	long oresid = uio->uio_resid;
12797c478bdstevel@tonic-gate	u_offset_t n, on, mapon;
12807c478bdstevel@tonic-gate	int error = 0;
12817c478bdstevel@tonic-gate	int doupdate = 1;
1282562eee4rbourbon	uint_t flags;
12839c65d7dBryan Cantrill	int directio_status;
12847c478bdstevel@tonic-gate	krw_t rwtype;
12857c478bdstevel@tonic-gate	o_mode_t type;
12867c478bdstevel@tonic-gate
12877c478bdstevel@tonic-gate	vp = ITOV(ip);
12887c478bdstevel@tonic-gate
12897c478bdstevel@tonic-gate	ASSERT(RW_LOCK_HELD(&ip->i_contents));
12907c478bdstevel@tonic-gate
12917c478bdstevel@tonic-gate	ufsvfsp = ip->i_ufsvfs;
12927c478bdstevel@tonic-gate
12937c478bdstevel@tonic-gate	if (ufsvfsp == NULL)
12947c478bdstevel@tonic-gate		return (EIO);
12957c478bdstevel@tonic-gate
12967c478bdstevel@tonic-gate	fs = ufsvfsp->vfs_fs;
12977c478bdstevel@tonic-gate
12987c478bdstevel@tonic-gate	/* check for valid filetype */
12997c478bdstevel@tonic-gate	type = ip->i_mode & IFMT;
13007c478bdstevel@tonic-gate	if ((type != IFREG) && (type != IFDIR) && (type != IFATTRDIR) &&
13017c478bdstevel@tonic-gate	    (type != IFLNK) && (type != IFSHAD)) {
13027c478bdstevel@tonic-gate		return (EIO);
13037c478bdstevel@tonic-gate	}
13047c478bdstevel@tonic-gate
13057c478bdstevel@tonic-gate	if (uio->uio_loffset > UFS_MAXOFFSET_T) {
13067c478bdstevel@tonic-gate		error = 0;
13077c478bdstevel@tonic-gate		goto out;
13087c478bdstevel@tonic-gate	}
13097c478bdstevel@tonic-gate	if (uio->uio_loffset < (offset_t)0) {
13107c478bdstevel@tonic-gate		return (EINVAL);
13117c478bdstevel@tonic-gate	}
13127c478bdstevel@tonic-gate	if (uio->uio_resid == 0) {
13137c478bdstevel@tonic-gate		return (0);
13147c478bdstevel@tonic-gate	}
13157c478bdstevel@tonic-gate
13167c478bdstevel@tonic-gate	if (!ULOCKFS_IS_NOIACC(ITOUL(ip)) && (fs->fs_ronly == 0) &&
131780d3443frankho	    (!ufsvfsp->vfs_noatime)) {
13187c478bdstevel@tonic-gate		mutex_enter(&ip->i_tlock);
13197c478bdstevel@tonic-gate		ip->i_flag |= IACC;
13207c478bdstevel@tonic-gate		mutex_exit(&ip->i_tlock);
13217c478bdstevel@tonic-gate	}
13227c478bdstevel@tonic-gate	/*
13237c478bdstevel@tonic-gate	 * Try to go direct
13247c478bdstevel@tonic-gate	 */
13257c478bdstevel@tonic-gate	if (ip->i_flag & IDIRECTIO || ufsvfsp->vfs_forcedirectio) {
13267c478bdstevel@tonic-gate		error = ufs_directio_read(ip, uio, cr, &directio_status);
13277c478bdstevel@tonic-gate		if (directio_status == DIRECTIO_SUCCESS)
13287c478bdstevel@tonic-gate			goto out;
13297c478bdstevel@tonic-gate	}
13307c478bdstevel@tonic-gate
13317c478bdstevel@tonic-gate	rwtype = (rw_write_held(&ip->i_contents)?RW_WRITER:RW_READER);
13327c478bdstevel@tonic-gate
13337c478bdstevel@tonic-gate	do {
13347c478bdstevel@tonic-gate		offset_t diff;
13357c478bdstevel@tonic-gate		u_offset_t uoff = uio->uio_loffset;
13367c478bdstevel@tonic-gate		off = uoff & (offset_t)MAXBMASK;
13377c478bdstevel@tonic-gate		mapon = (u_offset_t)(uoff & (offset_t)MAXBOFFSET);
13387c478bdstevel@tonic-gate		on = (u_offset_t)blkoff(fs, uoff);
13397c478bdstevel@tonic-gate		n = MIN((u_offset_t)fs->fs_bsize - on,
134080d3443frankho		    (u_offset_t)uio->uio_resid);
13417c478bdstevel@tonic-gate
13427c478bdstevel@tonic-gate		diff = ip->i_size - uoff;
13437c478bdstevel@tonic-gate
13447c478bdstevel@tonic-gate		if (diff <= (offset_t)0) {
13457c478bdstevel@tonic-gate			error = 0;
13467c478bdstevel@tonic-gate			goto out;
13477c478bdstevel@tonic-gate		}
13487c478bdstevel@tonic-gate		if (diff < (offset_t)n)
13497c478bdstevel@tonic-gate			n = (int)diff;
1350562eee4rbourbon
1351562eee4rbourbon		/*
13527c478bdstevel@tonic-gate		 * At this point we can enter ufs_getpage() in one of two
13537c478bdstevel@tonic-gate		 * ways:
13547c478bdstevel@tonic-gate		 * 1) segmap_getmapflt() calls ufs_getpage() when the
13557c478bdstevel@tonic-gate		 *    forcefault parameter is true (value of 1 is passed)
13567c478bdstevel@tonic-gate		 * 2) uiomove() causes a page fault.
13577c478bdstevel@tonic-gate		 *
13587c478bdstevel@tonic-gate		 * We cannot hold onto an i_contents reader lock without
13597c478bdstevel@tonic-gate		 * risking deadlock in ufs_getpage() so drop a reader lock.
13607c478bdstevel@tonic-gate		 * The ufs_getpage() dolock logic already allows for a
13617c478bdstevel@tonic-gate		 * thread holding i_contents as writer to work properly
13627c478bdstevel@tonic-gate		 * so we keep a writer lock.
13637c478bdstevel@tonic-gate		 */
13647c478bdstevel@tonic-gate		if (rwtype == RW_READER)
13657c478bdstevel@tonic-gate			rw_exit(&ip->i_contents);
13667c478bdstevel@tonic-gate
1367a565276praks		if (vpm_enable) {
1368a565276praks			/*
1369a565276praks			 * Copy data.
1370a565276praks			 */
1371a565276praks			error = vpm_data_copy(vp, (off + mapon), (uint_t)n,
137280d3443frankho			    uio, 1, NULL, 0, S_READ);
1373a565276praks		} else {
1374a565276praks			base = segmap_getmapflt(segkmap, vp, (off + mapon),
137580d3443frankho			    (uint_t)n, 1, S_READ);
1376a565276praks			error = uiomove(base + mapon, (long)n, UIO_READ, uio);
1377a565276praks		}
13787c478bdstevel@tonic-gate
13797c478bdstevel@tonic-gate		flags = 0;
13807c478bdstevel@tonic-gate		if (!error) {
13817c478bdstevel@tonic-gate			/*
13827c478bdstevel@tonic-gate			 * In POSIX SYNC (FSYNC and FDSYNC) read mode,
13837c478bdstevel@tonic-gate			 * we want to make sure that the page which has
13847c478bdstevel@tonic-gate			 * been read, is written on disk if it is dirty.
13857c478bdstevel@tonic-gate			 * And corresponding indirect blocks should also
13867c478bdstevel@tonic-gate			 * be flushed out.
13877c478bdstevel@tonic-gate			 */
13887c478bdstevel@tonic-gate			if ((ioflag & FRSYNC) && (ioflag & (FSYNC|FDSYNC))) {
13897c478bdstevel@tonic-gate				flags |= SM_WRITE;
13907c478bdstevel@tonic-gate			}
1391a565276praks			if (vpm_enable) {
1392a565276praks				error = vpm_sync_pages(vp, off, n, flags);
1393a565276praks			} else {
1394a565276praks				error = segmap_release(segkmap, base, flags);
1395a565276praks			}
1396a565276praks		} else {
1397a565276praks			if (vpm_enable) {
1398a565276praks				(void) vpm_sync_pages(vp, off, n, flags);
1399a565276praks			} else {
1400a565276praks				(void) segmap_release(segkmap, base, flags);
1401a565276praks			}
1402a565276praks		}
14037c478bdstevel@tonic-gate
14047c478bdstevel@tonic-gate		if (rwtype == RW_READER)
14057c478bdstevel@tonic-gate			rw_enter(&ip->i_contents, rwtype);
14067c478bdstevel@tonic-gate	} while (error == 0 && uio->uio_resid > 0 && n != 0);
14077c478bdstevel@tonic-gateout:
14087c478bdstevel@tonic-gate	/*
14097c478bdstevel@tonic-gate	 * Inode is updated according to this table if FRSYNC is set.
14107c478bdstevel@tonic-gate	 *
14117c478bdstevel@tonic-gate	 *   FSYNC	  FDSYNC(posix.4)
14127c478bdstevel@tonic-gate	 *   --------------------------
14137c478bdstevel@tonic-gate	 *   always	  IATTCHG|IBDWRITE
14147c478bdstevel@tonic-gate	 */
14157c478bdstevel@tonic-gate	/*
14167c478bdstevel@tonic-gate	 * The inode is not updated if we're logging and the inode is a
14177c478bdstevel@tonic-gate	 * directory with FRSYNC, FSYNC and FDSYNC flags set.
14187c478bdstevel@tonic-gate	 */
14197c478bdstevel@tonic-gate	if (ioflag & FRSYNC) {
14207c478bdstevel@tonic-gate		if (TRANS_ISTRANS(ufsvfsp) && ((ip->i_mode & IFMT) == IFDIR)) {
1421986fd29setje			doupdate = 0;
14227c478bdstevel@tonic-gate		}
14237c478bdstevel@tonic-gate		if (doupdate) {
14247c478bdstevel@tonic-gate			if ((ioflag & FSYNC) ||
14257c478bdstevel@tonic-gate			    ((ioflag & FDSYNC) &&
14267c478bdstevel@tonic-gate			    (ip->i_flag & (IATTCHG|IBDWRITE)))) {
14277c478bdstevel@tonic-gate				ufs_iupdat(ip, 1);
14287c478bdstevel@tonic-gate			}
14297c478bdstevel@tonic-gate		}
14307c478bdstevel@tonic-gate	}
14317c478bdstevel@tonic-gate	/*
14327c478bdstevel@tonic-gate	 * If we've already done a partial read, terminate
14337c478bdstevel@tonic-gate	 * the read but return no error.
14347c478bdstevel@tonic-gate	 */
14357c478bdstevel@tonic-gate	if (oresid != uio->uio_resid)
14367c478bdstevel@tonic-gate		error = 0;
14377c478bdstevel@tonic-gate	ITIMES(ip);
14387c478bdstevel@tonic-gate
14397c478bdstevel@tonic-gate	return (error);
14407c478bdstevel@tonic-gate}
14417c478bdstevel@tonic-gate
14427c478bdstevel@tonic-gate/* ARGSUSED */
14437c478bdstevel@tonic-gatestatic int
14447c478bdstevel@tonic-gateufs_ioctl(
14457c478bdstevel@tonic-gate	struct vnode	*vp,
14467c478bdstevel@tonic-gate	int		cmd,
14477c478bdstevel@tonic-gate	intptr_t	arg,
14487c478bdstevel@tonic-gate	int		flag,
14497c478bdstevel@tonic-gate	struct cred	*cr,
1450da6c28aamw	int		*rvalp,
1451da6c28aamw	caller_context_t *ct)
14527c478bdstevel@tonic-gate{
14537c478bdstevel@tonic-gate	struct lockfs	lockfs, lockfs_out;
14547c478bdstevel@tonic-gate	struct ufsvfs	*ufsvfsp = VTOI(vp)->i_ufsvfs;
14557c478bdstevel@tonic-gate	char		*comment, *original_comment;
14567c478bdstevel@tonic-gate	struct fs	*fs;
14577c478bdstevel@tonic-gate	struct ulockfs	*ulp;