xref: /illumos-gate/usr/src/uts/common/fs/ufs/ufs_vnops.c (revision 7c478bd95313f5f23a4c958a745db2134aa0324)
1*7c478bd9Sstevel@tonic-gate /*
2*7c478bd9Sstevel@tonic-gate  * CDDL HEADER START
3*7c478bd9Sstevel@tonic-gate  *
4*7c478bd9Sstevel@tonic-gate  * The contents of this file are subject to the terms of the
5*7c478bd9Sstevel@tonic-gate  * Common Development and Distribution License, Version 1.0 only
6*7c478bd9Sstevel@tonic-gate  * (the "License").  You may not use this file except in compliance
7*7c478bd9Sstevel@tonic-gate  * with the License.
8*7c478bd9Sstevel@tonic-gate  *
9*7c478bd9Sstevel@tonic-gate  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10*7c478bd9Sstevel@tonic-gate  * or http://www.opensolaris.org/os/licensing.
11*7c478bd9Sstevel@tonic-gate  * See the License for the specific language governing permissions
12*7c478bd9Sstevel@tonic-gate  * and limitations under the License.
13*7c478bd9Sstevel@tonic-gate  *
14*7c478bd9Sstevel@tonic-gate  * When distributing Covered Code, include this CDDL HEADER in each
15*7c478bd9Sstevel@tonic-gate  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16*7c478bd9Sstevel@tonic-gate  * If applicable, add the following below this CDDL HEADER, with the
17*7c478bd9Sstevel@tonic-gate  * fields enclosed by brackets "[]" replaced with your own identifying
18*7c478bd9Sstevel@tonic-gate  * information: Portions Copyright [yyyy] [name of copyright owner]
19*7c478bd9Sstevel@tonic-gate  *
20*7c478bd9Sstevel@tonic-gate  * CDDL HEADER END
21*7c478bd9Sstevel@tonic-gate  */
22*7c478bd9Sstevel@tonic-gate /*
23*7c478bd9Sstevel@tonic-gate  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24*7c478bd9Sstevel@tonic-gate  * Use is subject to license terms.
25*7c478bd9Sstevel@tonic-gate  */
26*7c478bd9Sstevel@tonic-gate 
27*7c478bd9Sstevel@tonic-gate /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
28*7c478bd9Sstevel@tonic-gate /*	  All Rights Reserved  	*/
29*7c478bd9Sstevel@tonic-gate 
30*7c478bd9Sstevel@tonic-gate /*
31*7c478bd9Sstevel@tonic-gate  * Portions of this source code were derived from Berkeley 4.3 BSD
32*7c478bd9Sstevel@tonic-gate  * under license from the Regents of the University of California.
33*7c478bd9Sstevel@tonic-gate  */
34*7c478bd9Sstevel@tonic-gate 
35*7c478bd9Sstevel@tonic-gate #pragma ident	"%Z%%M%	%I%	%E% SMI"
36*7c478bd9Sstevel@tonic-gate 
37*7c478bd9Sstevel@tonic-gate #include <sys/types.h>
38*7c478bd9Sstevel@tonic-gate #include <sys/t_lock.h>
39*7c478bd9Sstevel@tonic-gate #include <sys/ksynch.h>
40*7c478bd9Sstevel@tonic-gate #include <sys/param.h>
41*7c478bd9Sstevel@tonic-gate #include <sys/time.h>
42*7c478bd9Sstevel@tonic-gate #include <sys/systm.h>
43*7c478bd9Sstevel@tonic-gate #include <sys/sysmacros.h>
44*7c478bd9Sstevel@tonic-gate #include <sys/resource.h>
45*7c478bd9Sstevel@tonic-gate #include <sys/signal.h>
46*7c478bd9Sstevel@tonic-gate #include <sys/cred.h>
47*7c478bd9Sstevel@tonic-gate #include <sys/user.h>
48*7c478bd9Sstevel@tonic-gate #include <sys/buf.h>
49*7c478bd9Sstevel@tonic-gate #include <sys/vfs.h>
50*7c478bd9Sstevel@tonic-gate #include <sys/vnode.h>
51*7c478bd9Sstevel@tonic-gate #include <sys/proc.h>
52*7c478bd9Sstevel@tonic-gate #include <sys/disp.h>
53*7c478bd9Sstevel@tonic-gate #include <sys/file.h>
54*7c478bd9Sstevel@tonic-gate #include <sys/fcntl.h>
55*7c478bd9Sstevel@tonic-gate #include <sys/flock.h>
56*7c478bd9Sstevel@tonic-gate #include <sys/kmem.h>
57*7c478bd9Sstevel@tonic-gate #include <sys/uio.h>
58*7c478bd9Sstevel@tonic-gate #include <sys/dnlc.h>
59*7c478bd9Sstevel@tonic-gate #include <sys/conf.h>
60*7c478bd9Sstevel@tonic-gate #include <sys/mman.h>
61*7c478bd9Sstevel@tonic-gate #include <sys/pathname.h>
62*7c478bd9Sstevel@tonic-gate #include <sys/debug.h>
63*7c478bd9Sstevel@tonic-gate #include <sys/vmsystm.h>
64*7c478bd9Sstevel@tonic-gate #include <sys/cmn_err.h>
65*7c478bd9Sstevel@tonic-gate #include <sys/vtrace.h>
66*7c478bd9Sstevel@tonic-gate #include <sys/filio.h>
67*7c478bd9Sstevel@tonic-gate #include <sys/policy.h>
68*7c478bd9Sstevel@tonic-gate 
69*7c478bd9Sstevel@tonic-gate #include <sys/fs/ufs_fs.h>
70*7c478bd9Sstevel@tonic-gate #include <sys/fs/ufs_lockfs.h>
71*7c478bd9Sstevel@tonic-gate #include <sys/fs/ufs_filio.h>
72*7c478bd9Sstevel@tonic-gate #include <sys/fs/ufs_inode.h>
73*7c478bd9Sstevel@tonic-gate #include <sys/fs/ufs_fsdir.h>
74*7c478bd9Sstevel@tonic-gate #include <sys/fs/ufs_quota.h>
75*7c478bd9Sstevel@tonic-gate #include <sys/fs/ufs_log.h>
76*7c478bd9Sstevel@tonic-gate #include <sys/fs/ufs_snap.h>
77*7c478bd9Sstevel@tonic-gate #include <sys/fs/ufs_trans.h>
78*7c478bd9Sstevel@tonic-gate #include <sys/fs/ufs_panic.h>
79*7c478bd9Sstevel@tonic-gate #include <sys/fs/ufs_bio.h>
80*7c478bd9Sstevel@tonic-gate #include <sys/dirent.h>		/* must be AFTER <sys/fs/fsdir.h>! */
81*7c478bd9Sstevel@tonic-gate #include <sys/errno.h>
82*7c478bd9Sstevel@tonic-gate #include <sys/fssnap_if.h>
83*7c478bd9Sstevel@tonic-gate #include <sys/unistd.h>
84*7c478bd9Sstevel@tonic-gate #include <sys/sunddi.h>
85*7c478bd9Sstevel@tonic-gate 
86*7c478bd9Sstevel@tonic-gate #include <sys/filio.h>		/* _FIOIO */
87*7c478bd9Sstevel@tonic-gate 
88*7c478bd9Sstevel@tonic-gate #include <vm/hat.h>
89*7c478bd9Sstevel@tonic-gate #include <vm/page.h>
90*7c478bd9Sstevel@tonic-gate #include <vm/pvn.h>
91*7c478bd9Sstevel@tonic-gate #include <vm/as.h>
92*7c478bd9Sstevel@tonic-gate #include <vm/seg.h>
93*7c478bd9Sstevel@tonic-gate #include <vm/seg_map.h>
94*7c478bd9Sstevel@tonic-gate #include <vm/seg_vn.h>
95*7c478bd9Sstevel@tonic-gate #include <vm/seg_kmem.h>
96*7c478bd9Sstevel@tonic-gate #include <vm/rm.h>
97*7c478bd9Sstevel@tonic-gate #include <sys/swap.h>
98*7c478bd9Sstevel@tonic-gate 
99*7c478bd9Sstevel@tonic-gate #include <fs/fs_subr.h>
100*7c478bd9Sstevel@tonic-gate 
101*7c478bd9Sstevel@tonic-gate static struct instats ins;
102*7c478bd9Sstevel@tonic-gate 
103*7c478bd9Sstevel@tonic-gate static 	int ufs_getpage_ra(struct vnode *, u_offset_t, struct seg *, caddr_t);
104*7c478bd9Sstevel@tonic-gate static	int ufs_getpage_miss(struct vnode *, u_offset_t, size_t, struct seg *,
105*7c478bd9Sstevel@tonic-gate 		caddr_t, struct page **, size_t, enum seg_rw, int);
106*7c478bd9Sstevel@tonic-gate static	int ufs_open(struct vnode **, int, struct cred *);
107*7c478bd9Sstevel@tonic-gate static	int ufs_close(struct vnode *, int, int, offset_t, struct cred *);
108*7c478bd9Sstevel@tonic-gate static	int ufs_read(struct vnode *, struct uio *, int, struct cred *,
109*7c478bd9Sstevel@tonic-gate 			struct caller_context *);
110*7c478bd9Sstevel@tonic-gate static	int ufs_write(struct vnode *, struct uio *, int, struct cred *,
111*7c478bd9Sstevel@tonic-gate 			struct caller_context *);
112*7c478bd9Sstevel@tonic-gate static	int ufs_ioctl(struct vnode *, int, intptr_t, int, struct cred *, int *);
113*7c478bd9Sstevel@tonic-gate static	int ufs_getattr(struct vnode *, struct vattr *, int, struct cred *);
114*7c478bd9Sstevel@tonic-gate static	int ufs_setattr(struct vnode *, struct vattr *, int, struct cred *,
115*7c478bd9Sstevel@tonic-gate 			caller_context_t *);
116*7c478bd9Sstevel@tonic-gate static	int ufs_access(struct vnode *, int, int, struct cred *);
117*7c478bd9Sstevel@tonic-gate static	int ufs_lookup(struct vnode *, char *, struct vnode **,
118*7c478bd9Sstevel@tonic-gate 		struct pathname *, int, struct vnode *, struct cred *);
119*7c478bd9Sstevel@tonic-gate static	int ufs_create(struct vnode *, char *, struct vattr *, enum vcexcl,
120*7c478bd9Sstevel@tonic-gate 			int, struct vnode **, struct cred *, int);
121*7c478bd9Sstevel@tonic-gate static	int ufs_remove(struct vnode *, char *, struct cred *);
122*7c478bd9Sstevel@tonic-gate static	int ufs_link(struct vnode *, struct vnode *, char *, struct cred *);
123*7c478bd9Sstevel@tonic-gate static	int ufs_rename(struct vnode *, char *, struct vnode *, char *,
124*7c478bd9Sstevel@tonic-gate 			struct cred *);
125*7c478bd9Sstevel@tonic-gate static	int ufs_mkdir(struct vnode *, char *, struct vattr *, struct vnode **,
126*7c478bd9Sstevel@tonic-gate 			struct cred *);
127*7c478bd9Sstevel@tonic-gate static	int ufs_rmdir(struct vnode *, char *, struct vnode *, struct cred *);
128*7c478bd9Sstevel@tonic-gate static	int ufs_readdir(struct vnode *, struct uio *, struct cred *, int *);
129*7c478bd9Sstevel@tonic-gate static	int ufs_symlink(struct vnode *, char *, struct vattr *, char *,
130*7c478bd9Sstevel@tonic-gate 			struct cred *);
131*7c478bd9Sstevel@tonic-gate static	int ufs_readlink(struct vnode *, struct uio *, struct cred *);
132*7c478bd9Sstevel@tonic-gate static	int ufs_fsync(struct vnode *, int, struct cred *);
133*7c478bd9Sstevel@tonic-gate static	void ufs_inactive(struct vnode *, struct cred *);
134*7c478bd9Sstevel@tonic-gate static	int ufs_fid(struct vnode *, struct fid *);
135*7c478bd9Sstevel@tonic-gate static	int ufs_rwlock(struct vnode *, int, caller_context_t *);
136*7c478bd9Sstevel@tonic-gate static	void ufs_rwunlock(struct vnode *, int, caller_context_t *);
137*7c478bd9Sstevel@tonic-gate static	int ufs_seek(struct vnode *, offset_t, offset_t *);
138*7c478bd9Sstevel@tonic-gate static	int ufs_frlock(struct vnode *, int, struct flock64 *, int, offset_t,
139*7c478bd9Sstevel@tonic-gate 			struct flk_callback *, struct cred *);
140*7c478bd9Sstevel@tonic-gate static  int ufs_space(struct vnode *, int, struct flock64 *, int, offset_t,
141*7c478bd9Sstevel@tonic-gate 		cred_t *, caller_context_t *);
142*7c478bd9Sstevel@tonic-gate static	int ufs_getpage(struct vnode *, offset_t, size_t, uint_t *,
143*7c478bd9Sstevel@tonic-gate 		struct page **, size_t, struct seg *, caddr_t,
144*7c478bd9Sstevel@tonic-gate 		enum seg_rw, struct cred *);
145*7c478bd9Sstevel@tonic-gate static	int ufs_putpage(struct vnode *, offset_t, size_t, int, struct cred *);
146*7c478bd9Sstevel@tonic-gate static	int ufs_putpages(struct vnode *, offset_t, size_t, int, struct cred *);
147*7c478bd9Sstevel@tonic-gate static	int ufs_map(struct vnode *, offset_t, struct as *, caddr_t *, size_t,
148*7c478bd9Sstevel@tonic-gate 			uchar_t, uchar_t, uint_t, struct cred *);
149*7c478bd9Sstevel@tonic-gate static	int ufs_addmap(struct vnode *, offset_t, struct as *, caddr_t,  size_t,
150*7c478bd9Sstevel@tonic-gate 			uchar_t, uchar_t, uint_t, struct cred *);
151*7c478bd9Sstevel@tonic-gate static	int ufs_delmap(struct vnode *, offset_t, struct as *, caddr_t,  size_t,
152*7c478bd9Sstevel@tonic-gate 			uint_t, uint_t, uint_t, struct cred *);
153*7c478bd9Sstevel@tonic-gate static	int ufs_poll(vnode_t *, short, int, short *, struct pollhead **);
154*7c478bd9Sstevel@tonic-gate static	int ufs_dump(vnode_t *, caddr_t, int, int);
155*7c478bd9Sstevel@tonic-gate static	int ufs_l_pathconf(struct vnode *, int, ulong_t *, struct cred *);
156*7c478bd9Sstevel@tonic-gate static	int ufs_pageio(struct vnode *, struct page *, u_offset_t, size_t, int,
157*7c478bd9Sstevel@tonic-gate 			struct cred *);
158*7c478bd9Sstevel@tonic-gate static	int ufs_dump(vnode_t *, caddr_t, int, int);
159*7c478bd9Sstevel@tonic-gate static	int ufs_dumpctl(vnode_t *, int, int *);
160*7c478bd9Sstevel@tonic-gate static	daddr32_t *save_dblks(struct inode *, struct ufsvfs *, daddr32_t *,
161*7c478bd9Sstevel@tonic-gate 			daddr32_t *, int, int);
162*7c478bd9Sstevel@tonic-gate static	int ufs_getsecattr(struct vnode *, vsecattr_t *, int, struct cred *);
163*7c478bd9Sstevel@tonic-gate static	int ufs_setsecattr(struct vnode *, vsecattr_t *, int, struct cred *);
164*7c478bd9Sstevel@tonic-gate 
165*7c478bd9Sstevel@tonic-gate /*
166*7c478bd9Sstevel@tonic-gate  * For lockfs: ulockfs begin/end is now inlined in the ufs_xxx functions.
167*7c478bd9Sstevel@tonic-gate  *
168*7c478bd9Sstevel@tonic-gate  * XXX - ULOCKFS in fs_pathconf and ufs_ioctl is not inlined yet.
169*7c478bd9Sstevel@tonic-gate  */
170*7c478bd9Sstevel@tonic-gate struct vnodeops *ufs_vnodeops;
171*7c478bd9Sstevel@tonic-gate 
172*7c478bd9Sstevel@tonic-gate const fs_operation_def_t ufs_vnodeops_template[] = {
173*7c478bd9Sstevel@tonic-gate 	VOPNAME_OPEN, ufs_open,	/* will not be blocked by lockfs */
174*7c478bd9Sstevel@tonic-gate 	VOPNAME_CLOSE, ufs_close,	/* will not be blocked by lockfs */
175*7c478bd9Sstevel@tonic-gate 	VOPNAME_READ, ufs_read,
176*7c478bd9Sstevel@tonic-gate 	VOPNAME_WRITE, ufs_write,
177*7c478bd9Sstevel@tonic-gate 	VOPNAME_IOCTL, ufs_ioctl,
178*7c478bd9Sstevel@tonic-gate 	VOPNAME_GETATTR, ufs_getattr,
179*7c478bd9Sstevel@tonic-gate 	VOPNAME_SETATTR, ufs_setattr,
180*7c478bd9Sstevel@tonic-gate 	VOPNAME_ACCESS, ufs_access,
181*7c478bd9Sstevel@tonic-gate 	VOPNAME_LOOKUP, ufs_lookup,
182*7c478bd9Sstevel@tonic-gate 	VOPNAME_CREATE, ufs_create,
183*7c478bd9Sstevel@tonic-gate 	VOPNAME_REMOVE, ufs_remove,
184*7c478bd9Sstevel@tonic-gate 	VOPNAME_LINK, ufs_link,
185*7c478bd9Sstevel@tonic-gate 	VOPNAME_RENAME, ufs_rename,
186*7c478bd9Sstevel@tonic-gate 	VOPNAME_MKDIR, ufs_mkdir,
187*7c478bd9Sstevel@tonic-gate 	VOPNAME_RMDIR, ufs_rmdir,
188*7c478bd9Sstevel@tonic-gate 	VOPNAME_READDIR, ufs_readdir,
189*7c478bd9Sstevel@tonic-gate 	VOPNAME_SYMLINK, ufs_symlink,
190*7c478bd9Sstevel@tonic-gate 	VOPNAME_READLINK, ufs_readlink,
191*7c478bd9Sstevel@tonic-gate 	VOPNAME_FSYNC, ufs_fsync,
192*7c478bd9Sstevel@tonic-gate 	VOPNAME_INACTIVE, (fs_generic_func_p) ufs_inactive,  /* not blocked */
193*7c478bd9Sstevel@tonic-gate 	VOPNAME_FID, ufs_fid,
194*7c478bd9Sstevel@tonic-gate 	VOPNAME_RWLOCK, ufs_rwlock, /* not blocked */
195*7c478bd9Sstevel@tonic-gate 	VOPNAME_RWUNLOCK, (fs_generic_func_p) ufs_rwunlock,  /* not blocked */
196*7c478bd9Sstevel@tonic-gate 	VOPNAME_SEEK, ufs_seek,
197*7c478bd9Sstevel@tonic-gate 	VOPNAME_FRLOCK, ufs_frlock,
198*7c478bd9Sstevel@tonic-gate 	VOPNAME_SPACE, ufs_space,
199*7c478bd9Sstevel@tonic-gate 	VOPNAME_GETPAGE, ufs_getpage,
200*7c478bd9Sstevel@tonic-gate 	VOPNAME_PUTPAGE, ufs_putpage,
201*7c478bd9Sstevel@tonic-gate 	VOPNAME_MAP, (fs_generic_func_p) ufs_map,
202*7c478bd9Sstevel@tonic-gate 	VOPNAME_ADDMAP, (fs_generic_func_p) ufs_addmap,	/* not blocked */
203*7c478bd9Sstevel@tonic-gate 	VOPNAME_DELMAP, ufs_delmap,	/* will not be blocked by lockfs */
204*7c478bd9Sstevel@tonic-gate 	VOPNAME_POLL, (fs_generic_func_p) ufs_poll,	/* not blocked */
205*7c478bd9Sstevel@tonic-gate 	VOPNAME_DUMP, ufs_dump,
206*7c478bd9Sstevel@tonic-gate 	VOPNAME_PATHCONF, ufs_l_pathconf,
207*7c478bd9Sstevel@tonic-gate 	VOPNAME_PAGEIO, ufs_pageio,
208*7c478bd9Sstevel@tonic-gate 	VOPNAME_DUMPCTL, ufs_dumpctl,
209*7c478bd9Sstevel@tonic-gate 	VOPNAME_GETSECATTR, ufs_getsecattr,
210*7c478bd9Sstevel@tonic-gate 	VOPNAME_SETSECATTR, ufs_setsecattr,
211*7c478bd9Sstevel@tonic-gate 	VOPNAME_VNEVENT, fs_vnevent_support,
212*7c478bd9Sstevel@tonic-gate 	NULL, NULL
213*7c478bd9Sstevel@tonic-gate };
214*7c478bd9Sstevel@tonic-gate 
215*7c478bd9Sstevel@tonic-gate #define	MAX_BACKFILE_COUNT	9999
216*7c478bd9Sstevel@tonic-gate 
217*7c478bd9Sstevel@tonic-gate /*
218*7c478bd9Sstevel@tonic-gate  * Created by ufs_dumpctl() to store a file's disk block info into memory.
219*7c478bd9Sstevel@tonic-gate  * Used by ufs_dump() to dump data to disk directly.
220*7c478bd9Sstevel@tonic-gate  */
221*7c478bd9Sstevel@tonic-gate struct dump {
222*7c478bd9Sstevel@tonic-gate 	struct inode	*ip;		/* the file we contain */
223*7c478bd9Sstevel@tonic-gate 	daddr_t		fsbs;		/* number of blocks stored */
224*7c478bd9Sstevel@tonic-gate 	struct timeval32 time;		/* time stamp for the struct */
225*7c478bd9Sstevel@tonic-gate 	daddr32_t 	dblk[1];	/* place holder for block info */
226*7c478bd9Sstevel@tonic-gate };
227*7c478bd9Sstevel@tonic-gate 
228*7c478bd9Sstevel@tonic-gate static struct dump *dump_info = NULL;
229*7c478bd9Sstevel@tonic-gate 
230*7c478bd9Sstevel@tonic-gate /*
231*7c478bd9Sstevel@tonic-gate  * Previously there was no special action required for ordinary files.
232*7c478bd9Sstevel@tonic-gate  * (Devices are handled through the device file system.)
233*7c478bd9Sstevel@tonic-gate  * Now we support Large Files and Large File API requires open to
234*7c478bd9Sstevel@tonic-gate  * fail if file is large.
235*7c478bd9Sstevel@tonic-gate  * We could take care to prevent data corruption
236*7c478bd9Sstevel@tonic-gate  * by doing an atomic check of size and truncate if file is opened with
237*7c478bd9Sstevel@tonic-gate  * FTRUNC flag set but traditionally this is being done by the vfs/vnode
238*7c478bd9Sstevel@tonic-gate  * layers. So taking care of truncation here is a change in the existing
239*7c478bd9Sstevel@tonic-gate  * semantics of VOP_OPEN and therefore we chose not to implement any thing
240*7c478bd9Sstevel@tonic-gate  * here. The check for the size of the file > 2GB is being done at the
241*7c478bd9Sstevel@tonic-gate  * vfs layer in routine vn_open().
242*7c478bd9Sstevel@tonic-gate  */
243*7c478bd9Sstevel@tonic-gate 
244*7c478bd9Sstevel@tonic-gate /* ARGSUSED */
245*7c478bd9Sstevel@tonic-gate static int
246*7c478bd9Sstevel@tonic-gate ufs_open(struct vnode **vpp, int flag, struct cred *cr)
247*7c478bd9Sstevel@tonic-gate {
248*7c478bd9Sstevel@tonic-gate 	TRACE_1(TR_FAC_UFS, TR_UFS_OPEN, "ufs_open:vpp %p", vpp);
249*7c478bd9Sstevel@tonic-gate 	return (0);
250*7c478bd9Sstevel@tonic-gate }
251*7c478bd9Sstevel@tonic-gate 
252*7c478bd9Sstevel@tonic-gate /*ARGSUSED*/
253*7c478bd9Sstevel@tonic-gate static int
254*7c478bd9Sstevel@tonic-gate ufs_close(struct vnode *vp, int flag, int count, offset_t offset,
255*7c478bd9Sstevel@tonic-gate 	struct cred *cr)
256*7c478bd9Sstevel@tonic-gate {
257*7c478bd9Sstevel@tonic-gate 	TRACE_1(TR_FAC_UFS, TR_UFS_CLOSE, "ufs_close:vp %p", vp);
258*7c478bd9Sstevel@tonic-gate 
259*7c478bd9Sstevel@tonic-gate 	cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
260*7c478bd9Sstevel@tonic-gate 	cleanshares(vp, ttoproc(curthread)->p_pid);
261*7c478bd9Sstevel@tonic-gate 
262*7c478bd9Sstevel@tonic-gate 	/*
263*7c478bd9Sstevel@tonic-gate 	 * Push partially filled cluster at last close.
264*7c478bd9Sstevel@tonic-gate 	 * ``last close'' is approximated because the dnlc
265*7c478bd9Sstevel@tonic-gate 	 * may have a hold on the vnode.
266*7c478bd9Sstevel@tonic-gate 	 * Checking for VBAD here will also act as a forced umount check.
267*7c478bd9Sstevel@tonic-gate 	 */
268*7c478bd9Sstevel@tonic-gate 	if (vp->v_count <= 2 && vp->v_type != VBAD) {
269*7c478bd9Sstevel@tonic-gate 		struct inode *ip = VTOI(vp);
270*7c478bd9Sstevel@tonic-gate 		if (ip->i_delaylen) {
271*7c478bd9Sstevel@tonic-gate 			ins.in_poc.value.ul++;
272*7c478bd9Sstevel@tonic-gate 			(void) ufs_putpages(vp, ip->i_delayoff, ip->i_delaylen,
273*7c478bd9Sstevel@tonic-gate 					B_ASYNC | B_FREE, cr);
274*7c478bd9Sstevel@tonic-gate 			ip->i_delaylen = 0;
275*7c478bd9Sstevel@tonic-gate 		}
276*7c478bd9Sstevel@tonic-gate 	}
277*7c478bd9Sstevel@tonic-gate 
278*7c478bd9Sstevel@tonic-gate 	return (0);
279*7c478bd9Sstevel@tonic-gate }
280*7c478bd9Sstevel@tonic-gate 
281*7c478bd9Sstevel@tonic-gate /*ARGSUSED*/
282*7c478bd9Sstevel@tonic-gate static int
283*7c478bd9Sstevel@tonic-gate ufs_read(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *cr,
284*7c478bd9Sstevel@tonic-gate 	struct caller_context *ct)
285*7c478bd9Sstevel@tonic-gate {
286*7c478bd9Sstevel@tonic-gate 	struct inode *ip = VTOI(vp);
287*7c478bd9Sstevel@tonic-gate 	struct ufsvfs *ufsvfsp;
288*7c478bd9Sstevel@tonic-gate 	struct ulockfs *ulp = NULL;
289*7c478bd9Sstevel@tonic-gate 	int error = 0;
290*7c478bd9Sstevel@tonic-gate 	int intrans = 0;
291*7c478bd9Sstevel@tonic-gate 
292*7c478bd9Sstevel@tonic-gate 	ASSERT(RW_READ_HELD(&ip->i_rwlock));
293*7c478bd9Sstevel@tonic-gate 	TRACE_3(TR_FAC_UFS, TR_UFS_READ_START,
294*7c478bd9Sstevel@tonic-gate 		"ufs_read_start:vp %p uiop %p ioflag %x",
295*7c478bd9Sstevel@tonic-gate 		vp, uiop, ioflag);
296*7c478bd9Sstevel@tonic-gate 
297*7c478bd9Sstevel@tonic-gate 	/*
298*7c478bd9Sstevel@tonic-gate 	 * Mandatory locking needs to be done before ufs_lockfs_begin()
299*7c478bd9Sstevel@tonic-gate 	 * and TRANS_BEGIN_SYNC() calls since mandatory locks can sleep.
300*7c478bd9Sstevel@tonic-gate 	 */
301*7c478bd9Sstevel@tonic-gate 	if (MANDLOCK(vp, ip->i_mode)) {
302*7c478bd9Sstevel@tonic-gate 		/*
303*7c478bd9Sstevel@tonic-gate 		 * ufs_getattr ends up being called by chklock
304*7c478bd9Sstevel@tonic-gate 		 */
305*7c478bd9Sstevel@tonic-gate 		error = chklock(vp, FREAD, uiop->uio_loffset,
306*7c478bd9Sstevel@tonic-gate 				uiop->uio_resid, uiop->uio_fmode, ct);
307*7c478bd9Sstevel@tonic-gate 		if (error)
308*7c478bd9Sstevel@tonic-gate 			goto out;
309*7c478bd9Sstevel@tonic-gate 	}
310*7c478bd9Sstevel@tonic-gate 
311*7c478bd9Sstevel@tonic-gate 	ufsvfsp = ip->i_ufsvfs;
312*7c478bd9Sstevel@tonic-gate 	error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_READ_MASK);
313*7c478bd9Sstevel@tonic-gate 	if (error)
314*7c478bd9Sstevel@tonic-gate 		goto out;
315*7c478bd9Sstevel@tonic-gate 
316*7c478bd9Sstevel@tonic-gate 	/*
317*7c478bd9Sstevel@tonic-gate 	 * In the case that a directory is opened for reading as a file
318*7c478bd9Sstevel@tonic-gate 	 * (eg "cat .") with the  O_RSYNC, O_SYNC and O_DSYNC flags set.
319*7c478bd9Sstevel@tonic-gate 	 * The locking order had to be changed to avoid a deadlock with
320*7c478bd9Sstevel@tonic-gate 	 * an update taking place on that directory at the same time.
321*7c478bd9Sstevel@tonic-gate 	 */
322*7c478bd9Sstevel@tonic-gate 	if ((ip->i_mode & IFMT) == IFDIR) {
323*7c478bd9Sstevel@tonic-gate 
324*7c478bd9Sstevel@tonic-gate 		rw_enter(&ip->i_contents, RW_READER);
325*7c478bd9Sstevel@tonic-gate 		error = rdip(ip, uiop, ioflag, cr);
326*7c478bd9Sstevel@tonic-gate 		rw_exit(&ip->i_contents);
327*7c478bd9Sstevel@tonic-gate 
328*7c478bd9Sstevel@tonic-gate 		if (error) {
329*7c478bd9Sstevel@tonic-gate 			if (ulp)
330*7c478bd9Sstevel@tonic-gate 				ufs_lockfs_end(ulp);
331*7c478bd9Sstevel@tonic-gate 			goto out;
332*7c478bd9Sstevel@tonic-gate 		}
333*7c478bd9Sstevel@tonic-gate 
334*7c478bd9Sstevel@tonic-gate 		if (ulp && (ioflag & FRSYNC) && (ioflag & (FSYNC | FDSYNC)) &&
335*7c478bd9Sstevel@tonic-gate 		    TRANS_ISTRANS(ufsvfsp)) {
336*7c478bd9Sstevel@tonic-gate 			rw_exit(&ip->i_rwlock);
337*7c478bd9Sstevel@tonic-gate 			TRANS_BEGIN_SYNC(ufsvfsp, TOP_READ_SYNC, TOP_READ_SIZE,
338*7c478bd9Sstevel@tonic-gate 			    error);
339*7c478bd9Sstevel@tonic-gate 			ASSERT(!error);
340*7c478bd9Sstevel@tonic-gate 			TRANS_END_SYNC(ufsvfsp, error, TOP_READ_SYNC,
341*7c478bd9Sstevel@tonic-gate 			    TOP_READ_SIZE);
342*7c478bd9Sstevel@tonic-gate 			rw_enter(&ip->i_rwlock, RW_READER);
343*7c478bd9Sstevel@tonic-gate 		}
344*7c478bd9Sstevel@tonic-gate 	} else {
345*7c478bd9Sstevel@tonic-gate 		/*
346*7c478bd9Sstevel@tonic-gate 		 * Only transact reads to files opened for sync-read and
347*7c478bd9Sstevel@tonic-gate 		 * sync-write on a file system that is not write locked.
348*7c478bd9Sstevel@tonic-gate 		 *
349*7c478bd9Sstevel@tonic-gate 		 * The ``not write locked'' check prevents problems with
350*7c478bd9Sstevel@tonic-gate 		 * enabling/disabling logging on a busy file system.  E.g.,
351*7c478bd9Sstevel@tonic-gate 		 * logging exists at the beginning of the read but does not
352*7c478bd9Sstevel@tonic-gate 		 * at the end.
353*7c478bd9Sstevel@tonic-gate 		 *
354*7c478bd9Sstevel@tonic-gate 		 */
355*7c478bd9Sstevel@tonic-gate 		if (ulp && (ioflag & FRSYNC) && (ioflag & (FSYNC | FDSYNC)) &&
356*7c478bd9Sstevel@tonic-gate 		    TRANS_ISTRANS(ufsvfsp)) {
357*7c478bd9Sstevel@tonic-gate 			TRANS_BEGIN_SYNC(ufsvfsp, TOP_READ_SYNC, TOP_READ_SIZE,
358*7c478bd9Sstevel@tonic-gate 			    error);
359*7c478bd9Sstevel@tonic-gate 			ASSERT(!error);
360*7c478bd9Sstevel@tonic-gate 			intrans = 1;
361*7c478bd9Sstevel@tonic-gate 		}
362*7c478bd9Sstevel@tonic-gate 
363*7c478bd9Sstevel@tonic-gate 		rw_enter(&ip->i_contents, RW_READER);
364*7c478bd9Sstevel@tonic-gate 		error = rdip(ip, uiop, ioflag, cr);
365*7c478bd9Sstevel@tonic-gate 		rw_exit(&ip->i_contents);
366*7c478bd9Sstevel@tonic-gate 
367*7c478bd9Sstevel@tonic-gate 		if (intrans) {
368*7c478bd9Sstevel@tonic-gate 			TRANS_END_SYNC(ufsvfsp, error, TOP_READ_SYNC,
369*7c478bd9Sstevel@tonic-gate 			    TOP_READ_SIZE);
370*7c478bd9Sstevel@tonic-gate 		}
371*7c478bd9Sstevel@tonic-gate 	}
372*7c478bd9Sstevel@tonic-gate 
373*7c478bd9Sstevel@tonic-gate 	if (ulp) {
374*7c478bd9Sstevel@tonic-gate 		ufs_lockfs_end(ulp);
375*7c478bd9Sstevel@tonic-gate 	}
376*7c478bd9Sstevel@tonic-gate out:
377*7c478bd9Sstevel@tonic-gate 
378*7c478bd9Sstevel@tonic-gate 	TRACE_2(TR_FAC_UFS, TR_UFS_READ_END,
379*7c478bd9Sstevel@tonic-gate 		"ufs_read_end:vp %p error %d", vp, error);
380*7c478bd9Sstevel@tonic-gate 	return (error);
381*7c478bd9Sstevel@tonic-gate }
382*7c478bd9Sstevel@tonic-gate 
383*7c478bd9Sstevel@tonic-gate extern	int	ufs_HW;		/* high water mark */
384*7c478bd9Sstevel@tonic-gate extern	int	ufs_LW;		/* low water mark */
385*7c478bd9Sstevel@tonic-gate int	ufs_WRITES = 1;		/* XXX - enable/disable */
386*7c478bd9Sstevel@tonic-gate int	ufs_throttles = 0;	/* throttling count */
387*7c478bd9Sstevel@tonic-gate int	ufs_allow_shared_writes = 1;	/* directio shared writes */
388*7c478bd9Sstevel@tonic-gate 
389*7c478bd9Sstevel@tonic-gate static int
390*7c478bd9Sstevel@tonic-gate ufs_check_rewrite(struct inode *ip, struct uio *uiop, int ioflag)
391*7c478bd9Sstevel@tonic-gate {
392*7c478bd9Sstevel@tonic-gate 
393*7c478bd9Sstevel@tonic-gate 	/*
394*7c478bd9Sstevel@tonic-gate 	 * Filter to determine if this request is suitable as a
395*7c478bd9Sstevel@tonic-gate 	 * concurrent rewrite. This write must not allocate blocks
396*7c478bd9Sstevel@tonic-gate 	 * by extending the file or filling in holes. No use trying
397*7c478bd9Sstevel@tonic-gate 	 * through FSYNC descriptors as the inode will be synchronously
398*7c478bd9Sstevel@tonic-gate 	 * updated after the write. The uio structure has not yet been
399*7c478bd9Sstevel@tonic-gate 	 * checked for sanity, so assume nothing.
400*7c478bd9Sstevel@tonic-gate 	 */
401*7c478bd9Sstevel@tonic-gate 	return (((ip->i_mode & IFMT) == IFREG) && !(ioflag & FAPPEND) &&
402*7c478bd9Sstevel@tonic-gate 		(uiop->uio_loffset >= (offset_t)0) &&
403*7c478bd9Sstevel@tonic-gate 		(uiop->uio_loffset < ip->i_size) && (uiop->uio_resid > 0) &&
404*7c478bd9Sstevel@tonic-gate 		((ip->i_size - uiop->uio_loffset) >= uiop->uio_resid) &&
405*7c478bd9Sstevel@tonic-gate 		!(ioflag & FSYNC) && !bmap_has_holes(ip) &&
406*7c478bd9Sstevel@tonic-gate 		ufs_allow_shared_writes);
407*7c478bd9Sstevel@tonic-gate }
408*7c478bd9Sstevel@tonic-gate 
409*7c478bd9Sstevel@tonic-gate /*ARGSUSED*/
410*7c478bd9Sstevel@tonic-gate static int
411*7c478bd9Sstevel@tonic-gate ufs_write(struct vnode *vp, struct uio *uiop, int ioflag, cred_t *cr,
412*7c478bd9Sstevel@tonic-gate 	caller_context_t *ct)
413*7c478bd9Sstevel@tonic-gate {
414*7c478bd9Sstevel@tonic-gate 	struct inode *ip = VTOI(vp);
415*7c478bd9Sstevel@tonic-gate 	struct ufsvfs *ufsvfsp;
416*7c478bd9Sstevel@tonic-gate 	struct ulockfs *ulp;
417*7c478bd9Sstevel@tonic-gate 	int retry = 1;
418*7c478bd9Sstevel@tonic-gate 	int error, resv, resid = 0;
419*7c478bd9Sstevel@tonic-gate 	int directio_status;
420*7c478bd9Sstevel@tonic-gate 	int exclusive;
421*7c478bd9Sstevel@tonic-gate 	long start_resid = uiop->uio_resid;
422*7c478bd9Sstevel@tonic-gate 
423*7c478bd9Sstevel@tonic-gate 	TRACE_3(TR_FAC_UFS, TR_UFS_WRITE_START,
424*7c478bd9Sstevel@tonic-gate 		"ufs_write_start:vp %p uiop %p ioflag %x",
425*7c478bd9Sstevel@tonic-gate 		vp, uiop, ioflag);
426*7c478bd9Sstevel@tonic-gate 
427*7c478bd9Sstevel@tonic-gate 	ASSERT(RW_LOCK_HELD(&ip->i_rwlock));
428*7c478bd9Sstevel@tonic-gate 
429*7c478bd9Sstevel@tonic-gate retry_mandlock:
430*7c478bd9Sstevel@tonic-gate 	/*
431*7c478bd9Sstevel@tonic-gate 	 * Mandatory locking needs to be done before ufs_lockfs_begin()
432*7c478bd9Sstevel@tonic-gate 	 * and TRANS_BEGIN_[A]SYNC() calls since mandatory locks can sleep.
433*7c478bd9Sstevel@tonic-gate 	 * Check for forced unmounts normally done in ufs_lockfs_begin().
434*7c478bd9Sstevel@tonic-gate 	 */
435*7c478bd9Sstevel@tonic-gate 	if ((ufsvfsp = ip->i_ufsvfs) == NULL) {
436*7c478bd9Sstevel@tonic-gate 		error = EIO;
437*7c478bd9Sstevel@tonic-gate 		goto out;
438*7c478bd9Sstevel@tonic-gate 	}
439*7c478bd9Sstevel@tonic-gate 	if (MANDLOCK(vp, ip->i_mode)) {
440*7c478bd9Sstevel@tonic-gate 
441*7c478bd9Sstevel@tonic-gate 		ASSERT(RW_WRITE_HELD(&ip->i_rwlock));
442*7c478bd9Sstevel@tonic-gate 
443*7c478bd9Sstevel@tonic-gate 		/*
444*7c478bd9Sstevel@tonic-gate 		 * ufs_getattr ends up being called by chklock
445*7c478bd9Sstevel@tonic-gate 		 */
446*7c478bd9Sstevel@tonic-gate 		error = chklock(vp, FWRITE, uiop->uio_loffset,
447*7c478bd9Sstevel@tonic-gate 				uiop->uio_resid, uiop->uio_fmode, ct);
448*7c478bd9Sstevel@tonic-gate 		if (error)
449*7c478bd9Sstevel@tonic-gate 			goto out;
450*7c478bd9Sstevel@tonic-gate 	}
451*7c478bd9Sstevel@tonic-gate 
452*7c478bd9Sstevel@tonic-gate 	/* i_rwlock can change in chklock */
453*7c478bd9Sstevel@tonic-gate 	exclusive = rw_write_held(&ip->i_rwlock);
454*7c478bd9Sstevel@tonic-gate 
455*7c478bd9Sstevel@tonic-gate 	/*
456*7c478bd9Sstevel@tonic-gate 	 * Check for fast-path special case of directio re-writes.
457*7c478bd9Sstevel@tonic-gate 	 */
458*7c478bd9Sstevel@tonic-gate 	if ((ip->i_flag & IDIRECTIO || ufsvfsp->vfs_forcedirectio) &&
459*7c478bd9Sstevel@tonic-gate 	    !exclusive && ufs_check_rewrite(ip, uiop, ioflag)) {
460*7c478bd9Sstevel@tonic-gate 
461*7c478bd9Sstevel@tonic-gate 		error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_WRITE_MASK);
462*7c478bd9Sstevel@tonic-gate 		if (error)
463*7c478bd9Sstevel@tonic-gate 			goto out;
464*7c478bd9Sstevel@tonic-gate 
465*7c478bd9Sstevel@tonic-gate 		rw_enter(&ip->i_contents, RW_READER);
466*7c478bd9Sstevel@tonic-gate 		error = ufs_directio_write(ip, uiop, ioflag, 1, cr,
467*7c478bd9Sstevel@tonic-gate 			&directio_status);
468*7c478bd9Sstevel@tonic-gate 		if (directio_status == DIRECTIO_SUCCESS) {
469*7c478bd9Sstevel@tonic-gate 			uint_t i_flag_save;
470*7c478bd9Sstevel@tonic-gate 
471*7c478bd9Sstevel@tonic-gate 			if (start_resid != uiop->uio_resid)
472*7c478bd9Sstevel@tonic-gate 				error = 0;
473*7c478bd9Sstevel@tonic-gate 			/*
474*7c478bd9Sstevel@tonic-gate 			 * Special treatment of access times for re-writes.
475*7c478bd9Sstevel@tonic-gate 			 * If IMOD is not already set, then convert it
476*7c478bd9Sstevel@tonic-gate 			 * to IMODACC for this operation. This defers
477*7c478bd9Sstevel@tonic-gate 			 * entering a delta into the log until the inode
478*7c478bd9Sstevel@tonic-gate 			 * is flushed. This mimics what is done for read
479*7c478bd9Sstevel@tonic-gate 			 * operations and inode access time.
480*7c478bd9Sstevel@tonic-gate 			 */
481*7c478bd9Sstevel@tonic-gate 			mutex_enter(&ip->i_tlock);
482*7c478bd9Sstevel@tonic-gate 			i_flag_save = ip->i_flag;
483*7c478bd9Sstevel@tonic-gate 			ip->i_flag |= IUPD | ICHG;
484*7c478bd9Sstevel@tonic-gate 			ip->i_seq++;
485*7c478bd9Sstevel@tonic-gate 			ITIMES_NOLOCK(ip);
486*7c478bd9Sstevel@tonic-gate 			if ((i_flag_save & IMOD) == 0) {
487*7c478bd9Sstevel@tonic-gate 				ip->i_flag &= ~IMOD;
488*7c478bd9Sstevel@tonic-gate 				ip->i_flag |= IMODACC;
489*7c478bd9Sstevel@tonic-gate 			}
490*7c478bd9Sstevel@tonic-gate 			mutex_exit(&ip->i_tlock);
491*7c478bd9Sstevel@tonic-gate 			rw_exit(&ip->i_contents);
492*7c478bd9Sstevel@tonic-gate 			if (ulp)
493*7c478bd9Sstevel@tonic-gate 				ufs_lockfs_end(ulp);
494*7c478bd9Sstevel@tonic-gate 			goto out;
495*7c478bd9Sstevel@tonic-gate 		}
496*7c478bd9Sstevel@tonic-gate 		rw_exit(&ip->i_contents);
497*7c478bd9Sstevel@tonic-gate 		if (ulp)
498*7c478bd9Sstevel@tonic-gate 			ufs_lockfs_end(ulp);
499*7c478bd9Sstevel@tonic-gate 	}
500*7c478bd9Sstevel@tonic-gate 
501*7c478bd9Sstevel@tonic-gate 	if (!exclusive && !rw_tryupgrade(&ip->i_rwlock)) {
502*7c478bd9Sstevel@tonic-gate 		rw_exit(&ip->i_rwlock);
503*7c478bd9Sstevel@tonic-gate 		rw_enter(&ip->i_rwlock, RW_WRITER);
504*7c478bd9Sstevel@tonic-gate 		/*
505*7c478bd9Sstevel@tonic-gate 		 * Mandatory locking could have been enabled
506*7c478bd9Sstevel@tonic-gate 		 * after dropping the i_rwlock.
507*7c478bd9Sstevel@tonic-gate 		 */
508*7c478bd9Sstevel@tonic-gate 		if (MANDLOCK(vp, ip->i_mode))
509*7c478bd9Sstevel@tonic-gate 			goto retry_mandlock;
510*7c478bd9Sstevel@tonic-gate 	}
511*7c478bd9Sstevel@tonic-gate 
512*7c478bd9Sstevel@tonic-gate 	error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_WRITE_MASK);
513*7c478bd9Sstevel@tonic-gate 	if (error)
514*7c478bd9Sstevel@tonic-gate 		goto out;
515*7c478bd9Sstevel@tonic-gate 
516*7c478bd9Sstevel@tonic-gate 	/*
517*7c478bd9Sstevel@tonic-gate 	 * Amount of log space needed for this write
518*7c478bd9Sstevel@tonic-gate 	 */
519*7c478bd9Sstevel@tonic-gate 	TRANS_WRITE_RESV(ip, uiop, ulp, &resv, &resid);
520*7c478bd9Sstevel@tonic-gate 
521*7c478bd9Sstevel@tonic-gate 	/*
522*7c478bd9Sstevel@tonic-gate 	 * Throttle writes.
523*7c478bd9Sstevel@tonic-gate 	 */
524*7c478bd9Sstevel@tonic-gate 	if (ufs_WRITES && (ip->i_writes > ufs_HW)) {
525*7c478bd9Sstevel@tonic-gate 		mutex_enter(&ip->i_tlock);
526*7c478bd9Sstevel@tonic-gate 		while (ip->i_writes > ufs_HW) {
527*7c478bd9Sstevel@tonic-gate 			ufs_throttles++;
528*7c478bd9Sstevel@tonic-gate 			cv_wait(&ip->i_wrcv, &ip->i_tlock);
529*7c478bd9Sstevel@tonic-gate 		}
530*7c478bd9Sstevel@tonic-gate 		mutex_exit(&ip->i_tlock);
531*7c478bd9Sstevel@tonic-gate 	}
532*7c478bd9Sstevel@tonic-gate 
533*7c478bd9Sstevel@tonic-gate 	/*
534*7c478bd9Sstevel@tonic-gate 	 * Enter Transaction
535*7c478bd9Sstevel@tonic-gate 	 */
536*7c478bd9Sstevel@tonic-gate 	if (ioflag & (FSYNC|FDSYNC)) {
537*7c478bd9Sstevel@tonic-gate 		if (ulp) {
538*7c478bd9Sstevel@tonic-gate 			int terr = 0;
539*7c478bd9Sstevel@tonic-gate 			TRANS_BEGIN_SYNC(ufsvfsp, TOP_WRITE_SYNC, resv, terr);
540*7c478bd9Sstevel@tonic-gate 			ASSERT(!terr);
541*7c478bd9Sstevel@tonic-gate 		}
542*7c478bd9Sstevel@tonic-gate 	} else {
543*7c478bd9Sstevel@tonic-gate 		if (ulp)
544*7c478bd9Sstevel@tonic-gate 			TRANS_BEGIN_ASYNC(ufsvfsp, TOP_WRITE, resv);
545*7c478bd9Sstevel@tonic-gate 	}
546*7c478bd9Sstevel@tonic-gate 
547*7c478bd9Sstevel@tonic-gate 	/*
548*7c478bd9Sstevel@tonic-gate 	 * Write the file
549*7c478bd9Sstevel@tonic-gate 	 */
550*7c478bd9Sstevel@tonic-gate 	rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
551*7c478bd9Sstevel@tonic-gate 	rw_enter(&ip->i_contents, RW_WRITER);
552*7c478bd9Sstevel@tonic-gate 	if ((ioflag & FAPPEND) != 0 && (ip->i_mode & IFMT) == IFREG) {
553*7c478bd9Sstevel@tonic-gate 		/*
554*7c478bd9Sstevel@tonic-gate 		 * In append mode start at end of file.
555*7c478bd9Sstevel@tonic-gate 		 */
556*7c478bd9Sstevel@tonic-gate 		uiop->uio_loffset = ip->i_size;
557*7c478bd9Sstevel@tonic-gate 	}
558*7c478bd9Sstevel@tonic-gate 
559*7c478bd9Sstevel@tonic-gate 	/*
560*7c478bd9Sstevel@tonic-gate 	 * Mild optimisation, don't call ufs_trans_write() unless we have to
561*7c478bd9Sstevel@tonic-gate 	 * Also, suppress file system full messages if we will retry.
562*7c478bd9Sstevel@tonic-gate 	 */
563*7c478bd9Sstevel@tonic-gate 	if (retry)
564*7c478bd9Sstevel@tonic-gate 		ip->i_flag |= IQUIET;
565*7c478bd9Sstevel@tonic-gate 	if (resid) {
566*7c478bd9Sstevel@tonic-gate 		TRANS_WRITE(ip, uiop, ioflag, error, ulp, cr, resv, resid);
567*7c478bd9Sstevel@tonic-gate 	} else {
568*7c478bd9Sstevel@tonic-gate 		error = wrip(ip, uiop, ioflag, cr);
569*7c478bd9Sstevel@tonic-gate 	}
570*7c478bd9Sstevel@tonic-gate 	ip->i_flag &= ~IQUIET;
571*7c478bd9Sstevel@tonic-gate 
572*7c478bd9Sstevel@tonic-gate 	rw_exit(&ip->i_contents);
573*7c478bd9Sstevel@tonic-gate 	rw_exit(&ufsvfsp->vfs_dqrwlock);
574*7c478bd9Sstevel@tonic-gate 
575*7c478bd9Sstevel@tonic-gate 	/*
576*7c478bd9Sstevel@tonic-gate 	 * Leave Transaction
577*7c478bd9Sstevel@tonic-gate 	 */
578*7c478bd9Sstevel@tonic-gate 	if (ulp) {
579*7c478bd9Sstevel@tonic-gate 		if (ioflag & (FSYNC|FDSYNC)) {
580*7c478bd9Sstevel@tonic-gate 			int terr = 0;
581*7c478bd9Sstevel@tonic-gate 			TRANS_END_SYNC(ufsvfsp, terr, TOP_WRITE_SYNC, resv);
582*7c478bd9Sstevel@tonic-gate 			if (error == 0)
583*7c478bd9Sstevel@tonic-gate 				error = terr;
584*7c478bd9Sstevel@tonic-gate 		} else {
585*7c478bd9Sstevel@tonic-gate 			TRANS_END_ASYNC(ufsvfsp, TOP_WRITE, resv);
586*7c478bd9Sstevel@tonic-gate 		}
587*7c478bd9Sstevel@tonic-gate 		ufs_lockfs_end(ulp);
588*7c478bd9Sstevel@tonic-gate 	}
589*7c478bd9Sstevel@tonic-gate out:
590*7c478bd9Sstevel@tonic-gate 	if ((error == ENOSPC) && retry && TRANS_ISTRANS(ufsvfsp)) {
591*7c478bd9Sstevel@tonic-gate 		/*
592*7c478bd9Sstevel@tonic-gate 		 * Any blocks tied up in pending deletes?
593*7c478bd9Sstevel@tonic-gate 		 */
594*7c478bd9Sstevel@tonic-gate 		ufs_delete_drain_wait(ufsvfsp, 1);
595*7c478bd9Sstevel@tonic-gate 		retry = 0;
596*7c478bd9Sstevel@tonic-gate 		goto retry_mandlock;
597*7c478bd9Sstevel@tonic-gate 	}
598*7c478bd9Sstevel@tonic-gate 
599*7c478bd9Sstevel@tonic-gate 	if (error == ENOSPC && (start_resid != uiop->uio_resid))
600*7c478bd9Sstevel@tonic-gate 		error = 0;
601*7c478bd9Sstevel@tonic-gate 
602*7c478bd9Sstevel@tonic-gate 	TRACE_2(TR_FAC_UFS, TR_UFS_WRITE_END,
603*7c478bd9Sstevel@tonic-gate 		"ufs_write_end:vp %p error %d", vp, error);
604*7c478bd9Sstevel@tonic-gate 	return (error);
605*7c478bd9Sstevel@tonic-gate }
606*7c478bd9Sstevel@tonic-gate 
607*7c478bd9Sstevel@tonic-gate /*
608*7c478bd9Sstevel@tonic-gate  * Don't cache write blocks to files with the sticky bit set.
609*7c478bd9Sstevel@tonic-gate  * Used to keep swap files from blowing the page cache on a server.
610*7c478bd9Sstevel@tonic-gate  */
611*7c478bd9Sstevel@tonic-gate int stickyhack = 1;
612*7c478bd9Sstevel@tonic-gate 
613*7c478bd9Sstevel@tonic-gate /*
614*7c478bd9Sstevel@tonic-gate  * Free behind hacks.  The pager is busted.
615*7c478bd9Sstevel@tonic-gate  * XXX - need to pass the information down to writedone() in a flag like B_SEQ
616*7c478bd9Sstevel@tonic-gate  * or B_FREE_IF_TIGHT_ON_MEMORY.
617*7c478bd9Sstevel@tonic-gate  */
618*7c478bd9Sstevel@tonic-gate int	freebehind = 1;
619*7c478bd9Sstevel@tonic-gate int	smallfile = 32 * 1024;
620*7c478bd9Sstevel@tonic-gate 
621*7c478bd9Sstevel@tonic-gate /*
622*7c478bd9Sstevel@tonic-gate  * While we should, in most cases, cache the pages for write, we
623*7c478bd9Sstevel@tonic-gate  * may also want to cache the pages for read as long as they are
624*7c478bd9Sstevel@tonic-gate  * frequently re-usable.
625*7c478bd9Sstevel@tonic-gate  *
626*7c478bd9Sstevel@tonic-gate  * If cache_read_ahead = 1, the pages for read will go to the tail
627*7c478bd9Sstevel@tonic-gate  * of the cache list when they are released, otherwise go to the head.
628*7c478bd9Sstevel@tonic-gate  */
629*7c478bd9Sstevel@tonic-gate int	cache_read_ahead = 0;
630*7c478bd9Sstevel@tonic-gate 
631*7c478bd9Sstevel@tonic-gate /*
632*7c478bd9Sstevel@tonic-gate  * wrip does the real work of write requests for ufs.
633*7c478bd9Sstevel@tonic-gate  */
634*7c478bd9Sstevel@tonic-gate int
635*7c478bd9Sstevel@tonic-gate wrip(struct inode *ip, struct uio *uio, int ioflag, struct cred *cr)
636*7c478bd9Sstevel@tonic-gate {
637*7c478bd9Sstevel@tonic-gate 	rlim64_t limit = uio->uio_llimit;
638*7c478bd9Sstevel@tonic-gate 	u_offset_t off;
639*7c478bd9Sstevel@tonic-gate 	u_offset_t old_i_size;
640*7c478bd9Sstevel@tonic-gate 	struct fs *fs;
641*7c478bd9Sstevel@tonic-gate 	struct vnode *vp;
642*7c478bd9Sstevel@tonic-gate 	struct ufsvfs *ufsvfsp;
643*7c478bd9Sstevel@tonic-gate 	caddr_t base;
644*7c478bd9Sstevel@tonic-gate 	long start_resid = uio->uio_resid;	/* save starting resid */
645*7c478bd9Sstevel@tonic-gate 	long premove_resid;			/* resid before uiomove() */
646*7c478bd9Sstevel@tonic-gate 	uint_t flags;
647*7c478bd9Sstevel@tonic-gate 	int newpage;
648*7c478bd9Sstevel@tonic-gate 	int iupdat_flag, directio_status;
649*7c478bd9Sstevel@tonic-gate 	int n, on, mapon;
650*7c478bd9Sstevel@tonic-gate 	int error, pagecreate;
651*7c478bd9Sstevel@tonic-gate 	int do_dqrwlock;		/* drop/reacquire vfs_dqrwlock */
652*7c478bd9Sstevel@tonic-gate 	int32_t	iblocks;
653*7c478bd9Sstevel@tonic-gate 	int	new_iblocks;
654*7c478bd9Sstevel@tonic-gate 
655*7c478bd9Sstevel@tonic-gate 	/*
656*7c478bd9Sstevel@tonic-gate 	 * ip->i_size is incremented before the uiomove
657*7c478bd9Sstevel@tonic-gate 	 * is done on a write.  If the move fails (bad user
658*7c478bd9Sstevel@tonic-gate 	 * address) reset ip->i_size.
659*7c478bd9Sstevel@tonic-gate 	 * The better way would be to increment ip->i_size
660*7c478bd9Sstevel@tonic-gate 	 * only if the uiomove succeeds.
661*7c478bd9Sstevel@tonic-gate 	 */
662*7c478bd9Sstevel@tonic-gate 	int i_size_changed = 0;
663*7c478bd9Sstevel@tonic-gate 	o_mode_t type;
664*7c478bd9Sstevel@tonic-gate 	int i_seq_needed = 0;
665*7c478bd9Sstevel@tonic-gate 
666*7c478bd9Sstevel@tonic-gate 	vp = ITOV(ip);
667*7c478bd9Sstevel@tonic-gate 
668*7c478bd9Sstevel@tonic-gate 	/*
669*7c478bd9Sstevel@tonic-gate 	 * check for forced unmount - should not happen as
670*7c478bd9Sstevel@tonic-gate 	 * the request passed the lockfs checks.
671*7c478bd9Sstevel@tonic-gate 	 */
672*7c478bd9Sstevel@tonic-gate 	if ((ufsvfsp = ip->i_ufsvfs) == NULL)
673*7c478bd9Sstevel@tonic-gate 		return (EIO);
674*7c478bd9Sstevel@tonic-gate 
675*7c478bd9Sstevel@tonic-gate 	fs = ip->i_fs;
676*7c478bd9Sstevel@tonic-gate 
677*7c478bd9Sstevel@tonic-gate 	TRACE_1(TR_FAC_UFS, TR_UFS_RWIP_START,
678*7c478bd9Sstevel@tonic-gate 		"ufs_wrip_start:vp %p", vp);
679*7c478bd9Sstevel@tonic-gate 
680*7c478bd9Sstevel@tonic-gate 	ASSERT(RW_WRITE_HELD(&ip->i_contents));
681*7c478bd9Sstevel@tonic-gate 
682*7c478bd9Sstevel@tonic-gate 	/* check for valid filetype */
683*7c478bd9Sstevel@tonic-gate 	type = ip->i_mode & IFMT;
684*7c478bd9Sstevel@tonic-gate 	if ((type != IFREG) && (type != IFDIR) && (type != IFATTRDIR) &&
685*7c478bd9Sstevel@tonic-gate 	    (type != IFLNK) && (type != IFSHAD)) {
686*7c478bd9Sstevel@tonic-gate 		return (EIO);
687*7c478bd9Sstevel@tonic-gate 	}
688*7c478bd9Sstevel@tonic-gate 
689*7c478bd9Sstevel@tonic-gate 	/*
690*7c478bd9Sstevel@tonic-gate 	 * the actual limit of UFS file size
691*7c478bd9Sstevel@tonic-gate 	 * is UFS_MAXOFFSET_T
692*7c478bd9Sstevel@tonic-gate 	 */
693*7c478bd9Sstevel@tonic-gate 	if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
694*7c478bd9Sstevel@tonic-gate 		limit = MAXOFFSET_T;
695*7c478bd9Sstevel@tonic-gate 
696*7c478bd9Sstevel@tonic-gate 	if (uio->uio_loffset >= limit) {
697*7c478bd9Sstevel@tonic-gate 		proc_t *p = ttoproc(curthread);
698*7c478bd9Sstevel@tonic-gate 
699*7c478bd9Sstevel@tonic-gate 		TRACE_2(TR_FAC_UFS, TR_UFS_RWIP_END,
700*7c478bd9Sstevel@tonic-gate 			"ufs_wrip_end:vp %p error %d", vp, EINVAL);
701*7c478bd9Sstevel@tonic-gate 
702*7c478bd9Sstevel@tonic-gate 		mutex_enter(&p->p_lock);
703*7c478bd9Sstevel@tonic-gate 		(void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE], p->p_rctls,
704*7c478bd9Sstevel@tonic-gate 		    p, RCA_UNSAFE_SIGINFO);
705*7c478bd9Sstevel@tonic-gate 		mutex_exit(&p->p_lock);
706*7c478bd9Sstevel@tonic-gate 		return (EFBIG);
707*7c478bd9Sstevel@tonic-gate 	}
708*7c478bd9Sstevel@tonic-gate 
709*7c478bd9Sstevel@tonic-gate 	/*
710*7c478bd9Sstevel@tonic-gate 	 * if largefiles are disallowed, the limit is
711*7c478bd9Sstevel@tonic-gate 	 * the pre-largefiles value of 2GB
712*7c478bd9Sstevel@tonic-gate 	 */
713*7c478bd9Sstevel@tonic-gate 	if (ufsvfsp->vfs_lfflags & UFS_LARGEFILES)
714*7c478bd9Sstevel@tonic-gate 		limit = MIN(UFS_MAXOFFSET_T, limit);
715*7c478bd9Sstevel@tonic-gate 	else
716*7c478bd9Sstevel@tonic-gate 		limit = MIN(MAXOFF32_T, limit);
717*7c478bd9Sstevel@tonic-gate 
718*7c478bd9Sstevel@tonic-gate 	if (uio->uio_loffset < (offset_t)0) {
719*7c478bd9Sstevel@tonic-gate 		TRACE_2(TR_FAC_UFS, TR_UFS_RWIP_END,
720*7c478bd9Sstevel@tonic-gate 			"ufs_wrip_end:vp %p error %d", vp, EINVAL);
721*7c478bd9Sstevel@tonic-gate 		return (EINVAL);
722*7c478bd9Sstevel@tonic-gate 	}
723*7c478bd9Sstevel@tonic-gate 	if (uio->uio_resid == 0) {
724*7c478bd9Sstevel@tonic-gate 		TRACE_2(TR_FAC_UFS, TR_UFS_RWIP_END,
725*7c478bd9Sstevel@tonic-gate 			"ufs_wrip_end:vp %p error %d", vp, 0);
726*7c478bd9Sstevel@tonic-gate 		return (0);
727*7c478bd9Sstevel@tonic-gate 	}
728*7c478bd9Sstevel@tonic-gate 
729*7c478bd9Sstevel@tonic-gate 	if (uio->uio_loffset >= limit)
730*7c478bd9Sstevel@tonic-gate 		return (EFBIG);
731*7c478bd9Sstevel@tonic-gate 
732*7c478bd9Sstevel@tonic-gate 	ip->i_flag |= INOACC;	/* don't update ref time in getpage */
733*7c478bd9Sstevel@tonic-gate 
734*7c478bd9Sstevel@tonic-gate 	if (ioflag & (FSYNC|FDSYNC)) {
735*7c478bd9Sstevel@tonic-gate 		ip->i_flag |= ISYNC;
736*7c478bd9Sstevel@tonic-gate 		iupdat_flag = 1;
737*7c478bd9Sstevel@tonic-gate 	}
738*7c478bd9Sstevel@tonic-gate 	/*
739*7c478bd9Sstevel@tonic-gate 	 * Try to go direct
740*7c478bd9Sstevel@tonic-gate 	 */
741*7c478bd9Sstevel@tonic-gate 	if (ip->i_flag & IDIRECTIO || ufsvfsp->vfs_forcedirectio) {
742*7c478bd9Sstevel@tonic-gate 		uio->uio_llimit = limit;
743*7c478bd9Sstevel@tonic-gate 		error = ufs_directio_write(ip, uio, ioflag, 0, cr,
744*7c478bd9Sstevel@tonic-gate 			&directio_status);
745*7c478bd9Sstevel@tonic-gate 		/*
746*7c478bd9Sstevel@tonic-gate 		 * If ufs_directio wrote to the file or set the flags,
747*7c478bd9Sstevel@tonic-gate 		 * we need to update i_seq, but it may be deferred.
748*7c478bd9Sstevel@tonic-gate 		 */
749*7c478bd9Sstevel@tonic-gate 		if (start_resid != uio->uio_resid ||
750*7c478bd9Sstevel@tonic-gate 					(ip->i_flag & (ICHG|IUPD))) {
751*7c478bd9Sstevel@tonic-gate 			i_seq_needed = 1;
752*7c478bd9Sstevel@tonic-gate 			ip->i_flag |= ISEQ;
753*7c478bd9Sstevel@tonic-gate 		}
754*7c478bd9Sstevel@tonic-gate 		if (directio_status == DIRECTIO_SUCCESS)
755*7c478bd9Sstevel@tonic-gate 			goto out;
756*7c478bd9Sstevel@tonic-gate 	}
757*7c478bd9Sstevel@tonic-gate 
758*7c478bd9Sstevel@tonic-gate 	/*
759*7c478bd9Sstevel@tonic-gate 	 * Behavior with respect to dropping/reacquiring vfs_dqrwlock:
760*7c478bd9Sstevel@tonic-gate 	 *
761*7c478bd9Sstevel@tonic-gate 	 * o shadow inodes: vfs_dqrwlock is not held at all
762*7c478bd9Sstevel@tonic-gate 	 * o quota updates: vfs_dqrwlock is read or write held
763*7c478bd9Sstevel@tonic-gate 	 * o other updates: vfs_dqrwlock is read held
764*7c478bd9Sstevel@tonic-gate 	 *
765*7c478bd9Sstevel@tonic-gate 	 * The first case is the only one where we do not hold
766*7c478bd9Sstevel@tonic-gate 	 * vfs_dqrwlock at all while entering wrip().
767*7c478bd9Sstevel@tonic-gate 	 * We must make sure not to downgrade/drop vfs_dqrwlock if we
768*7c478bd9Sstevel@tonic-gate 	 * have it as writer, i.e. if we are updating the quota inode.
769*7c478bd9Sstevel@tonic-gate 	 * There is no potential deadlock scenario in this case as
770*7c478bd9Sstevel@tonic-gate 	 * ufs_getpage() takes care of this and avoids reacquiring
771*7c478bd9Sstevel@tonic-gate 	 * vfs_dqrwlock in that case.
772*7c478bd9Sstevel@tonic-gate 	 *
773*7c478bd9Sstevel@tonic-gate 	 * This check is done here since the above conditions do not change
774*7c478bd9Sstevel@tonic-gate 	 * and we possibly loop below, so save a few cycles.
775*7c478bd9Sstevel@tonic-gate 	 */
776*7c478bd9Sstevel@tonic-gate 	if ((type == IFSHAD) ||
777*7c478bd9Sstevel@tonic-gate 		(rw_owner(&ufsvfsp->vfs_dqrwlock) == curthread)) {
778*7c478bd9Sstevel@tonic-gate 			do_dqrwlock = 0;
779*7c478bd9Sstevel@tonic-gate 	} else {
780*7c478bd9Sstevel@tonic-gate 		do_dqrwlock = 1;
781*7c478bd9Sstevel@tonic-gate 	}
782*7c478bd9Sstevel@tonic-gate 
783*7c478bd9Sstevel@tonic-gate 	/*
784*7c478bd9Sstevel@tonic-gate 	 * Large Files: We cast MAXBMASK to offset_t
785*7c478bd9Sstevel@tonic-gate 	 * inorder to mask out the higher bits. Since offset_t
786*7c478bd9Sstevel@tonic-gate 	 * is a signed value, the high order bit set in MAXBMASK
787*7c478bd9Sstevel@tonic-gate 	 * value makes it do the right thing by having all bits 1
788*7c478bd9Sstevel@tonic-gate 	 * in the higher word. May be removed for _SOLARIS64_.
789*7c478bd9Sstevel@tonic-gate 	 */
790*7c478bd9Sstevel@tonic-gate 
791*7c478bd9Sstevel@tonic-gate 	fs = ip->i_fs;
792*7c478bd9Sstevel@tonic-gate 	do {
793*7c478bd9Sstevel@tonic-gate 		u_offset_t uoff = uio->uio_loffset;
794*7c478bd9Sstevel@tonic-gate 		off = uoff & (offset_t)MAXBMASK;
795*7c478bd9Sstevel@tonic-gate 		mapon = (int)(uoff & (offset_t)MAXBOFFSET);
796*7c478bd9Sstevel@tonic-gate 		on = (int)blkoff(fs, uoff);
797*7c478bd9Sstevel@tonic-gate 		n = (int)MIN(fs->fs_bsize - on, uio->uio_resid);
798*7c478bd9Sstevel@tonic-gate 		new_iblocks = 1;
799*7c478bd9Sstevel@tonic-gate 
800*7c478bd9Sstevel@tonic-gate 		if (type == IFREG && uoff + n >= limit) {
801*7c478bd9Sstevel@tonic-gate 			if (uoff >= limit) {
802*7c478bd9Sstevel@tonic-gate 				error = EFBIG;
803*7c478bd9Sstevel@tonic-gate 				goto out;
804*7c478bd9Sstevel@tonic-gate 			}
805*7c478bd9Sstevel@tonic-gate 			/*
806*7c478bd9Sstevel@tonic-gate 			 * since uoff + n >= limit,
807*7c478bd9Sstevel@tonic-gate 			 * therefore n >= limit - uoff, and n is an int
808*7c478bd9Sstevel@tonic-gate 			 * so it is safe to cast it to an int
809*7c478bd9Sstevel@tonic-gate 			 */
810*7c478bd9Sstevel@tonic-gate 			n = (int)(limit - (rlim64_t)uoff);
811*7c478bd9Sstevel@tonic-gate 		}
812*7c478bd9Sstevel@tonic-gate 		if (uoff + n > ip->i_size) {
813*7c478bd9Sstevel@tonic-gate 			/*
814*7c478bd9Sstevel@tonic-gate 			 * We are extending the length of the file.
815*7c478bd9Sstevel@tonic-gate 			 * bmap is used so that we are sure that
816*7c478bd9Sstevel@tonic-gate 			 * if we need to allocate new blocks, that it
817*7c478bd9Sstevel@tonic-gate 			 * is done here before we up the file size.
818*7c478bd9Sstevel@tonic-gate 			 */
819*7c478bd9Sstevel@tonic-gate 			error = bmap_write(ip, uoff, (int)(on + n),
820*7c478bd9Sstevel@tonic-gate 							mapon == 0, cr);
821*7c478bd9Sstevel@tonic-gate 			/*
822*7c478bd9Sstevel@tonic-gate 			 * bmap_write never drops i_contents so if
823*7c478bd9Sstevel@tonic-gate 			 * the flags are set it changed the file.
824*7c478bd9Sstevel@tonic-gate 			 */
825*7c478bd9Sstevel@tonic-gate 			if (ip->i_flag & (ICHG|IUPD)) {
826*7c478bd9Sstevel@tonic-gate 				i_seq_needed = 1;
827*7c478bd9Sstevel@tonic-gate 				ip->i_flag |= ISEQ;
828*7c478bd9Sstevel@tonic-gate 			}
829*7c478bd9Sstevel@tonic-gate 			if (error)
830*7c478bd9Sstevel@tonic-gate 				break;
831*7c478bd9Sstevel@tonic-gate 			/*
832*7c478bd9Sstevel@tonic-gate 			 * There is a window of vulnerability here.
833*7c478bd9Sstevel@tonic-gate 			 * The sequence of operations: allocate file
834*7c478bd9Sstevel@tonic-gate 			 * system blocks, uiomove the data into pages,
835*7c478bd9Sstevel@tonic-gate 			 * and then update the size of the file in the
836*7c478bd9Sstevel@tonic-gate 			 * inode, must happen atomically.  However, due
837*7c478bd9Sstevel@tonic-gate 			 * to current locking constraints, this can not
838*7c478bd9Sstevel@tonic-gate 			 * be done.
839*7c478bd9Sstevel@tonic-gate 			 */
840*7c478bd9Sstevel@tonic-gate 			ASSERT(ip->i_writer == NULL);
841*7c478bd9Sstevel@tonic-gate 			ip->i_writer = curthread;
842*7c478bd9Sstevel@tonic-gate 			i_size_changed = 1;
843*7c478bd9Sstevel@tonic-gate 			/*
844*7c478bd9Sstevel@tonic-gate 			 * If we are writing from the beginning of
845*7c478bd9Sstevel@tonic-gate 			 * the mapping, we can just create the
846*7c478bd9Sstevel@tonic-gate 			 * pages without having to read them.
847*7c478bd9Sstevel@tonic-gate 			 */
848*7c478bd9Sstevel@tonic-gate 			pagecreate = (mapon == 0);
849*7c478bd9Sstevel@tonic-gate 		} else if (n == MAXBSIZE) {
850*7c478bd9Sstevel@tonic-gate 			/*
851*7c478bd9Sstevel@tonic-gate 			 * Going to do a whole mappings worth,
852*7c478bd9Sstevel@tonic-gate 			 * so we can just create the pages w/o
853*7c478bd9Sstevel@tonic-gate 			 * having to read them in.  But before
854*7c478bd9Sstevel@tonic-gate 			 * we do that, we need to make sure any
855*7c478bd9Sstevel@tonic-gate 			 * needed blocks are allocated first.
856*7c478bd9Sstevel@tonic-gate 			 */
857*7c478bd9Sstevel@tonic-gate 			iblocks = ip->i_blocks;
858*7c478bd9Sstevel@tonic-gate 			error = bmap_write(ip, uoff, (int)(on + n), 1, cr);
859*7c478bd9Sstevel@tonic-gate 			/*
860*7c478bd9Sstevel@tonic-gate 			 * bmap_write never drops i_contents so if
861*7c478bd9Sstevel@tonic-gate 			 * the flags are set it changed the file.
862*7c478bd9Sstevel@tonic-gate 			 */
863*7c478bd9Sstevel@tonic-gate 			if (ip->i_flag & (ICHG|IUPD)) {
864*7c478bd9Sstevel@tonic-gate 				i_seq_needed = 1;
865*7c478bd9Sstevel@tonic-gate 				ip->i_flag |= ISEQ;
866*7c478bd9Sstevel@tonic-gate 			}
867*7c478bd9Sstevel@tonic-gate 			if (error)
868*7c478bd9Sstevel@tonic-gate 				break;
869*7c478bd9Sstevel@tonic-gate 			pagecreate = 1;
870*7c478bd9Sstevel@tonic-gate 			/*
871*7c478bd9Sstevel@tonic-gate 			 * check if the new created page needed the
872*7c478bd9Sstevel@tonic-gate 			 * allocation of new disk blocks.
873*7c478bd9Sstevel@tonic-gate 			 */
874*7c478bd9Sstevel@tonic-gate 			if (iblocks == ip->i_blocks)
875*7c478bd9Sstevel@tonic-gate 				new_iblocks = 0; /* no new blocks allocated */
876*7c478bd9Sstevel@tonic-gate 		} else {
877*7c478bd9Sstevel@tonic-gate 			pagecreate = 0;
878*7c478bd9Sstevel@tonic-gate 			/*
879*7c478bd9Sstevel@tonic-gate 			 * In sync mode flush the indirect blocks which
880*7c478bd9Sstevel@tonic-gate 			 * may have been allocated and not written on
881*7c478bd9Sstevel@tonic-gate 			 * disk. In above cases bmap_write will allocate
882*7c478bd9Sstevel@tonic-gate 			 * in sync mode.
883*7c478bd9Sstevel@tonic-gate 			 */
884*7c478bd9Sstevel@tonic-gate 			if (ioflag & (FSYNC|FDSYNC)) {
885*7c478bd9Sstevel@tonic-gate 				error = ufs_indirblk_sync(ip, uoff);
886*7c478bd9Sstevel@tonic-gate 				if (error)
887*7c478bd9Sstevel@tonic-gate 					break;
888*7c478bd9Sstevel@tonic-gate 			}
889*7c478bd9Sstevel@tonic-gate 		}
890*7c478bd9Sstevel@tonic-gate 
891*7c478bd9Sstevel@tonic-gate 		/*
892*7c478bd9Sstevel@tonic-gate 		 * At this point we can enter ufs_getpage() in one
893*7c478bd9Sstevel@tonic-gate 		 * of two ways:
894*7c478bd9Sstevel@tonic-gate 		 * 1) segmap_getmapflt() calls ufs_getpage() when the
895*7c478bd9Sstevel@tonic-gate 		 *    forcefault parameter is true (pagecreate == 0)
896*7c478bd9Sstevel@tonic-gate 		 * 2) uiomove() causes a page fault.
897*7c478bd9Sstevel@tonic-gate 		 *
898*7c478bd9Sstevel@tonic-gate 		 * We have to drop the contents lock to prevent the VM
899*7c478bd9Sstevel@tonic-gate 		 * system from trying to reaquire it in ufs_getpage()
900*7c478bd9Sstevel@tonic-gate 		 * should the uiomove cause a pagefault.
901*7c478bd9Sstevel@tonic-gate 		 *
902*7c478bd9Sstevel@tonic-gate 		 * We have to drop the reader vfs_dqrwlock here as well.
903*7c478bd9Sstevel@tonic-gate 		 */
904*7c478bd9Sstevel@tonic-gate 		rw_exit(&ip->i_contents);
905*7c478bd9Sstevel@tonic-gate 		if (do_dqrwlock) {
906*7c478bd9Sstevel@tonic-gate 			ASSERT(RW_LOCK_HELD(&ufsvfsp->vfs_dqrwlock));
907*7c478bd9Sstevel@tonic-gate 			ASSERT(!(RW_WRITE_HELD(&ufsvfsp->vfs_dqrwlock)));
908*7c478bd9Sstevel@tonic-gate 			rw_exit(&ufsvfsp->vfs_dqrwlock);
909*7c478bd9Sstevel@tonic-gate 		}
910*7c478bd9Sstevel@tonic-gate 
911*7c478bd9Sstevel@tonic-gate 		base = segmap_getmapflt(segkmap, vp, (off + mapon),
912*7c478bd9Sstevel@tonic-gate 					(uint_t)n, !pagecreate, S_WRITE);
913*7c478bd9Sstevel@tonic-gate 
914*7c478bd9Sstevel@tonic-gate 		/*
915*7c478bd9Sstevel@tonic-gate 		 * segmap_pagecreate() returns 1 if it calls
916*7c478bd9Sstevel@tonic-gate 		 * page_create_va() to allocate any pages.
917*7c478bd9Sstevel@tonic-gate 		 */
918*7c478bd9Sstevel@tonic-gate 		newpage = 0;
919*7c478bd9Sstevel@tonic-gate 
920*7c478bd9Sstevel@tonic-gate 		if (pagecreate)
921*7c478bd9Sstevel@tonic-gate 			newpage = segmap_pagecreate(segkmap, base,
922*7c478bd9Sstevel@tonic-gate 			    (size_t)n, 0);
923*7c478bd9Sstevel@tonic-gate 
924*7c478bd9Sstevel@tonic-gate 		premove_resid = uio->uio_resid;
925*7c478bd9Sstevel@tonic-gate 		error = uiomove(base + mapon, (long)n, UIO_WRITE, uio);
926*7c478bd9Sstevel@tonic-gate 
927*7c478bd9Sstevel@tonic-gate 		/*
928*7c478bd9Sstevel@tonic-gate 		 * If "newpage" is set, then a new page was created and it
929*7c478bd9Sstevel@tonic-gate 		 * does not contain valid data, so it needs to be initialized
930*7c478bd9Sstevel@tonic-gate 		 * at this point.
931*7c478bd9Sstevel@tonic-gate 		 * Otherwise the page contains old data, which was overwritten
932*7c478bd9Sstevel@tonic-gate 		 * partially or as a whole in uiomove.
933*7c478bd9Sstevel@tonic-gate 		 * If there is only one iovec structure within uio, then
934*7c478bd9Sstevel@tonic-gate 		 * on error uiomove will not be able to update uio->uio_loffset
935*7c478bd9Sstevel@tonic-gate 		 * and we would zero the whole page here!
936*7c478bd9Sstevel@tonic-gate 		 *
937*7c478bd9Sstevel@tonic-gate 		 * If uiomove fails because of an error, the old valid data
938*7c478bd9Sstevel@tonic-gate 		 * is kept instead of filling the rest of the page with zero's.
939*7c478bd9Sstevel@tonic-gate 		 */
940*7c478bd9Sstevel@tonic-gate 		if (newpage &&
941*7c478bd9Sstevel@tonic-gate 		    uio->uio_loffset < roundup(off + mapon + n, PAGESIZE)) {
942*7c478bd9Sstevel@tonic-gate 			/*
943*7c478bd9Sstevel@tonic-gate 			 * We created pages w/o initializing them completely,
944*7c478bd9Sstevel@tonic-gate 			 * thus we need to zero the part that wasn't set up.
945*7c478bd9Sstevel@tonic-gate 			 * This happens on most EOF write cases and if
946*7c478bd9Sstevel@tonic-gate 			 * we had some sort of error during the uiomove.
947*7c478bd9Sstevel@tonic-gate 			 */
948*7c478bd9Sstevel@tonic-gate 			int nzero, nmoved;
949*7c478bd9Sstevel@tonic-gate 
950*7c478bd9Sstevel@tonic-gate 			nmoved = (int)(uio->uio_loffset - (off + mapon));
951*7c478bd9Sstevel@tonic-gate 			ASSERT(nmoved >= 0 && nmoved <= n);
952*7c478bd9Sstevel@tonic-gate 			nzero = roundup(on + n, PAGESIZE) - nmoved;
953*7c478bd9Sstevel@tonic-gate 			ASSERT(nzero > 0 && mapon + nmoved + nzero <= MAXBSIZE);
954*7c478bd9Sstevel@tonic-gate 			(void) kzero(base + mapon + nmoved, (uint_t)nzero);
955*7c478bd9Sstevel@tonic-gate 		}
956*7c478bd9Sstevel@tonic-gate 
957*7c478bd9Sstevel@tonic-gate 		/*
958*7c478bd9Sstevel@tonic-gate 		 * Unlock the pages allocated by page_create_va()
959*7c478bd9Sstevel@tonic-gate 		 * in segmap_pagecreate()
960*7c478bd9Sstevel@tonic-gate 		 */
961*7c478bd9Sstevel@tonic-gate 		if (newpage)
962*7c478bd9Sstevel@tonic-gate 			segmap_pageunlock(segkmap, base, (size_t)n, S_WRITE);
963*7c478bd9Sstevel@tonic-gate 
964*7c478bd9Sstevel@tonic-gate 		/*
965*7c478bd9Sstevel@tonic-gate 		 * If the size of the file changed, then update the
966*7c478bd9Sstevel@tonic-gate 		 * size field in the inode now.  This can't be done
967*7c478bd9Sstevel@tonic-gate 		 * before the call to segmap_pageunlock or there is
968*7c478bd9Sstevel@tonic-gate 		 * a potential deadlock with callers to ufs_putpage().
969*7c478bd9Sstevel@tonic-gate 		 * They will be holding i_contents and trying to lock
970*7c478bd9Sstevel@tonic-gate 		 * a page, while this thread is holding a page locked
971*7c478bd9Sstevel@tonic-gate 		 * and trying to acquire i_contents.
972*7c478bd9Sstevel@tonic-gate 		 */
973*7c478bd9Sstevel@tonic-gate 		if (i_size_changed) {
974*7c478bd9Sstevel@tonic-gate 			rw_enter(&ip->i_contents, RW_WRITER);
975*7c478bd9Sstevel@tonic-gate 			old_i_size = ip->i_size;
976*7c478bd9Sstevel@tonic-gate 			UFS_SET_ISIZE(uoff + n, ip);
977*7c478bd9Sstevel@tonic-gate 			TRANS_INODE(ufsvfsp, ip);
978*7c478bd9Sstevel@tonic-gate 			/*
979*7c478bd9Sstevel@tonic-gate 			 * file has grown larger than 2GB. Set flag
980*7c478bd9Sstevel@tonic-gate 			 * in superblock to indicate this, if it
981*7c478bd9Sstevel@tonic-gate 			 * is not already set.
982*7c478bd9Sstevel@tonic-gate 			 */
983*7c478bd9Sstevel@tonic-gate 			if ((ip->i_size > MAXOFF32_T) &&
984*7c478bd9Sstevel@tonic-gate 			    !(fs->fs_flags & FSLARGEFILES)) {
985*7c478bd9Sstevel@tonic-gate 				ASSERT(ufsvfsp->vfs_lfflags & UFS_LARGEFILES);
986*7c478bd9Sstevel@tonic-gate 				mutex_enter(&ufsvfsp->vfs_lock);
987*7c478bd9Sstevel@tonic-gate 				fs->fs_flags |= FSLARGEFILES;
988*7c478bd9Sstevel@tonic-gate 				ufs_sbwrite(ufsvfsp);
989*7c478bd9Sstevel@tonic-gate 				mutex_exit(&ufsvfsp->vfs_lock);
990*7c478bd9Sstevel@tonic-gate 			}
991*7c478bd9Sstevel@tonic-gate 			mutex_enter(&ip->i_tlock);
992*7c478bd9Sstevel@tonic-gate 			ip->i_writer = NULL;
993*7c478bd9Sstevel@tonic-gate 			cv_broadcast(&ip->i_wrcv);
994*7c478bd9Sstevel@tonic-gate 			mutex_exit(&ip->i_tlock);
995*7c478bd9Sstevel@tonic-gate 			rw_exit(&ip->i_contents);
996*7c478bd9Sstevel@tonic-gate 		}
997*7c478bd9Sstevel@tonic-gate 
998*7c478bd9Sstevel@tonic-gate 		if (error) {
999*7c478bd9Sstevel@tonic-gate 			/*
1000*7c478bd9Sstevel@tonic-gate 			 * If we failed on a write, we may have already
1001*7c478bd9Sstevel@tonic-gate 			 * allocated file blocks as well as pages.  It's
1002*7c478bd9Sstevel@tonic-gate 			 * hard to undo the block allocation, but we must
1003*7c478bd9Sstevel@tonic-gate 			 * be sure to invalidate any pages that may have
1004*7c478bd9Sstevel@tonic-gate 			 * been allocated.
1005*7c478bd9Sstevel@tonic-gate 			 *
1006*7c478bd9Sstevel@tonic-gate 			 * If the page was created without initialization
1007*7c478bd9Sstevel@tonic-gate 			 * then we must check if it should be possible
1008*7c478bd9Sstevel@tonic-gate 			 * to destroy the new page and to keep the old data
1009*7c478bd9Sstevel@tonic-gate 			 * on the disk.
1010*7c478bd9Sstevel@tonic-gate 			 *
1011*7c478bd9Sstevel@tonic-gate 			 * It is possible to destroy the page without
1012*7c478bd9Sstevel@tonic-gate 			 * having to write back its contents only when
1013*7c478bd9Sstevel@tonic-gate 			 * - the size of the file keeps unchanged
1014*7c478bd9Sstevel@tonic-gate 			 * - bmap_write() did not allocate new disk blocks
1015*7c478bd9Sstevel@tonic-gate 			 *   it is possible to create big files using "seek" and
1016*7c478bd9Sstevel@tonic-gate 			 *   write to the end of the file. A "write" to a
1017*7c478bd9Sstevel@tonic-gate 			 *   position before the end of the file would not
1018*7c478bd9Sstevel@tonic-gate 			 *   change the size of the file but it would allocate
1019*7c478bd9Sstevel@tonic-gate 			 *   new disk blocks.
1020*7c478bd9Sstevel@tonic-gate 			 * - uiomove intended to overwrite the whole page.
1021*7c478bd9Sstevel@tonic-gate 			 * - a new page was created (newpage == 1).
1022*7c478bd9Sstevel@tonic-gate 			 */
1023*7c478bd9Sstevel@tonic-gate 
1024*7c478bd9Sstevel@tonic-gate 			if (i_size_changed == 0 && new_iblocks == 0 &&
1025*7c478bd9Sstevel@tonic-gate 			    newpage) {
1026*7c478bd9Sstevel@tonic-gate 
1027*7c478bd9Sstevel@tonic-gate 				/* unwind what uiomove eventually last did */
1028*7c478bd9Sstevel@tonic-gate 				uio->uio_resid = premove_resid;
1029*7c478bd9Sstevel@tonic-gate 
1030*7c478bd9Sstevel@tonic-gate 				/*
1031*7c478bd9Sstevel@tonic-gate 				 * destroy the page, do not write ambiguous
1032*7c478bd9Sstevel@tonic-gate 				 * data to the disk.
1033*7c478bd9Sstevel@tonic-gate 				 */
1034*7c478bd9Sstevel@tonic-gate 				flags = SM_DESTROY;
1035*7c478bd9Sstevel@tonic-gate 			} else {
1036*7c478bd9Sstevel@tonic-gate 				/*
1037*7c478bd9Sstevel@tonic-gate 				 * write the page back to the disk, if dirty,
1038*7c478bd9Sstevel@tonic-gate 				 * and remove the page from the cache.
1039*7c478bd9Sstevel@tonic-gate 				 */
1040*7c478bd9Sstevel@tonic-gate 				flags = SM_INVAL;
1041*7c478bd9Sstevel@tonic-gate 			}
1042*7c478bd9Sstevel@tonic-gate 			(void) segmap_release(segkmap, base, flags);
1043*7c478bd9Sstevel@tonic-gate 		} else {
1044*7c478bd9Sstevel@tonic-gate 			flags = 0;
1045*7c478bd9Sstevel@tonic-gate 			/*
1046*7c478bd9Sstevel@tonic-gate 			 * Force write back for synchronous write cases.
1047*7c478bd9Sstevel@tonic-gate 			 */
1048*7c478bd9Sstevel@tonic-gate 			if ((ioflag & (FSYNC|FDSYNC)) || type == IFDIR) {
1049*7c478bd9Sstevel@tonic-gate 				/*
1050*7c478bd9Sstevel@tonic-gate 				 * If the sticky bit is set but the
1051*7c478bd9Sstevel@tonic-gate 				 * execute bit is not set, we do a
1052*7c478bd9Sstevel@tonic-gate 				 * synchronous write back and free
1053*7c478bd9Sstevel@tonic-gate 				 * the page when done.  We set up swap
1054*7c478bd9Sstevel@tonic-gate 				 * files to be handled this way to
1055*7c478bd9Sstevel@tonic-gate 				 * prevent servers from keeping around
1056*7c478bd9Sstevel@tonic-gate 				 * the client's swap pages too long.
1057*7c478bd9Sstevel@tonic-gate 				 * XXX - there ought to be a better way.
1058*7c478bd9Sstevel@tonic-gate 				 */
1059*7c478bd9Sstevel@tonic-gate 				if (IS_SWAPVP(vp)) {
1060*7c478bd9Sstevel@tonic-gate 					flags = SM_WRITE | SM_FREE |
1061*7c478bd9Sstevel@tonic-gate 					    SM_DONTNEED;
1062*7c478bd9Sstevel@tonic-gate 					iupdat_flag = 0;
1063*7c478bd9Sstevel@tonic-gate 				} else {
1064*7c478bd9Sstevel@tonic-gate 					flags = SM_WRITE;
1065*7c478bd9Sstevel@tonic-gate 				}
1066*7c478bd9Sstevel@tonic-gate 			} else if (n + on == MAXBSIZE || IS_SWAPVP(vp)) {
1067*7c478bd9Sstevel@tonic-gate 				/*
1068*7c478bd9Sstevel@tonic-gate 				 * Have written a whole block.
1069*7c478bd9Sstevel@tonic-gate 				 * Start an asynchronous write and
1070*7c478bd9Sstevel@tonic-gate 				 * mark the buffer to indicate that
1071*7c478bd9Sstevel@tonic-gate 				 * it won't be needed again soon.
1072*7c478bd9Sstevel@tonic-gate 				 */
1073*7c478bd9Sstevel@tonic-gate 				flags = SM_WRITE | SM_ASYNC | SM_DONTNEED;
1074*7c478bd9Sstevel@tonic-gate 			}
1075*7c478bd9Sstevel@tonic-gate 			error = segmap_release(segkmap, base, flags);
1076*7c478bd9Sstevel@tonic-gate 			/*
1077*7c478bd9Sstevel@tonic-gate 			 * If the operation failed and is synchronous,
1078*7c478bd9Sstevel@tonic-gate 			 * then we need to unwind what uiomove() last
1079*7c478bd9Sstevel@tonic-gate 			 * did so we can potentially return an error to
1080*7c478bd9Sstevel@tonic-gate 			 * the caller.  If this write operation was
1081*7c478bd9Sstevel@tonic-gate 			 * done in two pieces and the first succeeded,
1082*7c478bd9Sstevel@tonic-gate 			 * then we won't return an error for the second
1083*7c478bd9Sstevel@tonic-gate 			 * piece that failed.  However, we only want to
1084*7c478bd9Sstevel@tonic-gate 			 * return a resid value that reflects what was
1085*7c478bd9Sstevel@tonic-gate 			 * really done.
1086*7c478bd9Sstevel@tonic-gate 			 *
1087*7c478bd9Sstevel@tonic-gate 			 * Failures for non-synchronous operations can
1088*7c478bd9Sstevel@tonic-gate 			 * be ignored since the page subsystem will
1089*7c478bd9Sstevel@tonic-gate 			 * retry the operation until it succeeds or the
1090*7c478bd9Sstevel@tonic-gate 			 * file system is unmounted.
1091*7c478bd9Sstevel@tonic-gate 			 */
1092*7c478bd9Sstevel@tonic-gate 			if (error) {
1093*7c478bd9Sstevel@tonic-gate 				if ((ioflag & (FSYNC | FDSYNC)) ||
1094*7c478bd9Sstevel@tonic-gate 				    type == IFDIR) {
1095*7c478bd9Sstevel@tonic-gate 					uio->uio_resid = premove_resid;
1096*7c478bd9Sstevel@tonic-gate 				} else {
1097*7c478bd9Sstevel@tonic-gate 					error = 0;
1098*7c478bd9Sstevel@tonic-gate 				}
1099*7c478bd9Sstevel@tonic-gate 			}
1100*7c478bd9Sstevel@tonic-gate 		}
1101*7c478bd9Sstevel@tonic-gate 
1102*7c478bd9Sstevel@tonic-gate 		/*
1103*7c478bd9Sstevel@tonic-gate 		 * Re-acquire contents lock.
1104*7c478bd9Sstevel@tonic-gate 		 * If it was dropped, reacquire reader vfs_dqrwlock as well.
1105*7c478bd9Sstevel@tonic-gate 		 */
1106*7c478bd9Sstevel@tonic-gate 		if (do_dqrwlock)
1107*7c478bd9Sstevel@tonic-gate 			rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
1108*7c478bd9Sstevel@tonic-gate 		rw_enter(&ip->i_contents, RW_WRITER);
1109*7c478bd9Sstevel@tonic-gate 
1110*7c478bd9Sstevel@tonic-gate 		/*
1111*7c478bd9Sstevel@tonic-gate 		 * If the uiomove() failed or if a synchronous
1112*7c478bd9Sstevel@tonic-gate 		 * page push failed, fix up i_size.
1113*7c478bd9Sstevel@tonic-gate 		 */
1114*7c478bd9Sstevel@tonic-gate 		if (error) {
1115*7c478bd9Sstevel@tonic-gate 			if (i_size_changed) {
1116*7c478bd9Sstevel@tonic-gate 				/*
1117*7c478bd9Sstevel@tonic-gate 				 * The uiomove failed, and we
1118*7c478bd9Sstevel@tonic-gate 				 * allocated blocks,so get rid
1119*7c478bd9Sstevel@tonic-gate 				 * of them.
1120*7c478bd9Sstevel@tonic-gate 				 */
1121*7c478bd9Sstevel@tonic-gate 				(void) ufs_itrunc(ip, old_i_size, 0, cr);
1122*7c478bd9Sstevel@tonic-gate 			}
1123*7c478bd9Sstevel@tonic-gate 		} else {
1124*7c478bd9Sstevel@tonic-gate 			/*
1125*7c478bd9Sstevel@tonic-gate 			 * XXX - Can this be out of the loop?
1126*7c478bd9Sstevel@tonic-gate 			 */
1127*7c478bd9Sstevel@tonic-gate 			ip->i_flag |= IUPD | ICHG;
1128*7c478bd9Sstevel@tonic-gate 			/*
1129*7c478bd9Sstevel@tonic-gate 			 * Only do one increase of i_seq for multiple
1130*7c478bd9Sstevel@tonic-gate 			 * pieces.  Because we drop locks, record
1131*7c478bd9Sstevel@tonic-gate 			 * the fact that we changed the timestamp and
1132*7c478bd9Sstevel@tonic-gate 			 * are deferring the increase in case another thread
1133*7c478bd9Sstevel@tonic-gate 			 * pushes our timestamp update.
1134*7c478bd9Sstevel@tonic-gate 			 */
1135*7c478bd9Sstevel@tonic-gate 			i_seq_needed = 1;
1136*7c478bd9Sstevel@tonic-gate 			ip->i_flag |= ISEQ;
1137*7c478bd9Sstevel@tonic-gate 			if (i_size_changed)
1138*7c478bd9Sstevel@tonic-gate 				ip->i_flag |= IATTCHG;
1139*7c478bd9Sstevel@tonic-gate 			if ((ip->i_mode & (IEXEC | (IEXEC >> 3) |
1140*7c478bd9Sstevel@tonic-gate 			    (IEXEC >> 6))) != 0 &&
1141*7c478bd9Sstevel@tonic-gate 			    (ip->i_mode & (ISUID | ISGID)) != 0 &&
1142*7c478bd9Sstevel@tonic-gate 			    secpolicy_vnode_setid_retain(cr,
1143*7c478bd9Sstevel@tonic-gate 			    (ip->i_mode & ISUID) != 0 && ip->i_uid == 0) != 0) {
1144*7c478bd9Sstevel@tonic-gate 				/*
1145*7c478bd9Sstevel@tonic-gate 				 * Clear Set-UID & Set-GID bits on
1146*7c478bd9Sstevel@tonic-gate 				 * successful write if not privileged
1147*7c478bd9Sstevel@tonic-gate 				 * and at least one of the execute bits
1148*7c478bd9Sstevel@tonic-gate 				 * is set.  If we always clear Set-GID,
1149*7c478bd9Sstevel@tonic-gate 				 * mandatory file and record locking is
1150*7c478bd9Sstevel@tonic-gate 				 * unuseable.
1151*7c478bd9Sstevel@tonic-gate 				 */
1152*7c478bd9Sstevel@tonic-gate 				ip->i_mode &= ~(ISUID | ISGID);
1153*7c478bd9Sstevel@tonic-gate 			}
1154*7c478bd9Sstevel@tonic-gate 		}
1155*7c478bd9Sstevel@tonic-gate 		TRANS_INODE(ufsvfsp, ip);
1156*7c478bd9Sstevel@tonic-gate 	} while (error == 0 && uio->uio_resid > 0 && n != 0);
1157*7c478bd9Sstevel@tonic-gate 
1158*7c478bd9Sstevel@tonic-gate out:
1159*7c478bd9Sstevel@tonic-gate 	/*
1160*7c478bd9Sstevel@tonic-gate 	 * Make sure i_seq is increased at least once per write
1161*7c478bd9Sstevel@tonic-gate 	 */
1162*7c478bd9Sstevel@tonic-gate 	if (i_seq_needed) {
1163*7c478bd9Sstevel@tonic-gate 		ip->i_seq++;
1164*7c478bd9Sstevel@tonic-gate 		ip->i_flag &= ~ISEQ;	/* no longer deferred */
1165*7c478bd9Sstevel@tonic-gate 	}
1166*7c478bd9Sstevel@tonic-gate 
1167*7c478bd9Sstevel@tonic-gate 	/*
1168*7c478bd9Sstevel@tonic-gate 	 * Inode is updated according to this table -
1169*7c478bd9Sstevel@tonic-gate 	 *
1170*7c478bd9Sstevel@tonic-gate 	 *   FSYNC	  FDSYNC(posix.4)
1171*7c478bd9Sstevel@tonic-gate 	 *   --------------------------
1172*7c478bd9Sstevel@tonic-gate 	 *   always@	  IATTCHG|IBDWRITE
1173*7c478bd9Sstevel@tonic-gate 	 *
1174*7c478bd9Sstevel@tonic-gate 	 * @ - 	If we are doing synchronous write the only time we should
1175*7c478bd9Sstevel@tonic-gate 	 *	not be sync'ing the ip here is if we have the stickyhack
1176*7c478bd9Sstevel@tonic-gate 	 *	activated, the file is marked with the sticky bit and
1177*7c478bd9Sstevel@tonic-gate 	 *	no exec bit, the file length has not been changed and
1178*7c478bd9Sstevel@tonic-gate 	 *	no new blocks have been allocated during this write.
1179*7c478bd9Sstevel@tonic-gate 	 */
1180*7c478bd9Sstevel@tonic-gate 
1181*7c478bd9Sstevel@tonic-gate 	if ((ip->i_flag & ISYNC) != 0) {
1182*7c478bd9Sstevel@tonic-gate 		/*
1183*7c478bd9Sstevel@tonic-gate 		 * we have eliminated nosync
1184*7c478bd9Sstevel@tonic-gate 		 */
1185*7c478bd9Sstevel@tonic-gate 		if ((ip->i_flag & (IATTCHG|IBDWRITE)) ||
1186*7c478bd9Sstevel@tonic-gate 			((ioflag & FSYNC) && iupdat_flag)) {
1187*7c478bd9Sstevel@tonic-gate 			ufs_iupdat(ip, 1);
1188*7c478bd9Sstevel@tonic-gate 		}
1189*7c478bd9Sstevel@tonic-gate 	}
1190*7c478bd9Sstevel@tonic-gate 
1191*7c478bd9Sstevel@tonic-gate 	/*
1192*7c478bd9Sstevel@tonic-gate 	 * If we've already done a partial-write, terminate
1193*7c478bd9Sstevel@tonic-gate 	 * the write but return no error unless the error is ENOSPC
1194*7c478bd9Sstevel@tonic-gate 	 * because the caller can detect this and free resources and
1195*7c478bd9Sstevel@tonic-gate 	 * try again.
1196*7c478bd9Sstevel@tonic-gate 	 */
1197*7c478bd9Sstevel@tonic-gate 	if ((start_resid != uio->uio_resid) && (error != ENOSPC))
1198*7c478bd9Sstevel@tonic-gate 		error = 0;
1199*7c478bd9Sstevel@tonic-gate 
1200*7c478bd9Sstevel@tonic-gate 	ip->i_flag &= ~(INOACC | ISYNC);
1201*7c478bd9Sstevel@tonic-gate 	ITIMES_NOLOCK(ip);
1202*7c478bd9Sstevel@tonic-gate 	TRACE_2(TR_FAC_UFS, TR_UFS_RWIP_END,
1203*7c478bd9Sstevel@tonic-gate 		"ufs_wrip_end:vp %p error %d", vp, error);
1204*7c478bd9Sstevel@tonic-gate 	return (error);
1205*7c478bd9Sstevel@tonic-gate }
1206*7c478bd9Sstevel@tonic-gate 
1207*7c478bd9Sstevel@tonic-gate /*
1208*7c478bd9Sstevel@tonic-gate  * rdip does the real work of read requests for ufs.
1209*7c478bd9Sstevel@tonic-gate  */
1210*7c478bd9Sstevel@tonic-gate int
1211*7c478bd9Sstevel@tonic-gate rdip(struct inode *ip, struct uio *uio, int ioflag, cred_t *cr)
1212*7c478bd9Sstevel@tonic-gate {
1213*7c478bd9Sstevel@tonic-gate 	u_offset_t off;
1214*7c478bd9Sstevel@tonic-gate 	caddr_t base;
1215*7c478bd9Sstevel@tonic-gate 	struct fs *fs;
1216*7c478bd9Sstevel@tonic-gate 	struct ufsvfs *ufsvfsp;
1217*7c478bd9Sstevel@tonic-gate 	struct vnode *vp;
1218*7c478bd9Sstevel@tonic-gate 	long oresid = uio->uio_resid;
1219*7c478bd9Sstevel@tonic-gate 	u_offset_t n, on, mapon;
1220*7c478bd9Sstevel@tonic-gate 	int error = 0;
1221*7c478bd9Sstevel@tonic-gate 	int doupdate = 1;
1222*7c478bd9Sstevel@tonic-gate 	uint_t flags, cachemode;
1223*7c478bd9Sstevel@tonic-gate 	int dofree, directio_status;
1224*7c478bd9Sstevel@tonic-gate 	krw_t rwtype;
1225*7c478bd9Sstevel@tonic-gate 	o_mode_t type;
1226*7c478bd9Sstevel@tonic-gate 
1227*7c478bd9Sstevel@tonic-gate 	vp = ITOV(ip);
1228*7c478bd9Sstevel@tonic-gate 
1229*7c478bd9Sstevel@tonic-gate 	TRACE_1(TR_FAC_UFS, TR_UFS_RWIP_START,
1230*7c478bd9Sstevel@tonic-gate 		"ufs_rdip_start:vp %p", vp);
1231*7c478bd9Sstevel@tonic-gate 
1232*7c478bd9Sstevel@tonic-gate 	ASSERT(RW_LOCK_HELD(&ip->i_contents));
1233*7c478bd9Sstevel@tonic-gate 
1234*7c478bd9Sstevel@tonic-gate 	ufsvfsp = ip->i_ufsvfs;
1235*7c478bd9Sstevel@tonic-gate 
1236*7c478bd9Sstevel@tonic-gate 	if (ufsvfsp == NULL)
1237*7c478bd9Sstevel@tonic-gate 		return (EIO);
1238*7c478bd9Sstevel@tonic-gate 
1239*7c478bd9Sstevel@tonic-gate 	fs = ufsvfsp->vfs_fs;
1240*7c478bd9Sstevel@tonic-gate 
1241*7c478bd9Sstevel@tonic-gate 	/* check for valid filetype */
1242*7c478bd9Sstevel@tonic-gate 	type = ip->i_mode & IFMT;
1243*7c478bd9Sstevel@tonic-gate 	if ((type != IFREG) && (type != IFDIR) && (type != IFATTRDIR) &&
1244*7c478bd9Sstevel@tonic-gate 	    (type != IFLNK) && (type != IFSHAD)) {
1245*7c478bd9Sstevel@tonic-gate 		return (EIO);
1246*7c478bd9Sstevel@tonic-gate 	}
1247*7c478bd9Sstevel@tonic-gate 
1248*7c478bd9Sstevel@tonic-gate 	if (uio->uio_loffset > UFS_MAXOFFSET_T) {
1249*7c478bd9Sstevel@tonic-gate 		TRACE_2(TR_FAC_UFS, TR_UFS_RWIP_END,
1250*7c478bd9Sstevel@tonic-gate 			"ufs_rdip_end:vp %p error %d", vp, EINVAL);
1251*7c478bd9Sstevel@tonic-gate 		error = 0;
1252*7c478bd9Sstevel@tonic-gate 		goto out;
1253*7c478bd9Sstevel@tonic-gate 	}
1254*7c478bd9Sstevel@tonic-gate 	if (uio->uio_loffset < (offset_t)0) {
1255*7c478bd9Sstevel@tonic-gate 		TRACE_2(TR_FAC_UFS, TR_UFS_RWIP_END,
1256*7c478bd9Sstevel@tonic-gate 			"ufs_rdip_end:vp %p error %d", vp, EINVAL);
1257*7c478bd9Sstevel@tonic-gate 		return (EINVAL);
1258*7c478bd9Sstevel@tonic-gate 	}
1259*7c478bd9Sstevel@tonic-gate 	if (uio->uio_resid == 0) {
1260*7c478bd9Sstevel@tonic-gate 		TRACE_2(TR_FAC_UFS, TR_UFS_RWIP_END,
1261*7c478bd9Sstevel@tonic-gate 			"ufs_rdip_end:vp %p error %d", vp, 0);
1262*7c478bd9Sstevel@tonic-gate 		return (0);
1263*7c478bd9Sstevel@tonic-gate 	}
1264*7c478bd9Sstevel@tonic-gate 
1265*7c478bd9Sstevel@tonic-gate 	if (!ULOCKFS_IS_NOIACC(ITOUL(ip)) && (fs->fs_ronly == 0) &&
1266*7c478bd9Sstevel@tonic-gate 		(!ufsvfsp->vfs_noatime)) {
1267*7c478bd9Sstevel@tonic-gate 		mutex_enter(&ip->i_tlock);
1268*7c478bd9Sstevel@tonic-gate 		ip->i_flag |= IACC;
1269*7c478bd9Sstevel@tonic-gate 		mutex_exit(&ip->i_tlock);
1270*7c478bd9Sstevel@tonic-gate 	}
1271*7c478bd9Sstevel@tonic-gate 	/*
1272*7c478bd9Sstevel@tonic-gate 	 * Try to go direct
1273*7c478bd9Sstevel@tonic-gate 	 */
1274*7c478bd9Sstevel@tonic-gate 	if (ip->i_flag & IDIRECTIO || ufsvfsp->vfs_forcedirectio) {
1275*7c478bd9Sstevel@tonic-gate 		error = ufs_directio_read(ip, uio, cr, &directio_status);
1276*7c478bd9Sstevel@tonic-gate 		if (directio_status == DIRECTIO_SUCCESS)
1277*7c478bd9Sstevel@tonic-gate 			goto out;
1278*7c478bd9Sstevel@tonic-gate 	}
1279*7c478bd9Sstevel@tonic-gate 
1280*7c478bd9Sstevel@tonic-gate 	rwtype = (rw_write_held(&ip->i_contents)?RW_WRITER:RW_READER);
1281*7c478bd9Sstevel@tonic-gate 
1282*7c478bd9Sstevel@tonic-gate 	/*
1283*7c478bd9Sstevel@tonic-gate 	 * If cache_read_ahead is enabled, we will
1284*7c478bd9Sstevel@tonic-gate 	 * release the pages at the tail of the cache
1285*7c478bd9Sstevel@tonic-gate 	 * list, otherwise we will put them at the head.
1286*7c478bd9Sstevel@tonic-gate 	 */
1287*7c478bd9Sstevel@tonic-gate 	if (cache_read_ahead)
1288*7c478bd9Sstevel@tonic-gate 		cachemode = SM_FREE | SM_ASYNC;
1289*7c478bd9Sstevel@tonic-gate 	else
1290*7c478bd9Sstevel@tonic-gate 		cachemode = SM_FREE | SM_DONTNEED | SM_ASYNC;
1291*7c478bd9Sstevel@tonic-gate 
1292*7c478bd9Sstevel@tonic-gate 	do {
1293*7c478bd9Sstevel@tonic-gate 		offset_t diff;
1294*7c478bd9Sstevel@tonic-gate 		u_offset_t uoff = uio->uio_loffset;
1295*7c478bd9Sstevel@tonic-gate 		off = uoff & (offset_t)MAXBMASK;
1296*7c478bd9Sstevel@tonic-gate 		mapon = (u_offset_t)(uoff & (offset_t)MAXBOFFSET);
1297*7c478bd9Sstevel@tonic-gate 		on = (u_offset_t)blkoff(fs, uoff);
1298*7c478bd9Sstevel@tonic-gate 		n = MIN((u_offset_t)fs->fs_bsize - on,
1299*7c478bd9Sstevel@tonic-gate 			(u_offset_t)uio->uio_resid);
1300*7c478bd9Sstevel@tonic-gate 
1301*7c478bd9Sstevel@tonic-gate 		diff = ip->i_size - uoff;
1302*7c478bd9Sstevel@tonic-gate 
1303*7c478bd9Sstevel@tonic-gate 		if (diff <= (offset_t)0) {
1304*7c478bd9Sstevel@tonic-gate 			error = 0;
1305*7c478bd9Sstevel@tonic-gate 			goto out;
1306*7c478bd9Sstevel@tonic-gate 		}
1307*7c478bd9Sstevel@tonic-gate 		if (diff < (offset_t)n)
1308*7c478bd9Sstevel@tonic-gate 			n = (int)diff;
1309*7c478bd9Sstevel@tonic-gate 		dofree = freebehind &&
1310*7c478bd9Sstevel@tonic-gate 		    ip->i_nextr == (off & PAGEMASK) && off > smallfile;
1311*7c478bd9Sstevel@tonic-gate 
1312*7c478bd9Sstevel@tonic-gate 		/*
1313*7c478bd9Sstevel@tonic-gate 		 * At this point we can enter ufs_getpage() in one of two
1314*7c478bd9Sstevel@tonic-gate 		 * ways:
1315*7c478bd9Sstevel@tonic-gate 		 * 1) segmap_getmapflt() calls ufs_getpage() when the
1316*7c478bd9Sstevel@tonic-gate 		 *    forcefault parameter is true (value of 1 is passed)
1317*7c478bd9Sstevel@tonic-gate 		 * 2) uiomove() causes a page fault.
1318*7c478bd9Sstevel@tonic-gate 		 *
1319*7c478bd9Sstevel@tonic-gate 		 * We cannot hold onto an i_contents reader lock without
1320*7c478bd9Sstevel@tonic-gate 		 * risking deadlock in ufs_getpage() so drop a reader lock.
1321*7c478bd9Sstevel@tonic-gate 		 * The ufs_getpage() dolock logic already allows for a
1322*7c478bd9Sstevel@tonic-gate 		 * thread holding i_contents as writer to work properly
1323*7c478bd9Sstevel@tonic-gate 		 * so we keep a writer lock.
1324*7c478bd9Sstevel@tonic-gate 		 */
1325*7c478bd9Sstevel@tonic-gate 		if (rwtype == RW_READER)
1326*7c478bd9Sstevel@tonic-gate 			rw_exit(&ip->i_contents);
1327*7c478bd9Sstevel@tonic-gate 		base = segmap_getmapflt(segkmap, vp, (off + mapon),
1328*7c478bd9Sstevel@tonic-gate 					(uint_t)n, 1, S_READ);
1329*7c478bd9Sstevel@tonic-gate 
1330*7c478bd9Sstevel@tonic-gate 		error = uiomove(base + mapon, (long)n, UIO_READ, uio);
1331*7c478bd9Sstevel@tonic-gate 
1332*7c478bd9Sstevel@tonic-gate 		flags = 0;
1333*7c478bd9Sstevel@tonic-gate 		if (!error) {
1334*7c478bd9Sstevel@tonic-gate 			/*
1335*7c478bd9Sstevel@tonic-gate 			 * If reading sequential we won't need
1336*7c478bd9Sstevel@tonic-gate 			 * this buffer again soon.
1337*7c478bd9Sstevel@tonic-gate 			 */
1338*7c478bd9Sstevel@tonic-gate 			if (dofree) {
1339*7c478bd9Sstevel@tonic-gate 				flags = cachemode;
1340*7c478bd9Sstevel@tonic-gate 			}
1341*7c478bd9Sstevel@tonic-gate 			/*
1342*7c478bd9Sstevel@tonic-gate 			 * In POSIX SYNC (FSYNC and FDSYNC) read mode,
1343*7c478bd9Sstevel@tonic-gate 			 * we want to make sure that the page which has
1344*7c478bd9Sstevel@tonic-gate 			 * been read, is written on disk if it is dirty.
1345*7c478bd9Sstevel@tonic-gate 			 * And corresponding indirect blocks should also
1346*7c478bd9Sstevel@tonic-gate 			 * be flushed out.
1347*7c478bd9Sstevel@tonic-gate 			 */
1348*7c478bd9Sstevel@tonic-gate 			if ((ioflag & FRSYNC) && (ioflag & (FSYNC|FDSYNC))) {
1349*7c478bd9Sstevel@tonic-gate 				flags &= ~SM_ASYNC;
1350*7c478bd9Sstevel@tonic-gate 				flags |= SM_WRITE;
1351*7c478bd9Sstevel@tonic-gate 			}
1352*7c478bd9Sstevel@tonic-gate 			error = segmap_release(segkmap, base, flags);
1353*7c478bd9Sstevel@tonic-gate 		} else
1354*7c478bd9Sstevel@tonic-gate 			(void) segmap_release(segkmap, base, flags);
1355*7c478bd9Sstevel@tonic-gate 
1356*7c478bd9Sstevel@tonic-gate 		if (rwtype == RW_READER)
1357*7c478bd9Sstevel@tonic-gate 			rw_enter(&ip->i_contents, rwtype);
1358*7c478bd9Sstevel@tonic-gate 	} while (error == 0 && uio->uio_resid > 0 && n != 0);
1359*7c478bd9Sstevel@tonic-gate out:
1360*7c478bd9Sstevel@tonic-gate 	/*
1361*7c478bd9Sstevel@tonic-gate 	 * Inode is updated according to this table if FRSYNC is set.
1362*7c478bd9Sstevel@tonic-gate 	 *
1363*7c478bd9Sstevel@tonic-gate 	 *   FSYNC	  FDSYNC(posix.4)
1364*7c478bd9Sstevel@tonic-gate 	 *   --------------------------
1365*7c478bd9Sstevel@tonic-gate 	 *   always	  IATTCHG|IBDWRITE
1366*7c478bd9Sstevel@tonic-gate 	 */
1367*7c478bd9Sstevel@tonic-gate 	/*
1368*7c478bd9Sstevel@tonic-gate 	 * The inode is not updated if we're logging and the inode is a
1369*7c478bd9Sstevel@tonic-gate 	 * directory with FRSYNC, FSYNC and FDSYNC flags set.
1370*7c478bd9Sstevel@tonic-gate 	 */
1371*7c478bd9Sstevel@tonic-gate 	if (ioflag & FRSYNC) {
1372*7c478bd9Sstevel@tonic-gate 		if (TRANS_ISTRANS(ufsvfsp) && ((ip->i_mode & IFMT) == IFDIR)) {
1373*7c478bd9Sstevel@tonic-gate 				doupdate = 0;
1374*7c478bd9Sstevel@tonic-gate 		}
1375*7c478bd9Sstevel@tonic-gate 		if (doupdate) {
1376*7c478bd9Sstevel@tonic-gate 			if ((ioflag & FSYNC) ||
1377*7c478bd9Sstevel@tonic-gate 			    ((ioflag & FDSYNC) &&
1378*7c478bd9Sstevel@tonic-gate 			    (ip->i_flag & (IATTCHG|IBDWRITE)))) {
1379*7c478bd9Sstevel@tonic-gate 				ufs_iupdat(ip, 1);
1380*7c478bd9Sstevel@tonic-gate 			}
1381*7c478bd9Sstevel@tonic-gate 		}
1382*7c478bd9Sstevel@tonic-gate 	}
1383*7c478bd9Sstevel@tonic-gate 	/*
1384*7c478bd9Sstevel@tonic-gate 	 * If we've already done a partial read, terminate
1385*7c478bd9Sstevel@tonic-gate 	 * the read but return no error.
1386*7c478bd9Sstevel@tonic-gate 	 */
1387*7c478bd9Sstevel@tonic-gate 	if (oresid != uio->uio_resid)
1388*7c478bd9Sstevel@tonic-gate 		error = 0;
1389*7c478bd9Sstevel@tonic-gate 	ITIMES(ip);
1390*7c478bd9Sstevel@tonic-gate 
1391*7c478bd9Sstevel@tonic-gate 	TRACE_2(TR_FAC_UFS, TR_UFS_RWIP_END,
1392*7c478bd9Sstevel@tonic-gate 		"ufs_rdip_end:vp %p error %d", vp, error);
1393*7c478bd9Sstevel@tonic-gate 	return (error);
1394*7c478bd9Sstevel@tonic-gate }
1395*7c478bd9Sstevel@tonic-gate 
1396*7c478bd9Sstevel@tonic-gate /* ARGSUSED */
1397*7c478bd9Sstevel@tonic-gate static int
1398*7c478bd9Sstevel@tonic-gate ufs_ioctl(
1399*7c478bd9Sstevel@tonic-gate 	struct vnode	*vp,
1400*7c478bd9Sstevel@tonic-gate 	int		cmd,
1401*7c478bd9Sstevel@tonic-gate 	intptr_t	arg,
1402*7c478bd9Sstevel@tonic-gate 	int		flag,
1403*7c478bd9Sstevel@tonic-gate 	struct cred	*cr,
1404*7c478bd9Sstevel@tonic-gate 	int		*rvalp)
1405*7c478bd9Sstevel@tonic-gate {
1406*7c478bd9Sstevel@tonic-gate 	struct lockfs	lockfs, lockfs_out;
1407*7c478bd9Sstevel@tonic-gate 	struct ufsvfs	*ufsvfsp = VTOI(vp)->i_ufsvfs;
1408*7c478bd9Sstevel@tonic-gate 	char		*comment, *original_comment;
1409*7c478bd9Sstevel@tonic-gate 	struct fs	*fs;
1410*7c478bd9Sstevel@tonic-gate 	struct ulockfs	*ulp;
1411*7c478bd9Sstevel@tonic-gate 	offset_t	off;
1412*7c478bd9Sstevel@tonic-gate 	extern int	maxphys;
1413*7c478bd9Sstevel@tonic-gate 	int		error;
1414*7c478bd9Sstevel@tonic-gate 	int		issync;
1415*7c478bd9Sstevel@tonic-gate 	int		trans_size;
1416*7c478bd9Sstevel@tonic-gate 
1417*7c478bd9Sstevel@tonic-gate 
1418*7c478bd9Sstevel@tonic-gate 	/*
1419*7c478bd9Sstevel@tonic-gate 	 * forcibly unmounted
1420*7c478bd9Sstevel@tonic-gate 	 */
1421*7c478bd9Sstevel@tonic-gate 	if (ufsvfsp == NULL) {
1422*7c478bd9Sstevel@tonic-gate 		return (EIO);
1423*7c478bd9Sstevel@tonic-gate 	}
1424*7c478bd9Sstevel@tonic-gate 
1425*7c478bd9Sstevel@tonic-gate 	fs = ufsvfsp->vfs_fs;
1426*7c478bd9Sstevel@tonic-gate 
1427*7c478bd9Sstevel@tonic-gate 	if (cmd == Q_QUOTACTL) {
1428*7c478bd9Sstevel@tonic-gate 		error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_QUOTA_MASK);
1429*7c478bd9Sstevel@tonic-gate 		if (error)
1430*7c478bd9Sstevel@tonic-gate 			return (error);
1431*7c478bd9Sstevel@tonic-gate 
1432*7c478bd9Sstevel@tonic-gate 		if (ulp) {
1433*7c478bd9Sstevel@tonic-gate 			TRANS_BEGIN_ASYNC(ufsvfsp, TOP_QUOTA,
1434*7c478bd9Sstevel@tonic-gate 						TOP_SETQUOTA_SIZE(fs));
1435*7c478bd9Sstevel@tonic-gate 		}
1436*7c478bd9Sstevel@tonic-gate 
1437*7c478bd9Sstevel@tonic-gate 		error = quotactl(vp, arg, flag, cr);
1438*7c478bd9Sstevel@tonic-gate 
1439*7c478bd9Sstevel@tonic-gate 		if (ulp) {
1440*7c478bd9Sstevel@tonic-gate 			TRANS_END_ASYNC(ufsvfsp, TOP_QUOTA,
1441*7c478bd9Sstevel@tonic-gate 						TOP_SETQUOTA_SIZE(fs));
1442*7c478bd9Sstevel@tonic-gate 			ufs_lockfs_end(ulp);
1443*7c478bd9Sstevel@tonic-gate 		}
1444*7c478bd9Sstevel@tonic-gate 		return (error);
1445*7c478bd9Sstevel@tonic-gate 	}
1446*7c478bd9Sstevel@tonic-gate 
1447*7c478bd9Sstevel@tonic-gate 	switch (cmd) {
1448*7c478bd9Sstevel@tonic-gate 		case _FIOLFS:
1449*7c478bd9Sstevel@tonic-gate 			/*
1450*7c478bd9Sstevel@tonic-gate 			 * file system locking
1451*7c478bd9Sstevel@tonic-gate 			 */
1452*7c478bd9Sstevel@tonic-gate 			if (secpolicy_fs_config(cr, ufsvfsp->vfs_vfs) != 0)
1453*7c478bd9Sstevel@tonic-gate 				return (EPERM);
1454*7c478bd9Sstevel@tonic-gate 
1455*7c478bd9Sstevel@tonic-gate 			if ((flag & DATAMODEL_MASK) == DATAMODEL_NATIVE) {
1456*7c478bd9Sstevel@tonic-gate 				if (copyin((caddr_t)arg, &lockfs,
1457*7c478bd9Sstevel@tonic-gate 						sizeof (struct lockfs)))
1458*7c478bd9Sstevel@tonic-gate 					return (EFAULT);
1459*7c478bd9Sstevel@tonic-gate 			}
1460*7c478bd9Sstevel@tonic-gate #ifdef _SYSCALL32_IMPL
1461*7c478bd9Sstevel@tonic-gate 			else {
1462*7c478bd9Sstevel@tonic-gate 				struct lockfs32	lockfs32;
1463*7c478bd9Sstevel@tonic-gate 				/* Translate ILP32 lockfs to LP64 lockfs */
1464*7c478bd9Sstevel@tonic-gate 				if (copyin((caddr_t)arg, &lockfs32,
1465*7c478bd9Sstevel@tonic-gate 				    sizeof (struct lockfs32)))
1466*7c478bd9Sstevel@tonic-gate 					return (EFAULT);
1467*7c478bd9Sstevel@tonic-gate 				lockfs.lf_lock = (ulong_t)lockfs32.lf_lock;
1468*7c478bd9Sstevel@tonic-gate 				lockfs.lf_flags = (ulong_t)lockfs32.lf_flags;
1469*7c478bd9Sstevel@tonic-gate 				lockfs.lf_key = (ulong_t)lockfs32.lf_key;
1470*7c478bd9Sstevel@tonic-gate 				lockfs.lf_comlen = (ulong_t)lockfs32.lf_comlen;
1471*7c478bd9Sstevel@tonic-gate 				lockfs.lf_comment =
1472*7c478bd9Sstevel@tonic-gate 					(caddr_t)(uintptr_t)lockfs32.lf_comment;
1473*7c478bd9Sstevel@tonic-gate 			}
1474*7c478bd9Sstevel@tonic-gate #endif /* _SYSCALL32_IMPL */
1475*7c478bd9Sstevel@tonic-gate 
1476*7c478bd9Sstevel@tonic-gate 			if (lockfs.lf_comlen) {
1477*7c478bd9Sstevel@tonic-gate 				if (lockfs.lf_comlen > LOCKFS_MAXCOMMENTLEN)
1478*7c478bd9Sstevel@tonic-gate 					return (ENAMETOOLONG);
1479*7c478bd9Sstevel@tonic-gate 				comment = kmem_alloc(lockfs.lf_comlen,
1480*7c478bd9Sstevel@tonic-gate 						KM_SLEEP);
1481*7c478bd9Sstevel@tonic-gate 				if (copyin(lockfs.lf_comment, comment,
1482*7c478bd9Sstevel@tonic-gate 					lockfs.lf_comlen)) {
1483*7c478bd9Sstevel@tonic-gate 					kmem_free(comment, lockfs.lf_comlen);
1484*7c478bd9Sstevel@tonic-gate 					return (EFAULT);
1485*7c478bd9Sstevel@tonic-gate 				}
1486*7c478bd9Sstevel@tonic-gate 				original_comment = lockfs.lf_comment;
1487*7c478bd9Sstevel@tonic-gate 				lockfs.lf_comment = comment;
1488*7c478bd9Sstevel@tonic-gate 			}
1489*7c478bd9Sstevel@tonic-gate 			if ((error = ufs_fiolfs(vp, &lockfs, 0)) == 0) {
1490*7c478bd9Sstevel@tonic-gate 				lockfs.lf_comment = original_comment;
1491*7c478bd9Sstevel@tonic-gate 
1492*7c478bd9Sstevel@tonic-gate 				if ((flag & DATAMODEL_MASK) ==
1493*7c478bd9Sstevel@tonic-gate 				    DATAMODEL_NATIVE) {
1494*7c478bd9Sstevel@tonic-gate 					(void) copyout(&lockfs, (caddr_t)arg,
1495*7c478bd9Sstevel@tonic-gate 					    sizeof (struct lockfs));
1496*7c478bd9Sstevel@tonic-gate 				}
1497*7c478bd9Sstevel@tonic-gate #ifdef _SYSCALL32_IMPL
1498*7c478bd9Sstevel@tonic-gate 				else {
1499*7c478bd9Sstevel@tonic-gate 					struct lockfs32	lockfs32;
1500*7c478bd9Sstevel@tonic-gate 					/* Translate LP64 to ILP32 lockfs */
1501*7c478bd9Sstevel@tonic-gate 					lockfs32.lf_lock =
1502*7c478bd9Sstevel@tonic-gate 					    (uint32_t)lockfs.lf_lock;
1503*7c478bd9Sstevel@tonic-gate 					lockfs32.lf_flags =
1504*7c478bd9Sstevel@tonic-gate 					    (uint32_t)lockfs.lf_flags;
1505*7c478bd9Sstevel@tonic-gate 					lockfs32.lf_key =
1506*7c478bd9Sstevel@tonic-gate 					    (uint32_t)lockfs.lf_key;
1507*7c478bd9Sstevel@tonic-gate 					lockfs32.lf_comlen =
1508*7c478bd9Sstevel@tonic-gate 					    (uint32_t)lockfs.lf_comlen;
1509*7c478bd9Sstevel@tonic-gate 					lockfs32.lf_comment =
1510*7c478bd9Sstevel@tonic-gate 					(uint32_t)(uintptr_t)lockfs.lf_comment;
1511*7c478bd9Sstevel@tonic-gate 					(void) copyout(&lockfs32, (caddr_t)arg,
1512*7c478bd9Sstevel@tonic-gate 					    sizeof (struct lockfs32));
1513*7c478bd9Sstevel@tonic-gate 				}
1514*7c478bd9Sstevel@tonic-gate #endif /* _SYSCALL32_IMPL */
1515*7c478bd9Sstevel@tonic-gate 
1516*7c478bd9Sstevel@tonic-gate 			} else {
1517*7c478bd9Sstevel@tonic-gate 				if (lockfs.lf_comlen)
1518*7c478bd9Sstevel@tonic-gate 					kmem_free(comment, lockfs.lf_comlen);
1519*7c478bd9Sstevel@tonic-gate 			}
1520*7c478bd9Sstevel@tonic-gate 			return (error);
1521*7c478bd9Sstevel@tonic-gate 
1522*7c478bd9Sstevel@tonic-gate 		case _FIOLFSS:
1523*7c478bd9Sstevel@tonic-gate 			/*
1524*7c478bd9Sstevel@tonic-gate 			 * get file system locking status
1525*7c478bd9Sstevel@tonic-gate 			 */
1526*7c478bd9Sstevel@tonic-gate 
1527*7c478bd9Sstevel@tonic-gate 			if ((flag & DATAMODEL_MASK) == DATAMODEL_NATIVE) {
1528*7c478bd9Sstevel@tonic-gate 				if (copyin((caddr_t)arg, &lockfs,
1529*7c478bd9Sstevel@tonic-gate 						sizeof (struct lockfs)))
1530*7c478bd9Sstevel@tonic-gate 					return (EFAULT);
1531*7c478bd9Sstevel@tonic-gate 			}
1532*7c478bd9Sstevel@tonic-gate #ifdef _SYSCALL32_IMPL
1533*7c478bd9Sstevel@tonic-gate 			else {
1534*7c478bd9Sstevel@tonic-gate 				struct lockfs32	lockfs32;
1535*7c478bd9Sstevel@tonic-gate 				/* Translate ILP32 lockfs to LP64 lockfs */
1536*7c478bd9Sstevel@tonic-gate 				if (copyin((caddr_t)arg, &lockfs32,
1537*7c478bd9Sstevel@tonic-gate 						sizeof (struct lockfs32)))
1538*7c478bd9Sstevel@tonic-gate 					return (EFAULT);
1539*7c478bd9Sstevel@tonic-gate 				lockfs.lf_lock = (ulong_t)lockfs32.lf_lock;
1540*7c478bd9Sstevel@tonic-gate 				lockfs.lf_flags = (ulong_t)lockfs32.lf_flags;
1541*7c478bd9Sstevel@tonic-gate 				lockfs.lf_key = (ulong_t)lockfs32.lf_key;
1542*7c478bd9Sstevel@tonic-gate 				lockfs.lf_comlen = (ulong_t)lockfs32.lf_comlen;
1543*7c478bd9Sstevel@tonic-gate 				lockfs.lf_comment =
1544*7c478bd9Sstevel@tonic-gate 					(caddr_t)(uintptr_t)lockfs32.lf_comment;
1545*7c478bd9Sstevel@tonic-gate 			}
1546*7c478bd9Sstevel@tonic-gate #endif /* _SYSCALL32_IMPL */
1547*7c478bd9Sstevel@tonic-gate 
1548*7c478bd9Sstevel@tonic-gate 			if (error =  ufs_fiolfss(vp, &lockfs_out))
1549*7c478bd9Sstevel@tonic-gate 				return (error);
1550*7c478bd9Sstevel@tonic-gate 			lockfs.lf_lock = lockfs_out.lf_lock;
1551*7c478bd9Sstevel@tonic-gate 			lockfs.lf_key = lockfs_out.lf_key;
1552*7c478bd9Sstevel@tonic-gate 			lockfs.lf_flags = lockfs_out.lf_flags;
1553*7c478bd9Sstevel@tonic-gate 			lockfs.lf_comlen = MIN(lockfs.lf_comlen,
1554*7c478bd9Sstevel@tonic-gate 				lockfs_out.lf_comlen);
1555*7c478bd9Sstevel@tonic-gate 
1556*7c478bd9Sstevel@tonic-gate 			if ((flag & DATAMODEL_MASK) == DATAMODEL_NATIVE) {
1557*7c478bd9Sstevel@tonic-gate 				if (copyout(&lockfs, (caddr_t)arg,
1558*7c478bd9Sstevel@tonic-gate 						sizeof (struct lockfs)))
1559*7c478bd9Sstevel@tonic-gate 					return (EFAULT);
1560*7c478bd9Sstevel@tonic-gate 			}
1561*7c478bd9Sstevel@tonic-gate #ifdef _SYSCALL32_IMPL
1562*7c478bd9Sstevel@tonic-gate 			else {
1563*7c478bd9Sstevel@tonic-gate 				/* Translate LP64 to ILP32 lockfs */
1564*7c478bd9Sstevel@tonic-gate 				struct lockfs32	lockfs32;
1565*7c478bd9Sstevel@tonic-gate 				lockfs32.lf_lock = (uint32_t)lockfs.lf_lock;
1566*7c478bd9Sstevel@tonic-gate 				lockfs32.lf_flags = (uint32_t)lockfs.lf_flags;
1567*7c478bd9Sstevel@tonic-gate 				lockfs32.lf_key = (uint32_t)lockfs.lf_key;
1568*7c478bd9Sstevel@tonic-gate 				lockfs32.lf_comlen = (uint32_t)lockfs.lf_comlen;
1569*7c478bd9Sstevel@tonic-gate 				lockfs32.lf_comment =
1570*7c478bd9Sstevel@tonic-gate 					(uint32_t)(uintptr_t)lockfs.lf_comment;
1571*7c478bd9Sstevel@tonic-gate 				if (copyout(&lockfs32, (caddr_t)arg,
1572*7c478bd9Sstevel@tonic-gate 					    sizeof (struct lockfs32)))
1573*7c478bd9Sstevel@tonic-gate 					return (EFAULT);
1574*7c478bd9Sstevel@tonic-gate 			}
1575*7c478bd9Sstevel@tonic-gate #endif /* _SYSCALL32_IMPL */
1576*7c478bd9Sstevel@tonic-gate 
1577*7c478bd9Sstevel@tonic-gate 			if (lockfs.lf_comlen &&
1578*7c478bd9Sstevel@tonic-gate 			    lockfs.lf_comment && lockfs_out.lf_comment)
1579*7c478bd9Sstevel@tonic-gate 				if (copyout(lockfs_out.lf_comment,
1580*7c478bd9Sstevel@tonic-gate 					lockfs.lf_comment,
1581*7c478bd9Sstevel@tonic-gate 					lockfs.lf_comlen))
1582*7c478bd9Sstevel@tonic-gate 					return (EFAULT);
1583*7c478bd9Sstevel@tonic-gate 			return (0);
1584*7c478bd9Sstevel@tonic-gate 
1585*7c478bd9Sstevel@tonic-gate 		case _FIOSATIME:
1586*7c478bd9Sstevel@tonic-gate 			/*
1587*7c478bd9Sstevel@tonic-gate 			 * set access time
1588*7c478bd9Sstevel@tonic-gate 			 */
1589*7c478bd9Sstevel@tonic-gate 
1590*7c478bd9Sstevel@tonic-gate 			/*
1591*7c478bd9Sstevel@tonic-gate 			 * if mounted w/o atime, return quietly.
1592*7c478bd9Sstevel@tonic-gate 			 * I briefly thought about returning ENOSYS, but
1593*7c478bd9Sstevel@tonic-gate 			 * figured that most apps would consider this fatal
1594*7c478bd9Sstevel@tonic-gate 			 * but the idea is to make this as seamless as poss.
1595*7c478bd9Sstevel@tonic-gate 			 */
1596*7c478bd9Sstevel@tonic-gate 			if (ufsvfsp->vfs_noatime)
1597*7c478bd9Sstevel@tonic-gate 				return (0);
1598*7c478bd9Sstevel@tonic-gate 
1599*7c478bd9Sstevel@tonic-gate 			error = ufs_lockfs_begin(ufsvfsp, &ulp,
1600*7c478bd9Sstevel@tonic-gate 					ULOCKFS_SETATTR_MASK);
1601*7c478bd9Sstevel@tonic-gate 			if (error)
1602*7c478bd9Sstevel@tonic-gate 				return (error);
1603*7c478bd9Sstevel@tonic-gate 
1604*7c478bd9Sstevel@tonic-gate 			if (ulp) {
1605*7c478bd9Sstevel@tonic-gate 				trans_size = (int)TOP_SETATTR_SIZE(VTOI(vp));
1606*7c478bd9Sstevel@tonic-gate 				TRANS_BEGIN_CSYNC(ufsvfsp, issync,
1607*7c478bd9Sstevel@tonic-gate 						TOP_SETATTR, trans_size);
1608*7c478bd9Sstevel@tonic-gate 			}
1609*7c478bd9Sstevel@tonic-gate 
1610*7c478bd9Sstevel@tonic-gate 			error = ufs_fiosatime(vp, (struct timeval *)arg,
1611*7c478bd9Sstevel@tonic-gate 					flag, cr);
1612*7c478bd9Sstevel@tonic-gate 
1613*7c478bd9Sstevel@tonic-gate 			if (ulp) {
1614*7c478bd9Sstevel@tonic-gate 				TRANS_END_CSYNC(ufsvfsp, error, issync,
1615*7c478bd9Sstevel@tonic-gate 						TOP_SETATTR, trans_size);
1616*7c478bd9Sstevel@tonic-gate 				ufs_lockfs_end(ulp);
1617*7c478bd9Sstevel@tonic-gate 			}
1618*7c478bd9Sstevel@tonic-gate 			return (error);
1619*7c478bd9Sstevel@tonic-gate 
1620*7c478bd9Sstevel@tonic-gate 		case _FIOSDIO:
1621*7c478bd9Sstevel@tonic-gate 			/*
1622*7c478bd9Sstevel@tonic-gate 			 * set delayed-io
1623*7c478bd9Sstevel@tonic-gate 			 */
1624*7c478bd9Sstevel@tonic-gate 			return (ufs_fiosdio(vp, (uint_t *)arg, flag, cr));
1625*7c478bd9Sstevel@tonic-gate 
1626*7c478bd9Sstevel@tonic-gate 		case _FIOGDIO:
1627*7c478bd9Sstevel@tonic-gate 			/*
1628*7c478bd9Sstevel@tonic-gate 			 * get delayed-io
1629*7c478bd9Sstevel@tonic-gate 			 */
1630*7c478bd9Sstevel@tonic-gate 			return (ufs_fiogdio(vp, (uint_t *)arg, flag, cr));
1631*7c478bd9Sstevel@tonic-gate 
1632*7c478bd9Sstevel@tonic-gate 		case _FIOIO:
1633*7c478bd9Sstevel@tonic-gate 			/*
1634*7c478bd9Sstevel@tonic-gate 			 * inode open
1635*7c478bd9Sstevel@tonic-gate 			 */
1636*7c478bd9Sstevel@tonic-gate 			error = ufs_lockfs_begin(ufsvfsp, &ulp,
1637*7c478bd9Sstevel@tonic-gate 					ULOCKFS_VGET_MASK);
1638*7c478bd9Sstevel@tonic-gate 			if (error)
1639*7c478bd9Sstevel@tonic-gate 				return (error);
1640*7c478bd9Sstevel@tonic-gate 
1641*7c478bd9Sstevel@tonic-gate 			error = ufs_fioio(vp, (struct fioio *)arg, flag, cr);
1642*7c478bd9Sstevel@tonic-gate 
1643*7c478bd9Sstevel@tonic-gate 			if (ulp) {
1644*7c478bd9Sstevel@tonic-gate 				ufs_lockfs_end(ulp);
1645*7c478bd9Sstevel@tonic-gate 			}
1646*7c478bd9Sstevel@tonic-gate 			return (error);
1647*7c478bd9Sstevel@tonic-gate 
1648*7c478bd9Sstevel@tonic-gate 		case _FIOFFS:
1649*7c478bd9Sstevel@tonic-gate 			/*
1650*7c478bd9Sstevel@tonic-gate 			 * file system flush (push w/invalidate)
1651*7c478bd9Sstevel@tonic-gate 			 */
1652*7c478bd9Sstevel@tonic-gate 			if ((caddr_t)arg != NULL)
1653*7c478bd9Sstevel@tonic-gate 				return (EINVAL);
1654*7c478bd9Sstevel@tonic-gate 			return (ufs_fioffs(vp, NULL, cr));
1655*7c478bd9Sstevel@tonic-gate 
1656*7c478bd9Sstevel@tonic-gate 		case _FIOISBUSY:
1657*7c478bd9Sstevel@tonic-gate 			/*
1658*7c478bd9Sstevel@tonic-gate 			 * Contract-private interface for Legato
1659*7c478bd9Sstevel@tonic-gate 			 * Purge this vnode from the DNLC and decide
1660*7c478bd9Sstevel@tonic-gate 			 * if this vnode is busy (*arg == 1) or not
1661*7c478bd9Sstevel@tonic-gate 			 * (*arg == 0)
1662*7c478bd9Sstevel@tonic-gate 			 */
1663*7c478bd9Sstevel@tonic-gate 			if (secpolicy_fs_config(cr, ufsvfsp->vfs_vfs) != 0)
1664*7c478bd9Sstevel@tonic-gate 				return (EPERM);
1665*7c478bd9Sstevel@tonic-gate 			error = ufs_fioisbusy(vp, (int *)arg, cr);
1666*7c478bd9Sstevel@tonic-gate 			return (error);
1667*7c478bd9Sstevel@tonic-gate 
1668*7c478bd9Sstevel@tonic-gate 		case _FIODIRECTIO:
1669*7c478bd9Sstevel@tonic-gate 			return (ufs_fiodirectio(vp, (int)arg, cr));
1670*7c478bd9Sstevel@tonic-gate 
1671*7c478bd9Sstevel@tonic-gate 		case _FIOTUNE:
1672*7c478bd9Sstevel@tonic-gate 			/*
1673*7c478bd9Sstevel@tonic-gate 			 * Tune the file system (aka setting fs attributes)
1674*7c478bd9Sstevel@tonic-gate 			 */
1675*7c478bd9Sstevel@tonic-gate 			error = ufs_lockfs_begin(ufsvfsp, &ulp,
1676*7c478bd9Sstevel@tonic-gate 					ULOCKFS_SETATTR_MASK);
1677*7c478bd9Sstevel@tonic-gate 			if (error)
1678*7c478bd9Sstevel@tonic-gate 				return (error);
1679*7c478bd9Sstevel@tonic-gate 
1680*7c478bd9Sstevel@tonic-gate 			error = ufs_fiotune(vp, (struct fiotune *)arg, cr);
1681*7c478bd9Sstevel@tonic-gate 
1682*7c478bd9Sstevel@tonic-gate 			if (ulp)
1683*7c478bd9Sstevel@tonic-gate 				ufs_lockfs_end(ulp);
1684*7c478bd9Sstevel@tonic-gate 			return (error);
1685*7c478bd9Sstevel@tonic-gate 
1686*7c478bd9Sstevel@tonic-gate 		case _FIOLOGENABLE:
1687*7c478bd9Sstevel@tonic-gate 			if (secpolicy_fs_config(cr, ufsvfsp->vfs_vfs) != 0)
1688*7c478bd9Sstevel@tonic-gate 				return (EPERM);
1689*7c478bd9Sstevel@tonic-gate 			return (ufs_fiologenable(vp, (void *)arg, cr, flag));
1690*7c478bd9Sstevel@tonic-gate 
1691*7c478bd9Sstevel@tonic-gate 		case _FIOLOGDISABLE:
1692*7c478bd9Sstevel@tonic-gate 			if (secpolicy_fs_config(cr, ufsvfsp->vfs_vfs) != 0)
1693*7c478bd9Sstevel@tonic-gate 				return (EPERM);
1694*7c478bd9Sstevel@tonic-gate 			return (ufs_fiologdisable(vp, (void *)arg, cr, flag));
1695*7c478bd9Sstevel@tonic-gate 
1696*7c478bd9Sstevel@tonic-gate 		case _FIOISLOG:
1697*7c478bd9Sstevel@tonic-gate 			return (ufs_fioislog(vp, (void *)arg, cr, flag));
1698*7c478bd9Sstevel@tonic-gate 
1699*7c478bd9Sstevel@tonic-gate 		case _FIOSNAPSHOTCREATE_MULTI:
1700*7c478bd9Sstevel@tonic-gate 		{
1701*7c478bd9Sstevel@tonic-gate 			struct fiosnapcreate_multi	fc, *fcp;
1702*7c478bd9Sstevel@tonic-gate 			size_t	fcm_size;
1703*7c478bd9Sstevel@tonic-gate 
1704*7c478bd9Sstevel@tonic-gate 			if (copyin((void *)arg, &fc, sizeof (fc)))
1705*7c478bd9Sstevel@tonic-gate 				return (EFAULT);
1706*7c478bd9Sstevel@tonic-gate 			if (fc.backfilecount > MAX_BACKFILE_COUNT)
1707*7c478bd9Sstevel@tonic-gate 				return (EINVAL);
1708*7c478bd9Sstevel@tonic-gate 			fcm_size = sizeof (struct fiosnapcreate_multi) +
1709*7c478bd9Sstevel@tonic-gate 			    (fc.backfilecount - 1) * sizeof (int);
1710*7c478bd9Sstevel@tonic-gate 			fcp = (struct fiosnapcreate_multi *)
1711*7c478bd9Sstevel@tonic-gate 			    kmem_alloc(fcm_size, KM_SLEEP);
1712*7c478bd9Sstevel@tonic-gate 			if (copyin((void *)arg, fcp, fcm_size)) {
1713*7c478bd9Sstevel@tonic-gate 				kmem_free(fcp, fcm_size);
1714*7c478bd9Sstevel@tonic-gate 				return (EFAULT);
1715*7c478bd9Sstevel@tonic-gate 			}
1716*7c478bd9Sstevel@tonic-gate 			error = ufs_snap_create(vp, fcp, cr);
1717*7c478bd9Sstevel@tonic-gate 			if (!error && copyout(fcp, (void *)arg, fcm_size))
1718*7c478bd9Sstevel@tonic-gate 				error = EFAULT;
1719*7c478bd9Sstevel@tonic-gate 			kmem_free(fcp, fcm_size);
1720*7c478bd9Sstevel@tonic-gate 			return (error);
1721*7c478bd9Sstevel@tonic-gate 		}
1722*7c478bd9Sstevel@tonic-gate 
1723*7c478bd9Sstevel@tonic-gate 		case _FIOSNAPSHOTDELETE:
1724*7c478bd9Sstevel@tonic-gate 		{
1725*7c478bd9Sstevel@tonic-gate 			struct fiosnapdelete	fc;
1726*7c478bd9Sstevel@tonic-gate 
1727*7c478bd9Sstevel@tonic-gate 			if (copyin((void *)arg, &fc, sizeof (fc)))
1728*7c478bd9Sstevel@tonic-gate 				return (EFAULT);
1729*7c478bd9Sstevel@tonic-gate 			error = ufs_snap_delete(vp, &fc, cr);
1730*7c478bd9Sstevel@tonic-gate 			if (!error && copyout(&fc, (void *)arg, sizeof (fc)))
1731*7c478bd9Sstevel@tonic-gate 				error = EFAULT;
1732*7c478bd9Sstevel@tonic-gate 			return (error);
1733*7c478bd9Sstevel@tonic-gate 		}
1734*7c478bd9Sstevel@tonic-gate 
1735*7c478bd9Sstevel@tonic-gate 		case _FIOGETSUPERBLOCK:
1736*7c478bd9Sstevel@tonic-gate 			if (copyout(fs, (void *)arg, SBSIZE))
1737*7c478bd9Sstevel@tonic-gate 				return (EFAULT);
1738*7c478bd9Sstevel@tonic-gate 			return (0);
1739*7c478bd9Sstevel@tonic-gate 
1740*7c478bd9Sstevel@tonic-gate 		case _FIOGETMAXPHYS:
1741*7c478bd9Sstevel@tonic-gate 			if (copyout(&maxphys, (void *)arg, sizeof (maxphys)))
1742*7c478bd9Sstevel@tonic-gate 				return (EFAULT);
1743*7c478bd9Sstevel@tonic-gate 			return (0);
1744*7c478bd9Sstevel@tonic-gate 
1745*7c478bd9Sstevel@tonic-gate 		/*
1746*7c478bd9Sstevel@tonic-gate 		 * The following 3 ioctls are for TSufs support
1747*7c478bd9Sstevel@tonic-gate 		 * although could potentially be used elsewhere
1748*7c478bd9Sstevel@tonic-gate 		 */
1749*7c478bd9Sstevel@tonic-gate 		case _FIO_SET_LUFS_DEBUG:
1750*7c478bd9Sstevel@tonic-gate 			if (secpolicy_fs_config(cr, ufsvfsp->vfs_vfs) != 0)
1751*7c478bd9Sstevel@tonic-gate 				return (EPERM);
1752*7c478bd9Sstevel@tonic-gate 			lufs_debug = (uint32_t)arg;
1753*7c478bd9Sstevel@tonic-gate 			return (0);
1754*7c478bd9Sstevel@tonic-gate 
1755*7c478bd9Sstevel@tonic-gate 		case _FIO_SET_LUFS_ERROR:
1756*7c478bd9Sstevel@tonic-gate 			if (secpolicy_fs_config(cr, ufsvfsp->vfs_vfs) != 0)
1757*7c478bd9Sstevel@tonic-gate 				return (EPERM);
1758*7c478bd9Sstevel@tonic-gate 			TRANS_SETERROR(ufsvfsp);
1759*7c478bd9Sstevel@tonic-gate 			return (0);
1760*7c478bd9Sstevel@tonic-gate 
1761*7c478bd9Sstevel@tonic-gate 		case _FIO_GET_TOP_STATS:
1762*7c478bd9Sstevel@tonic-gate 		{
1763*7c478bd9Sstevel@tonic-gate 			fio_lufs_stats_t *ls;
1764*7c478bd9Sstevel@tonic-gate 			ml_unit_t *ul = ufsvfsp->vfs_log;
1765*7c478bd9Sstevel@tonic-gate 
1766*7c478bd9Sstevel@tonic-gate 			ls = kmem_zalloc(sizeof (*ls), KM_SLEEP);
1767*7c478bd9Sstevel@tonic-gate 			ls->ls_debug = ul->un_debug; /* return debug value */
1768*7c478bd9Sstevel@tonic-gate 			/* Copy stucture if statistics are being kept */
1769*7c478bd9Sstevel@tonic-gate 			if (ul->un_logmap->mtm_tops) {
1770*7c478bd9Sstevel@tonic-gate 				ls->ls_topstats = *(ul->un_logmap->mtm_tops);
1771*7c478bd9Sstevel@tonic-gate 			}
1772*7c478bd9Sstevel@tonic-gate 			error = 0;
1773*7c478bd9Sstevel@tonic-gate 			if (copyout(ls, (void *)arg, sizeof (*ls)))
1774*7c478bd9Sstevel@tonic-gate 				error = EFAULT;
1775*7c478bd9Sstevel@tonic-gate 			kmem_free(ls, sizeof (*ls));
1776*7c478bd9Sstevel@tonic-gate 			return (error);
1777*7c478bd9Sstevel@tonic-gate 		}
1778*7c478bd9Sstevel@tonic-gate 
1779*7c478bd9Sstevel@tonic-gate 		case _FIO_SEEK_DATA:
1780*7c478bd9Sstevel@tonic-gate 		case _FIO_SEEK_HOLE:
1781*7c478bd9Sstevel@tonic-gate 			if (ddi_copyin((void *)arg, &off, sizeof (off), flag))
1782*7c478bd9Sstevel@tonic-gate 				return (EFAULT);
1783*7c478bd9Sstevel@tonic-gate 			/* offset paramater is in/out */
1784*7c478bd9Sstevel@tonic-gate 			error = ufs_fio_holey(vp, cmd, &off);
1785*7c478bd9Sstevel@tonic-gate 			if (error)
1786*7c478bd9Sstevel@tonic-gate 				return (error);
1787*7c478bd9Sstevel@tonic-gate 			if (ddi_copyout(&off, (void *)arg, sizeof (off), flag))
1788*7c478bd9Sstevel@tonic-gate 				return (EFAULT);
1789*7c478bd9Sstevel@tonic-gate 			return (0);
1790*7c478bd9Sstevel@tonic-gate 
1791*7c478bd9Sstevel@tonic-gate 		default:
1792*7c478bd9Sstevel@tonic-gate 			return (ENOTTY);
1793*7c478bd9Sstevel@tonic-gate 	}
1794*7c478bd9Sstevel@tonic-gate }
1795*7c478bd9Sstevel@tonic-gate 
1796*7c478bd9Sstevel@tonic-gate /* ARGSUSED */
1797*7c478bd9Sstevel@tonic-gate static int
1798*7c478bd9Sstevel@tonic-gate ufs_getattr(struct vnode *vp, struct vattr *vap, int flags,
1799*7c478bd9Sstevel@tonic-gate 	struct cred *cr)
1800*7c478bd9Sstevel@tonic-gate {
1801*7c478bd9Sstevel@tonic-gate 	struct inode *ip = VTOI(vp);
1802*7c478bd9Sstevel@tonic-gate 	struct ufsvfs *ufsvfsp;
1803*7c478bd9Sstevel@tonic-gate 	int err;
1804*7c478bd9Sstevel@tonic-gate 
1805*7c478bd9Sstevel@tonic-gate 	TRACE_2(TR_FAC_UFS, TR_UFS_GETATTR_START,
1806*7c478bd9Sstevel@tonic-gate 		"ufs_getattr_start:vp %p flags %x", vp, flags);
1807*7c478bd9Sstevel@tonic-gate 
1808*7c478bd9Sstevel@tonic-gate 	if (vap->va_mask == AT_SIZE) {
1809*7c478bd9Sstevel@tonic-gate 		/*
1810*7c478bd9Sstevel@tonic-gate 		 * for performance, if only the size is requested don't bother
1811*7c478bd9Sstevel@tonic-gate 		 * with anything else.
1812*7c478bd9Sstevel@tonic-gate 		 */
1813*7c478bd9Sstevel@tonic-gate 		UFS_GET_ISIZE(&vap->va_size, ip);
1814*7c478bd9Sstevel@tonic-gate 		TRACE_1(TR_FAC_UFS, TR_UFS_GETATTR_END,
1815*7c478bd9Sstevel@tonic-gate 			"ufs_getattr_end:vp %p", vp);
1816*7c478bd9Sstevel@tonic-gate 		return (0);
1817*7c478bd9Sstevel@tonic-gate 	}
1818*7c478bd9Sstevel@tonic-gate 
1819*7c478bd9Sstevel@tonic-gate 	/*
1820*7c478bd9Sstevel@tonic-gate 	 * inlined lockfs checks
1821*7c478bd9Sstevel@tonic-gate 	 */
1822*7c478bd9Sstevel@tonic-gate 	ufsvfsp = ip->i_ufsvfs;
1823*7c478bd9Sstevel@tonic-gate 	if ((ufsvfsp == NULL) || ULOCKFS_IS_HLOCK(&ufsvfsp->vfs_ulockfs)) {
1824*7c478bd9Sstevel@tonic-gate 		err = EIO;
1825*7c478bd9Sstevel@tonic-gate 		goto out;
1826*7c478bd9Sstevel@tonic-gate 	}
1827*7c478bd9Sstevel@tonic-gate 
1828*7c478bd9Sstevel@tonic-gate 	rw_enter(&ip->i_contents, RW_READER);
1829*7c478bd9Sstevel@tonic-gate 	/*
1830*7c478bd9Sstevel@tonic-gate 	 * Return all the attributes.  This should be refined so
1831*7c478bd9Sstevel@tonic-gate 	 * that it only returns what's asked for.
1832*7c478bd9Sstevel@tonic-gate 	 */
1833*7c478bd9Sstevel@tonic-gate 
1834*7c478bd9Sstevel@tonic-gate 	/*
1835*7c478bd9Sstevel@tonic-gate 	 * Copy from inode table.
1836*7c478bd9Sstevel@tonic-gate 	 */
1837*7c478bd9Sstevel@tonic-gate 	vap->va_type = vp->v_type;
1838*7c478bd9Sstevel@tonic-gate 	vap->va_mode = ip->i_mode & MODEMASK;
1839*7c478bd9Sstevel@tonic-gate 	/*
1840*7c478bd9Sstevel@tonic-gate 	 * If there is an ACL and there is a mask entry, then do the
1841*7c478bd9Sstevel@tonic-gate 	 * extra work that completes the equivalent of an acltomode(3)
1842*7c478bd9Sstevel@tonic-gate 	 * call.  According to POSIX P1003.1e, the acl mask should be
1843*7c478bd9Sstevel@tonic-gate 	 * returned in the group permissions field.
1844*7c478bd9Sstevel@tonic-gate 	 *
1845*7c478bd9Sstevel@tonic-gate 	 * - start with the original permission and mode bits (from above)
1846*7c478bd9Sstevel@tonic-gate 	 * - clear the group owner bits
1847*7c478bd9Sstevel@tonic-gate 	 * - add in the mask bits.
1848*7c478bd9Sstevel@tonic-gate 	 */
1849*7c478bd9Sstevel@tonic-gate 	if (ip->i_ufs_acl && ip->i_ufs_acl->aclass.acl_ismask) {
1850*7c478bd9Sstevel@tonic-gate 		vap->va_mode &= ~((VREAD | VWRITE | VEXEC) >> 3);
1851*7c478bd9Sstevel@tonic-gate 		vap->va_mode |=
1852*7c478bd9Sstevel@tonic-gate 		    (ip->i_ufs_acl->aclass.acl_maskbits & PERMMASK) << 3;
1853*7c478bd9Sstevel@tonic-gate 	}
1854*7c478bd9Sstevel@tonic-gate 	vap->va_uid = ip->i_uid;
1855*7c478bd9Sstevel@tonic-gate 	vap->va_gid = ip->i_gid;
1856*7c478bd9Sstevel@tonic-gate 	vap->va_fsid = ip->i_dev;
1857*7c478bd9Sstevel@tonic-gate 	vap->va_nodeid = (ino64_t)ip->i_number;
1858*7c478bd9Sstevel@tonic-gate 	vap->va_nlink = ip->i_nlink;
1859*7c478bd9Sstevel@tonic-gate 	vap->va_size = ip->i_size;
1860*7c478bd9Sstevel@tonic-gate 	if (vp->v_type == VCHR || vp->v_type == VBLK)
1861*7c478bd9Sstevel@tonic-gate 		vap->va_rdev = ip->i_rdev;
1862*7c478bd9Sstevel@tonic-gate 	else
1863*7c478bd9Sstevel@tonic-gate 		vap->va_rdev = 0;	/* not a b/c spec. */
1864*7c478bd9Sstevel@tonic-gate 	mutex_enter(&ip->i_tlock);
1865*7c478bd9Sstevel@tonic-gate 	ITIMES_NOLOCK(ip);	/* mark correct time in inode */
1866*7c478bd9Sstevel@tonic-gate 	vap->va_seq = ip->i_seq;
1867*7c478bd9Sstevel@tonic-gate 	vap->va_atime.tv_sec = (time_t)ip->i_atime.tv_sec;
1868*7c478bd9Sstevel@tonic-gate 	vap->va_atime.tv_nsec = ip->i_atime.tv_usec*1000;
1869*7c478bd9Sstevel@tonic-gate 	vap->va_mtime.tv_sec = (time_t)ip->i_mtime.tv_sec;
1870*7c478bd9Sstevel@tonic-gate 	vap->va_mtime.tv_nsec = ip->i_mtime.tv_usec*1000;
1871*7c478bd9Sstevel@tonic-gate 	vap->va_ctime.tv_sec = (time_t)ip->i_ctime.tv_sec;
1872*7c478bd9Sstevel@tonic-gate 	vap->va_ctime.tv_nsec = ip->i_ctime.tv_usec*1000;
1873*7c478bd9Sstevel@tonic-gate 	mutex_exit(&ip->i_tlock);
1874*7c478bd9Sstevel@tonic-gate 
1875*7c478bd9Sstevel@tonic-gate 	switch (ip->i_mode & IFMT) {
1876*7c478bd9Sstevel@tonic-gate 
1877*7c478bd9Sstevel@tonic-gate 	case IFBLK:
1878*7c478bd9Sstevel@tonic-gate 		vap->va_blksize = MAXBSIZE;		/* was BLKDEV_IOSIZE */
1879*7c478bd9Sstevel@tonic-gate 		break;
1880*7c478bd9Sstevel@tonic-gate 
1881*7c478bd9Sstevel@tonic-gate 	case IFCHR:
1882*7c478bd9Sstevel@tonic-gate 		vap->va_blksize = MAXBSIZE;
1883*7c478bd9Sstevel@tonic-gate 		break;
1884*7c478bd9Sstevel@tonic-gate 
1885*7c478bd9Sstevel@tonic-gate 	default:
1886*7c478bd9Sstevel@tonic-gate 		vap->va_blksize = ip->i_fs->fs_bsize;
1887*7c478bd9Sstevel@tonic-gate 		break;
1888*7c478bd9Sstevel@tonic-gate 	}
1889*7c478bd9Sstevel@tonic-gate 	vap->va_nblocks = (fsblkcnt64_t)ip->i_blocks;
1890*7c478bd9Sstevel@tonic-gate 	rw_exit(&ip->i_contents);
1891*7c478bd9Sstevel@tonic-gate 	err = 0;
1892*7c478bd9Sstevel@tonic-gate 
1893*7c478bd9Sstevel@tonic-gate out:
1894*7c478bd9Sstevel@tonic-gate 	TRACE_1(TR_FAC_UFS, TR_UFS_GETATTR_END, "ufs_getattr_end:vp %p", vp);
1895*7c478bd9Sstevel@tonic-gate 
1896*7c478bd9Sstevel@tonic-gate 	return (err);
1897*7c478bd9Sstevel@tonic-gate }
1898*7c478bd9Sstevel@tonic-gate 
1899*7c478bd9Sstevel@tonic-gate /*ARGSUSED4*/
1900*7c478bd9Sstevel@tonic-gate static int
1901*7c478bd9Sstevel@tonic-gate ufs_setattr(
1902*7c478bd9Sstevel@tonic-gate 	struct vnode *vp,
1903*7c478bd9Sstevel@tonic-gate 	struct vattr *vap,
1904*7c478bd9Sstevel@tonic-gate 	int flags,
1905*7c478bd9Sstevel@tonic-gate 	struct cred *cr,
1906*7c478bd9Sstevel@tonic-gate 	caller_context_t *ct)
1907*7c478bd9Sstevel@tonic-gate {
1908*7c478bd9Sstevel@tonic-gate 	struct inode *ip = VTOI(vp);
1909*7c478bd9Sstevel@tonic-gate 	struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
1910*7c478bd9Sstevel@tonic-gate 	struct fs *fs;
1911*7c478bd9Sstevel@tonic-gate 	struct ulockfs *ulp;
1912*7c478bd9Sstevel@tonic-gate 	char *errmsg1;
1913*7c478bd9Sstevel@tonic-gate 	char *errmsg2;
1914*7c478bd9Sstevel@tonic-gate 	long blocks;
1915*7c478bd9Sstevel@tonic-gate 	long int mask = vap->va_mask;
1916*7c478bd9Sstevel@tonic-gate 	size_t len1, len2;
1917*7c478bd9Sstevel@tonic-gate 	int issync;
1918*7c478bd9Sstevel@tonic-gate 	int trans_size;
1919*7c478bd9Sstevel@tonic-gate 	int dotrans;
1920*7c478bd9Sstevel@tonic-gate 	int dorwlock;
1921*7c478bd9Sstevel@tonic-gate 	int error;
1922*7c478bd9Sstevel@tonic-gate 	int owner_change;
1923*7c478bd9Sstevel@tonic-gate 	int dodqlock;
1924*7c478bd9Sstevel@tonic-gate 	timestruc_t now;
1925*7c478bd9Sstevel@tonic-gate 	vattr_t oldva;
1926*7c478bd9Sstevel@tonic-gate 	int retry = 1;
1927*7c478bd9Sstevel@tonic-gate 
1928*7c478bd9Sstevel@tonic-gate 	TRACE_2(TR_FAC_UFS, TR_UFS_SETATTR_START,
1929*7c478bd9Sstevel@tonic-gate 		"ufs_setattr_start:vp %p flags %x", vp, flags);
1930*7c478bd9Sstevel@tonic-gate 
1931*7c478bd9Sstevel@tonic-gate 	/*
1932*7c478bd9Sstevel@tonic-gate 	 * Cannot set these attributes.
1933*7c478bd9Sstevel@tonic-gate 	 */
1934*7c478bd9Sstevel@tonic-gate 	if (mask & AT_NOSET) {
1935*7c478bd9Sstevel@tonic-gate 		error = EINVAL;
1936*7c478bd9Sstevel@tonic-gate 		goto out;
1937*7c478bd9Sstevel@tonic-gate 	}
1938*7c478bd9Sstevel@tonic-gate 
1939*7c478bd9Sstevel@tonic-gate 	/*
1940*7c478bd9Sstevel@tonic-gate 	 * check for forced unmount
1941*7c478bd9Sstevel@tonic-gate 	 */
1942*7c478bd9Sstevel@tonic-gate 	if (ufsvfsp == NULL)
1943*7c478bd9Sstevel@tonic-gate 		return (EIO);
1944*7c478bd9Sstevel@tonic-gate 
1945*7c478bd9Sstevel@tonic-gate 	fs = ufsvfsp->vfs_fs;
1946*7c478bd9Sstevel@tonic-gate 	if (fs->fs_ronly != 0)
1947*7c478bd9Sstevel@tonic-gate 		return (EROFS);
1948*7c478bd9Sstevel@tonic-gate 
1949*7c478bd9Sstevel@tonic-gate again:
1950*7c478bd9Sstevel@tonic-gate 	errmsg1 = NULL;
1951*7c478bd9Sstevel@tonic-gate 	errmsg2 = NULL;
1952*7c478bd9Sstevel@tonic-gate 	dotrans = 0;
1953*7c478bd9Sstevel@tonic-gate 	dorwlock = 0;
1954*7c478bd9Sstevel@tonic-gate 	dodqlock = 0;
1955*7c478bd9Sstevel@tonic-gate 
1956*7c478bd9Sstevel@tonic-gate 	error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_SETATTR_MASK);
1957*7c478bd9Sstevel@tonic-gate 	if (error)
1958*7c478bd9Sstevel@tonic-gate 		goto out;
1959*7c478bd9Sstevel@tonic-gate 
1960*7c478bd9Sstevel@tonic-gate 	/*
1961*7c478bd9Sstevel@tonic-gate 	 * Acquire i_rwlock before TRANS_BEGIN_CSYNC() if this is a file.
1962*7c478bd9Sstevel@tonic-gate 	 * This follows the protocol for read()/write().
1963*7c478bd9Sstevel@tonic-gate 	 */
1964*7c478bd9Sstevel@tonic-gate 	if (vp->v_type != VDIR) {
1965*7c478bd9Sstevel@tonic-gate 		rw_enter(&ip->i_rwlock, RW_WRITER);
1966*7c478bd9Sstevel@tonic-gate 		dorwlock = 1;
1967*7c478bd9Sstevel@tonic-gate 	}
1968*7c478bd9Sstevel@tonic-gate 
1969*7c478bd9Sstevel@tonic-gate 	/*
1970*7c478bd9Sstevel@tonic-gate 	 * Truncate file.  Must have write permission and not be a directory.
1971*7c478bd9Sstevel@tonic-gate 	 */
1972*7c478bd9Sstevel@tonic-gate 	if (mask & AT_SIZE) {
1973*7c478bd9Sstevel@tonic-gate 		rw_enter(&ip->i_contents, RW_WRITER);
1974*7c478bd9Sstevel@tonic-gate 		if (vp->v_type == VDIR) {
1975*7c478bd9Sstevel@tonic-gate 			error = EISDIR;
1976*7c478bd9Sstevel@tonic-gate 			goto update_inode;
1977*7c478bd9Sstevel@tonic-gate 		}
1978*7c478bd9Sstevel@tonic-gate 		if (error = ufs_iaccess(ip, IWRITE, cr))
1979*7c478bd9Sstevel@tonic-gate 			goto update_inode;
1980*7c478bd9Sstevel@tonic-gate 
1981*7c478bd9Sstevel@tonic-gate 		rw_exit(&ip->i_contents);
1982*7c478bd9Sstevel@tonic-gate 		error = TRANS_ITRUNC(ip, vap->va_size, 0, cr);
1983*7c478bd9Sstevel@tonic-gate 		if (error) {
1984*7c478bd9Sstevel@tonic-gate 			rw_enter(&ip->i_contents, RW_WRITER);
1985*7c478bd9Sstevel@tonic-gate 			goto update_inode;
1986*7c478bd9Sstevel@tonic-gate 		}
1987*7c478bd9Sstevel@tonic-gate 	}
1988*7c478bd9Sstevel@tonic-gate 
1989*7c478bd9Sstevel@tonic-gate 	if (ulp) {
1990*7c478bd9Sstevel@tonic-gate 		trans_size = (int)TOP_SETATTR_SIZE(ip);
1991*7c478bd9Sstevel@tonic-gate 		TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_SETATTR, trans_size);
1992*7c478bd9Sstevel@tonic-gate 		++dotrans;
1993*7c478bd9Sstevel@tonic-gate 	}
1994*7c478bd9Sstevel@tonic-gate 
1995*7c478bd9Sstevel@tonic-gate 	/*
1996*7c478bd9Sstevel@tonic-gate 	 * Acquire i_rwlock after TRANS_BEGIN_CSYNC() if this is a directory.
1997*7c478bd9Sstevel@tonic-gate 	 * This follows the protocol established by
1998*7c478bd9Sstevel@tonic-gate 	 * ufs_link/create/remove/rename/mkdir/rmdir/symlink.
1999*7c478bd9Sstevel@tonic-gate 	 */
2000*7c478bd9Sstevel@tonic-gate 	if (vp->v_type == VDIR) {
2001*7c478bd9Sstevel@tonic-gate 		rw_enter(&ip->i_rwlock, RW_WRITER);
2002*7c478bd9Sstevel@tonic-gate 		dorwlock = 1;
2003*7c478bd9Sstevel@tonic-gate 	}
2004*7c478bd9Sstevel@tonic-gate 
2005*7c478bd9Sstevel@tonic-gate 	/*
2006*7c478bd9Sstevel@tonic-gate 	 * Grab quota lock if we are changing the file's owner.
2007*7c478bd9Sstevel@tonic-gate 	 */
2008*7c478bd9Sstevel@tonic-gate 	if (mask & AT_UID) {
2009*7c478bd9Sstevel@tonic-gate 		rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
2010*7c478bd9Sstevel@tonic-gate 		dodqlock = 1;
2011*7c478bd9Sstevel@tonic-gate 	}
2012*7c478bd9Sstevel@tonic-gate 	rw_enter(&ip->i_contents, RW_WRITER);
2013*7c478bd9Sstevel@tonic-gate 
2014*7c478bd9Sstevel@tonic-gate 	oldva.va_mode = ip->i_mode;
2015*7c478bd9Sstevel@tonic-gate 	oldva.va_uid = ip->i_uid;
2016*7c478bd9Sstevel@tonic-gate 	oldva.va_gid = ip->i_gid;
2017*7c478bd9Sstevel@tonic-gate 
2018*7c478bd9Sstevel@tonic-gate 	vap->va_mask &= ~AT_SIZE;
2019*7c478bd9Sstevel@tonic-gate 	/*
2020*7c478bd9Sstevel@tonic-gate 	 * ufs_iaccess is "close enough"; that's because it doesn't
2021*7c478bd9Sstevel@tonic-gate 	 * map the defines.
2022*7c478bd9Sstevel@tonic-gate 	 */
2023*7c478bd9Sstevel@tonic-gate 	error = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags,
2024*7c478bd9Sstevel@tonic-gate 				ufs_iaccess, ip);
2025*7c478bd9Sstevel@tonic-gate 	if (error)
2026*7c478bd9Sstevel@tonic-gate 		goto update_inode;
2027*7c478bd9Sstevel@tonic-gate 
2028*7c478bd9Sstevel@tonic-gate 	mask = vap->va_mask;
2029*7c478bd9Sstevel@tonic-gate 
2030*7c478bd9Sstevel@tonic-gate 	/*
2031*7c478bd9Sstevel@tonic-gate 	 * Change file access modes.
2032*7c478bd9Sstevel@tonic-gate 	 */
2033*7c478bd9Sstevel@tonic-gate 	if (mask & AT_MODE) {
2034*7c478bd9Sstevel@tonic-gate 		ip->i_mode = (ip->i_mode & IFMT) | (vap->va_mode & ~IFMT);
2035*7c478bd9Sstevel@tonic-gate 		TRANS_INODE(ufsvfsp, ip);
2036*7c478bd9Sstevel@tonic-gate 		ip->i_flag |= ICHG;
2037*7c478bd9Sstevel@tonic-gate 		if (stickyhack) {
2038*7c478bd9Sstevel@tonic-gate 			mutex_enter(&vp->v_lock);
2039*7c478bd9Sstevel@tonic-gate 			if ((ip->i_mode & (ISVTX | IEXEC | IFDIR)) == ISVTX)
2040*7c478bd9Sstevel@tonic-gate 				vp->v_flag |= VSWAPLIKE;
2041*7c478bd9Sstevel@tonic-gate 			else
2042*7c478bd9Sstevel@tonic-gate 				vp->v_flag &= ~VSWAPLIKE;
2043*7c478bd9Sstevel@tonic-gate 			mutex_exit(&vp->v_lock);
2044*7c478bd9Sstevel@tonic-gate 		}
2045*7c478bd9Sstevel@tonic-gate 	}
2046*7c478bd9Sstevel@tonic-gate 	if (mask & (AT_UID|AT_GID)) {
2047*7c478bd9Sstevel@tonic-gate 		if (mask & AT_UID) {
2048*7c478bd9Sstevel@tonic-gate 			/*
2049*7c478bd9Sstevel@tonic-gate 			 * Don't change ownership of the quota inode.
2050*7c478bd9Sstevel@tonic-gate 			 */
2051*7c478bd9Sstevel@tonic-gate 			if (ufsvfsp->vfs_qinod == ip) {
2052*7c478bd9Sstevel@tonic-gate 				ASSERT(ufsvfsp->vfs_qflags & MQ_ENABLED);
2053*7c478bd9Sstevel@tonic-gate 				error = EINVAL;
2054*7c478bd9Sstevel@tonic-gate 				goto update_inode;
2055*7c478bd9Sstevel@tonic-gate 			}
2056*7c478bd9Sstevel@tonic-gate 
2057*7c478bd9Sstevel@tonic-gate 			/*
2058*7c478bd9Sstevel@tonic-gate 			 * No real ownership change.
2059*7c478bd9Sstevel@tonic-gate 			 */
2060*7c478bd9Sstevel@tonic-gate 			if (ip->i_uid == vap->va_uid) {
2061*7c478bd9Sstevel@tonic-gate 				blocks = 0;
2062*7c478bd9Sstevel@tonic-gate 				owner_change = 0;
2063*7c478bd9Sstevel@tonic-gate 			}
2064*7c478bd9Sstevel@tonic-gate 			/*
2065*7c478bd9Sstevel@tonic-gate 			 * Remove the blocks and the file, from the old user's
2066*7c478bd9Sstevel@tonic-gate 			 * quota.
2067*7c478bd9Sstevel@tonic-gate 			 */
2068*7c478bd9Sstevel@tonic-gate 			else {
2069*7c478bd9Sstevel@tonic-gate 				blocks = ip->i_blocks;
2070*7c478bd9Sstevel@tonic-gate 				owner_change = 1;
2071*7c478bd9Sstevel@tonic-gate 
2072*7c478bd9Sstevel@tonic-gate 				(void) chkdq(ip, -blocks, /* force */ 1, cr,
2073*7c478bd9Sstevel@tonic-gate 						(char **)NULL, (size_t *)NULL);
2074*7c478bd9Sstevel@tonic-gate 				(void) chkiq(ufsvfsp, /* change */ -1, ip,
2075*7c478bd9Sstevel@tonic-gate 						(uid_t)ip->i_uid,
2076*7c478bd9Sstevel@tonic-gate 						/* force */ 1, cr,
2077*7c478bd9Sstevel@tonic-gate 						(char **)NULL, (size_t *)NULL);
2078*7c478bd9Sstevel@tonic-gate 				dqrele(ip->i_dquot);
2079*7c478bd9Sstevel@tonic-gate 			}
2080*7c478bd9Sstevel@tonic-gate 
2081*7c478bd9Sstevel@tonic-gate 			ip->i_uid = vap->va_uid;
2082*7c478bd9Sstevel@tonic-gate 
2083*7c478bd9Sstevel@tonic-gate 			/*
2084*7c478bd9Sstevel@tonic-gate 			 * There is a real ownership change.
2085*7c478bd9Sstevel@tonic-gate 			 */
2086*7c478bd9Sstevel@tonic-gate 			if (owner_change) {
2087*7c478bd9Sstevel@tonic-gate 				/*
2088*7c478bd9Sstevel@tonic-gate 				 * Add the blocks and the file to the new
2089*7c478bd9Sstevel@tonic-gate 				 * user's quota.
2090*7c478bd9Sstevel@tonic-gate 				 */
2091*7c478bd9Sstevel@tonic-gate 				ip->i_dquot = getinoquota(ip);
2092*7c478bd9Sstevel@tonic-gate 				(void) chkdq(ip, blocks, /* force */ 1, cr,
2093*7c478bd9Sstevel@tonic-gate 						&errmsg1, &len1);
2094*7c478bd9Sstevel@tonic-gate 				(void) chkiq(ufsvfsp, /* change */ 1,
2095*7c478bd9Sstevel@tonic-gate 						(struct inode *)NULL,
2096*7c478bd9Sstevel@tonic-gate 						(uid_t)ip->i_uid,
2097*7c478bd9Sstevel@tonic-gate 						/* force */ 1, cr,
2098*7c478bd9Sstevel@tonic-gate 						&errmsg2, &len2);
2099*7c478bd9Sstevel@tonic-gate 			}
2100*7c478bd9Sstevel@tonic-gate 		}
2101*7c478bd9Sstevel@tonic-gate 		if (mask & AT_GID) {
2102*7c478bd9Sstevel@tonic-gate 			ip->i_gid = vap->va_gid;
2103*7c478bd9Sstevel@tonic-gate 		}
2104*7c478bd9Sstevel@tonic-gate 		TRANS_INODE(ufsvfsp, ip);
2105*7c478bd9Sstevel@tonic-gate 		ip->i_flag |= ICHG;
2106*7c478bd9Sstevel@tonic-gate 	}
2107*7c478bd9Sstevel@tonic-gate 	/*
2108*7c478bd9Sstevel@tonic-gate 	 * Change file access or modified times.
2109*7c478bd9Sstevel@tonic-gate 	 */
2110*7c478bd9Sstevel@tonic-gate 	if (mask & (AT_ATIME|AT_MTIME)) {
2111*7c478bd9Sstevel@tonic-gate 		/* Check that the time value is within ufs range */
2112*7c478bd9Sstevel@tonic-gate 		if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) ||
2113*7c478bd9Sstevel@tonic-gate 		    ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) {
2114*7c478bd9Sstevel@tonic-gate 			error = EOVERFLOW;
2115*7c478bd9Sstevel@tonic-gate 			goto update_inode;
2116*7c478bd9Sstevel@tonic-gate 		}
2117*7c478bd9Sstevel@tonic-gate 
2118*7c478bd9Sstevel@tonic-gate 		/*
2119*7c478bd9Sstevel@tonic-gate 		 * if the "noaccess" mount option is set and only atime
2120*7c478bd9Sstevel@tonic-gate 		 * update is requested, do nothing. No error is returned.
2121*7c478bd9Sstevel@tonic-gate 		 */
2122*7c478bd9Sstevel@tonic-gate 		if ((ufsvfsp->vfs_noatime) &&
2123*7c478bd9Sstevel@tonic-gate 		    ((mask & (AT_ATIME|AT_MTIME)) == AT_ATIME))
2124*7c478bd9Sstevel@tonic-gate 			goto skip_atime;
2125*7c478bd9Sstevel@tonic-gate 
2126*7c478bd9Sstevel@tonic-gate 		if (mask & AT_ATIME) {
2127*7c478bd9Sstevel@tonic-gate 			ip->i_atime.tv_sec = vap->va_atime.tv_sec;
2128*7c478bd9Sstevel@tonic-gate 			ip->i_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
2129*7c478bd9Sstevel@tonic-gate 			ip->i_flag &= ~IACC;
2130*7c478bd9Sstevel@tonic-gate 		}
2131*7c478bd9Sstevel@tonic-gate 		if (mask & AT_MTIME) {
2132*7c478bd9Sstevel@tonic-gate 			ip->i_mtime.tv_sec = vap->va_mtime.tv_sec;
2133*7c478bd9Sstevel@tonic-gate 			ip->i_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
2134*7c478bd9Sstevel@tonic-gate 			gethrestime(&now);
2135*7c478bd9Sstevel@tonic-gate 			if (now.tv_sec > TIME32_MAX) {
2136*7c478bd9Sstevel@tonic-gate 				/*
2137*7c478bd9Sstevel@tonic-gate 				 * In 2038, ctime sticks forever..
2138*7c478bd9Sstevel@tonic-gate 				 */
2139*7c478bd9Sstevel@tonic-gate 				ip->i_ctime.tv_sec = TIME32_MAX;
2140*7c478bd9Sstevel@tonic-gate 				ip->i_ctime.tv_usec = 0;
2141*7c478bd9Sstevel@tonic-gate 			} else {
2142*7c478bd9Sstevel@tonic-gate 				ip->i_ctime.tv_sec = now.tv_sec;
2143*7c478bd9Sstevel@tonic-gate 				ip->i_ctime.tv_usec = now.tv_nsec / 1000;
2144*7c478bd9Sstevel@tonic-gate 			}
2145*7c478bd9Sstevel@tonic-gate 			ip->i_flag &= ~(IUPD|ICHG);
2146*7c478bd9Sstevel@tonic-gate 			ip->i_flag |= IMODTIME;
2147*7c478bd9Sstevel@tonic-gate 		}
2148*7c478bd9Sstevel@tonic-gate 		TRANS_INODE(ufsvfsp, ip);
2149*7c478bd9Sstevel@tonic-gate 		ip->i_flag |= IMOD;
2150*7c478bd9Sstevel@tonic-gate 	}
2151*7c478bd9Sstevel@tonic-gate 
2152*7c478bd9Sstevel@tonic-gate skip_atime:
2153*7c478bd9Sstevel@tonic-gate 	/*
2154*7c478bd9Sstevel@tonic-gate 	 * The presence of a shadow inode may indicate an ACL, but does
2155*7c478bd9Sstevel@tonic-gate 	 * not imply an ACL.  Future FSD types should be handled here too
2156*7c478bd9Sstevel@tonic-gate 	 * and check for the presence of the attribute-specific data
2157*7c478bd9Sstevel@tonic-gate 	 * before referencing it.
2158*7c478bd9Sstevel@tonic-gate 	 */
2159*7c478bd9Sstevel@tonic-gate 	if (ip->i_shadow) {
2160*7c478bd9Sstevel@tonic-gate 		/*
2161*7c478bd9Sstevel@tonic-gate 		 * XXX if ufs_iupdat is changed to sandbagged write fix
2162*7c478bd9Sstevel@tonic-gate 		 * ufs_acl_setattr to push ip to keep acls consistent
2163*7c478bd9Sstevel@tonic-gate 		 *
2164*7c478bd9Sstevel@tonic-gate 		 * Suppress out of inodes messages if we will retry.
2165*7c478bd9Sstevel@tonic-gate 		 */
2166*7c478bd9Sstevel@tonic-gate 		if (retry)
2167*7c478bd9Sstevel@tonic-gate 			ip->i_flag |= IQUIET;
2168*7c478bd9Sstevel@tonic-gate 		error = ufs_acl_setattr(ip, vap, cr);
2169*7c478bd9Sstevel@tonic-gate 		ip->i_flag &= ~IQUIET;
2170*7c478bd9Sstevel@tonic-gate 	}
2171*7c478bd9Sstevel@tonic-gate 
2172*7c478bd9Sstevel@tonic-gate update_inode:
2173*7c478bd9Sstevel@tonic-gate 	/*
2174*7c478bd9Sstevel@tonic-gate 	 * Setattr always increases the sequence number
2175*7c478bd9Sstevel@tonic-gate 	 */
2176*7c478bd9Sstevel@tonic-gate 	ip->i_seq++;
2177*7c478bd9Sstevel@tonic-gate 
2178*7c478bd9Sstevel@tonic-gate 	/*
2179*7c478bd9Sstevel@tonic-gate 	 * if nfsd and not logging; push synchronously
2180*7c478bd9Sstevel@tonic-gate 	 */
2181*7c478bd9Sstevel@tonic-gate 	if ((curthread->t_flag & T_DONTPEND) && !TRANS_ISTRANS(ufsvfsp)) {
2182*7c478bd9Sstevel@tonic-gate 		ufs_iupdat(ip, 1);
2183*7c478bd9Sstevel@tonic-gate 	} else {
2184*7c478bd9Sstevel@tonic-gate 		ITIMES_NOLOCK(ip);
2185*7c478bd9Sstevel@tonic-gate 	}
2186*7c478bd9Sstevel@tonic-gate 
2187*7c478bd9Sstevel@tonic-gate 	rw_exit(&ip->i_contents);
2188*7c478bd9Sstevel@tonic-gate 	if (dodqlock) {
2189*7c478bd9Sstevel@tonic-gate 		rw_exit(&ufsvfsp->vfs_dqrwlock);
2190*7c478bd9Sstevel@tonic-gate 	}
2191*7c478bd9Sstevel@tonic-gate 	if (dorwlock)
2192*7c478bd9Sstevel@tonic-gate 		rw_exit(&ip->i_rwlock);
2193*7c478bd9Sstevel@tonic-gate 
2194*7c478bd9Sstevel@tonic-gate 	if (ulp) {
2195*7c478bd9Sstevel@tonic-gate 		if (dotrans) {
2196*7c478bd9Sstevel@tonic-gate 			int terr = 0;
2197*7c478bd9Sstevel@tonic-gate 			TRANS_END_CSYNC(ufsvfsp, terr, issync, TOP_SETATTR,
2198*7c478bd9Sstevel@tonic-gate 			    trans_size);
2199*7c478bd9Sstevel@tonic-gate 			if (error == 0)
2200*7c478bd9Sstevel@tonic-gate 				error = terr;
2201*7c478bd9Sstevel@tonic-gate 		}
2202*7c478bd9Sstevel@tonic-gate 		ufs_lockfs_end(ulp);
2203*7c478bd9Sstevel@tonic-gate 	}
2204*7c478bd9Sstevel@tonic-gate out:
2205*7c478bd9Sstevel@tonic-gate 	/*
2206*7c478bd9Sstevel@tonic-gate 	 * If out of inodes or blocks, see if we can free something
2207*7c478bd9Sstevel@tonic-gate 	 * up from the delete queue.
2208*7c478bd9Sstevel@tonic-gate 	 */
2209*7c478bd9Sstevel@tonic-gate 	if ((error == ENOSPC) && retry && TRANS_ISTRANS(ufsvfsp)) {
2210*7c478bd9Sstevel@tonic-gate 		ufs_delete_drain_wait(ufsvfsp, 1);
2211*7c478bd9Sstevel@tonic-gate 		retry = 0;
2212*7c478bd9Sstevel@tonic-gate 		if (errmsg1 != NULL)
2213*7c478bd9Sstevel@tonic-gate 			kmem_free(errmsg1, len1);
2214*7c478bd9Sstevel@tonic-gate 		if (errmsg2 != NULL)
2215*7c478bd9Sstevel@tonic-gate 			kmem_free(errmsg2, len2);
2216*7c478bd9Sstevel@tonic-gate 		goto again;
2217*7c478bd9Sstevel@tonic-gate 	}
2218*7c478bd9Sstevel@tonic-gate 	TRACE_2(TR_FAC_UFS, TR_UFS_SETATTR_END,
2219*7c478bd9Sstevel@tonic-gate 		"ufs_setattr_end:vp %p error %d", vp, error);
2220*7c478bd9Sstevel@tonic-gate 	if (errmsg1 != NULL) {
2221*7c478bd9Sstevel@tonic-gate 		uprintf(errmsg1);
2222*7c478bd9Sstevel@tonic-gate 		kmem_free(errmsg1, len1);
2223*7c478bd9Sstevel@tonic-gate 	}
2224*7c478bd9Sstevel@tonic-gate 	if (errmsg2 != NULL) {
2225*7c478bd9Sstevel@tonic-gate 		uprintf(errmsg2);
2226*7c478bd9Sstevel@tonic-gate 		kmem_free(errmsg2, len2);
2227*7c478bd9Sstevel@tonic-gate 	}
2228*7c478bd9Sstevel@tonic-gate 	return (error);
2229*7c478bd9Sstevel@tonic-gate }
2230*7c478bd9Sstevel@tonic-gate 
2231*7c478bd9Sstevel@tonic-gate /*ARGSUSED*/
2232*7c478bd9Sstevel@tonic-gate static int
2233*7c478bd9Sstevel@tonic-gate ufs_access(struct vnode *vp, int mode, int flags, struct cred *cr)
2234*7c478bd9Sstevel@tonic-gate {
2235*7c478bd9Sstevel@tonic-gate 	struct inode *ip = VTOI(vp);
2236*7c478bd9Sstevel@tonic-gate 	int error;
2237*7c478bd9Sstevel@tonic-gate 
2238*7c478bd9Sstevel@tonic-gate 	TRACE_3(TR_FAC_UFS, TR_UFS_ACCESS_START,
2239*7c478bd9Sstevel@tonic-gate 		"ufs_access_start:vp %p mode %x flags %x", vp, mode, flags);
2240*7c478bd9Sstevel@tonic-gate 
2241*7c478bd9Sstevel@tonic-gate 	if (ip->i_ufsvfs == NULL)
2242*7c478bd9Sstevel@tonic-gate 		return (EIO);
2243*7c478bd9Sstevel@tonic-gate 
2244*7c478bd9Sstevel@tonic-gate 	rw_enter(&ip->i_contents, RW_READER);
2245*7c478bd9Sstevel@tonic-gate 
2246*7c478bd9Sstevel@tonic-gate 	/*
2247*7c478bd9Sstevel@tonic-gate 	 * The ufs_iaccess function wants to be called with
2248*7c478bd9Sstevel@tonic-gate 	 * mode bits expressed as "ufs specific" bits.
2249*7c478bd9Sstevel@tonic-gate 	 * I.e., VWRITE|VREAD|VEXEC do not make sense to
2250*7c478bd9Sstevel@tonic-gate 	 * ufs_iaccess() but IWRITE|IREAD|IEXEC do.
2251*7c478bd9Sstevel@tonic-gate 	 * But since they're the same we just pass the vnode mode
2252*7c478bd9Sstevel@tonic-gate 	 * bit but just verify that assumption at compile time.
2253*7c478bd9Sstevel@tonic-gate 	 */
2254*7c478bd9Sstevel@tonic-gate #if IWRITE != VWRITE || IREAD != VREAD || IEXEC != VEXEC
2255*7c478bd9Sstevel@tonic-gate #error "ufs_access needs to map Vmodes to Imodes"
2256*7c478bd9Sstevel@tonic-gate #endif
2257*7c478bd9Sstevel@tonic-gate 	error = ufs_iaccess(ip, mode, cr);
2258*7c478bd9Sstevel@tonic-gate 
2259*7c478bd9Sstevel@tonic-gate 	rw_exit(&ip->i_contents);
2260*7c478bd9Sstevel@tonic-gate 
2261*7c478bd9Sstevel@tonic-gate 	TRACE_2(TR_FAC_UFS, TR_UFS_ACCESS_END,
2262*7c478bd9Sstevel@tonic-gate 		"ufs_access_end:vp %p error %d", vp, error);
2263*7c478bd9Sstevel@tonic-gate 	return (error);
2264*7c478bd9Sstevel@tonic-gate }
2265*7c478bd9Sstevel@tonic-gate 
2266*7c478bd9Sstevel@tonic-gate /* ARGSUSED */
2267*7c478bd9Sstevel@tonic-gate static int
2268*7c478bd9Sstevel@tonic-gate ufs_readlink(struct vnode *vp, struct uio *uiop, struct cred *cr)
2269*7c478bd9Sstevel@tonic-gate {
2270*7c478bd9Sstevel@tonic-gate 	struct inode *ip = VTOI(vp);
2271*7c478bd9Sstevel@tonic-gate 	struct ufsvfs *ufsvfsp;
2272*7c478bd9Sstevel@tonic-gate 	struct ulockfs *ulp;
2273*7c478bd9Sstevel@tonic-gate 	int error;
2274*7c478bd9Sstevel@tonic-gate 	int fastsymlink;
2275*7c478bd9Sstevel@tonic-gate 
2276*7c478bd9Sstevel@tonic-gate 	TRACE_2(TR_FAC_UFS, TR_UFS_READLINK_START,
2277*7c478bd9Sstevel@tonic-gate 		"ufs_readlink_start:vp %p uiop %p", uiop, vp);
2278*7c478bd9Sstevel@tonic-gate 
2279*7c478bd9Sstevel@tonic-gate 	if (vp->v_type != VLNK) {
2280*7c478bd9Sstevel@tonic-gate 		error = EINVAL;
2281*7c478bd9Sstevel@tonic-gate 		goto nolockout;
2282*7c478bd9Sstevel@tonic-gate 	}
2283*7c478bd9Sstevel@tonic-gate 
2284*7c478bd9Sstevel@tonic-gate 	/*
2285*7c478bd9Sstevel@tonic-gate 	 * If the symbolic link is empty there is nothing to read.
2286*7c478bd9Sstevel@tonic-gate 	 * Fast-track these empty symbolic links
2287*7c478bd9Sstevel@tonic-gate 	 */
2288*7c478bd9Sstevel@tonic-gate 	if (ip->i_size == 0) {
2289*7c478bd9Sstevel@tonic-gate 		error = 0;
2290*7c478bd9Sstevel@tonic-gate 		goto nolockout;
2291*7c478bd9Sstevel@tonic-gate 	}
2292*7c478bd9Sstevel@tonic-gate 
2293*7c478bd9Sstevel@tonic-gate 	ufsvfsp = ip->i_ufsvfs;
2294*7c478bd9Sstevel@tonic-gate 	error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_READLINK_MASK);
2295*7c478bd9Sstevel@tonic-gate 	if (error)
2296*7c478bd9Sstevel@tonic-gate 		goto nolockout;
2297*7c478bd9Sstevel@tonic-gate 	/*
2298*7c478bd9Sstevel@tonic-gate 	 * The ip->i_rwlock protects the data blocks used for FASTSYMLINK
2299*7c478bd9Sstevel@tonic-gate 	 */
2300*7c478bd9Sstevel@tonic-gate again:
2301*7c478bd9Sstevel@tonic-gate 	fastsymlink = 0;
2302*7c478bd9Sstevel@tonic-gate 	if (ip->i_flag & IFASTSYMLNK) {
2303*7c478bd9Sstevel@tonic-gate 		rw_enter(&ip->i_rwlock, RW_READER);
2304*7c478bd9Sstevel@tonic-gate 		rw_enter(&ip->i_contents, RW_READER);
2305*7c478bd9Sstevel@tonic-gate 		if (ip->i_flag & IFASTSYMLNK) {
2306*7c478bd9Sstevel@tonic-gate 			if (!ULOCKFS_IS_NOIACC(ITOUL(ip)) &&
2307*7c478bd9Sstevel@tonic-gate 			    (ip->i_fs->fs_ronly == 0) &&
2308*7c478bd9Sstevel@tonic-gate 			    (!ufsvfsp->vfs_noatime)) {
2309*7c478bd9Sstevel@tonic-gate 				mutex_enter(&ip->i_tlock);
2310*7c478bd9Sstevel@tonic-gate 				ip->i_flag |= IACC;
2311*7c478bd9Sstevel@tonic-gate 				mutex_exit(&ip->i_tlock);
2312*7c478bd9Sstevel@tonic-gate 			}
2313*7c478bd9Sstevel@tonic-gate 			error = uiomove((caddr_t)&ip->i_db[1],
2314*7c478bd9Sstevel@tonic-gate 				MIN(ip->i_size, uiop->uio_resid),
2315*7c478bd9Sstevel@tonic-gate 				UIO_READ, uiop);
2316*7c478bd9Sstevel@tonic-gate 			ITIMES(ip);
2317*7c478bd9Sstevel@tonic-gate 			++fastsymlink;
2318*7c478bd9Sstevel@tonic-gate 		}
2319*7c478bd9Sstevel@tonic-gate 		rw_exit(&ip->i_contents);
2320*7c478bd9Sstevel@tonic-gate 		rw_exit(&ip->i_rwlock);
2321*7c478bd9Sstevel@tonic-gate 	}
2322*7c478bd9Sstevel@tonic-gate 	if (!fastsymlink) {
2323*7c478bd9Sstevel@tonic-gate 		ssize_t size;	/* number of bytes read  */
2324*7c478bd9Sstevel@tonic-gate 		caddr_t basep;	/* pointer to input data */
2325*7c478bd9Sstevel@tonic-gate 		ino_t ino;
2326*7c478bd9Sstevel@tonic-gate 		long  igen;
2327*7c478bd9Sstevel@tonic-gate 		struct uio tuio;	/* temp uio struct */
2328*7c478bd9Sstevel@tonic-gate 		struct uio *tuiop;
2329*7c478bd9Sstevel@tonic-gate 		iovec_t tiov;		/* temp iovec struct */
2330*7c478bd9Sstevel@tonic-gate 		char kbuf[FSL_SIZE];	/* buffer to hold fast symlink */
2331*7c478bd9Sstevel@tonic-gate 		int tflag = 0;		/* flag to indicate temp vars used */
2332*7c478bd9Sstevel@tonic-gate 
2333*7c478bd9Sstevel@tonic-gate 		ino = ip->i_number;
2334*7c478bd9Sstevel@tonic-gate 		igen = ip->i_gen;
2335*7c478bd9Sstevel@tonic-gate 		size = uiop->uio_resid;
2336*7c478bd9Sstevel@tonic-gate 		basep = uiop->uio_iov->iov_base;
2337*7c478bd9Sstevel@tonic-gate 		tuiop = uiop;
2338*7c478bd9Sstevel@tonic-gate 
2339*7c478bd9Sstevel@tonic-gate 		rw_enter(&ip->i_rwlock, RW_WRITER);
2340*7c478bd9Sstevel@tonic-gate 		rw_enter(&ip->i_contents, RW_WRITER);
2341*7c478bd9Sstevel@tonic-gate 		if (ip->i_flag & IFASTSYMLNK) {
2342*7c478bd9Sstevel@tonic-gate 			rw_exit(&ip->i_contents);
2343*7c478bd9Sstevel@tonic-gate 			rw_exit(&ip->i_rwlock);
2344*7c478bd9Sstevel@tonic-gate 			goto again;
2345*7c478bd9Sstevel@tonic-gate 		}
2346*7c478bd9Sstevel@tonic-gate 
2347*7c478bd9Sstevel@tonic-gate 		/* can this be a fast symlink and is it a user buffer? */
2348*7c478bd9Sstevel@tonic-gate 		if (ip->i_size <= FSL_SIZE &&
2349*7c478bd9Sstevel@tonic-gate 		    (uiop->uio_segflg == UIO_USERSPACE ||
2350*7c478bd9Sstevel@tonic-gate 		    uiop->uio_segflg == UIO_USERISPACE)) {
2351*7c478bd9Sstevel@tonic-gate 
2352*7c478bd9Sstevel@tonic-gate 			bzero(&tuio, sizeof (struct uio));
2353*7c478bd9Sstevel@tonic-gate 			/*
2354*7c478bd9Sstevel@tonic-gate 			 * setup a kernel buffer to read link into.  this
2355*7c478bd9Sstevel@tonic-gate 			 * is to fix a race condition where the user buffer
2356*7c478bd9Sstevel@tonic-gate 			 * got corrupted before copying it into the inode.
2357*7c478bd9Sstevel@tonic-gate 			 */
2358*7c478bd9Sstevel@tonic-gate 			size = ip->i_size;
2359*7c478bd9Sstevel@tonic-gate 			tiov.iov_len = size;
2360*7c478bd9Sstevel@tonic-gate 			tiov.iov_base = kbuf;
2361*7c478bd9Sstevel@tonic-gate 			tuio.uio_iov = &tiov;
2362*7c478bd9Sstevel@tonic-gate 			tuio.uio_iovcnt = 1;
2363*7c478bd9Sstevel@tonic-gate 			tuio.uio_offset = uiop->uio_offset;
2364*7c478bd9Sstevel@tonic-gate 			tuio.uio_segflg = UIO_SYSSPACE;
2365*7c478bd9Sstevel@tonic-gate 			tuio.uio_fmode = uiop->uio_fmode;
2366*7c478bd9Sstevel@tonic-gate 			tuio.uio_extflg = uiop->uio_extflg;
2367*7c478bd9Sstevel@tonic-gate 			tuio.uio_limit = uiop->uio_limit;
2368*7c478bd9Sstevel@tonic-gate 			tuio.uio_resid = size;
2369*7c478bd9Sstevel@tonic-gate 
2370*7c478bd9Sstevel@tonic-gate 			basep = tuio.uio_iov->iov_base;
2371*7c478bd9Sstevel@tonic-gate 			tuiop = &tuio;
2372*7c478bd9Sstevel@tonic-gate 			tflag = 1;
2373*7c478bd9Sstevel@tonic-gate 		}
2374*7c478bd9Sstevel@tonic-gate 
2375*7c478bd9Sstevel@tonic-gate 		error = rdip(ip, tuiop, 0, cr);
2376*7c478bd9Sstevel@tonic-gate 		if (!(error == 0 && ip->i_number == ino && ip->i_gen == igen)) {
2377*7c478bd9Sstevel@tonic-gate 			rw_exit(&ip->i_contents);
2378*7c478bd9Sstevel@tonic-gate 			rw_exit(&ip->i_rwlock);
2379*7c478bd9Sstevel@tonic-gate 			goto out;
2380*7c478bd9Sstevel@tonic-gate 		}
2381*7c478bd9Sstevel@tonic-gate 
2382*7c478bd9Sstevel@tonic-gate 		if (tflag == 0)
2383*7c478bd9Sstevel@tonic-gate 			size -= uiop->uio_resid;
2384*7c478bd9Sstevel@tonic-gate 
2385*7c478bd9Sstevel@tonic-gate 		if ((tflag == 0 && ip->i_size <= FSL_SIZE &&
2386*7c478bd9Sstevel@tonic-gate 		    ip->i_size == size) || (tflag == 1 &&
2387*7c478bd9Sstevel@tonic-gate 		    tuio.uio_resid == 0)) {
2388*7c478bd9Sstevel@tonic-gate 			error = kcopy(basep, &ip->i_db[1], ip->i_size);
2389*7c478bd9Sstevel@tonic-gate 			if (error == 0) {
2390*7c478bd9Sstevel@tonic-gate 				ip->i_flag |= IFASTSYMLNK;
2391*7c478bd9Sstevel@tonic-gate 				/*
2392*7c478bd9Sstevel@tonic-gate 				 * free page
2393*7c478bd9Sstevel@tonic-gate 				 */
2394*7c478bd9Sstevel@tonic-gate 				(void) VOP_PUTPAGE(ITOV(ip),
2395*7c478bd9Sstevel@tonic-gate 				    (offset_t)0, PAGESIZE,
2396*7c478bd9Sstevel@tonic-gate 				    (B_DONTNEED | B_FREE | B_FORCE | B_ASYNC),
2397*7c478bd9Sstevel@tonic-gate 				    cr);
2398*7c478bd9Sstevel@tonic-gate 			} else {
2399*7c478bd9Sstevel@tonic-gate 				int i;
2400*7c478bd9Sstevel@tonic-gate 				/* error, clear garbage left behind */
2401*7c478bd9Sstevel@tonic-gate 				for (i = 1; i < NDADDR; i++)
2402*7c478bd9Sstevel@tonic-gate 					ip->i_db[i] = 0;
2403*7c478bd9Sstevel@tonic-gate 				for (i = 0; i < NIADDR; i++)
2404*7c478bd9Sstevel@tonic-gate 					ip->i_ib[i] = 0;
2405*7c478bd9Sstevel@tonic-gate 			}
2406*7c478bd9Sstevel@tonic-gate 		}
2407*7c478bd9Sstevel@tonic-gate 		if (tflag == 1) {
2408*7c478bd9Sstevel@tonic-gate 			/* now, copy it into the user buffer */
2409*7c478bd9Sstevel@tonic-gate 			error = uiomove((caddr_t)kbuf,
2410*7c478bd9Sstevel@tonic-gate 				MIN(size, uiop->uio_resid),
2411*7c478bd9Sstevel@tonic-gate 				UIO_READ, uiop);
2412*7c478bd9Sstevel@tonic-gate 		}
2413*7c478bd9Sstevel@tonic-gate 		rw_exit(&ip->i_contents);
2414*7c478bd9Sstevel@tonic-gate 		rw_exit(&ip->i_rwlock);
2415*7c478bd9Sstevel@tonic-gate 	}
2416*7c478bd9Sstevel@tonic-gate out:
2417*7c478bd9Sstevel@tonic-gate 	if (ulp) {
2418*7c478bd9Sstevel@tonic-gate 		ufs_lockfs_end(ulp);
2419*7c478bd9Sstevel@tonic-gate 	}
2420*7c478bd9Sstevel@tonic-gate nolockout:
2421*7c478bd9Sstevel@tonic-gate 	TRACE_2(TR_FAC_UFS, TR_UFS_READLINK_END,
2422*7c478bd9Sstevel@tonic-gate 		"ufs_readlink_end:vp %p error %d", vp, error);
2423*7c478bd9Sstevel@tonic-gate 
2424*7c478bd9Sstevel@tonic-gate 	return (error);
2425*7c478bd9Sstevel@tonic-gate }
2426*7c478bd9Sstevel@tonic-gate 
2427*7c478bd9Sstevel@tonic-gate /* ARGSUSED */
2428*7c478bd9Sstevel@tonic-gate static int
2429*7c478bd9Sstevel@tonic-gate ufs_fsync(struct vnode *vp, int syncflag, struct cred *cr)
2430*7c478bd9Sstevel@tonic-gate {
2431*7c478bd9Sstevel@tonic-gate 	struct inode *ip = VTOI(vp);
2432*7c478bd9Sstevel@tonic-gate 	struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
2433*7c478bd9Sstevel@tonic-gate 	struct ulockfs *ulp;
2434*7c478bd9Sstevel@tonic-gate 	int error;
2435*7c478bd9Sstevel@tonic-gate 
2436*7c478bd9Sstevel@tonic-gate 	TRACE_1(TR_FAC_UFS, TR_UFS_FSYNC_START,
2437*7c478bd9Sstevel@tonic-gate 		"ufs_fsync_start:vp %p", vp);
2438*7c478bd9Sstevel@tonic-gate 
2439*7c478bd9Sstevel@tonic-gate 	error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_FSYNC_MASK);
2440*7c478bd9Sstevel@tonic-gate 	if (error)
2441*7c478bd9Sstevel@tonic-gate 		return (error);
2442*7c478bd9Sstevel@tonic-gate 
2443*7c478bd9Sstevel@tonic-gate 	if (TRANS_ISTRANS(ufsvfsp)) {
2444*7c478bd9Sstevel@tonic-gate 		/*
2445*7c478bd9Sstevel@tonic-gate 		 * First push out any data pages
2446*7c478bd9Sstevel@tonic-gate 		 */
2447*7c478bd9Sstevel@tonic-gate 		if (vn_has_cached_data(vp) && !(syncflag & FNODSYNC) &&
2448*7c478bd9Sstevel@tonic-gate 		    (vp->v_type != VCHR) && !(IS_SWAPVP(vp))) {
2449*7c478bd9Sstevel@tonic-gate 			error = VOP_PUTPAGE(vp, (offset_t)0, (size_t)0,
2450*7c478bd9Sstevel@tonic-gate 			    0, CRED());
2451*7c478bd9Sstevel@tonic-gate 			if (error)
2452*7c478bd9Sstevel@tonic-gate 				goto out;
2453*7c478bd9Sstevel@tonic-gate 		}
2454*7c478bd9Sstevel@tonic-gate 
2455*7c478bd9Sstevel@tonic-gate 		/*
2456*7c478bd9Sstevel@tonic-gate 		 * Delta any delayed inode times updates
2457*7c478bd9Sstevel@tonic-gate 		 * and push inode to log.
2458*7c478bd9Sstevel@tonic-gate 		 * All other inode deltas will have already been delta'd
2459*7c478bd9Sstevel@tonic-gate 		 * and will be pushed during the commit.
2460*7c478bd9Sstevel@tonic-gate 		 */
2461*7c478bd9Sstevel@tonic-gate 		if (!(syncflag & FDSYNC) &&
2462*7c478bd9Sstevel@tonic-gate 		    ((ip->i_flag & (IMOD|IMODACC)) == IMODACC)) {
2463*7c478bd9Sstevel@tonic-gate 			if (ulp) {
2464*7c478bd9Sstevel@tonic-gate 				TRANS_BEGIN_ASYNC(ufsvfsp, TOP_FSYNC,
2465*7c478bd9Sstevel@tonic-gate 				    TOP_SYNCIP_SIZE);
2466*7c478bd9Sstevel@tonic-gate 			}
2467*7c478bd9Sstevel@tonic-gate 			rw_enter(&ip->i_contents, RW_READER);
2468*7c478bd9Sstevel@tonic-gate 			mutex_enter(&ip->i_tlock);
2469*7c478bd9Sstevel@tonic-gate 			ip->i_flag &= ~IMODTIME;
2470*7c478bd9Sstevel@tonic-gate 			mutex_exit(&ip->i_tlock);
2471*7c478bd9Sstevel@tonic-gate 			ufs_iupdat(ip, I_SYNC);
2472*7c478bd9Sstevel@tonic-gate 			rw_exit(&ip->i_contents);
2473*7c478bd9Sstevel@tonic-gate 			if (ulp) {
2474*7c478bd9Sstevel@tonic-gate 				TRANS_END_ASYNC(ufsvfsp, TOP_FSYNC,
2475*7c478bd9Sstevel@tonic-gate 				    TOP_SYNCIP_SIZE);
2476*7c478bd9Sstevel@tonic-gate 			}
2477*7c478bd9Sstevel@tonic-gate 		}
2478*7c478bd9Sstevel@tonic-gate 
2479*7c478bd9Sstevel@tonic-gate 		/*
2480*7c478bd9Sstevel@tonic-gate 		 * Commit the Moby transaction
2481*7c478bd9Sstevel@tonic-gate 		 *
2482*7c478bd9Sstevel@tonic-gate 		 * Deltas have already been made so we just need to
2483*7c478bd9Sstevel@tonic-gate 		 * commit them with a synchronous transaction.
2484*7c478bd9Sstevel@tonic-gate 		 * TRANS_BEGIN_SYNC() will return an error
2485*7c478bd9Sstevel@tonic-gate 		 * if there are no deltas to commit, for an
2486*7c478bd9Sstevel@tonic-gate 		 * empty transaction.
2487*7c478bd9Sstevel@tonic-gate 		 */
2488*7c478bd9Sstevel@tonic-gate 		if (ulp) {
2489*7c478bd9Sstevel@tonic-gate 			TRANS_BEGIN_SYNC(ufsvfsp, TOP_FSYNC, TOP_COMMIT_SIZE,
2490*7c478bd9Sstevel@tonic-gate 			    error);
2491*7c478bd9Sstevel@tonic-gate 			if (error) {
2492*7c478bd9Sstevel@tonic-gate 				error = 0; /* commit wasn't needed */
2493*7c478bd9Sstevel@tonic-gate 				goto out;
2494*7c478bd9Sstevel@tonic-gate 			}
2495*7c478bd9Sstevel@tonic-gate 			TRANS_END_SYNC(ufsvfsp, error, TOP_FSYNC,
2496*7c478bd9Sstevel@tonic-gate 			    TOP_COMMIT_SIZE);
2497*7c478bd9Sstevel@tonic-gate 		}
2498*7c478bd9Sstevel@tonic-gate 	} else {	/* not logging */
2499*7c478bd9Sstevel@tonic-gate 		if (!(IS_SWAPVP(vp)))
2500*7c478bd9Sstevel@tonic-gate 			if (syncflag & FNODSYNC) {
2501*7c478bd9Sstevel@tonic-gate 				/* Just update the inode only */
2502*7c478bd9Sstevel@tonic-gate 				TRANS_IUPDAT(ip, 1);
2503*7c478bd9Sstevel@tonic-gate 				error = 0;
2504*7c478bd9Sstevel@tonic-gate 			} else if (syncflag & FDSYNC)
2505*7c478bd9Sstevel@tonic-gate 				/* Do data-synchronous writes */
2506*7c478bd9Sstevel@tonic-gate 				error = TRANS_SYNCIP(ip, 0, I_DSYNC, TOP_FSYNC);
2507*7c478bd9Sstevel@tonic-gate 			else
2508*7c478bd9Sstevel@tonic-gate 				/* Do synchronous writes */
2509*7c478bd9Sstevel@tonic-gate 				error = TRANS_SYNCIP(ip, 0, I_SYNC, TOP_FSYNC);
2510*7c478bd9Sstevel@tonic-gate 
2511*7c478bd9Sstevel@tonic-gate 		rw_enter(&ip->i_contents, RW_WRITER);
2512*7c478bd9Sstevel@tonic-gate 		if (!error)
2513*7c478bd9Sstevel@tonic-gate 			error = ufs_sync_indir(ip);
2514*7c478bd9Sstevel@tonic-gate 		rw_exit(&ip->i_contents);
2515*7c478bd9Sstevel@tonic-gate 	}
2516*7c478bd9Sstevel@tonic-gate out:
2517*7c478bd9Sstevel@tonic-gate 	if (ulp) {
2518*7c478bd9Sstevel@tonic-gate 		ufs_lockfs_end(ulp);
2519*7c478bd9Sstevel@tonic-gate 	}
2520*7c478bd9Sstevel@tonic-gate 	TRACE_2(TR_FAC_UFS, TR_UFS_FSYNC_END,
2521*7c478bd9Sstevel@tonic-gate 		"ufs_fsync_end:vp %p error %d", vp, error);
2522*7c478bd9Sstevel@tonic-gate 	return (error);
2523*7c478bd9Sstevel@tonic-gate }
2524*7c478bd9Sstevel@tonic-gate 
2525*7c478bd9Sstevel@tonic-gate /*ARGSUSED*/
2526*7c478bd9Sstevel@tonic-gate static void
2527*7c478bd9Sstevel@tonic-gate ufs_inactive(struct vnode *vp, struct cred *cr)
2528*7c478bd9Sstevel@tonic-gate {
2529*7c478bd9Sstevel@tonic-gate 	ufs_iinactive(VTOI(vp));
2530*7c478bd9Sstevel@tonic-gate }
2531*7c478bd9Sstevel@tonic-gate 
2532*7c478bd9Sstevel@tonic-gate /*
2533*7c478bd9Sstevel@tonic-gate  * Unix file system operations having to do with directory manipulation.
2534*7c478bd9Sstevel@tonic-gate  */
2535*7c478bd9Sstevel@tonic-gate int ufs_lookup_idle_count = 2;	/* Number of inodes to idle each time */
2536*7c478bd9Sstevel@tonic-gate /* ARGSUSED */
2537*7c478bd9Sstevel@tonic-gate static int
2538*7c478bd9Sstevel@tonic-gate ufs_lookup(struct vnode *dvp, char *nm, struct vnode **vpp,
2539*7c478bd9Sstevel@tonic-gate 	struct pathname *pnp, int flags, struct vnode *rdir, struct cred *cr)
2540*7c478bd9Sstevel@tonic-gate {
2541*7c478bd9Sstevel@tonic-gate 	struct inode *ip;
2542*7c478bd9Sstevel@tonic-gate 	struct inode *sip;
2543*7c478bd9Sstevel@tonic-gate 	struct inode *xip;
2544*7c478bd9Sstevel@tonic-gate 	struct ufsvfs *ufsvfsp;
2545*7c478bd9Sstevel@tonic-gate 	struct ulockfs *ulp;
2546*7c478bd9Sstevel@tonic-gate 	struct vnode *vp;
2547*7c478bd9Sstevel@tonic-gate 	int error;
2548*7c478bd9Sstevel@tonic-gate 
2549*7c478bd9Sstevel@tonic-gate 	TRACE_2(TR_FAC_UFS, TR_UFS_LOOKUP_START,
2550*7c478bd9Sstevel@tonic-gate 		"ufs_lookup_start:dvp %p name %s", dvp, nm);
2551*7c478bd9Sstevel@tonic-gate 
2552*7c478bd9Sstevel@tonic-gate 
2553*7c478bd9Sstevel@tonic-gate 	/*
2554*7c478bd9Sstevel@tonic-gate 	 * Check flags for type of lookup (regular file or attribute file)
2555*7c478bd9Sstevel@tonic-gate 	 */
2556*7c478bd9Sstevel@tonic-gate 
2557*7c478bd9Sstevel@tonic-gate 	ip = VTOI(dvp);
2558*7c478bd9Sstevel@tonic-gate 
2559*7c478bd9Sstevel@tonic-gate 	if (flags & LOOKUP_XATTR) {
2560*7c478bd9Sstevel@tonic-gate 
2561*7c478bd9Sstevel@tonic-gate 		/*
2562*7c478bd9Sstevel@tonic-gate 		 * We don't allow recursive attributes...
2563*7c478bd9Sstevel@tonic-gate 		 * Maybe someday we will.
2564*7c478bd9Sstevel@tonic-gate 		 */
2565*7c478bd9Sstevel@tonic-gate 		if ((ip->i_cflags & IXATTR)) {
2566*7c478bd9Sstevel@tonic-gate 			return (EINVAL);
2567*7c478bd9Sstevel@tonic-gate 		}
2568*7c478bd9Sstevel@tonic-gate 
2569*7c478bd9Sstevel@tonic-gate 		if ((vp = dnlc_lookup(dvp, XATTR_DIR_NAME)) == NULL) {
2570*7c478bd9Sstevel@tonic-gate 			error = ufs_xattr_getattrdir(dvp, &sip, flags, cr);
2571*7c478bd9Sstevel@tonic-gate 			if (error) {
2572*7c478bd9Sstevel@tonic-gate 				*vpp = NULL;
2573*7c478bd9Sstevel@tonic-gate 				goto out;
2574*7c478bd9Sstevel@tonic-gate 			}
2575*7c478bd9Sstevel@tonic-gate 
2576*7c478bd9Sstevel@tonic-gate 			vp = ITOV(sip);
2577*7c478bd9Sstevel@tonic-gate 			dnlc_update(dvp, XATTR_DIR_NAME, vp);
2578*7c478bd9Sstevel@tonic-gate 		}
2579*7c478bd9Sstevel@tonic-gate 
2580*7c478bd9Sstevel@tonic-gate 		/*
2581*7c478bd9Sstevel@tonic-gate 		 * Check accessibility of directory.
2582*7c478bd9Sstevel@tonic-gate 		 */
2583*7c478bd9Sstevel@tonic-gate 		if (vp == DNLC_NO_VNODE) {
2584*7c478bd9Sstevel@tonic-gate 			VN_RELE(vp);
2585*7c478bd9Sstevel@tonic-gate 			error = ENOENT;
2586*7c478bd9Sstevel@tonic-gate 			goto out;
2587*7c478bd9Sstevel@tonic-gate 		}
2588*7c478bd9Sstevel@tonic-gate 		if ((error = ufs_iaccess(VTOI(vp), IEXEC, cr)) != 0) {
2589*7c478bd9Sstevel@tonic-gate 			VN_RELE(vp);
2590*7c478bd9Sstevel@tonic-gate 			goto out;
2591*7c478bd9Sstevel@tonic-gate 		}
2592*7c478bd9Sstevel@tonic-gate 
2593*7c478bd9Sstevel@tonic-gate 		*vpp = vp;
2594*7c478bd9Sstevel@tonic-gate 		return (0);
2595*7c478bd9Sstevel@tonic-gate 	}
2596*7c478bd9Sstevel@tonic-gate 
2597*7c478bd9Sstevel@tonic-gate 	/*
2598*7c478bd9Sstevel@tonic-gate 	 * Check for a null component, which we should treat as
2599*7c478bd9Sstevel@tonic-gate 	 * looking at dvp from within it's parent, so we don't
2600*7c478bd9Sstevel@tonic-gate 	 * need a call to ufs_iaccess(), as it has already been
2601*7c478bd9Sstevel@tonic-gate 	 * done.
2602*7c478bd9Sstevel@tonic-gate 	 */
2603*7c478bd9Sstevel@tonic-gate 	if (nm[0] == 0) {
2604*7c478bd9Sstevel@tonic-gate 		VN_HOLD(dvp);
2605*7c478bd9Sstevel@tonic-gate 		error = 0;
2606*7c478bd9Sstevel@tonic-gate 		*vpp = dvp;
2607*7c478bd9Sstevel@tonic-gate 		goto out;
2608*7c478bd9Sstevel@tonic-gate 	}
2609*7c478bd9Sstevel@tonic-gate 
2610*7c478bd9Sstevel@tonic-gate 	/*
2611*7c478bd9Sstevel@tonic-gate 	 * Check for "." ie itself. this is a quick check and
2612*7c478bd9Sstevel@tonic-gate 	 * avoids adding "." into the dnlc (which have been seen
2613*7c478bd9Sstevel@tonic-gate 	 * to occupy >10% of the cache).
2614*7c478bd9Sstevel@tonic-gate 	 */
2615*7c478bd9Sstevel@tonic-gate 	if ((nm[0] == '.') && (nm[1] == 0)) {
2616*7c478bd9Sstevel@tonic-gate 		/*
2617*7c478bd9Sstevel@tonic-gate 		 * Don't return without checking accessibility
2618*7c478bd9Sstevel@tonic-gate 		 * of the directory. We only need the lock if
2619*7c478bd9Sstevel@tonic-gate 		 * we are going to return it.
2620*7c478bd9Sstevel@tonic-gate 		 */
2621*7c478bd9Sstevel@tonic-gate 		if ((error = ufs_iaccess(ip, IEXEC, cr)) == 0) {
2622*7c478bd9Sstevel@tonic-gate 			VN_HOLD(dvp);
2623*7c478bd9Sstevel@tonic-gate 			*vpp = dvp;
2624*7c478bd9Sstevel@tonic-gate 		}
2625*7c478bd9Sstevel@tonic-gate 		goto out;
2626*7c478bd9Sstevel@tonic-gate 	}
2627*7c478bd9Sstevel@tonic-gate 
2628*7c478bd9Sstevel@tonic-gate 	/*
2629*7c478bd9Sstevel@tonic-gate 	 * Fast path: Check the directory name lookup cache.
2630*7c478bd9Sstevel@tonic-gate 	 */
2631*7c478bd9Sstevel@tonic-gate 	if (vp = dnlc_lookup(dvp, nm)) {
2632*7c478bd9Sstevel@tonic-gate 		/*
2633*7c478bd9Sstevel@tonic-gate 		 * Check accessibility of directory.
2634*7c478bd9Sstevel@tonic-gate 		 */
2635*7c478bd9Sstevel@tonic-gate 		if ((error = ufs_iaccess(ip, IEXEC, cr)) != 0) {
2636*7c478bd9Sstevel@tonic-gate 			VN_RELE(vp);
2637*7c478bd9Sstevel@tonic-gate 			goto out;
2638*7c478bd9Sstevel@tonic-gate 		}
2639*7c478bd9Sstevel@tonic-gate 		if (vp == DNLC_NO_VNODE) {
2640*7c478bd9Sstevel@tonic-gate 			VN_RELE(vp);
2641*7c478bd9Sstevel@tonic-gate 			error = ENOENT;
2642*7c478bd9Sstevel@tonic-gate 			goto out;
2643*7c478bd9Sstevel@tonic-gate 		}
2644*7c478bd9Sstevel@tonic-gate 		xip = VTOI(vp);
2645*7c478bd9Sstevel@tonic-gate 		ulp = NULL;
2646*7c478bd9Sstevel@tonic-gate 		goto fastpath;
2647*7c478bd9Sstevel@tonic-gate 	}
2648*7c478bd9Sstevel@tonic-gate 
2649*7c478bd9Sstevel@tonic-gate 	/*
2650*7c478bd9Sstevel@tonic-gate 	 * Keep the idle queue from getting too long by
2651*7c478bd9Sstevel@tonic-gate 	 * idling two inodes before attempting to allocate another.
2652*7c478bd9Sstevel@tonic-gate 	 *    This operation must be performed before entering
2653*7c478bd9Sstevel@tonic-gate 	 *    lockfs or a transaction.
2654*7c478bd9Sstevel@tonic-gate 	 */
2655*7c478bd9Sstevel@tonic-gate 	if (ufs_idle_q.uq_ne > ufs_idle_q.uq_hiwat)
2656*7c478bd9Sstevel@tonic-gate 		if ((curthread->t_flag & T_DONTBLOCK) == 0) {
2657*7c478bd9Sstevel@tonic-gate 			ins.in_lidles.value.ul += ufs_lookup_idle_count;
2658*7c478bd9Sstevel@tonic-gate 			ufs_idle_some(ufs_lookup_idle_count);
2659*7c478bd9Sstevel@tonic-gate 		}
2660*7c478bd9Sstevel@tonic-gate 
2661*7c478bd9Sstevel@tonic-gate 	ufsvfsp = ip->i_ufsvfs;
2662*7c478bd9Sstevel@tonic-gate 	error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_LOOKUP_MASK);
2663*7c478bd9Sstevel@tonic-gate 	if (error)
2664*7c478bd9Sstevel@tonic-gate 		goto out;
2665*7c478bd9Sstevel@tonic-gate 
2666*7c478bd9Sstevel@tonic-gate 	error = ufs_dirlook(ip, nm, &xip, cr, 1);
2667*7c478bd9Sstevel@tonic-gate 
2668*7c478bd9Sstevel@tonic-gate fastpath:
2669*7c478bd9Sstevel@tonic-gate 	if (error == 0) {
2670*7c478bd9Sstevel@tonic-gate 		ip = xip;
2671*7c478bd9Sstevel@tonic-gate 		*vpp = ITOV(ip);
2672*7c478bd9Sstevel@tonic-gate 
2673*7c478bd9Sstevel@tonic-gate 		/*
2674*7c478bd9Sstevel@tonic-gate 		 * If vnode is a device return special vnode instead.
2675*7c478bd9Sstevel@tonic-gate 		 */
2676*7c478bd9Sstevel@tonic-gate 		if (IS_DEVVP(*vpp)) {
2677*7c478bd9Sstevel@tonic-gate 			struct vnode *newvp;
2678*7c478bd9Sstevel@tonic-gate 
2679*7c478bd9Sstevel@tonic-gate 			newvp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type,
2680*7c478bd9Sstevel@tonic-gate 			    cr);
2681*7c478bd9Sstevel@tonic-gate 			VN_RELE(*vpp);
2682*7c478bd9Sstevel@tonic-gate 			if (newvp == NULL)
2683*7c478bd9Sstevel@tonic-gate 				error = ENOSYS;
2684*7c478bd9Sstevel@tonic-gate 			else
2685*7c478bd9Sstevel@tonic-gate 				*vpp = newvp;
2686*7c478bd9Sstevel@tonic-gate 		}
2687*7c478bd9Sstevel@tonic-gate 	}
2688*7c478bd9Sstevel@tonic-gate 	if (ulp) {
2689*7c478bd9Sstevel@tonic-gate 		ufs_lockfs_end(ulp);
2690*7c478bd9Sstevel@tonic-gate 	}
2691*7c478bd9Sstevel@tonic-gate 
2692*7c478bd9Sstevel@tonic-gate out:
2693*7c478bd9Sstevel@tonic-gate 	TRACE_3(TR_FAC_UFS, TR_UFS_LOOKUP_END,
2694*7c478bd9Sstevel@tonic-gate 		"ufs_lookup_end:dvp %p name %s error %d", vpp, nm, error);
2695*7c478bd9Sstevel@tonic-gate 	return (error);
2696*7c478bd9Sstevel@tonic-gate }
2697*7c478bd9Sstevel@tonic-gate 
2698*7c478bd9Sstevel@tonic-gate static int
2699*7c478bd9Sstevel@tonic-gate ufs_create(struct vnode *dvp, char *name, struct vattr *vap, enum vcexcl excl,
2700*7c478bd9Sstevel@tonic-gate 	int mode, struct vnode **vpp, struct cred *cr, int flag)
2701*7c478bd9Sstevel@tonic-gate {
2702*7c478bd9Sstevel@tonic-gate 	struct inode *ip;
2703*7c478bd9Sstevel@tonic-gate 	struct inode *xip;
2704*7c478bd9Sstevel@tonic-gate 	struct inode *dip;
2705*7c478bd9Sstevel@tonic-gate 	struct vnode *xvp;
2706*7c478bd9Sstevel@tonic-gate 	struct ufsvfs *ufsvfsp;
2707*7c478bd9Sstevel@tonic-gate 	struct ulockfs *ulp;
2708*7c478bd9Sstevel@tonic-gate 	int error;
2709*7c478bd9Sstevel@tonic-gate 	int issync;
2710*7c478bd9Sstevel@tonic-gate 	int truncflag;
2711*7c478bd9Sstevel@tonic-gate 	int trans_size;
2712*7c478bd9Sstevel@tonic-gate 	int noentry;
2713*7c478bd9Sstevel@tonic-gate 	int defer_dip_seq_update = 0;	/* need to defer update of dip->i_seq */
2714*7c478bd9Sstevel@tonic-gate 	int retry = 1;
2715*7c478bd9Sstevel@tonic-gate 
2716*7c478bd9Sstevel@tonic-gate 	TRACE_1(TR_FAC_UFS, TR_UFS_CREATE_START,
2717*7c478bd9Sstevel@tonic-gate 		"ufs_create_start:dvp %p", dvp);
2718*7c478bd9Sstevel@tonic-gate 
2719*7c478bd9Sstevel@tonic-gate again:
2720*7c478bd9Sstevel@tonic-gate 	ip = VTOI(dvp);
2721*7c478bd9Sstevel@tonic-gate 	ufsvfsp = ip->i_ufsvfs;
2722*7c478bd9Sstevel@tonic-gate 	truncflag = 0;
2723*7c478bd9Sstevel@tonic-gate 
2724*7c478bd9Sstevel@tonic-gate 	error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_CREATE_MASK);
2725*7c478bd9Sstevel@tonic-gate 	if (error)
2726*7c478bd9Sstevel@tonic-gate 		goto out;
2727*7c478bd9Sstevel@tonic-gate 
2728*7c478bd9Sstevel@tonic-gate 	if (ulp) {
2729*7c478bd9Sstevel@tonic-gate 		trans_size = (int)TOP_CREATE_SIZE(ip);
2730*7c478bd9Sstevel@tonic-gate 		TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_CREATE, trans_size);
2731*7c478bd9Sstevel@tonic-gate 	}
2732*7c478bd9Sstevel@tonic-gate 
2733*7c478bd9Sstevel@tonic-gate 	if ((vap->va_mode & VSVTX) && secpolicy_vnode_stky_modify(cr) != 0)
2734*7c478bd9Sstevel@tonic-gate 		vap->va_mode &= ~VSVTX;
2735*7c478bd9Sstevel@tonic-gate 
2736*7c478bd9Sstevel@tonic-gate 	if (*name == '\0') {
2737*7c478bd9Sstevel@tonic-gate 		/*
2738*7c478bd9Sstevel@tonic-gate 		 * Null component name refers to the directory itself.
2739*7c478bd9Sstevel@tonic-gate 		 */
2740*7c478bd9Sstevel@tonic-gate 		VN_HOLD(dvp);
2741*7c478bd9Sstevel@tonic-gate 		/*
2742*7c478bd9Sstevel@tonic-gate 		 * Even though this is an error case, we need to grab the
2743*7c478bd9Sstevel@tonic-gate 		 * quota lock since the error handling code below is common.
2744*7c478bd9Sstevel@tonic-gate 		 */
2745*7c478bd9Sstevel@tonic-gate 		rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
2746*7c478bd9Sstevel@tonic-gate 		rw_enter(&ip->i_contents, RW_WRITER);
2747*7c478bd9Sstevel@tonic-gate 		error = EEXIST;
2748*7c478bd9Sstevel@tonic-gate 	} else {
2749*7c478bd9Sstevel@tonic-gate 		xip = NULL;
2750*7c478bd9Sstevel@tonic-gate 		noentry = 0;
2751*7c478bd9Sstevel@tonic-gate 		rw_enter(&ip->i_rwlock, RW_WRITER);
2752*7c478bd9Sstevel@tonic-gate 		xvp = dnlc_lookup(dvp, name);
2753*7c478bd9Sstevel@tonic-gate 		if (xvp == DNLC_NO_VNODE) {
2754*7c478bd9Sstevel@tonic-gate 			noentry = 1;
2755*7c478bd9Sstevel@tonic-gate 			VN_RELE(xvp);
2756*7c478bd9Sstevel@tonic-gate 			xvp = NULL;
2757*7c478bd9Sstevel@tonic-gate 		}
2758*7c478bd9Sstevel@tonic-gate 		if (xvp) {
2759*7c478bd9Sstevel@tonic-gate 			rw_exit(&ip->i_rwlock);
2760*7c478bd9Sstevel@tonic-gate 			if (error = ufs_iaccess(ip, IEXEC, cr)) {
2761*7c478bd9Sstevel@tonic-gate 				VN_RELE(xvp);
2762*7c478bd9Sstevel@tonic-gate 			} else {
2763*7c478bd9Sstevel@tonic-gate 				error = EEXIST;
2764*7c478bd9Sstevel@tonic-gate 				xip = VTOI(xvp);
2765*7c478bd9Sstevel@tonic-gate 			}
2766*7c478bd9Sstevel@tonic-gate 		} else {
2767*7c478bd9Sstevel@tonic-gate 			/*
2768*7c478bd9Sstevel@tonic-gate 			 * Suppress file system full message if we will retry
2769*7c478bd9Sstevel@tonic-gate 			 */
2770*7c478bd9Sstevel@tonic-gate 			error = ufs_direnter_cm(ip, name, DE_CREATE,
2771*7c478bd9Sstevel@tonic-gate 				vap, &xip, cr,
2772*7c478bd9Sstevel@tonic-gate 				(noentry | (retry ? IQUIET : 0)));
2773*7c478bd9Sstevel@tonic-gate 			rw_exit(&ip->i_rwlock);
2774*7c478bd9Sstevel@tonic-gate 		}
2775*7c478bd9Sstevel@tonic-gate 		ip = xip;
2776*7c478bd9Sstevel@tonic-gate 		if (ip != NULL) {
2777*7c478bd9Sstevel@tonic-gate 			rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
2778*7c478bd9Sstevel@tonic-gate 			rw_enter(&ip->i_contents, RW_WRITER);
2779*7c478bd9Sstevel@tonic-gate 		}
2780*7c478bd9Sstevel@tonic-gate 	}
2781*7c478bd9Sstevel@tonic-gate 
2782*7c478bd9Sstevel@tonic-gate 	/*
2783*7c478bd9Sstevel@tonic-gate 	 * If the file already exists and this is a non-exclusive create,
2784*7c478bd9Sstevel@tonic-gate 	 * check permissions and allow access for non-directories.
2785*7c478bd9Sstevel@tonic-gate 	 * Read-only create of an existing directory is also allowed.
2786*7c478bd9Sstevel@tonic-gate 	 * We fail an exclusive create of anything which already exists.
2787*7c478bd9Sstevel@tonic-gate 	 */
2788*7c478bd9Sstevel@tonic-gate 	if (error == EEXIST) {
2789*7c478bd9Sstevel@tonic-gate 		dip = VTOI(dvp);
2790*7c478bd9Sstevel@tonic-gate 		if (excl == NONEXCL) {
2791*7c478bd9Sstevel@tonic-gate 			if ((((ip->i_mode & IFMT) == IFDIR) ||
2792*7c478bd9Sstevel@tonic-gate 			    ((ip->i_mode & IFMT) == IFATTRDIR)) &&
2793*7c478bd9Sstevel@tonic-gate 			    (mode & IWRITE))
2794*7c478bd9Sstevel@tonic-gate 				error = EISDIR;
2795*7c478bd9Sstevel@tonic-gate 			else if (mode)
2796*7c478bd9Sstevel@tonic-gate 				error = ufs_iaccess(ip, mode, cr);
2797*7c478bd9Sstevel@tonic-gate 			else
2798*7c478bd9Sstevel@tonic-gate 				error = 0;
2799*7c478bd9Sstevel@tonic-gate 		}
2800*7c478bd9Sstevel@tonic-gate 		if (error) {
2801*7c478bd9Sstevel@tonic-gate 			rw_exit(&ip->i_contents);
2802*7c478bd9Sstevel@tonic-gate 			rw_exit(&ufsvfsp->vfs_dqrwlock);
2803*7c478bd9Sstevel@tonic-gate 			VN_RELE(ITOV(ip));
2804*7c478bd9Sstevel@tonic-gate 			goto unlock;
2805*7c478bd9Sstevel@tonic-gate 		}
2806*7c478bd9Sstevel@tonic-gate 		/*
2807*7c478bd9Sstevel@tonic-gate 		 * If the error EEXIST was set, then i_seq can not
2808*7c478bd9Sstevel@tonic-gate 		 * have been updated. The sequence number interface
2809*7c478bd9Sstevel@tonic-gate 		 * is defined such that a non-error VOP_CREATE must
2810*7c478bd9Sstevel@tonic-gate 		 * increase the dir va_seq it by at least one. If we
2811*7c478bd9Sstevel@tonic-gate 		 * have cleared the error, increase i_seq. Note that
2812*7c478bd9Sstevel@tonic-gate 		 * we are increasing the dir i_seq and in rare cases
2813*7c478bd9Sstevel@tonic-gate 		 * ip may actually be from the dvp, so we already have
2814*7c478bd9Sstevel@tonic-gate 		 * the locks and it will not be subject to truncation.
2815*7c478bd9Sstevel@tonic-gate 		 * In case we have to update i_seq of the parent
2816*7c478bd9Sstevel@tonic-gate 		 * directory dip, we have to defer it till we have
2817*7c478bd9Sstevel@tonic-gate 		 * released our locks on ip due to lock ordering requirements.
2818*7c478bd9Sstevel@tonic-gate 		 */
2819*7c478bd9Sstevel@tonic-gate 		if (ip != dip)
2820*7c478bd9Sstevel@tonic-gate 			defer_dip_seq_update = 1;
2821*7c478bd9Sstevel@tonic-gate 		else
2822*7c478bd9Sstevel@tonic-gate 			ip->i_seq++;
2823*7c478bd9Sstevel@tonic-gate 
2824*7c478bd9Sstevel@tonic-gate 		if (((ip->i_mode & IFMT) == IFREG) &&
2825*7c478bd9Sstevel@tonic-gate 		    (vap->va_mask & AT_SIZE) && vap->va_size == 0) {
2826*7c478bd9Sstevel@tonic-gate 			/*
2827*7c478bd9Sstevel@tonic-gate 			 * Truncate regular files, if requested by caller.
2828*7c478bd9Sstevel@tonic-gate 			 * Grab i_rwlock to make sure no one else is
2829*7c478bd9Sstevel@tonic-gate 			 * currently writing to the file (we promised
2830*7c478bd9Sstevel@tonic-gate 			 * bmap we would do this).
2831*7c478bd9Sstevel@tonic-gate 			 * Must get the locks in the correct order.
2832*7c478bd9Sstevel@tonic-gate 			 */
2833*7c478bd9Sstevel@tonic-gate 			if (ip->i_size == 0) {
2834*7c478bd9Sstevel@tonic-gate 				ip->i_flag |= ICHG | IUPD;
2835*7c478bd9Sstevel@tonic-gate 				ip->i_seq++;
2836*7c478bd9Sstevel@tonic-gate 				TRANS_INODE(ufsvfsp, ip);
2837*7c478bd9Sstevel@tonic-gate 			} else {
2838*7c478bd9Sstevel@tonic-gate 				/*
2839*7c478bd9Sstevel@tonic-gate 				 * Large Files: Why this check here?
2840*7c478bd9Sstevel@tonic-gate 				 * Though we do it in vn_create() we really
2841*7c478bd9Sstevel@tonic-gate 				 * want to guarantee that we do not destroy
2842*7c478bd9Sstevel@tonic-gate 				 * Large file data by atomically checking
2843*7c478bd9Sstevel@tonic-gate 				 * the size while holding the contents
2844*7c478bd9Sstevel@tonic-gate 				 * lock.
2845*7c478bd9Sstevel@tonic-gate 				 */
2846*7c478bd9Sstevel@tonic-gate 				if (flag && !(flag & FOFFMAX) &&
2847*7c478bd9Sstevel@tonic-gate 				    ((ip->i_mode & IFMT) == IFREG) &&
2848*7c478bd9Sstevel@tonic-gate 				    (ip->i_size > (offset_t)MAXOFF32_T)) {
2849*7c478bd9Sstevel@tonic-gate 					rw_exit(&ip->i_contents);
2850*7c478bd9Sstevel@tonic-gate 					rw_exit(&ufsvfsp->vfs_dqrwlock);
2851*7c478bd9Sstevel@tonic-gate 					error = EOVERFLOW;
2852*7c478bd9Sstevel@tonic-gate 					goto unlock;
2853*7c478bd9Sstevel@tonic-gate 				}
2854*7c478bd9Sstevel@tonic-gate 				if (TRANS_ISTRANS(ufsvfsp))
2855*7c478bd9Sstevel@tonic-gate 					truncflag++;
2856*7c478bd9Sstevel@tonic-gate 				else {
2857*7c478bd9Sstevel@tonic-gate 					rw_exit(&ip->i_contents);
2858*7c478bd9Sstevel@tonic-gate 					rw_exit(&ufsvfsp->vfs_dqrwlock);
2859*7c478bd9Sstevel@tonic-gate 					rw_enter(&ip->i_rwlock, RW_WRITER);
2860*7c478bd9Sstevel@tonic-gate 					rw_enter(&ufsvfsp->vfs_dqrwlock,
2861*7c478bd9Sstevel@tonic-gate 							RW_READER);
2862*7c478bd9Sstevel@tonic-gate 					rw_enter(&ip->i_contents, RW_WRITER);
2863*7c478bd9Sstevel@tonic-gate 					(void) ufs_itrunc(ip, (u_offset_t)0, 0,
2864*7c478bd9Sstevel@tonic-gate 								cr);
2865*7c478bd9Sstevel@tonic-gate 					rw_exit(&ip->i_rwlock);
2866*7c478bd9Sstevel@tonic-gate 				}
2867*7c478bd9Sstevel@tonic-gate 			}
2868*7c478bd9Sstevel@tonic-gate 		}
2869*7c478bd9Sstevel@tonic-gate 	}
2870*7c478bd9Sstevel@tonic-gate 
2871*7c478bd9Sstevel@tonic-gate 	if (error) {
2872*7c478bd9Sstevel@tonic-gate 		if (ip != NULL) {
2873*7c478bd9Sstevel@tonic-gate 			rw_exit(&ufsvfsp->vfs_dqrwlock);
2874*7c478bd9Sstevel@tonic-gate 			rw_exit(&ip->i_contents);
2875*7c478bd9Sstevel@tonic-gate 		}
2876*7c478bd9Sstevel@tonic-gate 		goto unlock;
2877*7c478bd9Sstevel@tonic-gate 	}
2878*7c478bd9Sstevel@tonic-gate 
2879*7c478bd9Sstevel@tonic-gate 	*vpp = ITOV(ip);
2880*7c478bd9Sstevel@tonic-gate 	ITIMES(ip);
2881*7c478bd9Sstevel@tonic-gate 	rw_exit(&ip->i_contents);
2882*7c478bd9Sstevel@tonic-gate 	rw_exit(&ufsvfsp->vfs_dqrwlock);
2883*7c478bd9Sstevel@tonic-gate 
2884*7c478bd9Sstevel@tonic-gate 	/*
2885*7c478bd9Sstevel@tonic-gate 	 * If vnode is a device return special vnode instead.
2886*7c478bd9Sstevel@tonic-gate 	 */
2887*7c478bd9Sstevel@tonic-gate 	if (!error && IS_DEVVP(*vpp)) {
2888*7c478bd9Sstevel@tonic-gate 		struct vnode *newvp;
2889*7c478bd9Sstevel@tonic-gate 
2890*7c478bd9Sstevel@tonic-gate 		newvp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
2891*7c478bd9Sstevel@tonic-gate 		VN_RELE(*vpp);
2892*7c478bd9Sstevel@tonic-gate 		if (newvp == NULL) {
2893*7c478bd9Sstevel@tonic-gate 			error = ENOSYS;
2894*7c478bd9Sstevel@tonic-gate 			goto unlock;
2895*7c478bd9Sstevel@tonic-gate 		}
2896*7c478bd9Sstevel@tonic-gate 		truncflag = 0;
2897*7c478bd9Sstevel@tonic-gate 		*vpp = newvp;
2898*7c478bd9Sstevel@tonic-gate 	}
2899*7c478bd9Sstevel@tonic-gate unlock:
2900*7c478bd9Sstevel@tonic-gate 
2901*7c478bd9Sstevel@tonic-gate 	/*
2902*7c478bd9Sstevel@tonic-gate 	 * Do the deferred update of the parent directory's sequence
2903*7c478bd9Sstevel@tonic-gate 	 * number now.
2904*7c478bd9Sstevel@tonic-gate 	 */
2905*7c478bd9Sstevel@tonic-gate 	if (defer_dip_seq_update == 1) {
2906*7c478bd9Sstevel@tonic-gate 		rw_enter(&dip->i_contents, RW_READER);
2907*7c478bd9Sstevel@tonic-gate 		mutex_enter(&dip->i_tlock);
2908*7c478bd9Sstevel@tonic-gate 		dip->i_seq++;
2909*7c478bd9Sstevel@tonic-gate 		mutex_exit(&dip->i_tlock);
2910*7c478bd9Sstevel@tonic-gate 		rw_exit(&dip->i_contents);
2911*7c478bd9Sstevel@tonic-gate 	}
2912*7c478bd9Sstevel@tonic-gate 
2913*7c478bd9Sstevel@tonic-gate 	if (ulp) {
2914*7c478bd9Sstevel@tonic-gate 		int terr = 0;
2915*7c478bd9Sstevel@tonic-gate 
2916*7c478bd9Sstevel@tonic-gate 		TRANS_END_CSYNC(ufsvfsp, terr, issync, TOP_CREATE,
2917*7c478bd9Sstevel@tonic-gate 		    trans_size);
2918*7c478bd9Sstevel@tonic-gate 
2919*7c478bd9Sstevel@tonic-gate 		/*
2920*7c478bd9Sstevel@tonic-gate 		 * If we haven't had a more interesting failure
2921*7c478bd9Sstevel@tonic-gate 		 * already, then anything that might've happened
2922*7c478bd9Sstevel@tonic-gate 		 * here should be reported.
2923*7c478bd9Sstevel@tonic-gate 		 */
2924*7c478bd9Sstevel@tonic-gate 		if (error == 0)
2925*7c478bd9Sstevel@tonic-gate 			error = terr;
2926*7c478bd9Sstevel@tonic-gate 	}
2927*7c478bd9Sstevel@tonic-gate 
2928*7c478bd9Sstevel@tonic-gate 	if (!error && truncflag) {
2929*7c478bd9Sstevel@tonic-gate 		rw_enter(&ip->i_rwlock, RW_WRITER);
2930*7c478bd9Sstevel@tonic-gate 		(void) TRANS_ITRUNC(ip, (u_offset_t)0, 0, cr);
2931*7c478bd9Sstevel@tonic-gate 		rw_exit(&ip->i_rwlock);
2932*7c478bd9Sstevel@tonic-gate 	}
2933*7c478bd9Sstevel@tonic-gate 
2934*7c478bd9Sstevel@tonic-gate 	if (ulp)
2935*7c478bd9Sstevel@tonic-gate 		ufs_lockfs_end(ulp);
2936*7c478bd9Sstevel@tonic-gate 
2937*7c478bd9Sstevel@tonic-gate 	/*
2938*7c478bd9Sstevel@tonic-gate 	 * If no inodes available, try to free one up out of the
2939*7c478bd9Sstevel@tonic-gate 	 * pending delete queue.
2940*7c478bd9Sstevel@tonic-gate 	 */
2941*7c478bd9Sstevel@tonic-gate 	if ((error == ENOSPC) && retry && TRANS_ISTRANS(ufsvfsp)) {
2942*7c478bd9Sstevel@tonic-gate 		ufs_delete_drain_wait(ufsvfsp, 1);
2943*7c478bd9Sstevel@tonic-gate 		retry = 0;
2944*7c478bd9Sstevel@tonic-gate 		goto again;
2945*7c478bd9Sstevel@tonic-gate 	}
2946*7c478bd9Sstevel@tonic-gate 
2947*7c478bd9Sstevel@tonic-gate out:
2948*7c478bd9Sstevel@tonic-gate 	TRACE_3(TR_FAC_UFS, TR_UFS_CREATE_END,
2949*7c478bd9Sstevel@tonic-gate 		"ufs_create_end:dvp %p name %s error %d", vpp, name, error);
2950*7c478bd9Sstevel@tonic-gate 	return (error);
2951*7c478bd9Sstevel@tonic-gate }
2952*7c478bd9Sstevel@tonic-gate 
2953*7c478bd9Sstevel@tonic-gate extern int ufs_idle_max;
2954*7c478bd9Sstevel@tonic-gate /*ARGSUSED*/
2955*7c478bd9Sstevel@tonic-gate static int
2956*7c478bd9Sstevel@tonic-gate ufs_remove(struct vnode *vp, char *nm, struct cred *cr)
2957*7c478bd9Sstevel@tonic-gate {
2958*7c478bd9Sstevel@tonic-gate 	struct inode *ip = VTOI(vp);
2959*7c478bd9Sstevel@tonic-gate 	struct ufsvfs *ufsvfsp	= ip->i_ufsvfs;
2960*7c478bd9Sstevel@tonic-gate 	struct ulockfs *ulp;
2961*7c478bd9Sstevel@tonic-gate 	vnode_t *rmvp = NULL;	/* Vnode corresponding to name being removed */
2962*7c478bd9Sstevel@tonic-gate 	int error;
2963*7c478bd9Sstevel@tonic-gate 	int issync;
2964*7c478bd9Sstevel@tonic-gate 	int trans_size;
2965*7c478bd9Sstevel@tonic-gate 
2966*7c478bd9Sstevel@tonic-gate 	TRACE_1(TR_FAC_UFS, TR_UFS_REMOVE_START,
2967*7c478bd9Sstevel@tonic-gate 		"ufs_remove_start:vp %p", vp);
2968*7c478bd9Sstevel@tonic-gate 
2969*7c478bd9Sstevel@tonic-gate 	/*
2970*7c478bd9Sstevel@tonic-gate 	 * don't let the delete queue get too long
2971*7c478bd9Sstevel@tonic-gate 	 */
2972*7c478bd9Sstevel@tonic-gate 	if (ufsvfsp == NULL) {
2973*7c478bd9Sstevel@tonic-gate 		error = EIO;
2974*7c478bd9Sstevel@tonic-gate 		goto out;
2975*7c478bd9Sstevel@tonic-gate 	}
2976*7c478bd9Sstevel@tonic-gate 	if (ufsvfsp->vfs_delete.uq_ne > ufs_idle_max)
2977*7c478bd9Sstevel@tonic-gate 		ufs_delete_drain(vp->v_vfsp, 1, 1);
2978*7c478bd9Sstevel@tonic-gate 
2979*7c478bd9Sstevel@tonic-gate 	error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_REMOVE_MASK);
2980*7c478bd9Sstevel@tonic-gate 	if (error)
2981*7c478bd9Sstevel@tonic-gate 		goto out;
2982*7c478bd9Sstevel@tonic-gate 
2983*7c478bd9Sstevel@tonic-gate 	if (ulp)
2984*7c478bd9Sstevel@tonic-gate 		TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_REMOVE,
2985*7c478bd9Sstevel@tonic-gate 		    trans_size = (int)TOP_REMOVE_SIZE(VTOI(vp)));
2986*7c478bd9Sstevel@tonic-gate 
2987*7c478bd9Sstevel@tonic-gate 	rw_enter(&ip->i_rwlock, RW_WRITER);
2988*7c478bd9Sstevel@tonic-gate 	error = ufs_dirremove(ip, nm, (struct inode *)0, (struct vnode *)0,
2989*7c478bd9Sstevel@tonic-gate 	    DR_REMOVE, cr, &rmvp);
2990*7c478bd9Sstevel@tonic-gate 	rw_exit(&ip->i_rwlock);
2991*7c478bd9Sstevel@tonic-gate 
2992*7c478bd9Sstevel@tonic-gate 	if (ulp) {
2993*7c478bd9Sstevel@tonic-gate 		TRANS_END_CSYNC(ufsvfsp, error, issync, TOP_REMOVE, trans_size);
2994*7c478bd9Sstevel@tonic-gate 		ufs_lockfs_end(ulp);
2995*7c478bd9Sstevel@tonic-gate 	}
2996*7c478bd9Sstevel@tonic-gate 
2997*7c478bd9Sstevel@tonic-gate 	/*
2998*7c478bd9Sstevel@tonic-gate 	 * This must be called after the remove transaction is closed.
2999*7c478bd9Sstevel@tonic-gate 	 */
3000*7c478bd9Sstevel@tonic-gate 	if (rmvp != NULL) {
3001*7c478bd9Sstevel@tonic-gate 		/* Only send the event if there were no errors */
3002*7c478bd9Sstevel@tonic-gate 		if (error == 0)
3003*7c478bd9Sstevel@tonic-gate 			vnevent_remove(rmvp);
3004*7c478bd9Sstevel@tonic-gate 		VN_RELE(rmvp);
3005*7c478bd9Sstevel@tonic-gate 	}
3006*7c478bd9Sstevel@tonic-gate out:
3007*7c478bd9Sstevel@tonic-gate 	TRACE_3(TR_FAC_UFS, TR_UFS_REMOVE_END,
3008*7c478bd9Sstevel@tonic-gate 		"ufs_remove_end:vp %p name %s error %d", vp, nm, error);
3009*7c478bd9Sstevel@tonic-gate 	return (error);
3010*7c478bd9Sstevel@tonic-gate }
3011*7c478bd9Sstevel@tonic-gate 
3012*7c478bd9Sstevel@tonic-gate /*
3013*7c478bd9Sstevel@tonic-gate  * Link a file or a directory.  Only privileged processes are allowed to
3014*7c478bd9Sstevel@tonic-gate  * make links to directories.
3015*7c478bd9Sstevel@tonic-gate  */
3016*7c478bd9Sstevel@tonic-gate static int
3017*7c478bd9Sstevel@tonic-gate ufs_link(struct vnode *tdvp, struct vnode *svp, char *tnm, struct cred *cr)
3018*7c478bd9Sstevel@tonic-gate {
3019*7c478bd9Sstevel@tonic-gate 	struct inode *sip;
3020*7c478bd9Sstevel@tonic-gate 	struct inode *tdp = VTOI(tdvp);
3021*7c478bd9Sstevel@tonic-gate 	struct ufsvfs *ufsvfsp = tdp->i_ufsvfs;
3022*7c478bd9Sstevel@tonic-gate 	struct ulockfs *ulp;
3023*7c478bd9Sstevel@tonic-gate 	struct vnode *realvp;
3024*7c478bd9Sstevel@tonic-gate 	int error;
3025*7c478bd9Sstevel@tonic-gate 	int issync;
3026*7c478bd9Sstevel@tonic-gate 	int trans_size;
3027*7c478bd9Sstevel@tonic-gate 	int isdev;
3028*7c478bd9Sstevel@tonic-gate 
3029*7c478bd9Sstevel@tonic-gate 	TRACE_1(TR_FAC_UFS, TR_UFS_LINK_START,
3030*7c478bd9Sstevel@tonic-gate 		"ufs_link_start:tdvp %p", tdvp);
3031*7c478bd9Sstevel@tonic-gate 
3032*7c478bd9Sstevel@tonic-gate 	error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_LINK_MASK);
3033*7c478bd9Sstevel@tonic-gate 	if (error)
3034*7c478bd9Sstevel@tonic-gate 		goto out;
3035*7c478bd9Sstevel@tonic-gate 
3036*7c478bd9Sstevel@tonic-gate 	if (ulp)
3037*7c478bd9Sstevel@tonic-gate 		TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_LINK,
3038*7c478bd9Sstevel@tonic-gate 		    trans_size = (int)TOP_LINK_SIZE(VTOI(tdvp)));
3039*7c478bd9Sstevel@tonic-gate 
3040*7c478bd9Sstevel@tonic-gate 	if (VOP_REALVP(svp, &realvp) == 0)
3041*7c478bd9Sstevel@tonic-gate 		svp = realvp;
3042*7c478bd9Sstevel@tonic-gate 
3043*7c478bd9Sstevel@tonic-gate 	/*
3044*7c478bd9Sstevel@tonic-gate 	 * Make sure link for extended attributes is valid
3045*7c478bd9Sstevel@tonic-gate 	 * We only support hard linking of attr in ATTRDIR to ATTRDIR
3046*7c478bd9Sstevel@tonic-gate 	 *
3047*7c478bd9Sstevel@tonic-gate 	 * Make certain we don't attempt to look at a device node as
3048*7c478bd9Sstevel@tonic-gate 	 * a ufs inode.
3049*7c478bd9Sstevel@tonic-gate 	 */
3050*7c478bd9Sstevel@tonic-gate 
3051*7c478bd9Sstevel@tonic-gate 	isdev = IS_DEVVP(svp);
3052*7c478bd9Sstevel@tonic-gate 	if (((isdev == 0) && ((VTOI(svp)->i_cflags & IXATTR) == 0) &&
3053*7c478bd9Sstevel@tonic-gate 	    ((tdp->i_mode & IFMT) == IFATTRDIR)) ||
3054*7c478bd9Sstevel@tonic-gate 	    ((isdev == 0) && (VTOI(svp)->i_cflags & IXATTR) &&
3055*7c478bd9Sstevel@tonic-gate 	    ((tdp->i_mode & IFMT) == IFDIR))) {
3056*7c478bd9Sstevel@tonic-gate 		error = EINVAL;
3057*7c478bd9Sstevel@tonic-gate 		goto unlock;
3058*7c478bd9Sstevel@tonic-gate 	}
3059*7c478bd9Sstevel@tonic-gate 
3060*7c478bd9Sstevel@tonic-gate 	sip = VTOI(svp);
3061*7c478bd9Sstevel@tonic-gate 	if ((svp->v_type == VDIR &&
3062*7c478bd9Sstevel@tonic-gate 	    secpolicy_fs_linkdir(cr, ufsvfsp->vfs_vfs) != 0) ||
3063*7c478bd9Sstevel@tonic-gate 	    (sip->i_uid != crgetuid(cr) && secpolicy_basic_link(cr) != 0)) {
3064*7c478bd9Sstevel@tonic-gate 		error = EPERM;
3065*7c478bd9Sstevel@tonic-gate 		goto unlock;
3066*7c478bd9Sstevel@tonic-gate 	}
3067*7c478bd9Sstevel@tonic-gate 	rw_enter(&tdp->i_rwlock, RW_WRITER);
3068*7c478bd9Sstevel@tonic-gate 	error = ufs_direnter_lr(tdp, tnm, DE_LINK, (struct inode *)0,
3069*7c478bd9Sstevel@tonic-gate 	    sip, cr, NULL);
3070*7c478bd9Sstevel@tonic-gate 	rw_exit(&tdp->i_rwlock);
3071*7c478bd9Sstevel@tonic-gate 
3072*7c478bd9Sstevel@tonic-gate unlock:
3073*7c478bd9Sstevel@tonic-gate 	if (ulp) {
3074*7c478bd9Sstevel@tonic-gate 		TRANS_END_CSYNC(ufsvfsp, error, issync, TOP_LINK, trans_size);
3075*7c478bd9Sstevel@tonic-gate 		ufs_lockfs_end(ulp);
3076*7c478bd9Sstevel@tonic-gate 	}
3077*7c478bd9Sstevel@tonic-gate out:
3078*7c478bd9Sstevel@tonic-gate 	TRACE_2(TR_FAC_UFS, TR_UFS_LINK_END,
3079*7c478bd9Sstevel@tonic-gate 		"ufs_link_end:tdvp %p error %d", tdvp, error);
3080*7c478bd9Sstevel@tonic-gate 	return (error);
3081*7c478bd9Sstevel@tonic-gate }
3082*7c478bd9Sstevel@tonic-gate 
3083*7c478bd9Sstevel@tonic-gate uint64_t ufs_rename_retry_cnt;
3084*7c478bd9Sstevel@tonic-gate uint64_t ufs_rename_upgrade_retry_cnt;
3085*7c478bd9Sstevel@tonic-gate uint64_t ufs_rename_dircheck_retry_cnt;
3086*7c478bd9Sstevel@tonic-gate clock_t	 ufs_rename_backoff_delay = 1;
3087*7c478bd9Sstevel@tonic-gate 
3088*7c478bd9Sstevel@tonic-gate /*
3089*7c478bd9Sstevel@tonic-gate  * Rename a file or directory.
3090*7c478bd9Sstevel@tonic-gate  * We are given the vnode and entry string of the source and the
3091*7c478bd9Sstevel@tonic-gate  * vnode and entry string of the place we want to move the source
3092*7c478bd9Sstevel@tonic-gate  * to (the target). The essential operation is:
3093*7c478bd9Sstevel@tonic-gate  *	unlink(target);
3094*7c478bd9Sstevel@tonic-gate  *	link(source, target);
3095*7c478bd9Sstevel@tonic-gate  *	unlink(source);
3096*7c478bd9Sstevel@tonic-gate  * but "atomically".  Can't do full commit without saving state in
3097*7c478bd9Sstevel@tonic-gate  * the inode on disk, which isn't feasible at this time.  Best we
3098*7c478bd9Sstevel@tonic-gate  * can do is always guarantee that the TARGET exists.
3099*7c478bd9Sstevel@tonic-gate  */
3100*7c478bd9Sstevel@tonic-gate /*ARGSUSED*/
3101*7c478bd9Sstevel@tonic-gate static int
3102*7c478bd9Sstevel@tonic-gate ufs_rename(
3103*7c478bd9Sstevel@tonic-gate 	struct vnode *sdvp,		/* old (source) parent vnode */
3104*7c478bd9Sstevel@tonic-gate 	char *snm,			/* old (source) entry name */
3105*7c478bd9Sstevel@tonic-gate 	struct vnode *tdvp,		/* new (target) parent vnode */
3106*7c478bd9Sstevel@tonic-gate 	char *tnm,			/* new (target) entry name */
3107*7c478bd9Sstevel@tonic-gate 	struct cred *cr)
3108*7c478bd9Sstevel@tonic-gate {
3109*7c478bd9Sstevel@tonic-gate 	struct inode *sip = NULL;	/* source inode */
3110*7c478bd9Sstevel@tonic-gate 	struct inode *sdp;		/* old (source) parent inode */
3111*7c478bd9Sstevel@tonic-gate 	struct inode *tdp;		/* new (target) parent inode */
3112*7c478bd9Sstevel@tonic-gate 	struct vnode *tvp = NULL;	/* target vnode, if it exists */
3113*7c478bd9Sstevel@tonic-gate 	struct vnode *realvp;
3114*7c478bd9Sstevel@tonic-gate 	struct ufsvfs *ufsvfsp;
3115*7c478bd9Sstevel@tonic-gate 	struct ulockfs *ulp;
3116*7c478bd9Sstevel@tonic-gate 	int error;
3117*7c478bd9Sstevel@tonic-gate 	int issync;
3118*7c478bd9Sstevel@tonic-gate 	int trans_size;
3119*7c478bd9Sstevel@tonic-gate 
3120*7c478bd9Sstevel@tonic-gate 	TRACE_1(TR_FAC_UFS, TR_UFS_RENAME_START,
3121*7c478bd9Sstevel@tonic-gate 		"ufs_rename_start:sdvp %p", sdvp);
3122*7c478bd9Sstevel@tonic-gate 
3123*7c478bd9Sstevel@tonic-gate 
3124*7c478bd9Sstevel@tonic-gate 	sdp = VTOI(sdvp);
3125*7c478bd9Sstevel@tonic-gate 	ufsvfsp = sdp->i_ufsvfs;
3126*7c478bd9Sstevel@tonic-gate 	error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_RENAME_MASK);
3127*7c478bd9Sstevel@tonic-gate 	if (error)
3128*7c478bd9Sstevel@tonic-gate 		goto out;
3129*7c478bd9Sstevel@tonic-gate 
3130*7c478bd9Sstevel@tonic-gate 	if (ulp)
3131*7c478bd9Sstevel@tonic-gate 		TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_RENAME,
3132*7c478bd9Sstevel@tonic-gate 		    trans_size = (int)TOP_RENAME_SIZE(sdp));
3133*7c478bd9Sstevel@tonic-gate 
3134*7c478bd9Sstevel@tonic-gate 	if (VOP_REALVP(tdvp, &realvp) == 0)
3135*7c478bd9Sstevel@tonic-gate 		tdvp = realvp;
3136*7c478bd9Sstevel@tonic-gate 
3137*7c478bd9Sstevel@tonic-gate 	tdp = VTOI(tdvp);
3138*7c478bd9Sstevel@tonic-gate 
3139*7c478bd9Sstevel@tonic-gate 	/*
3140*7c478bd9Sstevel@tonic-gate 	 * We only allow renaming of attributes from ATTRDIR to ATTRDIR.
3141*7c478bd9Sstevel@tonic-gate 	 */
3142*7c478bd9Sstevel@tonic-gate 	if ((tdp->i_mode & IFMT) != (sdp->i_mode & IFMT)) {
3143*7c478bd9Sstevel@tonic-gate 		error = EINVAL;
3144*7c478bd9Sstevel@tonic-gate 		goto unlock;
3145*7c478bd9Sstevel@tonic-gate 	}
3146*7c478bd9Sstevel@tonic-gate 
3147*7c478bd9Sstevel@tonic-gate 	/*
3148*7c478bd9Sstevel@tonic-gate 	 * Look up inode of file we're supposed to rename.
3149*7c478bd9Sstevel@tonic-gate 	 */
3150*7c478bd9Sstevel@tonic-gate 	if (error = ufs_dirlook(sdp, snm, &sip, cr, 0)) {
3151*7c478bd9Sstevel@tonic-gate 		goto unlock;
3152*7c478bd9Sstevel@tonic-gate 	}
3153*7c478bd9Sstevel@tonic-gate 
3154*7c478bd9Sstevel@tonic-gate 	/*
3155*7c478bd9Sstevel@tonic-gate 	 * Lock both the source and target directories (they may be
3156*7c478bd9Sstevel@tonic-gate 	 * the same) to provide the atomicity semantics that was
3157*7c478bd9Sstevel@tonic-gate 	 * previously provided by the per file system vfs_rename_lock
3158*7c478bd9Sstevel@tonic-gate 	 *
3159*7c478bd9Sstevel@tonic-gate 	 * with vfs_rename_lock removed to allow simultaneous renames
3160*7c478bd9Sstevel@tonic-gate 	 * within a file system, ufs_dircheckpath can deadlock while
3161*7c478bd9Sstevel@tonic-gate 	 * traversing back to ensure that source is not a parent directory
3162*7c478bd9Sstevel@tonic-gate 	 * of target parent directory. This is because we get into
3163*7c478bd9Sstevel@tonic-gate 	 * ufs_dircheckpath with the sdp and tdp locks held as RW_WRITER.
3164*7c478bd9Sstevel@tonic-gate 	 * If the tdp and sdp of the simultaneous renames happen to be
3165*7c478bd9Sstevel@tonic-gate 	 * in the path of each other, it can lead to a deadlock. This
3166*7c478bd9Sstevel@tonic-gate 	 * can be avoided by getting the locks as RW_READER here and then
3167*7c478bd9Sstevel@tonic-gate 	 * upgrading to RW_WRITER after completing the ufs_dircheckpath.
3168*7c478bd9Sstevel@tonic-gate 	 */
3169*7c478bd9Sstevel@tonic-gate retry:
3170*7c478bd9Sstevel@tonic-gate 	rw_enter(&tdp->i_rwlock, RW_READER);
3171*7c478bd9Sstevel@tonic-gate 	if (tdp != sdp) {
3172*7c478bd9Sstevel@tonic-gate 		/*
3173*7c478bd9Sstevel@tonic-gate 		 * We're locking 2 peer level locks, so must use tryenter
3174*7c478bd9Sstevel@tonic-gate 		 * on the 2nd to avoid deadlocks that would occur
3175*7c478bd9Sstevel@tonic-gate 		 * if we renamed a->b and b->a concurrently.
3176*7c478bd9Sstevel@tonic-gate 		 */
3177*7c478bd9Sstevel@tonic-gate 		if (!rw_tryenter(&sdp->i_rwlock, RW_READER)) {
3178*7c478bd9Sstevel@tonic-gate 			/*
3179*7c478bd9Sstevel@tonic-gate 			 * Reverse the lock grabs in case we have heavy
3180*7c478bd9Sstevel@tonic-gate 			 * contention on the 2nd lock.
3181*7c478bd9Sstevel@tonic-gate 			 */
3182*7c478bd9Sstevel@tonic-gate 			rw_exit(&tdp->i_rwlock);
3183*7c478bd9Sstevel@tonic-gate 			rw_enter(&sdp->i_rwlock, RW_READER);
3184*7c478bd9Sstevel@tonic-gate 			if (!rw_tryenter(&tdp->i_rwlock, RW_READER)) {
3185*7c478bd9Sstevel@tonic-gate 				ufs_rename_retry_cnt++;
3186*7c478bd9Sstevel@tonic-gate 				rw_exit(&sdp->i_rwlock);
3187*7c478bd9Sstevel@tonic-gate 				goto retry;
3188*7c478bd9Sstevel@tonic-gate 			}
3189*7c478bd9Sstevel@tonic-gate 		}
3190*7c478bd9Sstevel@tonic-gate 	}
3191*7c478bd9Sstevel@tonic-gate 
3192*7c478bd9Sstevel@tonic-gate 	if (sip == tdp) {
3193*7c478bd9Sstevel@tonic-gate 		error = EINVAL;
3194*7c478bd9Sstevel@tonic-gate 		goto errout;
3195*7c478bd9Sstevel@tonic-gate 	}
3196*7c478bd9Sstevel@tonic-gate 	/*
3197*7c478bd9Sstevel@tonic-gate 	 * Make sure we can delete the source entry.  This requires
3198*7c478bd9Sstevel@tonic-gate 	 * write permission on the containing directory.
3199*7c478bd9Sstevel@tonic-gate 	 * Check for sticky directories.
3200*7c478bd9Sstevel@tonic-gate 	 */
3201*7c478bd9Sstevel@tonic-gate 	rw_enter(&sdp->i_contents, RW_READER);
3202*7c478bd9Sstevel@tonic-gate 	rw_enter(&sip->i_contents, RW_READER);
3203*7c478bd9Sstevel@tonic-gate 	if ((error = ufs_iaccess(sdp, IWRITE, cr)) != 0 ||
3204*7c478bd9Sstevel@tonic-gate 	    (error = ufs_sticky_remove_access(sdp, sip, cr)) != 0) {
3205*7c478bd9Sstevel@tonic-gate 		rw_exit(&sip->i_contents);
3206*7c478bd9Sstevel@tonic-gate 		rw_exit(&sdp->i_contents);
3207*7c478bd9Sstevel@tonic-gate 		goto errout;
3208*7c478bd9Sstevel@tonic-gate 	}
3209*7c478bd9Sstevel@tonic-gate 
3210*7c478bd9Sstevel@tonic-gate 	/*
3211*7c478bd9Sstevel@tonic-gate 	 * If this is a rename of a directory and the parent is
3212*7c478bd9Sstevel@tonic-gate 	 * different (".." must be changed), then the source
3213*7c478bd9Sstevel@tonic-gate 	 * directory must not be in the directory hierarchy
3214*7c478bd9Sstevel@tonic-gate 	 * above the target, as this would orphan everything
3215*7c478bd9Sstevel@tonic-gate 	 * below the source directory.  Also the user must have
3216*7c478bd9Sstevel@tonic-gate 	 * write permission in the source so as to be able to
3217*7c478bd9Sstevel@tonic-gate 	 * change "..".
3218*7c478bd9Sstevel@tonic-gate 	 */
3219*7c478bd9Sstevel@tonic-gate 	if ((((sip->i_mode & IFMT) == IFDIR) ||
3220*7c478bd9Sstevel@tonic-gate 	    ((sip->i_mode & IFMT) == IFATTRDIR)) && sdp != tdp) {
3221*7c478bd9Sstevel@tonic-gate 		ino_t	inum;
3222*7c478bd9Sstevel@tonic-gate 
3223*7c478bd9Sstevel@tonic-gate 		if ((error = ufs_iaccess(sip, IWRITE, cr))) {
3224*7c478bd9Sstevel@tonic-gate 			rw_exit(&sip->i_contents);
3225*7c478bd9Sstevel@tonic-gate 			rw_exit(&sdp->i_contents);
3226*7c478bd9Sstevel@tonic-gate 			goto errout;
3227*7c478bd9Sstevel@tonic-gate 		}
3228*7c478bd9Sstevel@tonic-gate 		inum = sip->i_number;
3229*7c478bd9Sstevel@tonic-gate 		rw_exit(&sip->i_contents);
3230*7c478bd9Sstevel@tonic-gate 		rw_exit(&sdp->i_contents);
3231*7c478bd9Sstevel@tonic-gate 		if ((error = ufs_dircheckpath(inum, tdp, sdp, cr))) {
3232*7c478bd9Sstevel@tonic-gate 			/*
3233*7c478bd9Sstevel@tonic-gate 			 * If we got EAGAIN ufs_dircheckpath detected a
3234*7c478bd9Sstevel@tonic-gate 			 * potential deadlock and backed out. We need
3235*7c478bd9Sstevel@tonic-gate 			 * to retry the operation since sdp and tdp have
3236*7c478bd9Sstevel@tonic-gate 			 * to be released to avoid the deadlock.
3237*7c478bd9Sstevel@tonic-gate 			 */
3238*7c478bd9Sstevel@tonic-gate 			if (error == EAGAIN) {
3239*7c478bd9Sstevel@tonic-gate 				rw_exit(&tdp->i_rwlock);
3240*7c478bd9Sstevel@tonic-gate 				if (tdp != sdp)
3241*7c478bd9Sstevel@tonic-gate 					rw_exit(&sdp->i_rwlock);
3242*7c478bd9Sstevel@tonic-gate 				delay(ufs_rename_backoff_delay);
3243*7c478bd9Sstevel@tonic-gate 				ufs_rename_dircheck_retry_cnt++;
3244*7c478bd9Sstevel@tonic-gate 				goto retry;
3245*7c478bd9Sstevel@tonic-gate 			}
3246*7c478bd9Sstevel@tonic-gate 			goto errout;
3247*7c478bd9Sstevel@tonic-gate 		}
3248*7c478bd9Sstevel@tonic-gate 	} else {
3249*7c478bd9Sstevel@tonic-gate 		rw_exit(&sip->i_contents);
3250*7c478bd9Sstevel@tonic-gate 		rw_exit(&sdp->i_contents);
3251*7c478bd9Sstevel@tonic-gate 	}
3252*7c478bd9Sstevel@tonic-gate 
3253*7c478bd9Sstevel@tonic-gate 
3254*7c478bd9Sstevel@tonic-gate 	/*
3255*7c478bd9Sstevel@tonic-gate 	 * Check for renaming '.' or '..' or alias of '.'
3256*7c478bd9Sstevel@tonic-gate 	 */
3257*7c478bd9Sstevel@tonic-gate 	if (strcmp(snm, ".") == 0 || strcmp(snm, "..") == 0 || sdp == sip) {
3258*7c478bd9Sstevel@tonic-gate 		error = EINVAL;
3259*7c478bd9Sstevel@tonic-gate 		goto errout;
3260*7c478bd9Sstevel@tonic-gate 	}
3261*7c478bd9Sstevel@tonic-gate 
3262*7c478bd9Sstevel@tonic-gate 	/*
3263*7c478bd9Sstevel@tonic-gate 	 * Simultaneous renames can deadlock in ufs_dircheckpath since it
3264*7c478bd9Sstevel@tonic-gate 	 * tries to traverse back the file tree with both tdp and sdp held
3265*7c478bd9Sstevel@tonic-gate 	 * as RW_WRITER. To avoid that we have to hold the tdp and sdp locks
3266*7c478bd9Sstevel@tonic-gate 	 * as RW_READERS  till ufs_dircheckpath is done.
3267*7c478bd9Sstevel@tonic-gate 	 * Now that ufs_dircheckpath is done with, we can upgrade the locks
3268*7c478bd9Sstevel@tonic-gate 	 * to RW_WRITER.
3269*7c478bd9Sstevel@tonic-gate 	 */
3270*7c478bd9Sstevel@tonic-gate 	if (!rw_tryupgrade(&tdp->i_rwlock)) {
3271*7c478bd9Sstevel@tonic-gate 		/*
3272*7c478bd9Sstevel@tonic-gate 		 * The upgrade failed. We got to give away the lock
3273*7c478bd9Sstevel@tonic-gate 		 * as to avoid deadlocking with someone else who is
3274*7c478bd9Sstevel@tonic-gate 		 * waiting for writer lock. With the lock gone, we
3275*7c478bd9Sstevel@tonic-gate 		 * cannot be sure the checks done above will hold
3276*7c478bd9Sstevel@tonic-gate 		 * good when we eventually get them back as writer.
3277*7c478bd9Sstevel@tonic-gate 		 * So if we can't upgrade we drop the locks and retry
3278*7c478bd9Sstevel@tonic-gate 		 * everything again.
3279*7c478bd9Sstevel@tonic-gate 		 */
3280*7c478bd9Sstevel@tonic-gate 		rw_exit(&tdp->i_rwlock);
3281*7c478bd9Sstevel@tonic-gate 		if (tdp != sdp)
3282*7c478bd9Sstevel@tonic-gate 			rw_exit(&sdp->i_rwlock);
3283*7c478bd9Sstevel@tonic-gate 		delay(ufs_rename_backoff_delay);
3284*7c478bd9Sstevel@tonic-gate 		ufs_rename_upgrade_retry_cnt++;
3285*7c478bd9Sstevel@tonic-gate 		goto retry;
3286*7c478bd9Sstevel@tonic-gate 	}
3287*7c478bd9Sstevel@tonic-gate 	if (tdp != sdp) {
3288*7c478bd9Sstevel@tonic-gate 		if (!rw_tryupgrade(&sdp->i_rwlock)) {
3289*7c478bd9Sstevel@tonic-gate 			/*
3290*7c478bd9Sstevel@tonic-gate 			 * The upgrade failed. We got to give away the lock
3291*7c478bd9Sstevel@tonic-gate 			 * as to avoid deadlocking with someone else who is
3292*7c478bd9Sstevel@tonic-gate 			 * waiting for writer lock. With the lock gone, we
3293*7c478bd9Sstevel@tonic-gate 			 * cannot be sure the checks done above will hold
3294*7c478bd9Sstevel@tonic-gate 			 * good when we eventually get them back as writer.
3295*7c478bd9Sstevel@tonic-gate 			 * So if we can't upgrade we drop the locks and retry
3296*7c478bd9Sstevel@tonic-gate 			 * everything again.
3297*7c478bd9Sstevel@tonic-gate 			 */
3298*7c478bd9Sstevel@tonic-gate 			rw_exit(&tdp->i_rwlock);
3299*7c478bd9Sstevel@tonic-gate 			rw_exit(&sdp->i_rwlock);
3300*7c478bd9Sstevel@tonic-gate 			delay(ufs_rename_backoff_delay);
3301*7c478bd9Sstevel@tonic-gate 			ufs_rename_upgrade_retry_cnt++;
3302*7c478bd9Sstevel@tonic-gate 			goto retry;
3303*7c478bd9Sstevel@tonic-gate 		}
3304*7c478bd9Sstevel@tonic-gate 	}
3305*7c478bd9Sstevel@tonic-gate 	/*
3306*7c478bd9Sstevel@tonic-gate 	 * Link source to the target.  If a target exists, return its
3307*7c478bd9Sstevel@tonic-gate 	 * vnode pointer in tvp.  We'll release it after sending the
3308*7c478bd9Sstevel@tonic-gate 	 * vnevent.
3309*7c478bd9Sstevel@tonic-gate 	 */
3310*7c478bd9Sstevel@tonic-gate 	if (error = ufs_direnter_lr(tdp, tnm, DE_RENAME, sdp, sip, cr, &tvp)) {
3311*7c478bd9Sstevel@tonic-gate 		/*
3312*7c478bd9Sstevel@tonic-gate 		 * ESAME isn't really an error; it indicates that the
3313*7c478bd9Sstevel@tonic-gate 		 * operation should not be done because the source and target
3314*7c478bd9Sstevel@tonic-gate 		 * are the same file, but that no error should be reported.
3315*7c478bd9Sstevel@tonic-gate 		 */
3316*7c478bd9Sstevel@tonic-gate 		if (error == ESAME)
3317*7c478bd9Sstevel@tonic-gate 			error = 0;
3318*7c478bd9Sstevel@tonic-gate 		goto errout;
3319*7c478bd9Sstevel@tonic-gate 	}
3320*7c478bd9Sstevel@tonic-gate 
3321*7c478bd9Sstevel@tonic-gate 	/*
3322*7c478bd9Sstevel@tonic-gate 	 * Unlink the source.
3323*7c478bd9Sstevel@tonic-gate 	 * Remove the source entry.  ufs_dirremove() checks that the entry
3324*7c478bd9Sstevel@tonic-gate 	 * still reflects sip, and returns an error if it doesn't.
3325*7c478bd9Sstevel@tonic-gate 	 * If the entry has changed just forget about it.  Release
3326*7c478bd9Sstevel@tonic-gate 	 * the source inode.
3327*7c478bd9Sstevel@tonic-gate 	 */
3328*7c478bd9Sstevel@tonic-gate 	if ((error = ufs_dirremove(sdp, snm, sip, (struct vnode *)0,
3329*7c478bd9Sstevel@tonic-gate 	    DR_RENAME, cr, NULL)) == ENOENT)
3330*7c478bd9Sstevel@tonic-gate 		error = 0;
3331*7c478bd9Sstevel@tonic-gate 
3332*7c478bd9Sstevel@tonic-gate errout:
3333*7c478bd9Sstevel@tonic-gate 	rw_exit(&tdp->i_rwlock);
3334*7c478bd9Sstevel@tonic-gate 	if (sdp != tdp) {
3335*7c478bd9Sstevel@tonic-gate 		rw_exit(&sdp->i_rwlock);
3336*7c478bd9Sstevel@tonic-gate 	}
3337*7c478bd9Sstevel@tonic-gate 
3338*7c478bd9Sstevel@tonic-gate unlock:
3339*7c478bd9Sstevel@tonic-gate 	if (ulp) {
3340*7c478bd9Sstevel@tonic-gate 		TRANS_END_CSYNC(ufsvfsp, error, issync, TOP_RENAME, trans_size);
3341*7c478bd9Sstevel@tonic-gate 		ufs_lockfs_end(ulp);
3342*7c478bd9Sstevel@tonic-gate 	}
3343*7c478bd9Sstevel@tonic-gate 
3344*7c478bd9Sstevel@tonic-gate 	/*
3345*7c478bd9Sstevel@tonic-gate 	 * If no errors, send the appropriate events on the source
3346*7c478bd9Sstevel@tonic-gate 	 * and destination (a.k.a, target) vnodes, if they exist.
3347*7c478bd9Sstevel@tonic-gate 	 * This has to be done after the rename transaction has closed.
3348*7c478bd9Sstevel@tonic-gate 	 */
3349*7c478bd9Sstevel@tonic-gate 	if (error == 0) {
3350*7c478bd9Sstevel@tonic-gate 		if (tvp != NULL)
3351*7c478bd9Sstevel@tonic-gate 			vnevent_rename_dest(tvp);
3352*7c478bd9Sstevel@tonic-gate 		/*
3353*7c478bd9Sstevel@tonic-gate 		 * Note that if ufs_direnter_lr() returned ESAME then
3354*7c478bd9Sstevel@tonic-gate 		 * this event will still be sent.  This isn't expected
3355*7c478bd9Sstevel@tonic-gate 		 * to be a problem for anticipated usage by consumers.
3356*7c478bd9Sstevel@tonic-gate 		 */
3357*7c478bd9Sstevel@tonic-gate 		if (sip != NULL)
3358*7c478bd9Sstevel@tonic-gate 			vnevent_rename_src(ITOV(sip));
3359*7c478bd9Sstevel@tonic-gate 	}
3360*7c478bd9Sstevel@tonic-gate 
3361*7c478bd9Sstevel@tonic-gate 	if (tvp != NULL)
3362*7c478bd9Sstevel@tonic-gate 		VN_RELE(tvp);
3363*7c478bd9Sstevel@tonic-gate 
3364*7c478bd9Sstevel@tonic-gate 	if (sip != NULL)
3365*7c478bd9Sstevel@tonic-gate 		VN_RELE(ITOV(sip));
3366*7c478bd9Sstevel@tonic-gate 
3367*7c478bd9Sstevel@tonic-gate out:
3368*7c478bd9Sstevel@tonic-gate 	TRACE_5(TR_FAC_UFS, TR_UFS_RENAME_END,
3369*7c478bd9Sstevel@tonic-gate 		"ufs_rename_end:sdvp %p snm %s tdvp %p tnm %s error %d",
3370*7c478bd9Sstevel@tonic-gate 			sdvp, snm, tdvp, tnm, error);
3371*7c478bd9Sstevel@tonic-gate 	return (error);
3372*7c478bd9Sstevel@tonic-gate }
3373*7c478bd9Sstevel@tonic-gate 
3374*7c478bd9Sstevel@tonic-gate /*ARGSUSED*/
3375*7c478bd9Sstevel@tonic-gate static int
3376*7c478bd9Sstevel@tonic-gate ufs_mkdir(struct vnode *dvp, char *dirname, struct vattr *vap,
3377*7c478bd9Sstevel@tonic-gate 	struct vnode **vpp, struct cred *cr)
3378*7c478bd9Sstevel@tonic-gate {
3379*7c478bd9Sstevel@tonic-gate 	struct inode *ip;
3380*7c478bd9Sstevel@tonic-gate 	struct inode *xip;
3381*7c478bd9Sstevel@tonic-gate 	struct ufsvfs *ufsvfsp;
3382*7c478bd9Sstevel@tonic-gate 	struct ulockfs *ulp;
3383*7c478bd9Sstevel@tonic-gate 	int error;
3384*7c478bd9Sstevel@tonic-gate 	int issync;
3385*7c478bd9Sstevel@tonic-gate 	int trans_size;
3386*7c478bd9Sstevel@tonic-gate 	int retry = 1;
3387*7c478bd9Sstevel@tonic-gate 
3388*7c478bd9Sstevel@tonic-gate 	ASSERT((vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));
3389*7c478bd9Sstevel@tonic-gate 
3390*7c478bd9Sstevel@tonic-gate 	TRACE_1(TR_FAC_UFS, TR_UFS_MKDIR_START,
3391*7c478bd9Sstevel@tonic-gate 		"ufs_mkdir_start:dvp %p", dvp);
3392*7c478bd9Sstevel@tonic-gate 
3393*7c478bd9Sstevel@tonic-gate 	/*
3394*7c478bd9Sstevel@tonic-gate 	 * Can't make directory in attr hidden dir
3395*7c478bd9Sstevel@tonic-gate 	 */
3396*7c478bd9Sstevel@tonic-gate 	if ((VTOI(dvp)->i_mode & IFMT) == IFATTRDIR)
3397*7c478bd9Sstevel@tonic-gate 		return (EINVAL);
3398*7c478bd9Sstevel@tonic-gate 
3399*7c478bd9Sstevel@tonic-gate again:
3400*7c478bd9Sstevel@tonic-gate 	ip = VTOI(dvp);
3401*7c478bd9Sstevel@tonic-gate 	ufsvfsp = ip->i_ufsvfs;
3402*7c478bd9Sstevel@tonic-gate 	error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_MKDIR_MASK);
3403*7c478bd9Sstevel@tonic-gate 	if (error)
3404*7c478bd9Sstevel@tonic-gate 		goto out;
3405*7c478bd9Sstevel@tonic-gate 	if (ulp)
3406*7c478bd9Sstevel@tonic-gate 		TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_MKDIR,
3407*7c478bd9Sstevel@tonic-gate 		    trans_size = (int)TOP_MKDIR_SIZE(ip));
3408*7c478bd9Sstevel@tonic-gate 
3409*7c478bd9Sstevel@tonic-gate 	rw_enter(&ip->i_rwlock, RW_WRITER);
3410*7c478bd9Sstevel@tonic-gate 
3411*7c478bd9Sstevel@tonic-gate 	error = ufs_direnter_cm(ip, dirname, DE_MKDIR, vap, &xip, cr,
3412*7c478bd9Sstevel@tonic-gate 		(retry ? IQUIET : 0));
3413*7c478bd9Sstevel@tonic-gate 
3414*7c478bd9Sstevel@tonic-gate 	rw_exit(&ip->i_rwlock);
3415*7c478bd9Sstevel@tonic-gate 	if (error == 0) {
3416*7c478bd9Sstevel@tonic-gate 		ip = xip;
3417*7c478bd9Sstevel@tonic-gate 		*vpp = ITOV(ip);
3418*7c478bd9Sstevel@tonic-gate 	} else if (error == EEXIST)
3419*7c478bd9Sstevel@tonic-gate 		VN_RELE(ITOV(xip));
3420*7c478bd9Sstevel@tonic-gate 
3421*7c478bd9Sstevel@tonic-gate 	if (ulp) {
3422*7c478bd9Sstevel@tonic-gate 		int terr = 0;
3423*7c478bd9Sstevel@tonic-gate 		TRANS_END_CSYNC(ufsvfsp, terr, issync, TOP_MKDIR, trans_size);
3424*7c478bd9Sstevel@tonic-gate 		ufs_lockfs_end(ulp);
3425*7c478bd9Sstevel@tonic-gate 		if (error == 0)
3426*7c478bd9Sstevel@tonic-gate 			error = terr;
3427*7c478bd9Sstevel@tonic-gate 	}
3428*7c478bd9Sstevel@tonic-gate out:
3429*7c478bd9Sstevel@tonic-gate 	if ((error == ENOSPC) && retry && TRANS_ISTRANS(ufsvfsp)) {
3430*7c478bd9Sstevel@tonic-gate 		ufs_delete_drain_wait(ufsvfsp, 1);
3431*7c478bd9Sstevel@tonic-gate 		retry = 0;
3432*7c478bd9Sstevel@tonic-gate 		goto again;
3433*7c478bd9Sstevel@tonic-gate 	}
3434*7c478bd9Sstevel@tonic-gate 
3435*7c478bd9Sstevel@tonic-gate 	TRACE_2(TR_FAC_UFS, TR_UFS_MKDIR_END,
3436*7c478bd9Sstevel@tonic-gate 		"ufs_mkdir_end:dvp %p error %d", dvp, error);
3437*7c478bd9Sstevel@tonic-gate 	return (error);
3438*7c478bd9Sstevel@tonic-gate }
3439*7c478bd9Sstevel@tonic-gate 
3440*7c478bd9Sstevel@tonic-gate /*ARGSUSED*/
3441*7c478bd9Sstevel@tonic-gate static int
3442*7c478bd9Sstevel@tonic-gate ufs_rmdir(struct vnode *vp, char *nm, struct vnode *cdir, struct cred *cr)
3443*7c478bd9Sstevel@tonic-gate {
3444*7c478bd9Sstevel@tonic-gate 	struct inode *ip = VTOI(vp);
3445*7c478bd9Sstevel@tonic-gate 	struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
3446*7c478bd9Sstevel@tonic-gate 	struct ulockfs *ulp;
3447*7c478bd9Sstevel@tonic-gate 	vnode_t *rmvp = NULL;	/* Vnode of removed directory */
3448*7c478bd9Sstevel@tonic-gate 	int error;
3449*7c478bd9Sstevel@tonic-gate 	int issync;
3450*7c478bd9Sstevel@tonic-gate 
3451*7c478bd9Sstevel@tonic-gate 	TRACE_1(TR_FAC_UFS, TR_UFS_RMDIR_START,
3452*7c478bd9Sstevel@tonic-gate 		"ufs_rmdir_start:vp %p", vp);
3453*7c478bd9Sstevel@tonic-gate 
3454*7c478bd9Sstevel@tonic-gate 	/*
3455*7c478bd9Sstevel@tonic-gate 	 * don't let the delete queue get too long
3456*7c478bd9Sstevel@tonic-gate 	 */
3457*7c478bd9Sstevel@tonic-gate 	if (ufsvfsp == NULL) {
3458*7c478bd9Sstevel@tonic-gate 		error = EIO;
3459*7c478bd9Sstevel@tonic-gate 		goto out;
3460*7c478bd9Sstevel@tonic-gate 	}
3461*7c478bd9Sstevel@tonic-gate 	if (ufsvfsp->vfs_delete.uq_ne > ufs_idle_max)
3462*7c478bd9Sstevel@tonic-gate 		ufs_delete_drain(vp->v_vfsp, 1, 1);
3463*7c478bd9Sstevel@tonic-gate 
3464*7c478bd9Sstevel@tonic-gate 	error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_RMDIR_MASK);
3465*7c478bd9Sstevel@tonic-gate 	if (error)
3466*7c478bd9Sstevel@tonic-gate 		goto out;
3467*7c478bd9Sstevel@tonic-gate 
3468*7c478bd9Sstevel@tonic-gate 	if (ulp)
3469*7c478bd9Sstevel@tonic-gate 		TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_RMDIR, TOP_RMDIR_SIZE);
3470*7c478bd9Sstevel@tonic-gate 
3471*7c478bd9Sstevel@tonic-gate 	rw_enter(&ip->i_rwlock, RW_WRITER);
3472*7c478bd9Sstevel@tonic-gate 	error = ufs_dirremove(ip, nm, (struct inode *)0, cdir, DR_RMDIR, cr,
3473*7c478bd9Sstevel@tonic-gate 									&rmvp);
3474*7c478bd9Sstevel@tonic-gate 	rw_exit(&ip->i_rwlock);
3475*7c478bd9Sstevel@tonic-gate 
3476*7c478bd9Sstevel@tonic-gate 	if (ulp) {
3477*7c478bd9Sstevel@tonic-gate 		TRANS_END_CSYNC(ufsvfsp, error, issync, TOP_RMDIR,
3478*7c478bd9Sstevel@tonic-gate 				TOP_RMDIR_SIZE);
3479*7c478bd9Sstevel@tonic-gate 		ufs_lockfs_end(ulp);
3480*7c478bd9Sstevel@tonic-gate 	}
3481*7c478bd9Sstevel@tonic-gate 
3482*7c478bd9Sstevel@tonic-gate 	/*
3483*7c478bd9Sstevel@tonic-gate 	 * This must be done AFTER the rmdir transaction has closed.
3484*7c478bd9Sstevel@tonic-gate 	 */
3485*7c478bd9Sstevel@tonic-gate 	if (rmvp != NULL) {
3486*7c478bd9Sstevel@tonic-gate 		/* Only send the event if there were no errors */
3487*7c478bd9Sstevel@tonic-gate 		if (error == 0)
3488*7c478bd9Sstevel@tonic-gate 			vnevent_rmdir(rmvp);
3489*7c478bd9Sstevel@tonic-gate 		VN_RELE(rmvp);
3490*7c478bd9Sstevel@tonic-gate 	}
3491*7c478bd9Sstevel@tonic-gate out:
3492*7c478bd9Sstevel@tonic-gate 	TRACE_2(TR_FAC_UFS, TR_UFS_RMDIR_END,
3493*7c478bd9Sstevel@tonic-gate 		"ufs_rmdir_end:vp %p error %d", vp, error);
3494*7c478bd9Sstevel@tonic-gate 
3495*7c478bd9Sstevel@tonic-gate 	return (error);
3496*7c478bd9Sstevel@tonic-gate }
3497*7c478bd9Sstevel@tonic-gate 
3498*7c478bd9Sstevel@tonic-gate /* ARGSUSED */
3499*7c478bd9Sstevel@tonic-gate static int
3500*7c478bd9Sstevel@tonic-gate ufs_readdir(
3501*7c478bd9Sstevel@tonic-gate 	struct vnode *vp,
3502*7c478bd9Sstevel@tonic-gate 	struct uio *uiop,
3503*7c478bd9Sstevel@tonic-gate 	struct cred *cr,
3504*7c478bd9Sstevel@tonic-gate 	int *eofp)
3505*7c478bd9Sstevel@tonic-gate {
3506*7c478bd9Sstevel@tonic-gate 	struct iovec *iovp;
3507*7c478bd9Sstevel@tonic-gate 	struct inode *ip;
3508*7c478bd9Sstevel@tonic-gate 	struct direct *idp;
3509*7c478bd9Sstevel@tonic-gate 	struct dirent64 *odp;
3510*7c478bd9Sstevel@tonic-gate 	struct fbuf *fbp;
3511*7c478bd9Sstevel@tonic-gate 	struct ufsvfs *ufsvfsp;
3512*7c478bd9Sstevel@tonic-gate 	struct ulockfs *ulp;
3513*7c478bd9Sstevel@tonic-gate 	caddr_t outbuf;
3514*7c478bd9Sstevel@tonic-gate 	size_t bufsize;
3515*7c478bd9Sstevel@tonic-gate 	uint_t offset;
3516*7c478bd9Sstevel@tonic-gate 	uint_t bytes_wanted, total_bytes_wanted;
3517*7c478bd9Sstevel@tonic-gate 	int incount = 0;
3518*7c478bd9Sstevel@tonic-gate 	int outcount = 0;
3519*7c478bd9Sstevel@tonic-gate 	int error;
3520*7c478bd9Sstevel@tonic-gate 
3521*7c478bd9Sstevel@tonic-gate 	ip = VTOI(vp);
3522*7c478bd9Sstevel@tonic-gate 	ASSERT(RW_READ_HELD(&ip->i_rwlock));
3523*7c478bd9Sstevel@tonic-gate 
3524*7c478bd9Sstevel@tonic-gate 	TRACE_2(TR_FAC_UFS, TR_UFS_READDIR_START,
3525*7c478bd9Sstevel@tonic-gate 		"ufs_readdir_start:vp %p uiop %p", vp, uiop);
3526*7c478bd9Sstevel@tonic-gate 
3527*7c478bd9Sstevel@tonic-gate 	if (uiop->uio_loffset >= MAXOFF32_T) {
3528*7c478bd9Sstevel@tonic-gate 		if (eofp)
3529*7c478bd9Sstevel@tonic-gate 			*eofp = 1;
3530*7c478bd9Sstevel@tonic-gate 		return (0);
3531*7c478bd9Sstevel@tonic-gate 	}
3532*7c478bd9Sstevel@tonic-gate 
3533*7c478bd9Sstevel@tonic-gate 	/*
3534*7c478bd9Sstevel@tonic-gate 	 * Check if we have been called with a valid iov_len
3535*7c478bd9Sstevel@tonic-gate 	 * and bail out if not, otherwise we may potentially loop
3536*7c478bd9Sstevel@tonic-gate 	 * forever further down.
3537*7c478bd9Sstevel@tonic-gate 	 */
3538*7c478bd9Sstevel@tonic-gate 	if (uiop->uio_iov->iov_len <= 0) {
3539*7c478bd9Sstevel@tonic-gate 		error = EINVAL;
3540*7c478bd9Sstevel@tonic-gate 		goto out;
3541*7c478bd9Sstevel@tonic-gate 	}
3542*7c478bd9Sstevel@tonic-gate 
3543*7c478bd9Sstevel@tonic-gate 	/*
3544*7c478bd9Sstevel@tonic-gate 	 * Large Files: When we come here we are guaranteed that
3545*7c478bd9Sstevel@tonic-gate 	 * uio_offset can be used safely. The high word is zero.
3546*7c478bd9Sstevel@tonic-gate 	 */
3547*7c478bd9Sstevel@tonic-gate 
3548*7c478bd9Sstevel@tonic-gate 	ufsvfsp = ip->i_ufsvfs;
3549*7c478bd9Sstevel@tonic-gate 	error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_READDIR_MASK);
3550*7c478bd9Sstevel@tonic-gate 	if (error)
3551*7c478bd9Sstevel@tonic-gate 		goto out;
3552*7c478bd9Sstevel@tonic-gate 
3553*7c478bd9Sstevel@tonic-gate 	iovp = uiop->uio_iov;
3554*7c478bd9Sstevel@tonic-gate 	total_bytes_wanted = iovp->iov_len;
3555*7c478bd9Sstevel@tonic-gate 
3556*7c478bd9Sstevel@tonic-gate 	/* Large Files: directory files should not be "large" */
3557*7c478bd9Sstevel@tonic-gate 
3558*7c478bd9Sstevel@tonic-gate 	ASSERT(ip->i_size <= MAXOFF32_T);
3559*7c478bd9Sstevel@tonic-gate 
3560*7c478bd9Sstevel@tonic-gate 	/* Force offset to be valid (to guard against bogus lseek() values) */
3561*7c478bd9Sstevel@tonic-gate 	offset = (uint_t)uiop->uio_offset & ~(DIRBLKSIZ - 1);
3562*7c478bd9Sstevel@tonic-gate 
3563*7c478bd9Sstevel@tonic-gate 	/* Quit if at end of file or link count of zero (posix) */
3564*7c478bd9Sstevel@tonic-gate 	if (offset >= (uint_t)ip->i_size || ip->i_nlink <= 0) {
3565*7c478bd9Sstevel@tonic-gate 		if (eofp)
3566*7c478bd9Sstevel@tonic-gate 			*eofp = 1;
3567*7c478bd9Sstevel@tonic-gate 		error = 0;
3568*7c478bd9Sstevel@tonic-gate 		goto unlock;
3569*7c478bd9Sstevel@tonic-gate 	}
3570*7c478bd9Sstevel@tonic-gate 
3571*7c478bd9Sstevel@tonic-gate 	/*
3572*7c478bd9Sstevel@tonic-gate 	 * Get space to change directory entries into fs independent format.
3573*7c478bd9Sstevel@tonic-gate 	 * Do fast alloc for the most commonly used-request size (filesystem
3574*7c478bd9Sstevel@tonic-gate 	 * block size).
3575*7c478bd9Sstevel@tonic-gate 	 */
3576*7c478bd9Sstevel@tonic-gate 	if (uiop->uio_segflg != UIO_SYSSPACE || uiop->uio_iovcnt != 1) {
3577*7c478bd9Sstevel@tonic-gate 		bufsize = total_bytes_wanted;
3578*7c478bd9Sstevel@tonic-gate 		outbuf = kmem_alloc(bufsize, KM_SLEEP);
3579*7c478bd9Sstevel@tonic-gate 		odp = (struct dirent64 *)outbuf;
3580*7c478bd9Sstevel@tonic-gate 	} else {
3581*7c478bd9Sstevel@tonic-gate 		bufsize = total_bytes_wanted;
3582*7c478bd9Sstevel@tonic-gate 		odp = (struct dirent64 *)iovp->iov_base;
3583*7c478bd9Sstevel@tonic-gate 	}
3584*7c478bd9Sstevel@tonic-gate 
3585*7c478bd9Sstevel@tonic-gate nextblk:
3586*7c478bd9Sstevel@tonic-gate 	bytes_wanted = total_bytes_wanted;
3587*7c478bd9Sstevel@tonic-gate 
3588*7c478bd9Sstevel@tonic-gate 	/* Truncate request to file size */
3589*7c478bd9Sstevel@tonic-gate 	if (offset + bytes_wanted > (int)ip->i_size)
3590*7c478bd9Sstevel@tonic-gate 		bytes_wanted = (int)(ip->i_size - offset);
3591*7c478bd9Sstevel@tonic-gate 
3592*7c478bd9Sstevel@tonic-gate 	/* Comply with MAXBSIZE boundary restrictions of fbread() */
3593*7c478bd9Sstevel@tonic-gate 	if ((offset & MAXBOFFSET) + bytes_wanted > MAXBSIZE)
3594*7c478bd9Sstevel@tonic-gate 		bytes_wanted = MAXBSIZE - (offset & MAXBOFFSET);
3595*7c478bd9Sstevel@tonic-gate 
3596*7c478bd9Sstevel@tonic-gate 	/*
3597*7c478bd9Sstevel@tonic-gate 	 * Read in the next chunk.
3598*7c478bd9Sstevel@tonic-gate 	 * We are still holding the i_rwlock.
3599*7c478bd9Sstevel@tonic-gate 	 */
3600*7c478bd9Sstevel@tonic-gate 	error = fbread(vp, (offset_t)offset, bytes_wanted, S_OTHER, &fbp);
3601*7c478bd9Sstevel@tonic-gate 
3602*7c478bd9Sstevel@tonic-gate 	if (error)
3603*7c478bd9Sstevel@tonic-gate 		goto update_inode;
3604*7c478bd9Sstevel@tonic-gate 	if (!ULOCKFS_IS_NOIACC(ITOUL(ip)) && (ip->i_fs->fs_ronly == 0) &&
3605*7c478bd9Sstevel@tonic-gate 	    (!ufsvfsp->vfs_noatime)) {
3606*7c478bd9Sstevel@tonic-gate 		ip->i_flag |= IACC;
3607*7c478bd9Sstevel@tonic-gate 	}
3608*7c478bd9Sstevel@tonic-gate 	incount = 0;
3609*7c478bd9Sstevel@tonic-gate 	idp = (struct direct *)fbp->fb_addr;
3610*7c478bd9Sstevel@tonic-gate 	if (idp->d_ino == 0 && idp->d_reclen == 0 &&
3611*7c478bd9Sstevel@tonic-gate 		idp->d_namlen == 0) {
3612*7c478bd9Sstevel@tonic-gate 		cmn_err(CE_WARN, "ufs_readdir: bad dir, inumber = %llu, "
3613*7c478bd9Sstevel@tonic-gate 			"fs = %s\n",
3614*7c478bd9Sstevel@tonic-gate 			(u_longlong_t)ip->i_number, ufsvfsp->vfs_fs->fs_fsmnt);
3615*7c478bd9Sstevel@tonic-gate 		fbrelse(fbp, S_OTHER);
3616*7c478bd9Sstevel@tonic-gate 		error = ENXIO;
3617*7c478bd9Sstevel@tonic-gate 		goto update_inode;
3618*7c478bd9Sstevel@tonic-gate 	}
3619*7c478bd9Sstevel@tonic-gate 	/* Transform to file-system independent format */
3620*7c478bd9Sstevel@tonic-gate 	while (incount < bytes_wanted) {
3621*7c478bd9Sstevel@tonic-gate 		/*
3622*7c478bd9Sstevel@tonic-gate 		 * If the current directory entry is mangled, then skip
3623*7c478bd9Sstevel@tonic-gate 		 * to the next block.  It would be nice to set the FSBAD
3624*7c478bd9Sstevel@tonic-gate 		 * flag in the super-block so that a fsck is forced on
3625*7c478bd9Sstevel@tonic-gate 		 * next reboot, but locking is a problem.
3626*7c478bd9Sstevel@tonic-gate 		 */
3627*7c478bd9Sstevel@tonic-gate 		if (idp->d_reclen & 0x3) {
3628*7c478bd9Sstevel@tonic-gate 			offset = (offset + DIRBLKSIZ) & ~(DIRBLKSIZ-1);
3629*7c478bd9Sstevel@tonic-gate 			break;
3630*7c478bd9Sstevel@tonic-gate 		}
3631*7c478bd9Sstevel@tonic-gate 
3632*7c478bd9Sstevel@tonic-gate 		/* Skip to requested offset and skip empty entries */
3633*7c478bd9Sstevel@tonic-gate 		if (idp->d_ino != 0 && offset >= (uint_t)uiop->uio_offset) {
3634*7c478bd9Sstevel@tonic-gate 			ushort_t this_reclen =
3635*7c478bd9Sstevel@tonic-gate 			    DIRENT64_RECLEN(idp->d_namlen);
3636*7c478bd9Sstevel@tonic-gate 			/* Buffer too small for any entries */
3637*7c478bd9Sstevel@tonic-gate 			if (!outcount && this_reclen > bufsize) {
3638*7c478bd9Sstevel@tonic-gate 				fbrelse(fbp, S_OTHER);
3639*7c478bd9Sstevel@tonic-gate 				error = EINVAL;
3640*7c478bd9Sstevel@tonic-gate 				goto update_inode;
3641*7c478bd9Sstevel@tonic-gate 			}
3642*7c478bd9Sstevel@tonic-gate 			/* If would overrun the buffer, quit */
3643*7c478bd9Sstevel@tonic-gate 			if (outcount + this_reclen > bufsize) {
3644*7c478bd9Sstevel@tonic-gate 				break;
3645*7c478bd9Sstevel@tonic-gate 			}
3646*7c478bd9Sstevel@tonic-gate 			/* Take this entry */
3647*7c478bd9Sstevel@tonic-gate 			odp->d_ino = (ino64_t)idp->d_ino;
3648*7c478bd9Sstevel@tonic-gate 			odp->d_reclen = (ushort_t)this_reclen;
3649*7c478bd9Sstevel@tonic-gate 			odp->d_off = (offset_t)(offset + idp->d_reclen);
3650*7c478bd9Sstevel@tonic-gate 
3651*7c478bd9Sstevel@tonic-gate 			/* use strncpy(9f) to zero out uninitialized bytes */
3652*7c478bd9Sstevel@tonic-gate 
3653*7c478bd9Sstevel@tonic-gate 			ASSERT(strlen(idp->d_name) + 1 <=
3654*7c478bd9Sstevel@tonic-gate 			    DIRENT64_NAMELEN(this_reclen));
3655*7c478bd9Sstevel@tonic-gate 			(void) strncpy(odp->d_name, idp->d_name,
3656*7c478bd9Sstevel@tonic-gate 			    DIRENT64_NAMELEN(this_reclen));
3657*7c478bd9Sstevel@tonic-gate 			outcount += odp->d_reclen;
3658*7c478bd9Sstevel@tonic-gate 			odp = (struct dirent64 *)((intptr_t)odp +
3659*7c478bd9Sstevel@tonic-gate 				    odp->d_reclen);
3660*7c478bd9Sstevel@tonic-gate 			ASSERT(outcount <= bufsize);
3661*7c478bd9Sstevel@tonic-gate 		}
3662*7c478bd9Sstevel@tonic-gate 		if (idp->d_reclen) {
3663*7c478bd9Sstevel@tonic-gate 			incount += idp->d_reclen;
3664*7c478bd9Sstevel@tonic-gate 			offset += idp->d_reclen;
3665*7c478bd9Sstevel@tonic-gate 			idp = (struct direct *)((intptr_t)idp + idp->d_reclen);
3666*7c478bd9Sstevel@tonic-gate 		} else {
3667*7c478bd9Sstevel@tonic-gate 			offset = (offset + DIRBLKSIZ) & ~(DIRBLKSIZ-1);
3668*7c478bd9Sstevel@tonic-gate 			break;
3669*7c478bd9Sstevel@tonic-gate 		}
3670*7c478bd9Sstevel@tonic-gate 	}
3671*7c478bd9Sstevel@tonic-gate 	/* Release the chunk */
3672*7c478bd9Sstevel@tonic-gate 	fbrelse(fbp, S_OTHER);
3673*7c478bd9Sstevel@tonic-gate 
3674*7c478bd9Sstevel@tonic-gate 	/* Read whole block, but got no entries, read another if not eof */
3675*7c478bd9Sstevel@tonic-gate 
3676*7c478bd9Sstevel@tonic-gate 	/*
3677*7c478bd9Sstevel@tonic-gate 	 * Large Files: casting i_size to int here is not a problem
3678*7c478bd9Sstevel@tonic-gate 	 * because directory sizes are always less than MAXOFF32_T.
3679*7c478bd9Sstevel@tonic-gate 	 * See assertion above.
3680*7c478bd9Sstevel@tonic-gate 	 */
3681*7c478bd9Sstevel@tonic-gate 
3682*7c478bd9Sstevel@tonic-gate 	if (offset < (int)ip->i_size && !outcount)
3683*7c478bd9Sstevel@tonic-gate 		goto nextblk;
3684*7c478bd9Sstevel@tonic-gate 
3685*7c478bd9Sstevel@tonic-gate 	/* Copy out the entry data */
3686*7c478bd9Sstevel@tonic-gate 	if (uiop->uio_segflg == UIO_SYSSPACE && uiop->uio_iovcnt == 1) {
3687*7c478bd9Sstevel@tonic-gate 		iovp->iov_base += outcount;
3688*7c478bd9Sstevel@tonic-gate 		iovp->iov_len -= outcount;
3689*7c478bd9Sstevel@tonic-gate 		uiop->uio_resid -= outcount;
3690*7c478bd9Sstevel@tonic-gate 		uiop->uio_offset = offset;
3691*7c478bd9Sstevel@tonic-gate 	} else if ((error = uiomove(outbuf, (long)outcount, UIO_READ,
3692*7c478bd9Sstevel@tonic-gate 				    uiop)) == 0)
3693*7c478bd9Sstevel@tonic-gate 		uiop->uio_offset = offset;
3694*7c478bd9Sstevel@tonic-gate update_inode:
3695*7c478bd9Sstevel@tonic-gate 	ITIMES(ip);
3696*7c478bd9Sstevel@tonic-gate 	if (uiop->uio_segflg != UIO_SYSSPACE || uiop->uio_iovcnt != 1)
3697*7c478bd9Sstevel@tonic-gate 		kmem_free(outbuf, bufsize);
3698*7c478bd9Sstevel@tonic-gate 
3699*7c478bd9Sstevel@tonic-gate 	if (eofp && error == 0)
3700*7c478bd9Sstevel@tonic-gate 		*eofp = (uiop->uio_offset >= (int)ip->i_size);
3701*7c478bd9Sstevel@tonic-gate unlock:
3702*7c478bd9Sstevel@tonic-gate 	if (ulp) {
3703*7c478bd9Sstevel@tonic-gate 		ufs_lockfs_end(ulp);
3704*7c478bd9Sstevel@tonic-gate 	}
3705*7c478bd9Sstevel@tonic-gate out:
3706*7c478bd9Sstevel@tonic-gate 	TRACE_2(TR_FAC_UFS, TR_UFS_READDIR_END,
3707*7c478bd9Sstevel@tonic-gate 		"ufs_readdir_end:vp %p error %d", vp, error);
3708*7c478bd9Sstevel@tonic-gate 	return (error);
3709*7c478bd9Sstevel@tonic-gate }
3710*7c478bd9Sstevel@tonic-gate 
3711*7c478bd9Sstevel@tonic-gate /*ARGSUSED*/
3712*7c478bd9Sstevel@tonic-gate static int
3713*7c478bd9Sstevel@tonic-gate ufs_symlink(
3714*7c478bd9Sstevel@tonic-gate 	struct vnode *dvp,		/* ptr to parent dir vnode */
3715*7c478bd9Sstevel@tonic-gate 	char *linkname,			/* name of symbolic link */
3716*7c478bd9Sstevel@tonic-gate 	struct vattr *vap,		/* attributes */
3717*7c478bd9Sstevel@tonic-gate 	char *target,			/* target path */
3718*7c478bd9Sstevel@tonic-gate 	struct cred *cr)		/* user credentials */
3719*7c478bd9Sstevel@tonic-gate {
3720*7c478bd9Sstevel@tonic-gate 	struct inode *ip, *dip = VTOI(dvp);
3721*7c478bd9Sstevel@tonic-gate 	struct ufsvfs *ufsvfsp = dip->i_ufsvfs;
3722*7c478bd9Sstevel@tonic-gate 	struct ulockfs *ulp;
3723*7c478bd9Sstevel@tonic-gate 	int error;
3724*7c478bd9Sstevel@tonic-gate 	int issync;
3725*7c478bd9Sstevel@tonic-gate 	int trans_size;
3726*7c478bd9Sstevel@tonic-gate 	int residual;
3727*7c478bd9Sstevel@tonic-gate 	int ioflag;
3728*7c478bd9Sstevel@tonic-gate 	int retry = 1;
3729*7c478bd9Sstevel@tonic-gate 
3730*7c478bd9Sstevel@tonic-gate 	TRACE_1(TR_FAC_UFS, TR_UFS_SYMLINK_START,
3731*7c478bd9Sstevel@tonic-gate 		"ufs_symlink_start:dvp %p", dvp);
3732*7c478bd9Sstevel@tonic-gate 
3733*7c478bd9Sstevel@tonic-gate 	/*
3734*7c478bd9Sstevel@tonic-gate 	 * No symlinks in attrdirs at this time
3735*7c478bd9Sstevel@tonic-gate 	 */
3736*7c478bd9Sstevel@tonic-gate 	if ((VTOI(dvp)->i_mode & IFMT) == IFATTRDIR)
3737*7c478bd9Sstevel@tonic-gate 		return (EINVAL);
3738*7c478bd9Sstevel@tonic-gate 
3739*7c478bd9Sstevel@tonic-gate again:
3740*7c478bd9Sstevel@tonic-gate 	ip = (struct inode *)NULL;
3741*7c478bd9Sstevel@tonic-gate 	vap->va_type = VLNK;
3742*7c478bd9Sstevel@tonic-gate 	vap->va_rdev = 0;
3743*7c478bd9Sstevel@tonic-gate 
3744*7c478bd9Sstevel@tonic-gate 	error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_SYMLINK_MASK);
3745*7c478bd9Sstevel@tonic-gate 	if (error)
3746*7c478bd9Sstevel@tonic-gate 		goto out;
3747*7c478bd9Sstevel@tonic-gate 
3748*7c478bd9Sstevel@tonic-gate 	if (ulp)
3749*7c478bd9Sstevel@tonic-gate 		TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_SYMLINK,
3750*7c478bd9Sstevel@tonic-gate 		    trans_size = (int)TOP_SYMLINK_SIZE(dip));
3751*7c478bd9Sstevel@tonic-gate 
3752*7c478bd9Sstevel@tonic-gate 	/*
3753*7c478bd9Sstevel@tonic-gate 	 * We must create the inode before the directory entry, to avoid
3754*7c478bd9Sstevel@tonic-gate 	 * racing with readlink().  ufs_dirmakeinode requires that we
3755*7c478bd9Sstevel@tonic-gate 	 * hold the quota lock as reader, and directory locks as writer.
3756*7c478bd9Sstevel@tonic-gate 	 */
3757*7c478bd9Sstevel@tonic-gate 
3758*7c478bd9Sstevel@tonic-gate 	rw_enter(&dip->i_rwlock, RW_WRITER);
3759*7c478bd9Sstevel@tonic-gate 	rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
3760*7c478bd9Sstevel@tonic-gate 	rw_enter(&dip->i_contents, RW_WRITER);
3761*7c478bd9Sstevel@tonic-gate 
3762*7c478bd9Sstevel@tonic-gate 	/*
3763*7c478bd9Sstevel@tonic-gate 	 * Suppress any out of inodes messages if we will retry on
3764*7c478bd9Sstevel@tonic-gate 	 * ENOSP
3765*7c478bd9Sstevel@tonic-gate 	 */
3766*7c478bd9Sstevel@tonic-gate 	if (retry)
3767*7c478bd9Sstevel@tonic-gate 		dip->i_flag |= IQUIET;
3768*7c478bd9Sstevel@tonic-gate 
3769*7c478bd9Sstevel@tonic-gate 	error = ufs_dirmakeinode(dip, &ip, vap, DE_SYMLINK, cr);
3770*7c478bd9Sstevel@tonic-gate 
3771*7c478bd9Sstevel@tonic-gate 	dip->i_flag &= ~IQUIET;
3772*7c478bd9Sstevel@tonic-gate 
3773*7c478bd9Sstevel@tonic-gate 	rw_exit(&dip->i_contents);
3774*7c478bd9Sstevel@tonic-gate 	rw_exit(&ufsvfsp->vfs_dqrwlock);
3775*7c478bd9Sstevel@tonic-gate 	rw_exit(&dip->i_rwlock);
3776*7c478bd9Sstevel@tonic-gate 
3777*7c478bd9Sstevel@tonic-gate 	if (error)
3778*7c478bd9Sstevel@tonic-gate 		goto unlock;
3779*7c478bd9Sstevel@tonic-gate 
3780*7c478bd9Sstevel@tonic-gate 	/*
3781*7c478bd9Sstevel@tonic-gate 	 * OK.  The inode has been created.  Write out the data of the
3782*7c478bd9Sstevel@tonic-gate 	 * symbolic link.  Since symbolic links are metadata, and should
3783*7c478bd9Sstevel@tonic-gate 	 * remain consistent across a system crash, we need to force the
3784*7c478bd9Sstevel@tonic-gate 	 * data out synchronously.
3785*7c478bd9Sstevel@tonic-gate 	 *
3786*7c478bd9Sstevel@tonic-gate 	 * (This is a change from the semantics in earlier releases, which
3787*7c478bd9Sstevel@tonic-gate 	 * only created symbolic links synchronously if the semi-documented
3788*7c478bd9Sstevel@tonic-gate 	 * 'syncdir' option was set, or if we were being invoked by the NFS
3789*7c478bd9Sstevel@tonic-gate 	 * server, which requires symbolic links to be created synchronously.)
3790*7c478bd9Sstevel@tonic-gate 	 *
3791*7c478bd9Sstevel@tonic-gate 	 * We need to pass in a pointer for the residual length; otherwise
3792*7c478bd9Sstevel@tonic-gate 	 * ufs_rdwri() will always return EIO if it can't write the data,
3793*7c478bd9Sstevel@tonic-gate 	 * even if the error was really ENOSPC or EDQUOT.
3794*7c478bd9Sstevel@tonic-gate 	 */
3795*7c478bd9Sstevel@tonic-gate 
3796*7c478bd9Sstevel@tonic-gate 	ioflag = FWRITE | FDSYNC;
3797*7c478bd9Sstevel@tonic-gate 	residual = 0;
3798*7c478bd9Sstevel@tonic-gate 
3799*7c478bd9Sstevel@tonic-gate 	rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
3800*7c478bd9Sstevel@tonic-gate 	rw_enter(&ip->i_contents, RW_WRITER);
3801*7c478bd9Sstevel@tonic-gate 
3802*7c478bd9Sstevel@tonic-gate 	/*
3803*7c478bd9Sstevel@tonic-gate 	 * Suppress file system full messages if we will retry
3804*7c478bd9Sstevel@tonic-gate 	 */
3805*7c478bd9Sstevel@tonic-gate 	if (retry)
3806*7c478bd9Sstevel@tonic-gate 		ip->i_flag |= IQUIET;
3807*7c478bd9Sstevel@tonic-gate 
3808*7c478bd9Sstevel@tonic-gate 	error = ufs_rdwri(UIO_WRITE, ioflag, ip, target, strlen(target),
3809*7c478bd9Sstevel@tonic-gate 	    (offset_t)0, UIO_SYSSPACE, &residual, cr);
3810*7c478bd9Sstevel@tonic-gate 
3811*7c478bd9Sstevel@tonic-gate 	ip->i_flag &= ~IQUIET;
3812*7c478bd9Sstevel@tonic-gate 
3813*7c478bd9Sstevel@tonic-gate 	if (error) {
3814*7c478bd9Sstevel@tonic-gate 		rw_exit(&ip->i_contents);
3815*7c478bd9Sstevel@tonic-gate 		rw_exit(&ufsvfsp->vfs_dqrwlock);
3816*7c478bd9Sstevel@tonic-gate 		goto remove;
3817*7c478bd9Sstevel@tonic-gate 	}
3818*7c478bd9Sstevel@tonic-gate 
3819*7c478bd9Sstevel@tonic-gate 	/*
3820*7c478bd9Sstevel@tonic-gate 	 * If the link's data is small enough, we can cache it in the inode.
3821*7c478bd9Sstevel@tonic-gate 	 * This is a "fast symbolic link".  We don't use the first direct
3822*7c478bd9Sstevel@tonic-gate 	 * block because that's actually used to point at the symbolic link's
3823*7c478bd9Sstevel@tonic-gate 	 * contents on disk; but we know that none of the other direct or
3824*7c478bd9Sstevel@tonic-gate 	 * indirect blocks can be used because symbolic links are restricted
3825*7c478bd9Sstevel@tonic-gate 	 * to be smaller than a file system block.
3826*7c478bd9Sstevel@tonic-gate 	 */
3827*7c478bd9Sstevel@tonic-gate 
3828*7c478bd9Sstevel@tonic-gate 	ASSERT(MAXPATHLEN <= VBSIZE(ITOV(ip)));
3829*7c478bd9Sstevel@tonic-gate 
3830*7c478bd9Sstevel@tonic-gate 	if (ip->i_size > 0 && ip->i_size <= FSL_SIZE) {
3831*7c478bd9Sstevel@tonic-gate 		if (kcopy(target, &ip->i_db[1], ip->i_size) == 0) {
3832*7c478bd9Sstevel@tonic-gate 			ip->i_flag |= IFASTSYMLNK;
3833*7c478bd9Sstevel@tonic-gate 		} else {
3834*7c478bd9Sstevel@tonic-gate 			int i;
3835*7c478bd9Sstevel@tonic-gate 			/* error, clear garbage left behind */
3836*7c478bd9Sstevel@tonic-gate 			for (i = 1; i < NDADDR; i++)
3837*7c478bd9Sstevel@tonic-gate 				ip->i_db[i] = 0;
3838*7c478bd9Sstevel@tonic-gate 			for (i = 0; i < NIADDR; i++)
3839*7c478bd9Sstevel@tonic-gate 				ip->i_ib[i] = 0;
3840*7c478bd9Sstevel@tonic-gate 		}
3841*7c478bd9Sstevel@tonic-gate 	}
3842*7c478bd9Sstevel@tonic-gate 
3843*7c478bd9Sstevel@tonic-gate 	rw_exit(&ip->i_contents);
3844*7c478bd9Sstevel@tonic-gate 	rw_exit(&ufsvfsp->vfs_dqrwlock);
3845*7c478bd9Sstevel@tonic-gate 
3846*7c478bd9Sstevel@tonic-gate 	/*
3847*7c478bd9Sstevel@tonic-gate 	 * OK.  We've successfully created the symbolic link.  All that
3848*7c478bd9Sstevel@tonic-gate 	 * remains is to insert it into the appropriate directory.
3849*7c478bd9Sstevel@tonic-gate 	 */
3850*7c478bd9Sstevel@tonic-gate 
3851*7c478bd9Sstevel@tonic-gate 	rw_enter(&dip->i_rwlock, RW_WRITER);
3852*7c478bd9Sstevel@tonic-gate 	error = ufs_direnter_lr(dip, linkname, DE_SYMLINK, NULL, ip, cr, NULL);
3853*7c478bd9Sstevel@tonic-gate 	rw_exit(&dip->i_rwlock);
3854*7c478bd9Sstevel@tonic-gate 
3855*7c478bd9Sstevel@tonic-gate 	/*
3856*7c478bd9Sstevel@tonic-gate 	 * Fall through into remove-on-error code.  We're either done, or we
3857*7c478bd9Sstevel@tonic-gate 	 * need to remove the inode (if we couldn't insert it).
3858*7c478bd9Sstevel@tonic-gate 	 */
3859*7c478bd9Sstevel@tonic-gate 
3860*7c478bd9Sstevel@tonic-gate remove:
3861*7c478bd9Sstevel@tonic-gate 	if (error && (ip != NULL)) {
3862*7c478bd9Sstevel@tonic-gate 		rw_enter(&ip->i_contents, RW_WRITER);
3863*7c478bd9Sstevel@tonic-gate 		ip->i_nlink--;
3864*7c478bd9Sstevel@tonic-gate 		ip->i_flag |= ICHG;
3865*7c478bd9Sstevel@tonic-gate 		ip->i_seq++;
3866*7c478bd9Sstevel@tonic-gate 		ufs_setreclaim(ip);
3867*7c478bd9Sstevel@tonic-gate 		rw_exit(&ip->i_contents);
3868*7c478bd9Sstevel@tonic-gate 	}
3869*7c478bd9Sstevel@tonic-gate 
3870*7c478bd9Sstevel@tonic-gate unlock:
3871*7c478bd9Sstevel@tonic-gate 	if (ip != NULL)
3872*7c478bd9Sstevel@tonic-gate 		VN_RELE(ITOV(ip));
3873*7c478bd9Sstevel@tonic-gate 
3874*7c478bd9Sstevel@tonic-gate 	if (ulp) {
3875*7c478bd9Sstevel@tonic-gate 		int terr = 0;
3876*7c478bd9Sstevel@tonic-gate 
3877*7c478bd9Sstevel@tonic-gate 		TRANS_END_CSYNC(ufsvfsp, terr, issync, TOP_SYMLINK,
3878*7c478bd9Sstevel@tonic-gate 				trans_size);
3879*7c478bd9Sstevel@tonic-gate 		ufs_lockfs_end(ulp);
3880*7c478bd9Sstevel@tonic-gate 		if (error == 0)
3881*7c478bd9Sstevel@tonic-gate 			error = terr;
3882*7c478bd9Sstevel@tonic-gate 	}
3883*7c478bd9Sstevel@tonic-gate 
3884*7c478bd9Sstevel@tonic-gate 	/*
3885*7c478bd9Sstevel@tonic-gate 	 * We may have failed due to lack of an inode or of a block to
3886*7c478bd9Sstevel@tonic-gate 	 * store the target in.  Try flushing the delete queue to free
3887*7c478bd9Sstevel@tonic-gate 	 * logically-available things up and try again.
3888*7c478bd9Sstevel@tonic-gate 	 */
3889*7c478bd9Sstevel@tonic-gate 	if ((error == ENOSPC) && retry && TRANS_ISTRANS(ufsvfsp)) {
3890*7c478bd9Sstevel@tonic-gate 		ufs_delete_drain_wait(ufsvfsp, 1);
3891*7c478bd9Sstevel@tonic-gate 		retry = 0;
3892*7c478bd9Sstevel@tonic-gate 		goto again;
3893*7c478bd9Sstevel@tonic-gate 	}
3894*7c478bd9Sstevel@tonic-gate 
3895*7c478bd9Sstevel@tonic-gate out:
3896*7c478bd9Sstevel@tonic-gate 	TRACE_2(TR_FAC_UFS, TR_UFS_SYMLINK_END,
3897*7c478bd9Sstevel@tonic-gate 		"ufs_symlink_end:dvp %p error %d", dvp, error);
3898*7c478bd9Sstevel@tonic-gate 	return (error);
3899*7c478bd9Sstevel@tonic-gate }
3900*7c478bd9Sstevel@tonic-gate 
3901*7c478bd9Sstevel@tonic-gate /*
3902*7c478bd9Sstevel@tonic-gate  * Ufs specific routine used to do ufs io.
3903*7c478bd9Sstevel@tonic-gate  */
3904*7c478bd9Sstevel@tonic-gate int
3905*7c478bd9Sstevel@tonic-gate ufs_rdwri(enum uio_rw rw, int ioflag, struct inode *ip, caddr_t base,
3906*7c478bd9Sstevel@tonic-gate 	ssize_t len, offset_t offset, enum uio_seg seg, int *aresid,
3907*7c478bd9Sstevel@tonic-gate 	struct cred *cr)
3908*7c478bd9Sstevel@tonic-gate {
3909*7c478bd9Sstevel@tonic-gate 	struct uio auio;
3910*7c478bd9Sstevel@tonic-gate 	struct iovec aiov;
3911*7c478bd9Sstevel@tonic-gate 	int error;
3912*7c478bd9Sstevel@tonic-gate 
3913*7c478bd9Sstevel@tonic-gate 	ASSERT(RW_LOCK_HELD(&ip->i_contents));
3914*7c478bd9Sstevel@tonic-gate 
3915*7c478bd9Sstevel@tonic-gate 	bzero((caddr_t)&auio, sizeof (uio_t));
3916*7c478bd9Sstevel@tonic-gate 	bzero((caddr_t)&aiov, sizeof (iovec_t));
3917*7c478bd9Sstevel@tonic-gate 
3918*7c478bd9Sstevel@tonic-gate 	aiov.iov_base = base;
3919*7c478bd9Sstevel@tonic-gate 	aiov.iov_len = len;
3920*7c478bd9Sstevel@tonic-gate 	auio.uio_iov = &aiov;
3921*7c478bd9Sstevel@tonic-gate 	auio.uio_iovcnt = 1;
3922*7c478bd9Sstevel@tonic-gate 	auio.uio_loffset = offset;
3923*7c478bd9Sstevel@tonic-gate 	auio.uio_segflg = (short)seg;
3924*7c478bd9Sstevel@tonic-gate 	auio.uio_resid = len;
3925*7c478bd9Sstevel@tonic-gate 
3926*7c478bd9Sstevel@tonic-gate 	if (rw == UIO_WRITE) {
3927*7c478bd9Sstevel@tonic-gate 		auio.uio_fmode = FWRITE;
3928*7c478bd9Sstevel@tonic-gate 		auio.uio_extflg = UIO_COPY_DEFAULT;
3929*7c478bd9Sstevel@tonic-gate 		auio.uio_llimit = curproc->p_fsz_ctl;
3930*7c478bd9Sstevel@tonic-gate 		error = wrip(ip, &auio, ioflag, cr);
3931*7c478bd9Sstevel@tonic-gate 	} else {
3932*7c478bd9Sstevel@tonic-gate 		auio.uio_fmode = FREAD;
3933*7c478bd9Sstevel@tonic-gate 		auio.uio_extflg = UIO_COPY_CACHED;
3934*7c478bd9Sstevel@tonic-gate 		auio.uio_llimit = MAXOFFSET_T;
3935*7c478bd9Sstevel@tonic-gate 		error = rdip(ip, &auio, ioflag, cr);
3936*7c478bd9Sstevel@tonic-gate 	}
3937*7c478bd9Sstevel@tonic-gate 
3938*7c478bd9Sstevel@tonic-gate 	if (aresid) {
3939*7c478bd9Sstevel@tonic-gate 		*aresid = auio.uio_resid;
3940*7c478bd9Sstevel@tonic-gate 	} else if (auio.uio_resid) {
3941*7c478bd9Sstevel@tonic-gate 		error = EIO;
3942*7c478bd9Sstevel@tonic-gate 	}
3943*7c478bd9Sstevel@tonic-gate 	return (error);
3944*7c478bd9Sstevel@tonic-gate }
3945*7c478bd9Sstevel@tonic-gate 
3946*7c478bd9Sstevel@tonic-gate static int
3947*7c478bd9Sstevel@tonic-gate ufs_fid(vp, fidp)
3948*7c478bd9Sstevel@tonic-gate 	struct vnode *vp;
3949*7c478bd9Sstevel@tonic-gate 	struct fid *fidp;
3950*7c478bd9Sstevel@tonic-gate {
3951*7c478bd9Sstevel@tonic-gate 	struct ufid *ufid;
3952*7c478bd9Sstevel@tonic-gate 	struct inode *ip = VTOI(vp);
3953*7c478bd9Sstevel@tonic-gate 
3954*7c478bd9Sstevel@tonic-gate 	if (ip->i_ufsvfs == NULL)
3955*7c478bd9Sstevel@tonic-gate 		return (EIO);
3956*7c478bd9Sstevel@tonic-gate 
3957*7c478bd9Sstevel@tonic-gate 	if (fidp->fid_len < (sizeof (struct ufid) - sizeof (ushort_t))) {
3958*7c478bd9Sstevel@tonic-gate 		fidp->fid_len = sizeof (struct ufid) - sizeof (ushort_t);
3959*7c478bd9Sstevel@tonic-gate 		return (ENOSPC);
3960*7c478bd9Sstevel@tonic-gate 	}
3961*7c478bd9Sstevel@tonic-gate 
3962*7c478bd9Sstevel@tonic-gate 	ufid = (struct ufid *)fidp;
3963*7c478bd9Sstevel@tonic-gate 	bzero((char *)ufid, sizeof (struct ufid));
3964*7c478bd9Sstevel@tonic-gate 	ufid->ufid_len = sizeof (struct ufid) - sizeof (ushort_t);
3965*7c478bd9Sstevel@tonic-gate 	ufid->ufid_ino = ip->i_number;
3966*7c478bd9Sstevel@tonic-gate 	ufid->ufid_gen = ip->i_gen;
3967*7c478bd9Sstevel@tonic-gate 
3968*7c478bd9Sstevel@tonic-gate 	return (0);
3969*7c478bd9Sstevel@tonic-gate }
3970*7c478bd9Sstevel@tonic-gate 
3971*7c478bd9Sstevel@tonic-gate /* ARGSUSED2 */
3972*7c478bd9Sstevel@tonic-gate static int
3973*7c478bd9Sstevel@tonic-gate ufs_rwlock(struct vnode *vp, int write_lock, caller_context_t *ctp)
3974*7c478bd9Sstevel@tonic-gate {
3975*7c478bd9Sstevel@tonic-gate 	struct inode	*ip = VTOI(vp);
3976*7c478bd9Sstevel@tonic-gate 	struct ufsvfs	*ufsvfsp;
3977*7c478bd9Sstevel@tonic-gate 	int		forcedirectio;
3978*7c478bd9Sstevel@tonic-gate 
3979*7c478bd9Sstevel@tonic-gate 	/*
3980*7c478bd9Sstevel@tonic-gate 	 * Read case is easy.
3981*7c478bd9Sstevel@tonic-gate 	 */
3982*7c478bd9Sstevel@tonic-gate 	if (!write_lock) {
3983*7c478bd9Sstevel@tonic-gate 		rw_enter(&ip->i_rwlock, RW_READER);
3984*7c478bd9Sstevel@tonic-gate 		return (V_WRITELOCK_FALSE);
3985*7c478bd9Sstevel@tonic-gate 	}
3986*7c478bd9Sstevel@tonic-gate 
3987*7c478bd9Sstevel@tonic-gate 	/*
3988*7c478bd9Sstevel@tonic-gate 	 * Caller has requested a writer lock, but that inhibits any
3989*7c478bd9Sstevel@tonic-gate 	 * concurrency in the VOPs that follow. Acquire the lock shared
3990*7c478bd9Sstevel@tonic-gate 	 * and defer exclusive access until it is known to be needed in
3991*7c478bd9Sstevel@tonic-gate 	 * other VOP handlers. Some cases can be determined here.
3992*7c478bd9Sstevel@tonic-gate 	 */
3993*7c478bd9Sstevel@tonic-gate 
3994*7c478bd9Sstevel@tonic-gate 	/*
3995*7c478bd9Sstevel@tonic-gate 	 * If directio is not set, there is no chance of concurrency,
3996*7c478bd9Sstevel@tonic-gate 	 * so just acquire the lock exclusive. Beware of a forced
3997*7c478bd9Sstevel@tonic-gate 	 * unmount before looking at the mount option.
3998*7c478bd9Sstevel@tonic-gate 	 */
3999*7c478bd9Sstevel@tonic-gate 	ufsvfsp = ip->i_ufsvfs;
4000*7c478bd9Sstevel@tonic-gate 	forcedirectio = ufsvfsp ? ufsvfsp->vfs_forcedirectio : 0;
4001*7c478bd9Sstevel@tonic-gate 	if (!(ip->i_flag & IDIRECTIO || forcedirectio) ||
4002*7c478bd9Sstevel@tonic-gate 	    !ufs_allow_shared_writes) {
4003*7c478bd9Sstevel@tonic-gate 		rw_enter(&ip->i_rwlock, RW_WRITER);
4004*7c478bd9Sstevel@tonic-gate 		return (V_WRITELOCK_TRUE);
4005*7c478bd9Sstevel@tonic-gate 	}
4006*7c478bd9Sstevel@tonic-gate 
4007*7c478bd9Sstevel@tonic-gate 	/*
4008*7c478bd9Sstevel@tonic-gate 	 * Mandatory locking forces acquiring i_rwlock exclusive.
4009*7c478bd9Sstevel@tonic-gate 	 */
4010*7c478bd9Sstevel@tonic-gate 	if (MANDLOCK(vp, ip->i_mode)) {
4011*7c478bd9Sstevel@tonic-gate 		rw_enter(&ip->i_rwlock, RW_WRITER);
4012*7c478bd9Sstevel@tonic-gate 		return (V_WRITELOCK_TRUE);
4013*7c478bd9Sstevel@tonic-gate 	}
4014*7c478bd9Sstevel@tonic-gate 
4015*7c478bd9Sstevel@tonic-gate 	/*
4016*7c478bd9Sstevel@tonic-gate 	 * Acquire the lock shared in case a concurrent write follows.
4017*7c478bd9Sstevel@tonic-gate 	 * Mandatory locking could have become enabled before the lock
4018*7c478bd9Sstevel@tonic-gate 	 * was acquired. Re-check and upgrade if needed.
4019*7c478bd9Sstevel@tonic-gate 	 */
4020*7c478bd9Sstevel@tonic-gate 	rw_enter(&ip->i_rwlock, RW_READER);
4021*7c478bd9Sstevel@tonic-gate 	if (MANDLOCK(vp, ip->i_mode)) {
4022*7c478bd9Sstevel@tonic-gate 		rw_exit(&ip->i_rwlock);
4023*7c478bd9Sstevel@tonic-gate 		rw_enter(&ip->i_rwlock, RW_WRITER);
4024*7c478bd9Sstevel@tonic-gate 		return (V_WRITELOCK_TRUE);
4025*7c478bd9Sstevel@tonic-gate 	}
4026*7c478bd9Sstevel@tonic-gate 	return (V_WRITELOCK_FALSE);
4027*7c478bd9Sstevel@tonic-gate }
4028*7c478bd9Sstevel@tonic-gate 
4029*7c478bd9Sstevel@tonic-gate /*ARGSUSED*/
4030*7c478bd9Sstevel@tonic-gate static void
4031*7c478bd9Sstevel@tonic-gate ufs_rwunlock(struct vnode *vp, int write_lock, caller_context_t *ctp)
4032*7c478bd9Sstevel@tonic-gate {
4033*7c478bd9Sstevel@tonic-gate 	struct inode	*ip = VTOI(vp);
4034*7c478bd9Sstevel@tonic-gate 
4035*7c478bd9Sstevel@tonic-gate 	rw_exit(&ip->i_rwlock);
4036*7c478bd9Sstevel@tonic-gate }
4037*7c478bd9Sstevel@tonic-gate 
4038*7c478bd9Sstevel@tonic-gate /* ARGSUSED */
4039*7c478bd9Sstevel@tonic-gate static int
4040*7c478bd9Sstevel@tonic-gate ufs_seek(struct vnode *vp, offset_t ooff, offset_t *noffp)
4041*7c478bd9Sstevel@tonic-gate {
4042*7c478bd9Sstevel@tonic-gate 	return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
4043*7c478bd9Sstevel@tonic-gate }
4044*7c478bd9Sstevel@tonic-gate 
4045*7c478bd9Sstevel@tonic-gate /* ARGSUSED */
4046*7c478bd9Sstevel@tonic-gate static int
4047*7c478bd9Sstevel@tonic-gate ufs_frlock(struct vnode *vp, int cmd, struct flock64 *bfp, int flag,
4048*7c478bd9Sstevel@tonic-gate 	offset_t offset, struct flk_callback *flk_cbp, struct cred *cr)
4049*7c478bd9Sstevel@tonic-gate {
4050*7c478bd9Sstevel@tonic-gate 	struct inode *ip = VTOI(vp);
4051*7c478bd9Sstevel@tonic-gate 
4052*7c478bd9Sstevel@tonic-gate 	if (ip->i_ufsvfs == NULL)
4053*7c478bd9Sstevel@tonic-gate 		return (EIO);
4054*7c478bd9Sstevel@tonic-gate 
4055*7c478bd9Sstevel@tonic-gate 	/*
4056*7c478bd9Sstevel@tonic-gate 	 * If file is being mapped, disallow frlock.
4057*7c478bd9Sstevel@tonic-gate 	 * XXX I am not holding tlock while checking i_mapcnt because the
4058*7c478bd9Sstevel@tonic-gate 	 * current locking strategy drops all locks before calling fs_frlock.
4059*7c478bd9Sstevel@tonic-gate 	 * So, mapcnt could change before we enter fs_frlock making is
4060*7c478bd9Sstevel@tonic-gate 	 * meaningless to have held tlock in the first place.
4061*7c478bd9Sstevel@tonic-gate 	 */
4062*7c478bd9Sstevel@tonic-gate 	if (ip->i_mapcnt > 0 && MANDLOCK(vp, ip->i_mode))
4063*7c478bd9Sstevel@tonic-gate 		return (EAGAIN);
4064*7c478bd9Sstevel@tonic-gate 	return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr));
4065*7c478bd9Sstevel@tonic-gate }
4066*7c478bd9Sstevel@tonic-gate 
4067*7c478bd9Sstevel@tonic-gate /* ARGSUSED */
4068*7c478bd9Sstevel@tonic-gate static int
4069*7c478bd9Sstevel@tonic-gate ufs_space(
4070*7c478bd9Sstevel@tonic-gate 	struct vnode *vp,
4071*7c478bd9Sstevel@tonic-gate 	int cmd,
4072*7c478bd9Sstevel@tonic-gate 	struct flock64 *bfp,
4073*7c478bd9Sstevel@tonic-gate 	int flag,
4074*7c478bd9Sstevel@tonic-gate 	offset_t offset,
4075*7c478bd9Sstevel@tonic-gate 	cred_t *cr,
4076*7c478bd9Sstevel@tonic-gate 	caller_context_t *ct)
4077*7c478bd9Sstevel@tonic-gate {
4078*7c478bd9Sstevel@tonic-gate 	struct ufsvfs *ufsvfsp	= VTOI(vp)->i_ufsvfs;
4079*7c478bd9Sstevel@tonic-gate 	struct ulockfs *ulp;
4080*7c478bd9Sstevel@tonic-gate 	int error;
4081*7c478bd9Sstevel@tonic-gate 
4082*7c478bd9Sstevel@tonic-gate 	error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_SPACE_MASK);
4083*7c478bd9Sstevel@tonic-gate 	if (error)
4084*7c478bd9Sstevel@tonic-gate 		return (error);
4085*7c478bd9Sstevel@tonic-gate 
4086*7c478bd9Sstevel@tonic-gate 
4087*7c478bd9Sstevel@tonic-gate 	if (cmd != F_FREESP)
4088*7c478bd9Sstevel@tonic-gate 		error =  EINVAL;
4089*7c478bd9Sstevel@tonic-gate 	else if ((error = convoff(vp, bfp, 0, offset)) == 0)
4090*7c478bd9Sstevel@tonic-gate 		error = ufs_freesp(vp, bfp, flag, cr);
4091*7c478bd9Sstevel@tonic-gate 
4092*7c478bd9Sstevel@tonic-gate 	if (ulp)
4093*7c478bd9Sstevel@tonic-gate 		ufs_lockfs_end(ulp);
4094*7c478bd9Sstevel@tonic-gate 	return (error);
4095*7c478bd9Sstevel@tonic-gate }
4096*7c478bd9Sstevel@tonic-gate 
4097*7c478bd9Sstevel@tonic-gate /*
4098*7c478bd9Sstevel@tonic-gate  * Used to determine if read ahead should be done. Also used to
4099*7c478bd9Sstevel@tonic-gate  * to determine when write back occurs.
4100*7c478bd9Sstevel@tonic-gate  */
4101*7c478bd9Sstevel@tonic-gate #define	CLUSTSZ(ip)		((ip)->i_ufsvfs->vfs_ioclustsz)
4102*7c478bd9Sstevel@tonic-gate 
4103*7c478bd9Sstevel@tonic-gate /*
4104*7c478bd9Sstevel@tonic-gate  * A faster version of ufs_getpage.
4105*7c478bd9Sstevel@tonic-gate  *
4106*7c478bd9Sstevel@tonic-gate  * We optimize by inlining the pvn_getpages iterator, eliminating
4107*7c478bd9Sstevel@tonic-gate  * calls to bmap_read if file doesn't have UFS holes, and avoiding
4108*7c478bd9Sstevel@tonic-gate  * the overhead of page_exists().
4109*7c478bd9Sstevel@tonic-gate  *
4110*7c478bd9Sstevel@tonic-gate  * When files has UFS_HOLES and ufs_getpage is called with S_READ,
4111*7c478bd9Sstevel@tonic-gate  * we set *protp to PROT_READ to avoid calling bmap_read. This approach
4112*7c478bd9Sstevel@tonic-gate  * victimizes performance when a file with UFS holes is faulted
4113*7c478bd9Sstevel@tonic-gate  * first in the S_READ mode, and then in the S_WRITE mode. We will get
4114*7c478bd9Sstevel@tonic-gate  * two MMU faults in this case.
4115*7c478bd9Sstevel@tonic-gate  *
4116*7c478bd9Sstevel@tonic-gate  * XXX - the inode fields which control the sequential mode are not
4117*7c478bd9Sstevel@tonic-gate  *	 protected by any mutex. The read ahead will act wild if
4118*7c478bd9Sstevel@tonic-gate  *	 multiple processes will access the file concurrently and
4119*7c478bd9Sstevel@tonic-gate  *	 some of them in sequential mode. One particulary bad case
4120*7c478bd9Sstevel@tonic-gate  *	 is if another thread will change the value of i_nextrio between
4121*7c478bd9Sstevel@tonic-gate  *	 the time this thread tests the i_nextrio value and then reads it
4122*7c478bd9Sstevel@tonic-gate  *	 again to use it as the offset for the read ahead.
4123*7c478bd9Sstevel@tonic-gate  */
4124*7c478bd9Sstevel@tonic-gate static int
4125*7c478bd9Sstevel@tonic-gate ufs_getpage(struct vnode *vp, offset_t off, size_t len, uint_t *protp,
4126*7c478bd9Sstevel@tonic-gate 	page_t *plarr[], size_t plsz, struct seg *seg, caddr_t addr,
4127*7c478bd9Sstevel@tonic-gate 	enum seg_rw rw, struct cred *cr)
4128*7c478bd9Sstevel@tonic-gate {
4129*7c478bd9Sstevel@tonic-gate 	u_offset_t	uoff = (u_offset_t)off; /* type conversion */
4130*7c478bd9Sstevel@tonic-gate 	u_offset_t	pgoff;
4131*7c478bd9Sstevel@tonic-gate 	u_offset_t	eoff;
4132*7c478bd9Sstevel@tonic-gate 	struct inode 	*ip = VTOI(vp);
4133*7c478bd9Sstevel@tonic-gate 	struct ufsvfs	*ufsvfsp = ip->i_ufsvfs;
4134*7c478bd9Sstevel@tonic-gate 	struct fs 	*fs;
4135*7c478bd9Sstevel@tonic-gate 	struct ulockfs	*ulp;
4136*7c478bd9Sstevel@tonic-gate 	page_t		**pl;
4137*7c478bd9Sstevel@tonic-gate 	caddr_t		pgaddr;
4138*7c478bd9Sstevel@tonic-gate 	krw_t		rwtype;
4139*7c478bd9Sstevel@tonic-gate 	int 		err;
4140*7c478bd9Sstevel@tonic-gate 	int		has_holes;
4141*7c478bd9Sstevel@tonic-gate 	int		beyond_eof;
4142*7c478bd9Sstevel@tonic-gate 	int		seqmode;
4143*7c478bd9Sstevel@tonic-gate 	int		pgsize = PAGESIZE;
4144*7c478bd9Sstevel@tonic-gate 	int		dolock;
4145*7c478bd9Sstevel@tonic-gate 	int		do_qlock;
4146*7c478bd9Sstevel@tonic-gate 	int		trans_size;
4147*7c478bd9Sstevel@tonic-gate 
4148*7c478bd9Sstevel@tonic-gate 	TRACE_1(TR_FAC_UFS, TR_UFS_GETPAGE_START,
4149*7c478bd9Sstevel@tonic-gate 		"ufs_getpage_start:vp %p", vp);
4150*7c478bd9Sstevel@tonic-gate 
4151*7c478bd9Sstevel@tonic-gate 	ASSERT((uoff & PAGEOFFSET) == 0);
4152*7c478bd9Sstevel@tonic-gate 
4153*7c478bd9Sstevel@tonic-gate 	if (protp)
4154*7c478bd9Sstevel@tonic-gate 		*protp = PROT_ALL;
4155*7c478bd9Sstevel@tonic-gate 
4156*7c478bd9Sstevel@tonic-gate 	/*
4157*7c478bd9Sstevel@tonic-gate 	 * Obey the lockfs protocol
4158*7c478bd9Sstevel@tonic-gate 	 */
4159*7c478bd9Sstevel@tonic-gate 	err = ufs_lockfs_begin_getpage(ufsvfsp, &ulp, seg,
4160*7c478bd9Sstevel@tonic-gate 			rw == S_READ || rw == S_EXEC, protp);
4161*7c478bd9Sstevel@tonic-gate 	if (err)
4162*7c478bd9Sstevel@tonic-gate 		goto out;
4163*7c478bd9Sstevel@tonic-gate 
4164*7c478bd9Sstevel@tonic-gate 	fs = ufsvfsp->vfs_fs;
4165*7c478bd9Sstevel@tonic-gate 
4166*7c478bd9Sstevel@tonic-gate 	if (ulp && (rw == S_CREATE || rw == S_WRITE) &&
4167*7c478bd9Sstevel@tonic-gate 	    !(vp->v_flag & VISSWAP)) {
4168*7c478bd9Sstevel@tonic-gate 		/*
4169*7c478bd9Sstevel@tonic-gate 		 * Try to start a transaction, will return if blocking is
4170*7c478bd9Sstevel@tonic-gate 		 * expected to occur and the address space is not the
4171*7c478bd9Sstevel@tonic-gate 		 * kernel address space.
4172*7c478bd9Sstevel@tonic-gate 		 */
4173*7c478bd9Sstevel@tonic-gate 		trans_size = TOP_GETPAGE_SIZE(ip);
4174*7c478bd9Sstevel@tonic-gate 		if (seg->s_as != &kas) {
4175*7c478bd9Sstevel@tonic-gate 			TRANS_TRY_BEGIN_ASYNC(ufsvfsp, TOP_GETPAGE,
4176*7c478bd9Sstevel@tonic-gate 				trans_size, err)
4177*7c478bd9Sstevel@tonic-gate 			if (err == EWOULDBLOCK) {
4178*7c478bd9Sstevel@tonic-gate 				/*
4179*7c478bd9Sstevel@tonic-gate 				 * Use EDEADLK here because the VM code
4180*7c478bd9Sstevel@tonic-gate 				 * can normally never see this error.
4181*7c478bd9Sstevel@tonic-gate 				 */
4182*7c478bd9Sstevel@tonic-gate 				err = EDEADLK;
4183*7c478bd9Sstevel@tonic-gate 				ufs_lockfs_end(ulp);
4184*7c478bd9Sstevel@tonic-gate 				goto out;
4185*7c478bd9Sstevel@tonic-gate 			}
4186*7c478bd9Sstevel@tonic-gate 		} else {
4187*7c478bd9Sstevel@tonic-gate 			TRANS_BEGIN_ASYNC(ufsvfsp, TOP_GETPAGE, trans_size);
4188*7c478bd9Sstevel@tonic-gate 		}
4189*7c478bd9Sstevel@tonic-gate 	}
4190*7c478bd9Sstevel@tonic-gate 
4191*7c478bd9Sstevel@tonic-gate 	if (vp->v_flag & VNOMAP) {
4192*7c478bd9Sstevel@tonic-gate 		err = ENOSYS;
4193*7c478bd9Sstevel@tonic-gate 		goto unlock;
4194*7c478bd9Sstevel@tonic-gate 	}
4195*7c478bd9Sstevel@tonic-gate 
4196*7c478bd9Sstevel@tonic-gate 	seqmode = ip->i_nextr == uoff && rw != S_CREATE;
4197*7c478bd9Sstevel@tonic-gate 
4198*7c478bd9Sstevel@tonic-gate 	rwtype = RW_READER;		/* start as a reader */
4199*7c478bd9Sstevel@tonic-gate 	dolock = (rw_owner(&ip->i_contents) != curthread);
4200*7c478bd9Sstevel@tonic-gate 	/*
4201*7c478bd9Sstevel@tonic-gate 	 * If this thread owns the lock, i.e., this thread grabbed it
4202*7c478bd9Sstevel@tonic-gate 	 * as writer somewhere above, then we don't need to grab the
4203*7c478bd9Sstevel@tonic-gate 	 * lock as reader in this routine.
4204*7c478bd9Sstevel@tonic-gate 	 */
4205*7c478bd9Sstevel@tonic-gate 	do_qlock = (rw_owner(&ufsvfsp->vfs_dqrwlock) != curthread);
4206*7c478bd9Sstevel@tonic-gate 
4207*7c478bd9Sstevel@tonic-gate retrylock:
4208*7c478bd9Sstevel@tonic-gate 	if (dolock) {
4209*7c478bd9Sstevel@tonic-gate 		/*
4210*7c478bd9Sstevel@tonic-gate 		 * Grab the quota lock if we need to call
4211*7c478bd9Sstevel@tonic-gate 		 * bmap_write() below (with i_contents as writer).
4212*7c478bd9Sstevel@tonic-gate 		 */
4213*7c478bd9Sstevel@tonic-gate 		if (do_qlock && rwtype == RW_WRITER)
4214*7c478bd9Sstevel@tonic-gate 			rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
4215*7c478bd9Sstevel@tonic-gate 		rw_enter(&ip->i_contents, rwtype);
4216*7c478bd9Sstevel@tonic-gate 	}
4217*7c478bd9Sstevel@tonic-gate 
4218*7c478bd9Sstevel@tonic-gate 	/*
4219*7c478bd9Sstevel@tonic-gate 	 * We may be getting called as a side effect of a bmap using
4220*7c478bd9Sstevel@tonic-gate 	 * fbread() when the blocks might be being allocated and the
4221*7c478bd9Sstevel@tonic-gate 	 * size has not yet been up'ed.  In this case we want to be
4222*7c478bd9Sstevel@tonic-gate 	 * able to return zero pages if we get back UFS_HOLE from
4223*7c478bd9Sstevel@tonic-gate 	 * calling bmap for a non write case here.  We also might have
4224*7c478bd9Sstevel@tonic-gate 	 * to read some frags from the disk into a page if we are
4225*7c478bd9Sstevel@tonic-gate 	 * extending the number of frags for a given lbn in bmap().
4226*7c478bd9Sstevel@tonic-gate 	 * Large Files: The read of i_size here is atomic because
4227*7c478bd9Sstevel@tonic-gate 	 * i_contents is held here. If dolock is zero, the lock
4228*7c478bd9Sstevel@tonic-gate 	 * is held in bmap routines.
4229*7c478bd9Sstevel@tonic-gate 	 */
4230*7c478bd9Sstevel@tonic-gate 	beyond_eof = uoff + len > ip->i_size + PAGEOFFSET;
4231*7c478bd9Sstevel@tonic-gate 	if (beyond_eof && seg != segkmap) {
4232*7c478bd9Sstevel@tonic-gate 		if (dolock) {
4233*7c478bd9Sstevel@tonic-gate 			rw_exit(&ip->i_contents);
4234*7c478bd9Sstevel@tonic-gate 			if (do_qlock && rwtype == RW_WRITER)
4235*7c478bd9Sstevel@tonic-gate 				rw_exit(&ufsvfsp->vfs_dqrwlock);
4236*7c478bd9Sstevel@tonic-gate 		}
4237*7c478bd9Sstevel@tonic-gate 		err = EFAULT;
4238*7c478bd9Sstevel@tonic-gate 		goto unlock;
4239*7c478bd9Sstevel@tonic-gate 	}
4240*7c478bd9Sstevel@tonic-gate 
4241*7c478bd9Sstevel@tonic-gate 	/*
4242*7c478bd9Sstevel@tonic-gate 	 * Must hold i_contents lock throughout the call to pvn_getpages
4243*7c478bd9Sstevel@tonic-gate 	 * since locked pages are returned from each call to ufs_getapage.
4244*7c478bd9Sstevel@tonic-gate 	 * Must *not* return locked pages and then try for contents lock
4245*7c478bd9Sstevel@tonic-gate 	 * due to lock ordering requirements (inode > page)
4246*7c478bd9Sstevel@tonic-gate 	 */
4247*7c478bd9Sstevel@tonic-gate 
4248*7c478bd9Sstevel@tonic-gate 	has_holes = bmap_has_holes(ip);
4249*7c478bd9Sstevel@tonic-gate 
4250*7c478bd9Sstevel@tonic-gate 	if ((rw == S_WRITE || rw == S_CREATE) && has_holes && !beyond_eof) {
4251*7c478bd9Sstevel@tonic-gate 		int	blk_size;
4252*7c478bd9Sstevel@tonic-gate 		u_offset_t offset;
4253*7c478bd9Sstevel@tonic-gate 
4254*7c478bd9Sstevel@tonic-gate 		/*
4255*7c478bd9Sstevel@tonic-gate 		 * We must acquire the RW_WRITER lock in order to
4256*7c478bd9Sstevel@tonic-gate 		 * call bmap_write().
4257*7c478bd9Sstevel@tonic-gate 		 */
4258*7c478bd9Sstevel@tonic-gate 		if (dolock && rwtype == RW_READER) {
4259*7c478bd9Sstevel@tonic-gate 			rwtype = RW_WRITER;
4260*7c478bd9Sstevel@tonic-gate 
4261*7c478bd9Sstevel@tonic-gate 			/*
4262*7c478bd9Sstevel@tonic-gate 			 * Grab the quota lock before
4263*7c478bd9Sstevel@tonic-gate 			 * upgrading i_contents, but if we can't grab it
4264*7c478bd9Sstevel@tonic-gate 			 * don't wait here due to lock order:
4265*7c478bd9Sstevel@tonic-gate 			 * vfs_dqrwlock > i_contents.
4266*7c478bd9Sstevel@tonic-gate 			 */
4267*7c478bd9Sstevel@tonic-gate 			if (do_qlock && rw_tryenter(&ufsvfsp->vfs_dqrwlock,
4268*7c478bd9Sstevel@tonic-gate 							RW_READER) == 0) {
4269*7c478bd9Sstevel@tonic-gate 				rw_exit(&ip->i_contents);
4270*7c478bd9Sstevel@tonic-gate 				goto retrylock;
4271*7c478bd9Sstevel@tonic-gate 			}
4272*7c478bd9Sstevel@tonic-gate 			if (!rw_tryupgrade(&ip->i_contents)) {
4273*7c478bd9Sstevel@tonic-gate 				rw_exit(&ip->i_contents);
4274*7c478bd9Sstevel@tonic-gate 				if (do_qlock)
4275*7c478bd9Sstevel@tonic-gate 					rw_exit(&ufsvfsp->vfs_dqrwlock);
4276*7c478bd9Sstevel@tonic-gate 				goto retrylock;
4277*7c478bd9Sstevel@tonic-gate 			}
4278*7c478bd9Sstevel@tonic-gate 		}
4279*7c478bd9Sstevel@tonic-gate 
4280*7c478bd9Sstevel@tonic-gate 		/*
4281*7c478bd9Sstevel@tonic-gate 		 * May be allocating disk blocks for holes here as
4282*7c478bd9Sstevel@tonic-gate 		 * a result of mmap faults. write(2) does the bmap_write
4283*7c478bd9Sstevel@tonic-gate 		 * in rdip/wrip, not here. We are not dealing with frags
4284*7c478bd9Sstevel@tonic-gate 		 * in this case.
4285*7c478bd9Sstevel@tonic-gate 		 */
4286*7c478bd9Sstevel@tonic-gate 		/*
4287*7c478bd9Sstevel@tonic-gate 		 * Large Files: We cast fs_bmask field to offset_t
4288*7c478bd9Sstevel@tonic-gate 		 * just as we do for MAXBMASK because uoff is a 64-bit
4289*7c478bd9Sstevel@tonic-gate 		 * data type. fs_bmask will still be a 32-bit type
4290*7c478bd9Sstevel@tonic-gate 		 * as we cannot change any ondisk data structures.
4291*7c478bd9Sstevel@tonic-gate 		 */
4292*7c478bd9Sstevel@tonic-gate 
4293*7c478bd9Sstevel@tonic-gate 		offset = uoff & (offset_t)fs->fs_bmask;
4294*7c478bd9Sstevel@tonic-gate 		while (offset < uoff + len) {
4295*7c478bd9Sstevel@tonic-gate 			blk_size = (int)blksize(fs, ip, lblkno(fs, offset));
4296*7c478bd9Sstevel@tonic-gate 			err = bmap_write(ip, offset, blk_size, 0, cr);
4297*7c478bd9Sstevel@tonic-gate 			if (ip->i_flag & (ICHG|IUPD))
4298*7c478bd9Sstevel@tonic-gate 				ip->i_seq++;
4299*7c478bd9Sstevel@tonic-gate 			if (err)
4300*7c478bd9Sstevel@tonic-gate 				goto update_inode;
4301*7c478bd9Sstevel@tonic-gate 			offset += blk_size; /* XXX - make this contig */
4302*7c478bd9Sstevel@tonic-gate 		}
4303*7c478bd9Sstevel@tonic-gate 	}
4304*7c478bd9Sstevel@tonic-gate 
4305*7c478bd9Sstevel@tonic-gate 	/*
4306*7c478bd9Sstevel@tonic-gate 	 * Can be a reader from now on.
4307*7c478bd9Sstevel@tonic-gate 	 */
4308*7c478bd9Sstevel@tonic-gate 	if (dolock && rwtype == RW_WRITER) {
4309*7c478bd9Sstevel@tonic-gate 		rw_downgrade(&ip->i_contents);
4310*7c478bd9Sstevel@tonic-gate 		/*
4311*7c478bd9Sstevel@tonic-gate 		 * We can release vfs_dqrwlock early so do it, but make
4312*7c478bd9Sstevel@tonic-gate 		 * sure we don't try to release it again at the bottom.
4313*7c478bd9Sstevel@tonic-gate 		 */
4314*7c478bd9Sstevel@tonic-gate 		if (do_qlock) {
4315*7c478bd9Sstevel@tonic-gate 			rw_exit(&ufsvfsp->vfs_dqrwlock);
4316*7c478bd9Sstevel@tonic-gate 			do_qlock = 0;
4317*7c478bd9Sstevel@tonic-gate 		}
4318*7c478bd9Sstevel@tonic-gate 	}
4319*7c478bd9Sstevel@tonic-gate 
4320*7c478bd9Sstevel@tonic-gate 	/*
4321*7c478bd9Sstevel@tonic-gate 	 * We remove PROT_WRITE in cases when the file has UFS holes
4322*7c478bd9Sstevel@tonic-gate 	 * because we don't  want to call bmap_read() to check each
4323*7c478bd9Sstevel@tonic-gate 	 * page if it is backed with a disk block.
4324*7c478bd9Sstevel@tonic-gate 	 */
4325*7c478bd9Sstevel@tonic-gate 	if (protp && has_holes && rw != S_WRITE && rw != S_CREATE)
4326*7c478bd9Sstevel@tonic-gate 		*protp &= ~PROT_WRITE;
4327*7c478bd9Sstevel@tonic-gate 
4328*7c478bd9Sstevel@tonic-gate 	err = 0;
4329*7c478bd9Sstevel@tonic-gate 
4330*7c478bd9Sstevel@tonic-gate 	/*
4331*7c478bd9Sstevel@tonic-gate 	 * The loop looks up pages in the range [off, off + len).
4332*7c478bd9Sstevel@tonic-gate 	 * For each page, we first check if we should initiate an asynchronous
4333*7c478bd9Sstevel@tonic-gate 	 * read ahead before we call page_lookup (we may sleep in page_lookup
4334*7c478bd9Sstevel@tonic-gate 	 * for a previously initiated disk read).
4335*7c478bd9Sstevel@tonic-gate 	 */
4336*7c478bd9Sstevel@tonic-gate 	eoff = (uoff + len);
4337*7c478bd9Sstevel@tonic-gate 	for (pgoff = uoff, pgaddr = addr, pl = plarr;
4338*7c478bd9Sstevel@tonic-gate 	    pgoff < eoff; /* empty */) {
4339*7c478bd9Sstevel@tonic-gate 		page_t	*pp;
4340*7c478bd9Sstevel@tonic-gate 		u_offset_t	nextrio;
4341*7c478bd9Sstevel@tonic-gate 		se_t	se;
4342*7c478bd9Sstevel@tonic-gate 		int retval;
4343*7c478bd9Sstevel@tonic-gate 
4344*7c478bd9Sstevel@tonic-gate 		se = ((rw == S_CREATE || rw == S_OTHER) ? SE_EXCL : SE_SHARED);
4345*7c478bd9Sstevel@tonic-gate 
4346*7c478bd9Sstevel@tonic-gate 		/* Handle async getpage (faultahead) */
4347*7c478bd9Sstevel@tonic-gate 		if (plarr == NULL) {
4348*7c478bd9Sstevel@tonic-gate 			ip->i_nextrio = pgoff;
4349*7c478bd9Sstevel@tonic-gate 			(void) ufs_getpage_ra(vp, pgoff, seg, pgaddr);
4350*7c478bd9Sstevel@tonic-gate 			pgoff += pgsize;
4351*7c478bd9Sstevel@tonic-gate 			pgaddr += pgsize;
4352*7c478bd9Sstevel@tonic-gate 			continue;
4353*7c478bd9Sstevel@tonic-gate 		}
4354*7c478bd9Sstevel@tonic-gate 		/*
4355*7c478bd9Sstevel@tonic-gate 		 * Check if we should initiate read ahead of next cluster.
4356*7c478bd9Sstevel@tonic-gate 		 * We call page_exists only when we need to confirm that
4357*7c478bd9Sstevel@tonic-gate 		 * we have the current page before we initiate the read ahead.
4358*7c478bd9Sstevel@tonic-gate 		 */
4359*7c478bd9Sstevel@tonic-gate 		nextrio = ip->i_nextrio;
4360*7c478bd9Sstevel@tonic-gate 		if (seqmode &&
4361*7c478bd9Sstevel@tonic-gate 		    pgoff + CLUSTSZ(ip) >= nextrio && pgoff <= nextrio &&
4362*7c478bd9Sstevel@tonic-gate 		    nextrio < ip->i_size && page_exists(vp, pgoff)) {
4363*7c478bd9Sstevel@tonic-gate 			retval = ufs_getpage_ra(vp, pgoff, seg, pgaddr);
4364*7c478bd9Sstevel@tonic-gate 			/*
4365*7c478bd9Sstevel@tonic-gate 			 * We always read ahead the next cluster of data
4366*7c478bd9Sstevel@tonic-gate 			 * starting from i_nextrio. If the page (vp,nextrio)
4367*7c478bd9Sstevel@tonic-gate 			 * is actually in core at this point, the routine
4368*7c478bd9Sstevel@tonic-gate 			 * ufs_getpage_ra() will stop pre-fetching data
4369*7c478bd9Sstevel@tonic-gate 			 * until we read that page in a synchronized manner
4370*7c478bd9Sstevel@tonic-gate 			 * through ufs_getpage_miss(). So, we should increase
4371*7c478bd9Sstevel@tonic-gate 			 * i_nextrio if the page (vp, nextrio) exists.
4372*7c478bd9Sstevel@tonic-gate 			 */
4373*7c478bd9Sstevel@tonic-gate 			if ((retval == 0) && page_exists(vp, nextrio)) {
4374*7c478bd9Sstevel@tonic-gate 				ip->i_nextrio = nextrio + pgsize;
4375*7c478bd9Sstevel@tonic-gate 			}
4376*7c478bd9Sstevel@tonic-gate 		}
4377*7c478bd9Sstevel@tonic-gate 
4378*7c478bd9Sstevel@tonic-gate 		if ((pp = page_lookup(vp, pgoff, se)) != NULL) {
4379*7c478bd9Sstevel@tonic-gate 			/*
4380*7c478bd9Sstevel@tonic-gate 			 * We found the page in the page cache.
4381*7c478bd9Sstevel@tonic-gate 			 */
4382*7c478bd9Sstevel@tonic-gate 			*pl++ = pp;
4383*7c478bd9Sstevel@tonic-gate 			pgoff += pgsize;
4384*7c478bd9Sstevel@tonic-gate 			pgaddr += pgsize;
4385*7c478bd9Sstevel@tonic-gate 			len -= pgsize;
4386*7c478bd9Sstevel@tonic-gate 			plsz -= pgsize;
4387*7c478bd9Sstevel@tonic-gate 		} else  {
4388*7c478bd9Sstevel@tonic-gate 			/*
4389*7c478bd9Sstevel@tonic-gate 			 * We have to create the page, or read it from disk.
4390*7c478bd9Sstevel@tonic-gate 			 */
4391*7c478bd9Sstevel@tonic-gate 			if (err = ufs_getpage_miss(vp, pgoff, len, seg, pgaddr,
4392*7c478bd9Sstevel@tonic-gate 			    pl, plsz, rw, seqmode))
4393*7c478bd9Sstevel@tonic-gate 				goto error;
4394*7c478bd9Sstevel@tonic-gate 
4395*7c478bd9Sstevel@tonic-gate 			while (*pl != NULL) {
4396*7c478bd9Sstevel@tonic-gate 				pl++;
4397*7c478bd9Sstevel@tonic-gate 				pgoff += pgsize;
4398*7c478bd9Sstevel@tonic-gate 				pgaddr += pgsize;
4399*7c478bd9Sstevel@tonic-gate 				len -= pgsize;
4400*7c478bd9Sstevel@tonic-gate 				plsz -= pgsize;
4401*7c478bd9Sstevel@tonic-gate 			}
4402*7c478bd9Sstevel@tonic-gate 		}
4403*7c478bd9Sstevel@tonic-gate 	}
4404*7c478bd9Sstevel@tonic-gate 
4405*7c478bd9Sstevel@tonic-gate 	/*
4406*7c478bd9Sstevel@tonic-gate 	 * Return pages up to plsz if they are in the page cache.
4407*7c478bd9Sstevel@tonic-gate 	 * We cannot return pages if there is a chance that they are
4408*7c478bd9Sstevel@tonic-gate 	 * backed with a UFS hole and rw is S_WRITE or S_CREATE.
4409*7c478bd9Sstevel@tonic-gate 	 */
4410*7c478bd9Sstevel@tonic-gate 	if (plarr && !(has_holes && (rw == S_WRITE || rw == S_CREATE))) {
4411*7c478bd9Sstevel@tonic-gate 
4412*7c478bd9Sstevel@tonic-gate 		ASSERT((protp == NULL) ||
4413*7c478bd9Sstevel@tonic-gate 			!(has_holes && (*protp & PROT_WRITE)));
4414*7c478bd9Sstevel@tonic-gate 
4415*7c478bd9Sstevel@tonic-gate 		eoff = pgoff + plsz;
4416*7c478bd9Sstevel@tonic-gate 		while (pgoff < eoff) {
4417*7c478bd9Sstevel@tonic-gate 			page_t		*pp;
4418*7c478bd9Sstevel@tonic-gate 
4419*7c478bd9Sstevel@tonic-gate 			if ((pp = page_lookup_nowait(vp, pgoff,
4420*7c478bd9Sstevel@tonic-gate 			    SE_SHARED)) == NULL)
4421*7c478bd9Sstevel@tonic-gate 				break;
4422*7c478bd9Sstevel@tonic-gate 
4423*7c478bd9Sstevel@tonic-gate 			*pl++ = pp;
4424*7c478bd9Sstevel@tonic-gate 			pgoff += pgsize;
4425*7c478bd9Sstevel@tonic-gate 			plsz -= pgsize;
4426*7c478bd9Sstevel@tonic-gate 		}
4427*7c478bd9Sstevel@tonic-gate 	}
4428*7c478bd9Sstevel@tonic-gate 
4429*7c478bd9Sstevel@tonic-gate 	if (plarr)
4430*7c478bd9Sstevel@tonic-gate 		*pl = NULL;			/* Terminate page list */
4431*7c478bd9Sstevel@tonic-gate 	ip->i_nextr = pgoff;
4432*7c478bd9Sstevel@tonic-gate 
4433*7c478bd9Sstevel@tonic-gate error:
4434*7c478bd9Sstevel@tonic-gate 	if (err && plarr) {
4435*7c478bd9Sstevel@tonic-gate 		/*
4436*7c478bd9Sstevel@tonic-gate 		 * Release any pages we have locked.
4437*7c478bd9Sstevel@tonic-gate 		 */
4438*7c478bd9Sstevel@tonic-gate 		while (pl > &plarr[0])
4439*7c478bd9Sstevel@tonic-gate 			page_unlock(*--pl);
4440*7c478bd9Sstevel@tonic-gate 
4441*7c478bd9Sstevel@tonic-gate 		plarr[0] = NULL;
4442*7c478bd9Sstevel@tonic-gate 	}
4443*7c478bd9Sstevel@tonic-gate 
4444*7c478bd9Sstevel@tonic-gate update_inode:
4445*7c478bd9Sstevel@tonic-gate 	/*
4446*7c478bd9Sstevel@tonic-gate 	 * If the inode is not already marked for IACC (in rdip() for read)
4447*7c478bd9Sstevel@tonic-gate 	 * and the inode is not marked for no access time update (in wrip()
4448*7c478bd9Sstevel@tonic-gate 	 * for write) then update the inode access time and mod time now.
4449*7c478bd9Sstevel@tonic-gate 	 */
4450*7c478bd9Sstevel@tonic-gate 	if ((ip->i_flag & (IACC | INOACC)) == 0) {
4451*7c478bd9Sstevel@tonic-gate 		if ((rw != S_OTHER) && (ip->i_mode & IFMT) != IFDIR) {
4452*7c478bd9Sstevel@tonic-gate 			if (!ULOCKFS_IS_NOIACC(ITOUL(ip)) &&
4453*7c478bd9Sstevel@tonic-gate 			    (fs->fs_ronly == 0) &&
4454*7c478bd9Sstevel@tonic-gate 			    (!ufsvfsp->vfs_noatime)) {
4455*7c478bd9Sstevel@tonic-gate 				mutex_enter(&ip->i_tlock);
4456*7c478bd9Sstevel@tonic-gate 				ip->i_flag |= IACC;
4457*7c478bd9Sstevel@tonic-gate 				ITIMES_NOLOCK(ip);
4458*7c478bd9Sstevel@tonic-gate 				mutex_exit(&ip->i_tlock);
4459*7c478bd9Sstevel@tonic-gate 			}
4460*7c478bd9Sstevel@tonic-gate 		}
4461*7c478bd9Sstevel@tonic-gate 	}
4462*7c478bd9Sstevel@tonic-gate 
4463*7c478bd9Sstevel@tonic-gate 	if (dolock) {
4464*7c478bd9Sstevel@tonic-gate 		rw_exit(&ip->i_contents);
4465*7c478bd9Sstevel@tonic-gate 		if (do_qlock && rwtype == RW_WRITER)
4466*7c478bd9Sstevel@tonic-gate 			rw_exit(&ufsvfsp->vfs_dqrwlock);
4467*7c478bd9Sstevel@tonic-gate 	}
4468*7c478bd9Sstevel@tonic-gate 
4469*7c478bd9Sstevel@tonic-gate unlock:
4470*7c478bd9Sstevel@tonic-gate 	if (ulp) {
4471*7c478bd9Sstevel@tonic-gate 		if ((rw == S_CREATE || rw == S_WRITE) &&
4472*7c478bd9Sstevel@tonic-gate 		    !(vp->v_flag & VISSWAP)) {
4473*7c478bd9Sstevel@tonic-gate 			TRANS_END_ASYNC(ufsvfsp, TOP_GETPAGE, trans_size);
4474*7c478bd9Sstevel@tonic-gate 		}
4475*7c478bd9Sstevel@tonic-gate 		ufs_lockfs_end(ulp);
4476*7c478bd9Sstevel@tonic-gate 	}
4477*7c478bd9Sstevel@tonic-gate out:
4478*7c478bd9Sstevel@tonic-gate 	TRACE_2(TR_FAC_UFS, TR_UFS_GETPAGE_END,
4479*7c478bd9Sstevel@tonic-gate 		"ufs_getpage_end:vp %p error %d", vp, err);
4480*7c478bd9Sstevel@tonic-gate 	return (err);
4481*7c478bd9Sstevel@tonic-gate }
4482*7c478bd9Sstevel@tonic-gate 
4483*7c478bd9Sstevel@tonic-gate /*
4484*7c478bd9Sstevel@tonic-gate  * ufs_getpage_miss is called when ufs_getpage missed the page in the page
4485*7c478bd9Sstevel@tonic-gate  * cache. The page is either read from the disk, or it's created.
4486*7c478bd9Sstevel@tonic-gate  * A page is created (without disk read) if rw == S_CREATE, or if
4487*7c478bd9Sstevel@tonic-gate  * the page is not backed with a real disk block (UFS hole).
4488*7c478bd9Sstevel@tonic-gate  */
4489*7c478bd9Sstevel@tonic-gate /* ARGSUSED */
4490*7c478bd9Sstevel@tonic-gate static int
4491*7c478bd9Sstevel@tonic-gate ufs_getpage_miss(struct vnode *vp, u_offset_t off, size_t len, struct seg *seg,
4492*7c478bd9Sstevel@tonic-gate 	caddr_t addr, page_t *pl[], size_t plsz, enum seg_rw rw, int seq)
4493*7c478bd9Sstevel@tonic-gate {
4494*7c478bd9Sstevel@tonic-gate 	struct inode	*ip = VTOI(vp);
4495*7c478bd9Sstevel@tonic-gate 	page_t		*pp;
4496*7c478bd9Sstevel@tonic-gate 	daddr_t		bn;
4497*7c478bd9Sstevel@tonic-gate 	size_t		io_len;
4498*7c478bd9Sstevel@tonic-gate 	int		crpage;
4499*7c478bd9Sstevel@tonic-gate 	int		err;
4500*7c478bd9Sstevel@tonic-gate 	int		contig;
4501*7c478bd9Sstevel@tonic-gate 	int		bsize = ip->i_fs->fs_bsize;
4502*7c478bd9Sstevel@tonic-gate 
4503*7c478bd9Sstevel@tonic-gate 	/*
4504*7c478bd9Sstevel@tonic-gate 	 * Figure out whether the page can be created, or must be
4505*7c478bd9Sstevel@tonic-gate 	 * must be read from the disk.
4506*7c478bd9Sstevel@tonic-gate 	 */
4507*7c478bd9Sstevel@tonic-gate 	if (rw == S_CREATE)
4508*7c478bd9Sstevel@tonic-gate 		crpage = 1;
4509*7c478bd9Sstevel@tonic-gate 	else {
4510*7c478bd9Sstevel@tonic-gate 		contig = 0;
4511*7c478bd9Sstevel@tonic-gate 		if (err = bmap_read(ip, off, &bn, &contig))
4512*7c478bd9Sstevel@tonic-gate 			return (err);
4513*7c478bd9Sstevel@tonic-gate 		crpage = (bn == UFS_HOLE);
4514*7c478bd9Sstevel@tonic-gate 	}
4515*7c478bd9Sstevel@tonic-gate 
4516*7c478bd9Sstevel@tonic-gate 	if (crpage) {
4517*7c478bd9Sstevel@tonic-gate 		if ((pp = page_create_va(vp, off, PAGESIZE, PG_WAIT, seg,
4518*7c478bd9Sstevel@tonic-gate 		    addr)) == NULL) {
4519*7c478bd9Sstevel@tonic-gate 			return (ufs_fault(vp,
4520*7c478bd9Sstevel@tonic-gate 				    "ufs_getpage_miss: page_create == NULL"));
4521*7c478bd9Sstevel@tonic-gate 		}
4522*7c478bd9Sstevel@tonic-gate 
4523*7c478bd9Sstevel@tonic-gate 		if (rw != S_CREATE)
4524*7c478bd9Sstevel@tonic-gate 			pagezero(pp, 0, PAGESIZE);
4525*7c478bd9Sstevel@tonic-gate 		io_len = PAGESIZE;
4526*7c478bd9Sstevel@tonic-gate 	} else {
4527*7c478bd9Sstevel@tonic-gate 		u_offset_t	io_off;
4528*7c478bd9Sstevel@tonic-gate 		uint_t	xlen;
4529*7c478bd9Sstevel@tonic-gate 		struct buf	*bp;
4530*7c478bd9Sstevel@tonic-gate 		ufsvfs_t	*ufsvfsp = ip->i_ufsvfs;
4531*7c478bd9Sstevel@tonic-gate 
4532*7c478bd9Sstevel@tonic-gate 		/*
4533*7c478bd9Sstevel@tonic-gate 		 * If access is not in sequential order, we read from disk
4534*7c478bd9Sstevel@tonic-gate 		 * in bsize units.
4535*7c478bd9Sstevel@tonic-gate 		 *
4536*7c478bd9Sstevel@tonic-gate 		 * We limit the size of the transfer to bsize if we are reading
4537*7c478bd9Sstevel@tonic-gate 		 * from the beginning of the file. Note in this situation we
4538*7c478bd9Sstevel@tonic-gate 		 * will hedge our bets and initiate an async read ahead of
4539*7c478bd9Sstevel@tonic-gate 		 * the second block.
4540*7c478bd9Sstevel@tonic-gate 		 */
4541*7c478bd9Sstevel@tonic-gate 		if (!seq || off == 0)
4542*7c478bd9Sstevel@tonic-gate 			contig = MIN(contig, bsize);
4543*7c478bd9Sstevel@tonic-gate 
4544*7c478bd9Sstevel@tonic-gate 		pp = pvn_read_kluster(vp, off, seg, addr, &io_off,
4545*7c478bd9Sstevel@tonic-gate 		    &io_len, off, contig, 0);
4546*7c478bd9Sstevel@tonic-gate 
4547*7c478bd9Sstevel@tonic-gate 		/*
4548*7c478bd9Sstevel@tonic-gate 		 * Some other thread has entered the page.
4549*7c478bd9Sstevel@tonic-gate 		 * ufs_getpage will retry page_lookup.
4550*7c478bd9Sstevel@tonic-gate 		 */
4551*7c478bd9Sstevel@tonic-gate 		if (pp == NULL) {
4552*7c478bd9Sstevel@tonic-gate 			pl[0] = NULL;
4553*7c478bd9Sstevel@tonic-gate 			return (0);
4554*7c478bd9Sstevel@tonic-gate 		}
4555*7c478bd9Sstevel@tonic-gate 
4556*7c478bd9Sstevel@tonic-gate 		/*
4557*7c478bd9Sstevel@tonic-gate 		 * Zero part of the page which we are not
4558*7c478bd9Sstevel@tonic-gate 		 * going to read from the disk.
4559*7c478bd9Sstevel@tonic-gate 		 */
4560*7c478bd9Sstevel@tonic-gate 		xlen = io_len & PAGEOFFSET;
4561*7c478bd9Sstevel@tonic-gate 		if (xlen != 0)
4562*7c478bd9Sstevel@tonic-gate 			pagezero(pp->p_prev, xlen, PAGESIZE - xlen);
4563*7c478bd9Sstevel@tonic-gate 
4564*7c478bd9Sstevel@tonic-gate 		bp = pageio_setup(pp, io_len, ip->i_devvp, B_READ);
4565*7c478bd9Sstevel@tonic-gate 		bp->b_edev = ip->i_dev;
4566*7c478bd9Sstevel@tonic-gate 		bp->b_dev = cmpdev(ip->i_dev);
4567*7c478bd9Sstevel@tonic-gate 		bp->b_blkno = bn;
4568*7c478bd9Sstevel@tonic-gate 		bp->b_un.b_addr = (caddr_t)0;
4569*7c478bd9Sstevel@tonic-gate 		bp->b_file = ip->i_vnode;
4570*7c478bd9Sstevel@tonic-gate 		bp->b_offset = off;
4571*7c478bd9Sstevel@tonic-gate 
4572*7c478bd9Sstevel@tonic-gate 		if (ufsvfsp->vfs_log) {
4573*7c478bd9Sstevel@tonic-gate 			lufs_read_strategy(ufsvfsp->vfs_log, bp);
4574*7c478bd9Sstevel@tonic-gate 		} else if (ufsvfsp->vfs_snapshot) {
4575*7c478bd9Sstevel@tonic-gate 			fssnap_strategy(&ufsvfsp->vfs_snapshot, bp);
4576*7c478bd9Sstevel@tonic-gate 		} else {
4577*7c478bd9Sstevel@tonic-gate 			ufsvfsp->vfs_iotstamp = lbolt;
4578*7c478bd9Sstevel@tonic-gate 			ub.ub_getpages.value.ul++;
4579*7c478bd9Sstevel@tonic-gate 			(void) bdev_strategy(bp);
4580*7c478bd9Sstevel@tonic-gate 			lwp_stat_update(LWP_STAT_INBLK, 1);
4581*7c478bd9Sstevel@tonic-gate 		}
4582*7c478bd9Sstevel@tonic-gate 
4583*7c478bd9Sstevel@tonic-gate 		ip->i_nextrio = off + ((io_len + PAGESIZE - 1) & PAGEMASK);
4584*7c478bd9Sstevel@tonic-gate 
4585*7c478bd9Sstevel@tonic-gate 		/*
4586*7c478bd9Sstevel@tonic-gate 		 * If the file access is sequential, initiate read ahead
4587*7c478bd9Sstevel@tonic-gate 		 * of the next cluster.
4588*7c478bd9Sstevel@tonic-gate 		 */
4589*7c478bd9Sstevel@tonic-gate 		if (seq && ip->i_nextrio < ip->i_size)
4590*7c478bd9Sstevel@tonic-gate 			(void) ufs_getpage_ra(vp, off, seg, addr);
4591*7c478bd9Sstevel@tonic-gate 		err = biowait(bp);
4592*7c478bd9Sstevel@tonic-gate 		pageio_done(bp);
4593*7c478bd9Sstevel@tonic-gate 
4594*7c478bd9Sstevel@tonic-gate 		if (err) {
4595*7c478bd9Sstevel@tonic-gate 			pvn_read_done(pp, B_ERROR);
4596*7c478bd9Sstevel@tonic-gate 			return (err);
4597*7c478bd9Sstevel@tonic-gate 		}
4598*7c478bd9Sstevel@tonic-gate 	}
4599*7c478bd9Sstevel@tonic-gate 
4600*7c478bd9Sstevel@tonic-gate 	pvn_plist_init(pp, pl, plsz, off, io_len, rw);
4601*7c478bd9Sstevel@tonic-gate 	return (0);
4602*7c478bd9Sstevel@tonic-gate }
4603*7c478bd9Sstevel@tonic-gate 
4604*7c478bd9Sstevel@tonic-gate /*
4605*7c478bd9Sstevel@tonic-gate  * Read ahead a cluster from the disk. Returns the length in bytes.
4606*7c478bd9Sstevel@tonic-gate  */
4607*7c478bd9Sstevel@tonic-gate static int
4608*7c478bd9Sstevel@tonic-gate ufs_getpage_ra(struct vnode *vp, u_offset_t off, struct seg *seg, caddr_t addr)
4609*7c478bd9Sstevel@tonic-gate {
4610*7c478bd9Sstevel@tonic-gate 	struct inode	*ip = VTOI(vp);
4611*7c478bd9Sstevel@tonic-gate 	page_t		*pp;
4612*7c478bd9Sstevel@tonic-gate 	u_offset_t	io_off = ip->i_nextrio;
4613*7c478bd9Sstevel@tonic-gate 	ufsvfs_t	*ufsvfsp;
4614*7c478bd9Sstevel@tonic-gate 	caddr_t		addr2 = addr + (io_off - off);
4615*7c478bd9Sstevel@tonic-gate 	struct buf	*bp;
4616*7c478bd9Sstevel@tonic-gate 	daddr_t		bn;
4617*7c478bd9Sstevel@tonic-gate 	size_t		io_len;
4618*7c478bd9Sstevel@tonic-gate 	int		contig;
4619*7c478bd9Sstevel@tonic-gate 	int		xlen;
4620*7c478bd9Sstevel@tonic-gate 	int		bsize = ip->i_fs->fs_bsize;
4621*7c478bd9Sstevel@tonic-gate 
4622*7c478bd9Sstevel@tonic-gate 	/*
4623*7c478bd9Sstevel@tonic-gate 	 * If the directio advisory is in effect on this file,
4624*7c478bd9Sstevel@tonic-gate 	 * then do not do buffered read ahead. Read ahead makes
4625*7c478bd9Sstevel@tonic-gate 	 * it more difficult on threads using directio as they
4626*7c478bd9Sstevel@tonic-gate 	 * will be forced to flush the pages from this vnode.
4627*7c478bd9Sstevel@tonic-gate 	 */
4628*7c478bd9Sstevel@tonic-gate 	if ((ufsvfsp = ip->i_ufsvfs) == NULL)
4629*7c478bd9Sstevel@tonic-gate 		return (0);
4630*7c478bd9Sstevel@tonic-gate 	if (ip->i_flag & IDIRECTIO || ufsvfsp->vfs_forcedirectio)
4631*7c478bd9Sstevel@tonic-gate 		return (0);
4632*7c478bd9Sstevel@tonic-gate 
4633*7c478bd9Sstevel@tonic-gate 	/*
4634*7c478bd9Sstevel@tonic-gate 	 * Is this test needed?
4635*7c478bd9Sstevel@tonic-gate 	 */
4636*7c478bd9Sstevel@tonic-gate 	if (addr2 >= seg->s_base + seg->s_size)
4637*7c478bd9Sstevel@tonic-gate 		return (0);
4638*7c478bd9Sstevel@tonic-gate 
4639*7c478bd9Sstevel@tonic-gate 	contig = 0;
4640*7c478bd9Sstevel@tonic-gate 	if (bmap_read(ip, io_off, &bn, &contig) != 0 || bn == UFS_HOLE)
4641*7c478bd9Sstevel@tonic-gate 		return (0);
4642*7c478bd9Sstevel@tonic-gate 
4643*7c478bd9Sstevel@tonic-gate 	/*
4644*7c478bd9Sstevel@tonic-gate 	 * Limit the transfer size to bsize if this is the 2nd block.
4645*7c478bd9Sstevel@tonic-gate 	 */
4646*7c478bd9Sstevel@tonic-gate 	if (io_off == (u_offset_t)bsize)
4647*7c478bd9Sstevel@tonic-gate 		contig = MIN(contig, bsize);
4648*7c478bd9Sstevel@tonic-gate 
4649*7c478bd9Sstevel@tonic-gate 	if ((pp = pvn_read_kluster(vp, io_off, seg, addr2, &io_off,
4650*7c478bd9Sstevel@tonic-gate 	    &io_len, io_off, contig, 1)) == NULL)
4651*7c478bd9Sstevel@tonic-gate 		return (0);
4652*7c478bd9Sstevel@tonic-gate 
4653*7c478bd9Sstevel@tonic-gate 	/*
4654*7c478bd9Sstevel@tonic-gate 	 * Zero part of page which we are not going to read from disk
4655*7c478bd9Sstevel@tonic-gate 	 */
4656*7c478bd9Sstevel@tonic-gate 	if ((xlen = (io_len & PAGEOFFSET)) > 0)
4657*7c478bd9Sstevel@tonic-gate 		pagezero(pp->p_prev, xlen, PAGESIZE - xlen);
4658*7c478bd9Sstevel@tonic-gate 
4659*7c478bd9Sstevel@tonic-gate 	ip->i_nextrio = (io_off + io_len + PAGESIZE - 1) & PAGEMASK;
4660*7c478bd9Sstevel@tonic-gate 
4661*7c478bd9Sstevel@tonic-gate 	bp = pageio_setup(pp, io_len, ip->i_devvp, B_READ | B_ASYNC);
4662*7c478bd9Sstevel@tonic-gate 	bp->b_edev = ip->i_dev;
4663*7c478bd9Sstevel@tonic-gate 	bp->b_dev = cmpdev(ip->i_dev);
4664*7c478bd9Sstevel@tonic-gate 	bp->b_blkno = bn;
4665*7c478bd9Sstevel@tonic-gate 	bp->b_un.b_addr = (caddr_t)0;
4666*7c478bd9Sstevel@tonic-gate 	bp->b_file = ip->i_vnode;
4667*7c478bd9Sstevel@tonic-gate 	bp->b_offset = off;
4668*7c478bd9Sstevel@tonic-gate 
4669*7c478bd9Sstevel@tonic-gate 	if (ufsvfsp->vfs_log) {
4670*7c478bd9Sstevel@tonic-gate 		lufs_read_strategy(ufsvfsp->vfs_log, bp);
4671*7c478bd9Sstevel@tonic-gate 	} else if (ufsvfsp->vfs_snapshot) {
4672*7c478bd9Sstevel@tonic-gate 		fssnap_strategy(&ufsvfsp->vfs_snapshot, bp);
4673*7c478bd9Sstevel@tonic-gate 	} else {
4674*7c478bd9Sstevel@tonic-gate 		ufsvfsp->vfs_iotstamp = lbolt;
4675*7c478bd9Sstevel@tonic-gate 		ub.ub_getras.value.ul++;
4676*7c478bd9Sstevel@tonic-gate 		(void) bdev_strategy(bp);
4677*7c478bd9Sstevel@tonic-gate 		lwp_stat_update(LWP_STAT_INBLK, 1);
4678*7c478bd9Sstevel@tonic-gate 	}
4679*7c478bd9Sstevel@tonic-gate 
4680*7c478bd9Sstevel@tonic-gate 	return (io_len);
4681*7c478bd9Sstevel@tonic-gate }
4682*7c478bd9Sstevel@tonic-gate 
4683*7c478bd9Sstevel@tonic-gate int	ufs_delay = 1;
4684*7c478bd9Sstevel@tonic-gate /*
4685*7c478bd9Sstevel@tonic-gate  * Flags are composed of {B_INVAL, B_FREE, B_DONTNEED, B_FORCE, B_ASYNC}
4686*7c478bd9Sstevel@tonic-gate  *
4687*7c478bd9Sstevel@tonic-gate  * LMXXX - the inode really ought to contain a pointer to one of these
4688*7c478bd9Sstevel@tonic-gate  * async args.  Stuff gunk in there and just hand the whole mess off.
4689*7c478bd9Sstevel@tonic-gate  * This would replace i_delaylen, i_delayoff.
4690*7c478bd9Sstevel@tonic-gate  */
4691*7c478bd9Sstevel@tonic-gate /*ARGSUSED*/
4692*7c478bd9Sstevel@tonic-gate static int
4693*7c478bd9Sstevel@tonic-gate ufs_putpage(struct vnode *vp, offset_t off, size_t len, int flags,
4694*7c478bd9Sstevel@tonic-gate 	struct cred *cr)
4695*7c478bd9Sstevel@tonic-gate {
4696*7c478bd9Sstevel@tonic-gate 	struct inode *ip = VTOI(vp);
4697*7c478bd9Sstevel@tonic-gate 	int err = 0;
4698*7c478bd9Sstevel@tonic-gate 
4699*7c478bd9Sstevel@tonic-gate 	if (vp->v_count == 0) {
4700*7c478bd9Sstevel@tonic-gate 		return (ufs_fault(vp, "ufs_putpage: bad v_count == 0"));
4701*7c478bd9Sstevel@tonic-gate 	}
4702*7c478bd9Sstevel@tonic-gate 
4703*7c478bd9Sstevel@tonic-gate 	TRACE_1(TR_FAC_UFS, TR_UFS_PUTPAGE_START,
4704*7c478bd9Sstevel@tonic-gate 		"ufs_putpage_start:vp %p", vp);
4705*7c478bd9Sstevel@tonic-gate 
4706*7c478bd9Sstevel@tonic-gate 	/*
4707*7c478bd9Sstevel@tonic-gate 	 * XXX - Why should this check be made here?
4708*7c478bd9Sstevel@tonic-gate 	 */
4709*7c478bd9Sstevel@tonic-gate 	if (vp->v_flag & VNOMAP) {
4710*7c478bd9Sstevel@tonic-gate 		err = ENOSYS;
4711*7c478bd9Sstevel@tonic-gate 		goto errout;
4712*7c478bd9Sstevel@tonic-gate 	}
4713*7c478bd9Sstevel@tonic-gate 
4714*7c478bd9Sstevel@tonic-gate 	if (ip->i_ufsvfs == NULL) {
4715*7c478bd9Sstevel@tonic-gate 		err = EIO;
4716*7c478bd9Sstevel@tonic-gate 		goto errout;
4717*7c478bd9Sstevel@tonic-gate 	}
4718*7c478bd9Sstevel@tonic-gate 
4719*7c478bd9Sstevel@tonic-gate 	if (flags & B_ASYNC) {
4720*7c478bd9Sstevel@tonic-gate 		if (ufs_delay && len &&
4721*7c478bd9Sstevel@tonic-gate 		    (flags & ~(B_ASYNC|B_DONTNEED|B_FREE)) == 0) {
4722*7c478bd9Sstevel@tonic-gate 			mutex_enter(&ip->i_tlock);
4723*7c478bd9Sstevel@tonic-gate 			/*
4724*7c478bd9Sstevel@tonic-gate 			 * If nobody stalled, start a new cluster.
4725*7c478bd9Sstevel@tonic-gate 			 */
4726*7c478bd9Sstevel@tonic-gate 			if (ip->i_delaylen == 0) {
4727*7c478bd9Sstevel@tonic-gate 				ip->i_delayoff = off;
4728*7c478bd9Sstevel@tonic-gate 				ip->i_delaylen = len;
4729*7c478bd9Sstevel@tonic-gate 				mutex_exit(&ip->i_tlock);
4730*7c478bd9Sstevel@tonic-gate 				goto errout;
4731*7c478bd9Sstevel@tonic-gate 			}
4732*7c478bd9Sstevel@tonic-gate 			/*
4733*7c478bd9Sstevel@tonic-gate 			 * If we have a full cluster or they are not contig,
4734*7c478bd9Sstevel@tonic-gate 			 * then push last cluster and start over.
4735*7c478bd9Sstevel@tonic-gate 			 */
4736*7c478bd9Sstevel@tonic-gate 			if (ip->i_delaylen >= CLUSTSZ(ip) ||
4737*7c478bd9Sstevel@tonic-gate 			    ip->i_delayoff + ip->i_delaylen != off) {
4738*7c478bd9Sstevel@tonic-gate 				u_offset_t doff;
4739*7c478bd9Sstevel@tonic-gate 				size_t dlen;
4740*7c478bd9Sstevel@tonic-gate 
4741*7c478bd9Sstevel@tonic-gate 				doff = ip->i_delayoff;
4742*7c478bd9Sstevel@tonic-gate 				dlen = ip->i_delaylen;
4743*7c478bd9Sstevel@tonic-gate 				ip->i_delayoff = off;
4744*7c478bd9Sstevel@tonic-gate 				ip->i_delaylen = len;
4745*7c478bd9Sstevel@tonic-gate 				mutex_exit(&ip->i_tlock);
4746*7c478bd9Sstevel@tonic-gate 				err = ufs_putpages(vp, doff, dlen,
4747*7c478bd9Sstevel@tonic-gate 				    flags, cr);
4748*7c478bd9Sstevel@tonic-gate 				/* LMXXX - flags are new val, not old */
4749*7c478bd9Sstevel@tonic-gate 				goto errout;
4750*7c478bd9Sstevel@tonic-gate 			}
4751*7c478bd9Sstevel@tonic-gate 			/*
4752*7c478bd9Sstevel@tonic-gate 			 * There is something there, it's not full, and
4753*7c478bd9Sstevel@tonic-gate 			 * it is contig.
4754*7c478bd9Sstevel@tonic-gate 			 */
4755*7c478bd9Sstevel@tonic-gate 			ip->i_delaylen += len;
4756*7c478bd9Sstevel@tonic-gate 			mutex_exit(&ip->i_tlock);
4757*7c478bd9Sstevel@tonic-gate 			goto errout;
4758*7c478bd9Sstevel@tonic-gate 		}
4759*7c478bd9Sstevel@tonic-gate 		/*
4760*7c478bd9Sstevel@tonic-gate 		 * Must have weird flags or we are not clustering.
4761*7c478bd9Sstevel@tonic-gate 		 */
4762*7c478bd9Sstevel@tonic-gate 	}
4763*7c478bd9Sstevel@tonic-gate 
4764*7c478bd9Sstevel@tonic-gate 	err = ufs_putpages(vp, off, len, flags, cr);
4765*7c478bd9Sstevel@tonic-gate 
4766*7c478bd9Sstevel@tonic-gate errout:
4767*7c478bd9Sstevel@tonic-gate 	TRACE_2(TR_FAC_UFS, TR_UFS_PUTPAGE_END,
4768*7c478bd9Sstevel@tonic-gate 		"ufs_putpage_end:vp %p error %d", vp, err);
4769*7c478bd9Sstevel@tonic-gate 	return (err);
4770*7c478bd9Sstevel@tonic-gate }
4771*7c478bd9Sstevel@tonic-gate 
4772*7c478bd9Sstevel@tonic-gate /*
4773*7c478bd9Sstevel@tonic-gate  * If len == 0, do from off to EOF.
4774*7c478bd9Sstevel@tonic-gate  *
4775*7c478bd9Sstevel@tonic-gate  * The normal cases should be len == 0 & off == 0 (entire vp list),
4776*7c478bd9Sstevel@tonic-gate  * len == MAXBSIZE (from segmap_release actions), and len == PAGESIZE
4777*7c478bd9Sstevel@tonic-gate  * (from pageout).
4778*7c478bd9Sstevel@tonic-gate  */
4779*7c478bd9Sstevel@tonic-gate /*ARGSUSED*/
4780*7c478bd9Sstevel@tonic-gate static int
4781*7c478bd9Sstevel@tonic-gate ufs_putpages(
4782*7c478bd9Sstevel@tonic-gate 	struct vnode *vp,
4783*7c478bd9Sstevel@tonic-gate 	offset_t off,
4784*7c478bd9Sstevel@tonic-gate 	size_t len,
4785*7c478bd9Sstevel@tonic-gate 	int flags,
4786*7c478bd9Sstevel@tonic-gate 	struct cred *cr)
4787*7c478bd9Sstevel@tonic-gate {
4788*7c478bd9Sstevel@tonic-gate 	u_offset_t io_off;
4789*7c478bd9Sstevel@tonic-gate 	u_offset_t eoff;
4790*7c478bd9Sstevel@tonic-gate 	struct inode *ip = VTOI(vp);
4791*7c478bd9Sstevel@tonic-gate 	page_t *pp;
4792*7c478bd9Sstevel@tonic-gate 	size_t io_len;
4793*7c478bd9Sstevel@tonic-gate 	int err = 0;
4794*7c478bd9Sstevel@tonic-gate 	int dolock;
4795*7c478bd9Sstevel@tonic-gate 
4796*7c478bd9Sstevel@tonic-gate 	if (vp->v_count == 0)
4797*7c478bd9Sstevel@tonic-gate 		return (ufs_fault(vp, "ufs_putpages: v_count == 0"));
4798*7c478bd9Sstevel@tonic-gate 	/*
4799*7c478bd9Sstevel@tonic-gate 	 * Acquire the readers/write inode lock before locking
4800*7c478bd9Sstevel@tonic-gate 	 * any pages in this inode.
4801*7c478bd9Sstevel@tonic-gate 	 * The inode lock is held during i/o.
4802*7c478bd9Sstevel@tonic-gate 	 */
4803*7c478bd9Sstevel@tonic-gate 	if (len == 0) {
4804*7c478bd9Sstevel@tonic-gate 		mutex_enter(&ip->i_tlock);
4805*7c478bd9Sstevel@tonic-gate 		ip->i_delayoff = ip->i_delaylen = 0;
4806*7c478bd9Sstevel@tonic-gate 		mutex_exit(&ip->i_tlock);
4807*7c478bd9Sstevel@tonic-gate 	}
4808*7c478bd9Sstevel@tonic-gate 	dolock = (rw_owner(&ip->i_contents) != curthread);
4809*7c478bd9Sstevel@tonic-gate 	if (dolock) {
4810*7c478bd9Sstevel@tonic-gate 		/*
4811*7c478bd9Sstevel@tonic-gate 		 * Must synchronize this thread and any possible thread
4812*7c478bd9Sstevel@tonic-gate 		 * operating in the window of vulnerability in wrip().
4813*7c478bd9Sstevel@tonic-gate 		 * It is dangerous to allow both a thread doing a putpage
4814*7c478bd9Sstevel@tonic-gate 		 * and a thread writing, so serialize them.  The exception
4815*7c478bd9Sstevel@tonic-gate 		 * is when the thread in wrip() does something which causes
4816*7c478bd9Sstevel@tonic-gate 		 * a putpage operation.  Then, the thread must be allowed
4817*7c478bd9Sstevel@tonic-gate 		 * to continue.  It may encounter a bmap_read problem in
4818*7c478bd9Sstevel@tonic-gate 		 * ufs_putapage, but that is handled in ufs_putapage.
4819*7c478bd9Sstevel@tonic-gate 		 * Allow async writers to proceed, we don't want to block
4820*7c478bd9Sstevel@tonic-gate 		 * the pageout daemon.
4821*7c478bd9Sstevel@tonic-gate 		 */
4822*7c478bd9Sstevel@tonic-gate 		if (ip->i_writer == curthread)
4823*7c478bd9Sstevel@tonic-gate 			rw_enter(&ip->i_contents, RW_READER);
4824*7c478bd9Sstevel@tonic-gate 		else {
4825*7c478bd9Sstevel@tonic-gate 			for (;;) {
4826*7c478bd9Sstevel@tonic-gate 				rw_enter(&ip->i_contents, RW_READER);
4827*7c478bd9Sstevel@tonic-gate 				mutex_enter(&ip->i_tlock);
4828*7c478bd9Sstevel@tonic-gate 				/*
4829*7c478bd9Sstevel@tonic-gate 				 * If there is no thread in the critical
4830*7c478bd9Sstevel@tonic-gate 				 * section of wrip(), then proceed.
4831*7c478bd9Sstevel@tonic-gate 				 * Otherwise, wait until there isn't one.
4832*7c478bd9Sstevel@tonic-gate 				 */
4833*7c478bd9Sstevel@tonic-gate 				if (ip->i_writer == NULL) {
4834*7c478bd9Sstevel@tonic-gate 					mutex_exit(&ip->i_tlock);
4835*7c478bd9Sstevel@tonic-gate 					break;
4836*7c478bd9Sstevel@tonic-gate 				}
4837*7c478bd9Sstevel@tonic-gate 				rw_exit(&ip->i_contents);
4838*7c478bd9Sstevel@tonic-gate 				/*
4839*7c478bd9Sstevel@tonic-gate 				 * Bounce async writers when we have a writer
4840*7c478bd9Sstevel@tonic-gate 				 * working on this file so we don't deadlock
4841*7c478bd9Sstevel@tonic-gate 				 * the pageout daemon.
4842*7c478bd9Sstevel@tonic-gate 				 */
4843*7c478bd9Sstevel@tonic-gate 				if (flags & B_ASYNC) {
4844*7c478bd9Sstevel@tonic-gate 					mutex_exit(&ip->i_tlock);
4845*7c478bd9Sstevel@tonic-gate 					return (0);
4846*7c478bd9Sstevel@tonic-gate 				}
4847*7c478bd9Sstevel@tonic-gate 				cv_wait(&ip->i_wrcv, &ip->i_tlock);
4848*7c478bd9Sstevel@tonic-gate 				mutex_exit(&ip->i_tlock);
4849*7c478bd9Sstevel@tonic-gate 			}
4850*7c478bd9Sstevel@tonic-gate 		}
4851*7c478bd9Sstevel@tonic-gate 	}
4852*7c478bd9Sstevel@tonic-gate 
4853*7c478bd9Sstevel@tonic-gate 	if (!vn_has_cached_data(vp)) {
4854*7c478bd9Sstevel@tonic-gate 		if (dolock)
4855*7c478bd9Sstevel@tonic-gate 			rw_exit(&ip->i_contents);
4856*7c478bd9Sstevel@tonic-gate 		return (0);
4857*7c478bd9Sstevel@tonic-gate 	}
4858*7c478bd9Sstevel@tonic-gate 
4859*7c478bd9Sstevel@tonic-gate 	if (len == 0) {
4860*7c478bd9Sstevel@tonic-gate 		/*
4861*7c478bd9Sstevel@tonic-gate 		 * Search the entire vp list for pages >= off.
4862*7c478bd9Sstevel@tonic-gate 		 */
4863*7c478bd9Sstevel@tonic-gate 		err = pvn_vplist_dirty(vp, (u_offset_t)off, ufs_putapage,
4864*7c478bd9Sstevel@tonic-gate 					flags, cr);
4865*7c478bd9Sstevel@tonic-gate 	} else {
4866*7c478bd9Sstevel@tonic-gate 		/*
4867*7c478bd9Sstevel@tonic-gate 		 * Loop over all offsets in the range looking for
4868*7c478bd9Sstevel@tonic-gate 		 * pages to deal with.
4869*7c478bd9Sstevel@tonic-gate 		 */
4870*7c478bd9Sstevel@tonic-gate 		if ((eoff = blkroundup(ip->i_fs, ip->i_size)) != 0)
4871*7c478bd9Sstevel@tonic-gate 			eoff = MIN(off + len, eoff);
4872*7c478bd9Sstevel@tonic-gate 		else
4873*7c478bd9Sstevel@tonic-gate 			eoff = off + len;
4874*7c478bd9Sstevel@tonic-gate 
4875*7c478bd9Sstevel@tonic-gate 		for (io_off = off; io_off < eoff; io_off += io_len) {
4876*7c478bd9Sstevel@tonic-gate 			/*
4877*7c478bd9Sstevel@tonic-gate 			 * If we are not invalidating, synchronously
4878*7c478bd9Sstevel@tonic-gate 			 * freeing or writing pages, use the routine
4879*7c478bd9Sstevel@tonic-gate 			 * page_lookup_nowait() to prevent reclaiming
4880*7c478bd9Sstevel@tonic-gate 			 * them from the free list.
4881*7c478bd9Sstevel@tonic-gate 			 */
4882*7c478bd9Sstevel@tonic-gate 			if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) {
4883*7c478bd9Sstevel@tonic-gate 				pp = page_lookup(vp, io_off,
4884*7c478bd9Sstevel@tonic-gate 					(flags & (B_INVAL | B_FREE)) ?
4885*7c478bd9Sstevel@tonic-gate 					    SE_EXCL : SE_SHARED);
4886*7c478bd9Sstevel@tonic-gate 			} else {
4887*7c478bd9Sstevel@tonic-gate 				pp = page_lookup_nowait(vp, io_off,
4888*7c478bd9Sstevel@tonic-gate 					(flags & B_FREE) ? SE_EXCL : SE_SHARED);
4889*7c478bd9Sstevel@tonic-gate 			}
4890*7c478bd9Sstevel@tonic-gate 
4891*7c478bd9Sstevel@tonic-gate 			if (pp == NULL || pvn_getdirty(pp, flags) == 0)
4892*7c478bd9Sstevel@tonic-gate 				io_len = PAGESIZE;
4893*7c478bd9Sstevel@tonic-gate 			else {
4894*7c478bd9Sstevel@tonic-gate 				u_offset_t *io_offp = &io_off;
4895*7c478bd9Sstevel@tonic-gate 
4896*7c478bd9Sstevel@tonic-gate 				err = ufs_putapage(vp, pp, io_offp, &io_len,
4897*7c478bd9Sstevel@tonic-gate 				    flags, cr);
4898*7c478bd9Sstevel@tonic-gate 				if (err != 0)
4899*7c478bd9Sstevel@tonic-gate 					break;
4900*7c478bd9Sstevel@tonic-gate 				/*
4901*7c478bd9Sstevel@tonic-gate 				 * "io_off" and "io_len" are returned as
4902*7c478bd9Sstevel@tonic-gate 				 * the range of pages we actually wrote.
4903*7c478bd9Sstevel@tonic-gate 				 * This allows us to skip ahead more quickly
4904*7c478bd9Sstevel@tonic-gate 				 * since several pages may've been dealt
4905*7c478bd9Sstevel@tonic-gate 				 * with by this iteration of the loop.
4906*7c478bd9Sstevel@tonic-gate 				 */
4907*7c478bd9Sstevel@tonic-gate 			}
4908*7c478bd9Sstevel@tonic-gate 		}
4909*7c478bd9Sstevel@tonic-gate 	}
4910*7c478bd9Sstevel@tonic-gate 	if (err == 0 && off == 0 && (len == 0 || len >= ip->i_size)) {
4911*7c478bd9Sstevel@tonic-gate 		/*
4912*7c478bd9Sstevel@tonic-gate 		 * We have just sync'ed back all the pages on
4913*7c478bd9Sstevel@tonic-gate 		 * the inode, turn off the IMODTIME flag.
4914*7c478bd9Sstevel@tonic-gate 		 */
4915*7c478bd9Sstevel@tonic-gate 		mutex_enter(&ip->i_tlock);
4916*7c478bd9Sstevel@tonic-gate 		ip->i_flag &= ~IMODTIME;
4917*7c478bd9Sstevel@tonic-gate 		mutex_exit(&ip->i_tlock);
4918*7c478bd9Sstevel@tonic-gate 	}
4919*7c478bd9Sstevel@tonic-gate 	if (dolock)
4920*7c478bd9Sstevel@tonic-gate 		rw_exit(&ip->i_contents);
4921*7c478bd9Sstevel@tonic-gate 	return (err);
4922*7c478bd9Sstevel@tonic-gate }
4923*7c478bd9Sstevel@tonic-gate 
4924*7c478bd9Sstevel@tonic-gate static void
4925*7c478bd9Sstevel@tonic-gate ufs_iodone(buf_t *bp)
4926*7c478bd9Sstevel@tonic-gate {
4927*7c478bd9Sstevel@tonic-gate 	struct inode *ip;
4928*7c478bd9Sstevel@tonic-gate 
4929*7c478bd9Sstevel@tonic-gate 	ASSERT((bp->b_pages->p_vnode != NULL) && !(bp->b_flags & B_READ));
4930*7c478bd9Sstevel@tonic-gate 
4931*7c478bd9Sstevel@tonic-gate 	bp->b_iodone = NULL;
4932*7c478bd9Sstevel@tonic-gate 
4933*7c478bd9Sstevel@tonic-gate 	ip = VTOI(bp->b_pages->p_vnode);
4934*7c478bd9Sstevel@tonic-gate 
4935*7c478bd9Sstevel@tonic-gate 	mutex_enter(&ip->i_tlock);
4936*7c478bd9Sstevel@tonic-gate 	if (ip->i_writes >= ufs_LW) {
4937*7c478bd9Sstevel@tonic-gate 		if ((ip->i_writes -= bp->b_bcount) <= ufs_LW)
4938*7c478bd9Sstevel@tonic-gate 			if (ufs_WRITES)
4939*7c478bd9Sstevel@tonic-gate 				cv_broadcast(&ip->i_wrcv); /* wake all up */
4940*7c478bd9Sstevel@tonic-gate 	} else {
4941*7c478bd9Sstevel@tonic-gate 		ip->i_writes -= bp->b_bcount;
4942*7c478bd9Sstevel@tonic-gate 	}
4943*7c478bd9Sstevel@tonic-gate 
4944*7c478bd9Sstevel@tonic-gate 	mutex_exit(&ip->i_tlock);
4945*7c478bd9Sstevel@tonic-gate 	iodone(bp);
4946*7c478bd9Sstevel@tonic-gate }
4947*7c478bd9Sstevel@tonic-gate 
4948*7c478bd9Sstevel@tonic-gate /*
4949*7c478bd9Sstevel@tonic-gate  * Write out a single page, possibly klustering adjacent
4950*7c478bd9Sstevel@tonic-gate  * dirty pages.  The inode lock must be held.
4951*7c478bd9Sstevel@tonic-gate  *
4952*7c478bd9Sstevel@tonic-gate  * LMXXX - bsize < pagesize not done.
4953*7c478bd9Sstevel@tonic-gate  */
4954*7c478bd9Sstevel@tonic-gate /*ARGSUSED*/
4955*7c478bd9Sstevel@tonic-gate int
4956*7c478bd9Sstevel@tonic-gate ufs_putapage(
4957*7c478bd9Sstevel@tonic-gate 	struct vnode *vp,
4958*7c478bd9Sstevel@tonic-gate 	page_t *pp,
4959*7c478bd9Sstevel@tonic-gate 	u_offset_t *offp,
4960*7c478bd9Sstevel@tonic-gate 	size_t *lenp,		/* return values */
4961*7c478bd9Sstevel@tonic-gate 	int flags,
4962*7c478bd9Sstevel@tonic-gate 	struct cred *cr)
4963*7c478bd9Sstevel@tonic-gate {
4964*7c478bd9Sstevel@tonic-gate 	u_offset_t io_off;
4965*7c478bd9Sstevel@tonic-gate 	u_offset_t off;
4966*7c478bd9Sstevel@tonic-gate 	struct inode *ip = VTOI(vp);
4967*7c478bd9Sstevel@tonic-gate 	struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
4968*7c478bd9Sstevel@tonic-gate 	struct fs *fs;
4969*7c478bd9Sstevel@tonic-gate 	struct buf *bp;
4970*7c478bd9Sstevel@tonic-gate 	size_t io_len;
4971*7c478bd9Sstevel@tonic-gate 	daddr_t bn;
4972*7c478bd9Sstevel@tonic-gate 	int err;
4973*7c478bd9Sstevel@tonic-gate 	int contig;
4974*7c478bd9Sstevel@tonic-gate 
4975*7c478bd9Sstevel@tonic-gate 	ASSERT(RW_LOCK_HELD(&ip->i_contents));
4976*7c478bd9Sstevel@tonic-gate 
4977*7c478bd9Sstevel@tonic-gate 	TRACE_1(TR_FAC_UFS, TR_UFS_PUTAPAGE_START,
4978*7c478bd9Sstevel@tonic-gate 		"ufs_putapage_start:vp %p", vp);
4979*7c478bd9Sstevel@tonic-gate 
4980*7c478bd9Sstevel@tonic-gate 	if (ufsvfsp == NULL) {
4981*7c478bd9Sstevel@tonic-gate 		err = EIO;
4982*7c478bd9Sstevel@tonic-gate 		goto out_trace;
4983*7c478bd9Sstevel@tonic-gate 	}
4984*7c478bd9Sstevel@tonic-gate 
4985*7c478bd9Sstevel@tonic-gate 	fs = ip->i_fs;
4986*7c478bd9Sstevel@tonic-gate 	ASSERT(fs->fs_ronly == 0);
4987*7c478bd9Sstevel@tonic-gate 
4988*7c478bd9Sstevel@tonic-gate 	/*
4989*7c478bd9Sstevel@tonic-gate 	 * If the modified time on the inode has not already been
4990*7c478bd9Sstevel@tonic-gate 	 * set elsewhere (e.g. for write/setattr) we set the time now.
4991*7c478bd9Sstevel@tonic-gate 	 * This gives us approximate modified times for mmap'ed files
4992*7c478bd9Sstevel@tonic-gate 	 * which are modified via stores in the user address space.
4993*7c478bd9Sstevel@tonic-gate 	 */
4994*7c478bd9Sstevel@tonic-gate 	if ((ip->i_flag & IMODTIME) == 0) {
4995*7c478bd9Sstevel@tonic-gate 		mutex_enter(&ip->i_tlock);
4996*7c478bd9Sstevel@tonic-gate 		ip->i_flag |= IUPD;
4997*7c478bd9Sstevel@tonic-gate 		ip->i_seq++;
4998*7c478bd9Sstevel@tonic-gate 		ITIMES_NOLOCK(ip);
4999*7c478bd9Sstevel@tonic-gate 		mutex_exit(&ip->i_tlock);
5000*7c478bd9Sstevel@tonic-gate 	}
5001*7c478bd9Sstevel@tonic-gate 
5002*7c478bd9Sstevel@tonic-gate 	/*
5003*7c478bd9Sstevel@tonic-gate 	 * Align the request to a block boundry (for old file systems),
5004*7c478bd9Sstevel@tonic-gate 	 * and go ask bmap() how contiguous things are for this file.
5005*7c478bd9Sstevel@tonic-gate 	 */
5006*7c478bd9Sstevel@tonic-gate 	off = pp->p_offset & (offset_t)fs->fs_bmask;	/* block align it */
5007*7c478bd9Sstevel@tonic-gate 	contig = 0;
5008*7c478bd9Sstevel@tonic-gate 	err = bmap_read(ip, off, &bn, &contig);
5009*7c478bd9Sstevel@tonic-gate 	if (err)
5010*7c478bd9Sstevel@tonic-gate 		goto out;
5011*7c478bd9Sstevel@tonic-gate 	if (bn == UFS_HOLE) {			/* putpage never allocates */
5012*7c478bd9Sstevel@tonic-gate 		/*
5013*7c478bd9Sstevel@tonic-gate 		 * logging device is in error mode; simply return EIO
5014*7c478bd9Sstevel@tonic-gate 		 */
5015*7c478bd9Sstevel@tonic-gate 		if (TRANS_ISERROR(ufsvfsp)) {
5016*7c478bd9Sstevel@tonic-gate 			err = EIO;
5017*7c478bd9Sstevel@tonic-gate 			goto out;
5018*7c478bd9Sstevel@tonic-gate 		}
5019*7c478bd9Sstevel@tonic-gate 		/*
5020*7c478bd9Sstevel@tonic-gate 		 * Oops, the thread in the window in wrip() did some
5021*7c478bd9Sstevel@tonic-gate 		 * sort of operation which caused a putpage in the bad
5022*7c478bd9Sstevel@tonic-gate 		 * range.  In this case, just return an error which will
5023*7c478bd9Sstevel@tonic-gate 		 * cause the software modified bit on the page to set
5024*7c478bd9Sstevel@tonic-gate 		 * and the page will get written out again later.
5025*7c478bd9Sstevel@tonic-gate 		 */
5026*7c478bd9Sstevel@tonic-gate 		if (ip->i_writer == curthread) {
5027*7c478bd9Sstevel@tonic-gate 			err = EIO;
5028*7c478bd9Sstevel@tonic-gate 			goto out;
5029*7c478bd9Sstevel@tonic-gate 		}
5030*7c478bd9Sstevel@tonic-gate 		/*
5031*7c478bd9Sstevel@tonic-gate 		 * If the pager is trying to push a page in the bad range
5032*7c478bd9Sstevel@tonic-gate 		 * just tell him to try again later when things are better.
5033*7c478bd9Sstevel@tonic-gate 		 */
5034*7c478bd9Sstevel@tonic-gate 		if (flags & B_ASYNC) {
5035*7c478bd9Sstevel@tonic-gate 			err = EAGAIN;
5036*7c478bd9Sstevel@tonic-gate 			goto out;
5037*7c478bd9Sstevel@tonic-gate 		}
5038*7c478bd9Sstevel@tonic-gate 		err = ufs_fault(ITOV(ip), "ufs_putapage: bn == UFS_HOLE");
5039*7c478bd9Sstevel@tonic-gate 		goto out;
5040*7c478bd9Sstevel@tonic-gate 	}
5041*7c478bd9Sstevel@tonic-gate 
5042*7c478bd9Sstevel@tonic-gate 	/*
5043*7c478bd9Sstevel@tonic-gate 	 * Take the length (of contiguous bytes) passed back from bmap()
5044*7c478bd9Sstevel@tonic-gate 	 * and _try_ and get a set of pages covering that extent.
5045*7c478bd9Sstevel@tonic-gate 	 */
5046*7c478bd9Sstevel@tonic-gate 	pp = pvn_write_kluster(vp, pp, &io_off, &io_len, off, contig, flags);
5047*7c478bd9Sstevel@tonic-gate 
5048*7c478bd9Sstevel@tonic-gate 	/*
5049*7c478bd9Sstevel@tonic-gate 	 * May have run out of memory and not clustered backwards.
5050*7c478bd9Sstevel@tonic-gate 	 * off		p_offset
5051*7c478bd9Sstevel@tonic-gate 	 * [  pp - 1  ][   pp   ]
5052*7c478bd9Sstevel@tonic-gate 	 * [	block		]
5053*7c478bd9Sstevel@tonic-gate 	 * We told bmap off, so we have to adjust the bn accordingly.
5054*7c478bd9Sstevel@tonic-gate 	 */
5055*7c478bd9Sstevel@tonic-gate 	if (io_off > off) {
5056*7c478bd9Sstevel@tonic-gate 		bn += btod(io_off - off);
5057*7c478bd9Sstevel@tonic-gate 		contig -= (io_off - off);
5058*7c478bd9Sstevel@tonic-gate 	}
5059*7c478bd9Sstevel@tonic-gate 
5060*7c478bd9Sstevel@tonic-gate 	/*
5061*7c478bd9Sstevel@tonic-gate 	 * bmap was carefull to tell us the right size so use that.
5062*7c478bd9Sstevel@tonic-gate 	 * There might be unallocated frags at the end.
5063*7c478bd9Sstevel@tonic-gate 	 * LMXXX - bzero the end of the page?  We must be writing after EOF.
5064*7c478bd9Sstevel@tonic-gate 	 */
5065*7c478bd9Sstevel@tonic-gate 	if (io_len > contig) {
5066*7c478bd9Sstevel@tonic-gate 		ASSERT(io_len - contig < fs->fs_bsize);
5067*7c478bd9Sstevel@tonic-gate 		io_len -= (io_len - contig);
5068*7c478bd9Sstevel@tonic-gate 	}
5069*7c478bd9Sstevel@tonic-gate 
5070*7c478bd9Sstevel@tonic-gate 	/*
5071*7c478bd9Sstevel@tonic-gate 	 * Handle the case where we are writing the last page after EOF.
5072*7c478bd9Sstevel@tonic-gate 	 *
5073*7c478bd9Sstevel@tonic-gate 	 * XXX - just a patch for i-mt3.
5074*7c478bd9Sstevel@tonic-gate 	 */
5075*7c478bd9Sstevel@tonic-gate 	if (io_len == 0) {
5076*7c478bd9Sstevel@tonic-gate 		ASSERT(pp->p_offset >= (u_offset_t)(roundup(ip->i_size,
5077*7c478bd9Sstevel@tonic-gate 							    PAGESIZE)));
5078*7c478bd9Sstevel@tonic-gate 		io_len = PAGESIZE;
5079*7c478bd9Sstevel@tonic-gate 	}
5080*7c478bd9Sstevel@tonic-gate 
5081*7c478bd9Sstevel@tonic-gate 	bp = pageio_setup(pp, io_len, ip->i_devvp, B_WRITE | flags);
5082*7c478bd9Sstevel@tonic-gate 
5083*7c478bd9Sstevel@tonic-gate 	ULOCKFS_SET_MOD(ITOUL(ip));
5084*7c478bd9Sstevel@tonic-gate 
5085*7c478bd9Sstevel@tonic-gate 	bp->b_edev = ip->i_dev;
5086*7c478bd9Sstevel@tonic-gate 	bp->b_dev = cmpdev(ip->i_dev);
5087*7c478bd9Sstevel@tonic-gate 	bp->b_blkno = bn;
5088*7c478bd9Sstevel@tonic-gate 	bp->b_un.b_addr = (caddr_t)0;
5089*7c478bd9Sstevel@tonic-gate 	bp->b_file = ip->i_vnode;
5090*7c478bd9Sstevel@tonic-gate 
5091*7c478bd9Sstevel@tonic-gate 	if (TRANS_ISTRANS(ufsvfsp)) {
5092*7c478bd9Sstevel@tonic-gate 		if ((ip->i_mode & IFMT) == IFSHAD) {
5093*7c478bd9Sstevel@tonic-gate 			TRANS_BUF(ufsvfsp, 0, io_len, bp, DT_SHAD);
5094*7c478bd9Sstevel@tonic-gate 		} else if (ufsvfsp->vfs_qinod == ip) {
5095*7c478bd9Sstevel@tonic-gate 			TRANS_DELTA(ufsvfsp, ldbtob(bn), bp->b_bcount, DT_QR,
5096*7c478bd9Sstevel@tonic-gate 			    0, 0);
5097*7c478bd9Sstevel@tonic-gate 		}
5098*7c478bd9Sstevel@tonic-gate 	}
5099*7c478bd9Sstevel@tonic-gate 
5100*7c478bd9Sstevel@tonic-gate 	/* write throttle */
5101*7c478bd9Sstevel@tonic-gate 
5102*7c478bd9Sstevel@tonic-gate 	ASSERT(bp->b_iodone == NULL);
5103*7c478bd9Sstevel@tonic-gate 	bp->b_iodone = (int (*)())ufs_iodone;
5104*7c478bd9Sstevel@tonic-gate 	mutex_enter(&ip->i_tlock);
5105*7c478bd9Sstevel@tonic-gate 	ip->i_writes += bp->b_bcount;
5106*7c478bd9Sstevel@tonic-gate 	mutex_exit(&ip->i_tlock);
5107*7c478bd9Sstevel@tonic-gate 
5108*7c478bd9Sstevel@tonic-gate 	if (bp->b_flags & B_ASYNC) {
5109*7c478bd9Sstevel@tonic-gate 		if (ufsvfsp->vfs_log) {
5110*7c478bd9Sstevel@tonic-gate 			lufs_write_strategy(ufsvfsp->vfs_log, bp);
5111*7c478bd9Sstevel@tonic-gate 		} else if (ufsvfsp->vfs_snapshot) {
5112*7c478bd9Sstevel@tonic-gate 			fssnap_strategy(&ufsvfsp->vfs_snapshot, bp);
5113*7c478bd9Sstevel@tonic-gate 		} else {
5114*7c478bd9Sstevel@tonic-gate 			ufsvfsp->vfs_iotstamp = lbolt;
5115*7c478bd9Sstevel@tonic-gate 			ub.ub_putasyncs.value.ul++;
5116*7c478bd9Sstevel@tonic-gate 			(void) bdev_strategy(bp);
5117*7c478bd9Sstevel@tonic-gate 			lwp_stat_update(LWP_STAT_OUBLK, 1);
5118*7c478bd9Sstevel@tonic-gate 		}
5119*7c478bd9Sstevel@tonic-gate 	} else {
5120*7c478bd9Sstevel@tonic-gate 		if (ufsvfsp->vfs_log) {
5121*7c478bd9Sstevel@tonic-gate 			lufs_write_strategy(ufsvfsp->vfs_log, bp);
5122*7c478bd9Sstevel@tonic-gate 		} else if (ufsvfsp->vfs_snapshot) {
5123*7c478bd9Sstevel@tonic-gate 			fssnap_strategy(&ufsvfsp->vfs_snapshot, bp);
5124*7c478bd9Sstevel@tonic-gate 		} else {
5125*7c478bd9Sstevel@tonic-gate 			ufsvfsp->vfs_iotstamp = lbolt;
5126*7c478bd9Sstevel@tonic-gate 			ub.ub_putsyncs.value.ul++;
5127*7c478bd9Sstevel@tonic-gate 			(void) bdev_strategy(bp);
5128*7c478bd9Sstevel@tonic-gate 			lwp_stat_update(LWP_STAT_OUBLK, 1);
5129*7c478bd9Sstevel@tonic-gate 		}
5130*7c478bd9Sstevel@tonic-gate 		err = biowait(bp);
5131*7c478bd9Sstevel@tonic-gate 		pageio_done(bp);
5132*7c478bd9Sstevel@tonic-gate 		pvn_write_done(pp, ((err) ? B_ERROR : 0) | B_WRITE | flags);
5133*7c478bd9Sstevel@tonic-gate 	}
5134*7c478bd9Sstevel@tonic-gate 
5135*7c478bd9Sstevel@tonic-gate 	pp = NULL;
5136*7c478bd9Sstevel@tonic-gate 
5137*7c478bd9Sstevel@tonic-gate out:
5138*7c478bd9Sstevel@tonic-gate 	if (err != 0 && pp != NULL)
5139*7c478bd9Sstevel@tonic-gate 		pvn_write_done(pp, B_ERROR | B_WRITE | flags);
5140*7c478bd9Sstevel@tonic-gate 
5141*7c478bd9Sstevel@tonic-gate 	if (offp)
5142*7c478bd9Sstevel@tonic-gate 		*offp = io_off;
5143*7c478bd9Sstevel@tonic-gate 	if (lenp)
5144*7c478bd9Sstevel@tonic-gate 		*lenp = io_len;
5145*7c478bd9Sstevel@tonic-gate out_trace:
5146*7c478bd9Sstevel@tonic-gate 	TRACE_2(TR_FAC_UFS, TR_UFS_PUTAPAGE_END,
5147*7c478bd9Sstevel@tonic-gate 		"ufs_putapage_end:vp %p error %d", vp, err);
5148*7c478bd9Sstevel@tonic-gate 	return (err);
5149*7c478bd9Sstevel@tonic-gate }
5150*7c478bd9Sstevel@tonic-gate 
5151*7c478bd9Sstevel@tonic-gate /* ARGSUSED */
5152*7c478bd9Sstevel@tonic-gate static int
5153*7c478bd9Sstevel@tonic-gate ufs_map(struct vnode *vp,
5154*7c478bd9Sstevel@tonic-gate 	offset_t off,
5155*7c478bd9Sstevel@tonic-gate 	struct as *as,
5156*7c478bd9Sstevel@tonic-gate 	caddr_t *addrp,
5157*7c478bd9Sstevel@tonic-gate 	size_t len,
5158*7c478bd9Sstevel@tonic-gate 	uchar_t prot,
5159*7c478bd9Sstevel@tonic-gate 	uchar_t maxprot,
5160*7c478bd9Sstevel@tonic-gate 	uint_t flags,
5161*7c478bd9Sstevel@tonic-gate 	struct cred *cr)
5162*7c478bd9Sstevel@tonic-gate {
5163*7c478bd9Sstevel@tonic-gate 	struct segvn_crargs vn_a;
5164*7c478bd9Sstevel@tonic-gate 	struct ufsvfs *ufsvfsp = VTOI(vp)->i_ufsvfs;
5165*7c478bd9Sstevel@tonic-gate 	struct ulockfs *ulp;
5166*7c478bd9Sstevel@tonic-gate 	int error;
5167*7c478bd9Sstevel@tonic-gate 
5168*7c478bd9Sstevel@tonic-gate 	TRACE_1(TR_FAC_UFS, TR_UFS_MAP_START,
5169*7c478bd9Sstevel@tonic-gate 		"ufs_map_start:vp %p", vp);
5170*7c478bd9Sstevel@tonic-gate 
5171*7c478bd9Sstevel@tonic-gate 	error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_MAP_MASK);
5172*7c478bd9Sstevel@tonic-gate 	if (error)
5173*7c478bd9Sstevel@tonic-gate 		goto out;
5174*7c478bd9Sstevel@tonic-gate 
5175*7c478bd9Sstevel@tonic-gate 	if (vp->v_flag & VNOMAP) {
5176*7c478bd9Sstevel@tonic-gate 		error = ENOSYS;
5177*7c478bd9Sstevel@tonic-gate 		goto unlock;
5178*7c478bd9Sstevel@tonic-gate 	}
5179*7c478bd9Sstevel@tonic-gate 
5180*7c478bd9Sstevel@tonic-gate 	if (off < (offset_t)0 || (offset_t)(off + len) < (offset_t)0) {
5181*7c478bd9Sstevel@tonic-gate 		error = ENXIO;
5182*7c478bd9Sstevel@tonic-gate 		goto unlock;
5183*7c478bd9Sstevel@tonic-gate 	}
5184*7c478bd9Sstevel@tonic-gate 
5185*7c478bd9Sstevel@tonic-gate 	if (vp->v_type != VREG) {
5186*7c478bd9Sstevel@tonic-gate 		error = ENODEV;
5187*7c478bd9Sstevel@tonic-gate 		goto unlock;
5188*7c478bd9Sstevel@tonic-gate 	}
5189*7c478bd9Sstevel@tonic-gate 
5190*7c478bd9Sstevel@tonic-gate 	/*
5191*7c478bd9Sstevel@tonic-gate 	 * If file is being locked, disallow mapping.
5192*7c478bd9Sstevel@tonic-gate 	 */
5193*7c478bd9Sstevel@tonic-gate 	if (vn_has_mandatory_locks(vp, VTOI(vp)->i_mode)) {
5194*7c478bd9Sstevel@tonic-gate 		error = EAGAIN;
5195*7c478bd9Sstevel@tonic-gate 		goto unlock;
5196*7c478bd9Sstevel@tonic-gate 	}
5197*7c478bd9Sstevel@tonic-gate 
5198*7c478bd9Sstevel@tonic-gate 	as_rangelock(as);
5199*7c478bd9Sstevel@tonic-gate 	if ((flags & MAP_FIXED) == 0) {
5200*7c478bd9Sstevel@tonic-gate 		map_addr(addrp, len, off, 1, flags);
5201*7c478bd9Sstevel@tonic-gate 		if (*addrp == NULL) {
5202*7c478bd9Sstevel@tonic-gate 			as_rangeunlock(as);
5203*7c478bd9Sstevel@tonic-gate 			error = ENOMEM;
5204*7c478bd9Sstevel@tonic-gate 			goto unlock;
5205*7c478bd9Sstevel@tonic-gate 		}
5206*7c478bd9Sstevel@tonic-gate 	} else {
5207*7c478bd9Sstevel@tonic-gate 		/*
5208*7c478bd9Sstevel@tonic-gate 		 * User specified address - blow away any previous mappings
5209*7c478bd9Sstevel@tonic-gate 		 */
5210*7c478bd9Sstevel@tonic-gate 		(void) as_unmap(as, *addrp, len);
5211*7c478bd9Sstevel@tonic-gate 	}
5212*7c478bd9Sstevel@tonic-gate 
5213*7c478bd9Sstevel@tonic-gate 	vn_a.vp = vp;
5214*7c478bd9Sstevel@tonic-gate 	vn_a.offset = (u_offset_t)off;
5215*7c478bd9Sstevel@tonic-gate 	vn_a.type = flags & MAP_TYPE;
5216*7c478bd9Sstevel@tonic-gate 	vn_a.prot = prot;
5217*7c478bd9Sstevel@tonic-gate 	vn_a.maxprot = maxprot;
5218*7c478bd9Sstevel@tonic-gate 	vn_a.cred = cr;
5219*7c478bd9Sstevel@tonic-gate 	vn_a.amp = NULL;
5220*7c478bd9Sstevel@tonic-gate 	vn_a.flags = flags & ~MAP_TYPE;
5221*7c478bd9Sstevel@tonic-gate 	vn_a.szc = 0;
5222*7c478bd9Sstevel@tonic-gate 	vn_a.lgrp_mem_policy_flags = 0;
5223*7c478bd9Sstevel@tonic-gate 
5224*7c478bd9Sstevel@tonic-gate 	error = as_map(as, *addrp, len, segvn_create, &vn_a);
5225*7c478bd9Sstevel@tonic-gate 	as_rangeunlock(as);
5226*7c478bd9Sstevel@tonic-gate 
5227*7c478bd9Sstevel@tonic-gate unlock:
5228*7c478bd9Sstevel@tonic-gate 	if (ulp) {
5229*7c478bd9Sstevel@tonic-gate 		ufs_lockfs_end(ulp);
5230*7c478bd9Sstevel@tonic-gate 	}
5231*7c478bd9Sstevel@tonic-gate out:
5232*7c478bd9Sstevel@tonic-gate 	TRACE_2(TR_FAC_UFS, TR_UFS_MAP_END,
5233*7c478bd9Sstevel@tonic-gate 		"ufs_map_end:vp %p error %d", vp, error);
5234*7c478bd9Sstevel@tonic-gate 	return (error);
5235*7c478bd9Sstevel@tonic-gate }
5236*7c478bd9Sstevel@tonic-gate 
5237*7c478bd9Sstevel@tonic-gate /* ARGSUSED */
5238*7c478bd9Sstevel@tonic-gate static int
5239*7c478bd9Sstevel@tonic-gate ufs_addmap(struct vnode *vp,
5240*7c478bd9Sstevel@tonic-gate 	offset_t off,
5241*7c478bd9Sstevel@tonic-gate 	struct as *as,
5242*7c478bd9Sstevel@tonic-gate 	caddr_t addr,
5243*7c478bd9Sstevel@tonic-gate 	size_t	len,
5244*7c478bd9Sstevel@tonic-gate 	uchar_t  prot,
5245*7c478bd9Sstevel@tonic-gate 	uchar_t  maxprot,
5246*7c478bd9Sstevel@tonic-gate 	uint_t    flags,
5247*7c478bd9Sstevel@tonic-gate 	struct cred *cr)
5248*7c478bd9Sstevel@tonic-gate {
5249*7c478bd9Sstevel@tonic-gate 	struct inode *ip = VTOI(vp);
5250*7c478bd9Sstevel@tonic-gate 
5251*7c478bd9Sstevel@tonic-gate 	if (vp->v_flag & VNOMAP) {
5252*7c478bd9Sstevel@tonic-gate 		return (ENOSYS);
5253*7c478bd9Sstevel@tonic-gate 	}
5254*7c478bd9Sstevel@tonic-gate 
5255*7c478bd9Sstevel@tonic-gate 	mutex_enter(&ip->i_tlock);
5256*7c478bd9Sstevel@tonic-gate 	ip->i_mapcnt += btopr(len);
5257*7c478bd9Sstevel@tonic-gate 	mutex_exit(&ip->i_tlock);
5258*7c478bd9Sstevel@tonic-gate 	return (0);
5259*7c478bd9Sstevel@tonic-gate }
5260*7c478bd9Sstevel@tonic-gate 
5261*7c478bd9Sstevel@tonic-gate /*ARGSUSED*/
5262*7c478bd9Sstevel@tonic-gate static int
5263*7c478bd9Sstevel@tonic-gate ufs_delmap(struct vnode *vp, offset_t off, struct as *as, caddr_t addr,
5264*7c478bd9Sstevel@tonic-gate 	size_t len, uint_t prot,  uint_t maxprot,  uint_t flags,
5265*7c478bd9Sstevel@tonic-gate 	struct cred *cr)
5266*7c478bd9Sstevel@tonic-gate {
5267*7c478bd9Sstevel@tonic-gate 	struct inode *ip = VTOI(vp);
5268*7c478bd9Sstevel@tonic-gate 
5269*7c478bd9Sstevel@tonic-gate 	if (vp->v_flag & VNOMAP) {
5270*7c478bd9Sstevel@tonic-gate 		return (ENOSYS);
5271*7c478bd9Sstevel@tonic-gate 	}
5272*7c478bd9Sstevel@tonic-gate 
5273*7c478bd9Sstevel@tonic-gate 	mutex_enter(&ip->i_tlock);
5274*7c478bd9Sstevel@tonic-gate 	ip->i_mapcnt -= btopr(len); 	/* Count released mappings */
5275*7c478bd9Sstevel@tonic-gate 	ASSERT(ip->i_mapcnt >= 0);
5276*7c478bd9Sstevel@tonic-gate 	mutex_exit(&ip->i_tlock);
5277*7c478bd9Sstevel@tonic-gate 	return (0);
5278*7c478bd9Sstevel@tonic-gate }
5279*7c478bd9Sstevel@tonic-gate /*
5280*7c478bd9Sstevel@tonic-gate  * Return the answer requested to poll() for non-device files
5281*7c478bd9Sstevel@tonic-gate  */
5282*7c478bd9Sstevel@tonic-gate struct pollhead ufs_pollhd;
5283*7c478bd9Sstevel@tonic-gate 
5284*7c478bd9Sstevel@tonic-gate /* ARGSUSED */
5285*7c478bd9Sstevel@tonic-gate int
5286*7c478bd9Sstevel@tonic-gate ufs_poll(vnode_t *vp, short ev, int any, short *revp, struct pollhead **phpp)
5287*7c478bd9Sstevel@tonic-gate {
5288*7c478bd9Sstevel@tonic-gate 	struct ufsvfs	*ufsvfsp;
5289*7c478bd9Sstevel@tonic-gate 
5290*7c478bd9Sstevel@tonic-gate 	*revp = 0;
5291*7c478bd9Sstevel@tonic-gate 	ufsvfsp = VTOI(vp)->i_ufsvfs;
5292*7c478bd9Sstevel@tonic-gate 
5293*7c478bd9Sstevel@tonic-gate 	if (!ufsvfsp) {
5294*7c478bd9Sstevel@tonic-gate 		*revp = POLLHUP;
5295*7c478bd9Sstevel@tonic-gate 		goto out;
5296*7c478bd9Sstevel@tonic-gate 	}
5297*7c478bd9Sstevel@tonic-gate 
5298*7c478bd9Sstevel@tonic-gate 	if (ULOCKFS_IS_HLOCK(&ufsvfsp->vfs_ulockfs) ||
5299*7c478bd9Sstevel@tonic-gate 	    ULOCKFS_IS_ELOCK(&ufsvfsp->vfs_ulockfs)) {
5300*7c478bd9Sstevel@tonic-gate 		*revp |= POLLERR;
5301*7c478bd9Sstevel@tonic-gate 
5302*7c478bd9Sstevel@tonic-gate 	} else {
5303*7c478bd9Sstevel@tonic-gate 		if ((ev & POLLOUT) && !ufsvfsp->vfs_fs->fs_ronly &&
5304*7c478bd9Sstevel@tonic-gate 		    !ULOCKFS_IS_WLOCK(&ufsvfsp->vfs_ulockfs))
5305*7c478bd9Sstevel@tonic-gate 			*revp |= POLLOUT;
5306*7c478bd9Sstevel@tonic-gate 
5307*7c478bd9Sstevel@tonic-gate 		if ((ev & POLLWRBAND) && !ufsvfsp->vfs_fs->fs_ronly &&
5308*7c478bd9Sstevel@tonic-gate 		    !ULOCKFS_IS_WLOCK(&ufsvfsp->vfs_ulockfs))
5309*7c478bd9Sstevel@tonic-gate 			*revp |= POLLWRBAND;
5310*7c478bd9Sstevel@tonic-gate 
5311*7c478bd9Sstevel@tonic-gate 		if (ev & POLLIN)
5312*7c478bd9Sstevel@tonic-gate 			*revp |= POLLIN;
5313*7c478bd9Sstevel@tonic-gate 
5314*7c478bd9Sstevel@tonic-gate 		if (ev & POLLRDNORM)
5315*7c478bd9Sstevel@tonic-gate 			*revp |= POLLRDNORM;
5316*7c478bd9Sstevel@tonic-gate 
5317*7c478bd9Sstevel@tonic-gate 		if (ev & POLLRDBAND)
5318*7c478bd9Sstevel@tonic-gate 			*revp |= POLLRDBAND;
5319*7c478bd9Sstevel@tonic-gate 	}
5320*7c478bd9Sstevel@tonic-gate 
5321*7c478bd9Sstevel@tonic-gate 	if ((ev & POLLPRI) && (*revp & (POLLERR|POLLHUP)))
5322*7c478bd9Sstevel@tonic-gate 		*revp |= POLLPRI;
5323*7c478bd9Sstevel@tonic-gate out:
5324*7c478bd9Sstevel@tonic-gate 	*phpp = !any && !*revp ? &ufs_pollhd : (struct pollhead *)NULL;
5325*7c478bd9Sstevel@tonic-gate 
5326*7c478bd9Sstevel@tonic-gate 	return (0);
5327*7c478bd9Sstevel@tonic-gate }
5328*7c478bd9Sstevel@tonic-gate 
5329*7c478bd9Sstevel@tonic-gate /* ARGSUSED */
5330*7c478bd9Sstevel@tonic-gate static int
5331*7c478bd9Sstevel@tonic-gate ufs_l_pathconf(struct vnode *vp, int cmd, ulong_t *valp, struct cred *cr)
5332*7c478bd9Sstevel@tonic-gate {
5333*7c478bd9Sstevel@tonic-gate 	struct ufsvfs	*ufsvfsp = VTOI(vp)->i_ufsvfs;
5334*7c478bd9Sstevel@tonic-gate 	struct ulockfs	*ulp = NULL;
5335*7c478bd9Sstevel@tonic-gate 	struct inode 	*sip = NULL;
5336*7c478bd9Sstevel@tonic-gate 	int		error;
5337*7c478bd9Sstevel@tonic-gate 	struct inode 	*ip = VTOI(vp);
5338*7c478bd9Sstevel@tonic-gate 	int		issync;
5339*7c478bd9Sstevel@tonic-gate 
5340*7c478bd9Sstevel@tonic-gate 	error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_PATHCONF_MASK);
5341*7c478bd9Sstevel@tonic-gate 	if (error)
5342*7c478bd9Sstevel@tonic-gate 		return (error);
5343*7c478bd9Sstevel@tonic-gate 
5344*7c478bd9Sstevel@tonic-gate 	switch (cmd) {
5345*7c478bd9Sstevel@tonic-gate 		/*
5346*7c478bd9Sstevel@tonic-gate 		 * Have to handle _PC_NAME_MAX here, because the normal way
5347*7c478bd9Sstevel@tonic-gate 		 * [fs_pathconf() -> VOP_STATVFS() -> ufs_statvfs()]
5348*7c478bd9Sstevel@tonic-gate 		 * results in a lock ordering reversal between
5349*7c478bd9Sstevel@tonic-gate 		 * ufs_lockfs_{begin,end}() and
5350*7c478bd9Sstevel@tonic-gate 		 * ufs_thread_{suspend,continue}().
5351*7c478bd9Sstevel@tonic-gate 		 *
5352*7c478bd9Sstevel@tonic-gate 		 * Keep in sync with ufs_statvfs().
5353*7c478bd9Sstevel@tonic-gate 		 */
5354*7c478bd9Sstevel@tonic-gate 	case _PC_NAME_MAX:
5355*7c478bd9Sstevel@tonic-gate 		*valp = MAXNAMLEN;
5356*7c478bd9Sstevel@tonic-gate 		break;
5357*7c478bd9Sstevel@tonic-gate 
5358*7c478bd9Sstevel@tonic-gate 	case _PC_FILESIZEBITS:
5359*7c478bd9Sstevel@tonic-gate 		if (ufsvfsp->vfs_lfflags & UFS_LARGEFILES)
5360*7c478bd9Sstevel@tonic-gate 			*valp = UFS_FILESIZE_BITS;
5361*7c478bd9Sstevel@tonic-gate 		else
5362*7c478bd9Sstevel@tonic-gate 			*valp = 32;
5363*7c478bd9Sstevel@tonic-gate 		break;
5364*7c478bd9Sstevel@tonic-gate 
5365*7c478bd9Sstevel@tonic-gate 	case _PC_XATTR_EXISTS:
5366*7c478bd9Sstevel@tonic-gate 		if (vp->v_vfsp->vfs_flag & VFS_XATTR) {
5367*7c478bd9Sstevel@tonic-gate 
5368*7c478bd9Sstevel@tonic-gate 			error = ufs_xattr_getattrdir(vp, &sip, LOOKUP_XATTR,
5369*7c478bd9Sstevel@tonic-gate 							cr);
5370*7c478bd9Sstevel@tonic-gate 			if (error ==  0 && sip != NULL) {
5371*7c478bd9Sstevel@tonic-gate 				/* Start transaction */
5372*7c478bd9Sstevel@tonic-gate 				if (ulp) {
5373*7c478bd9Sstevel@tonic-gate 					TRANS_BEGIN_CSYNC(ufsvfsp, issync,
5374*7c478bd9Sstevel@tonic-gate 					    TOP_RMDIR, TOP_RMDIR_SIZE);
5375*7c478bd9Sstevel@tonic-gate 				}
5376*7c478bd9Sstevel@tonic-gate 				/*
5377*7c478bd9Sstevel@tonic-gate 				 * Is directory empty
5378*7c478bd9Sstevel@tonic-gate 				 */
5379*7c478bd9Sstevel@tonic-gate 				rw_enter(&sip->i_rwlock, RW_WRITER);
5380*7c478bd9Sstevel@tonic-gate 				rw_enter(&sip->i_contents, RW_WRITER);
5381*7c478bd9Sstevel@tonic-gate 				if (ufs_xattrdirempty(sip,
5382*7c478bd9Sstevel@tonic-gate 						sip->i_number, CRED())) {
5383*7c478bd9Sstevel@tonic-gate 					rw_enter(&ip->i_contents, RW_WRITER);
5384*7c478bd9Sstevel@tonic-gate 					ufs_unhook_shadow(ip, sip);
5385*7c478bd9Sstevel@tonic-gate 					rw_exit(&ip->i_contents);
5386*7c478bd9Sstevel@tonic-gate 
5387*7c478bd9Sstevel@tonic-gate 					*valp = 0;
5388*7c478bd9Sstevel@tonic-gate 
5389*7c478bd9Sstevel@tonic-gate 				} else
5390*7c478bd9Sstevel@tonic-gate 					*valp = 1;
5391*7c478bd9Sstevel@tonic-gate 				rw_exit(&sip->i_contents);
5392*7c478bd9Sstevel@tonic-gate 				rw_exit(&sip->i_rwlock);
5393*7c478bd9Sstevel@tonic-gate 				if (ulp) {
5394*7c478bd9Sstevel@tonic-gate 					TRANS_END_CSYNC(ufsvfsp, error, issync,
5395*7c478bd9Sstevel@tonic-gate 					    TOP_RMDIR, TOP_RMDIR_SIZE);
5396*7c478bd9Sstevel@tonic-gate 				}
5397*7c478bd9Sstevel@tonic-gate 				VN_RELE(ITOV(sip));
5398*7c478bd9Sstevel@tonic-gate 			} else if (error == ENOENT) {
5399*7c478bd9Sstevel@tonic-gate 				*valp = 0;
5400*7c478bd9Sstevel@tonic-gate 				error = 0;
5401*7c478bd9Sstevel@tonic-gate 			}
5402*7c478bd9Sstevel@tonic-gate 		} else {
5403*7c478bd9Sstevel@tonic-gate 			error = fs_pathconf(vp, cmd, valp, cr);
5404*7c478bd9Sstevel@tonic-gate 		}
5405*7c478bd9Sstevel@tonic-gate 		break;
5406*7c478bd9Sstevel@tonic-gate 
5407*7c478bd9Sstevel@tonic-gate 	case _PC_ACL_ENABLED:
5408*7c478bd9Sstevel@tonic-gate 		*valp = _ACL_ACLENT_ENABLED;
5409*7c478bd9Sstevel@tonic-gate 		break;
5410*7c478bd9Sstevel@tonic-gate 
5411*7c478bd9Sstevel@tonic-gate 	case _PC_MIN_HOLE_SIZE:
5412*7c478bd9Sstevel@tonic-gate 		*valp = (ulong_t)ip->i_fs->fs_bsize;
5413*7c478bd9Sstevel@tonic-gate 		break;
5414*7c478bd9Sstevel@tonic-gate 
5415*7c478bd9Sstevel@tonic-gate 	default:
5416*7c478bd9Sstevel@tonic-gate 		error = fs_pathconf(vp, cmd, valp, cr);
5417*7c478bd9Sstevel@tonic-gate 	}
5418*7c478bd9Sstevel@tonic-gate 
5419*7c478bd9Sstevel@tonic-gate 	if (ulp != NULL) {
5420*7c478bd9Sstevel@tonic-gate 		ufs_lockfs_end(ulp);
5421*7c478bd9Sstevel@tonic-gate 	}
5422*7c478bd9Sstevel@tonic-gate 	return (error);
5423*7c478bd9Sstevel@tonic-gate }
5424*7c478bd9Sstevel@tonic-gate 
5425*7c478bd9Sstevel@tonic-gate int ufs_pageio_writes, ufs_pageio_reads;
5426*7c478bd9Sstevel@tonic-gate 
5427*7c478bd9Sstevel@tonic-gate /*ARGSUSED*/
5428*7c478bd9Sstevel@tonic-gate static int
5429*7c478bd9Sstevel@tonic-gate ufs_pageio(struct vnode *vp, page_t *pp, u_offset_t io_off, size_t io_len,
5430*7c478bd9Sstevel@tonic-gate 	int flags, struct cred *cr)
5431*7c478bd9Sstevel@tonic-gate {
5432*7c478bd9Sstevel@tonic-gate 	struct inode *ip = VTOI(vp);
5433*7c478bd9Sstevel@tonic-gate 	struct ufsvfs *ufsvfsp;
5434*7c478bd9Sstevel@tonic-gate 	page_t *npp = NULL, *opp = NULL, *cpp = pp;
5435*7c478bd9Sstevel@tonic-gate 	struct buf *bp;
5436*7c478bd9Sstevel@tonic-gate 	daddr_t bn;
5437*7c478bd9Sstevel@tonic-gate 	size_t done_len = 0, cur_len = 0;
5438*7c478bd9Sstevel@tonic-gate 	int err = 0;
5439*7c478bd9Sstevel@tonic-gate 	int contig = 0;
5440*7c478bd9Sstevel@tonic-gate 	int dolock;
5441*7c478bd9Sstevel@tonic-gate 	int vmpss = 0;
5442*7c478bd9Sstevel@tonic-gate 
5443*7c478bd9Sstevel@tonic-gate 	if ((flags & B_READ) && pp != NULL && pp->p_vnode == vp &&
5444*7c478bd9Sstevel@tonic-gate 	    vp->v_mpssdata != NULL) {
5445*7c478bd9Sstevel@tonic-gate 		vmpss = 1;
5446*7c478bd9Sstevel@tonic-gate 	}
5447*7c478bd9Sstevel@tonic-gate 
5448*7c478bd9Sstevel@tonic-gate 	dolock = (rw_owner(&ip->i_contents) != curthread);
5449*7c478bd9Sstevel@tonic-gate 	/*
5450*7c478bd9Sstevel@tonic-gate 	 * We need a better check.  Ideally, we would use another
5451*7c478bd9Sstevel@tonic-gate 	 * vnodeops so that hlocked and forcibly unmounted file
5452*7c478bd9Sstevel@tonic-gate 	 * systems would return EIO where appropriate and w/o the
5453*7c478bd9Sstevel@tonic-gate 	 * need for these checks.
5454*7c478bd9Sstevel@tonic-gate 	 */
5455*7c478bd9Sstevel@tonic-gate 	if ((ufsvfsp = ip->i_ufsvfs) == NULL)
5456*7c478bd9Sstevel@tonic-gate 		return (EIO);
5457*7c478bd9Sstevel@tonic-gate 
5458*7c478bd9Sstevel@tonic-gate 	if (dolock) {
5459*7c478bd9Sstevel@tonic-gate 		/*
5460*7c478bd9Sstevel@tonic-gate 		 * segvn may call VOP_PAGEIO() instead of VOP_GETPAGE() to
5461*7c478bd9Sstevel@tonic-gate 		 * handle a fault against a segment that maps vnode pages with
5462*7c478bd9Sstevel@tonic-gate 		 * large mappings.  Segvn creates pages and holds them locked
5463*7c478bd9Sstevel@tonic-gate 		 * SE_EXCL during VOP_PAGEIO() call. In this case we have to
5464*7c478bd9Sstevel@tonic-gate 		 * use rw_tryenter() to avoid a potential deadlock since in
5465*7c478bd9Sstevel@tonic-gate 		 * lock order i_contents needs to be taken first.
5466*7c478bd9Sstevel@tonic-gate 		 * Segvn will retry via VOP_GETPAGE() if VOP_PAGEIO() fails.
5467*7c478bd9Sstevel@tonic-gate 		 */
5468*7c478bd9Sstevel@tonic-gate 		if (!vmpss) {
5469*7c478bd9Sstevel@tonic-gate 			rw_enter(&ip->i_contents, RW_READER);
5470*7c478bd9Sstevel@tonic-gate 		} else if (!rw_tryenter(&ip->i_contents, RW_READER)) {
5471*7c478bd9Sstevel@tonic-gate 			return (EDEADLK);
5472*7c478bd9Sstevel@tonic-gate 		}
5473*7c478bd9Sstevel@tonic-gate 	}
5474*7c478bd9Sstevel@tonic-gate 
5475*7c478bd9Sstevel@tonic-gate 	if (pp == NULL) {
5476*7c478bd9Sstevel@tonic-gate 		if (bmap_has_holes(ip)) {
5477*7c478bd9Sstevel@tonic-gate 			err = ENOSYS;
5478*7c478bd9Sstevel@tonic-gate 		} else {
5479*7c478bd9Sstevel@tonic-gate 			err = EINVAL;
5480*7c478bd9Sstevel@tonic-gate 		}
5481*7c478bd9Sstevel@tonic-gate 		if (dolock)
5482*7c478bd9Sstevel@tonic-gate 			rw_exit(&ip->i_contents);
5483*7c478bd9Sstevel@tonic-gate 		return (err);
5484*7c478bd9Sstevel@tonic-gate 	}
5485*7c478bd9Sstevel@tonic-gate 
5486*7c478bd9Sstevel@tonic-gate 	/*
5487*7c478bd9Sstevel@tonic-gate 	 * Break the io request into chunks, one for each contiguous
5488*7c478bd9Sstevel@tonic-gate 	 * stretch of disk blocks in the target file.
5489*7c478bd9Sstevel@tonic-gate 	 */
5490*7c478bd9Sstevel@tonic-gate 	while (done_len < io_len) {
5491*7c478bd9Sstevel@tonic-gate 		ASSERT(cpp);
5492*7c478bd9Sstevel@tonic-gate 		contig = 0;
5493*7c478bd9Sstevel@tonic-gate 		if (err = bmap_read(ip, (u_offset_t)(io_off + done_len),
5494*7c478bd9Sstevel@tonic-gate 				    &bn, &contig))
5495*7c478bd9Sstevel@tonic-gate 			break;
5496*7c478bd9Sstevel@tonic-gate 
5497*7c478bd9Sstevel@tonic-gate 		if (bn == UFS_HOLE) {	/* No holey swapfiles */
5498*7c478bd9Sstevel@tonic-gate 			if (vmpss) {
5499*7c478bd9Sstevel@tonic-gate 				err = EFAULT;
5500*7c478bd9Sstevel@tonic-gate 				break;
5501*7c478bd9Sstevel@tonic-gate 			}
5502*7c478bd9Sstevel@tonic-gate 			err = ufs_fault(ITOV(ip), "ufs_pageio: bn == UFS_HOLE");
5503*7c478bd9Sstevel@tonic-gate 			break;
5504*7c478bd9Sstevel@tonic-gate 		}
5505*7c478bd9Sstevel@tonic-gate 
5506*7c478bd9Sstevel@tonic-gate 		cur_len = MIN(io_len - done_len, contig);
5507*7c478bd9Sstevel@tonic-gate 		/*
5508*7c478bd9Sstevel@tonic-gate 		 * Zero out a page beyond EOF, when the last block of
5509*7c478bd9Sstevel@tonic-gate 		 * a file is a UFS fragment so that ufs_pageio() can be used
5510*7c478bd9Sstevel@tonic-gate 		 * instead of ufs_getpage() to handle faults against
5511*7c478bd9Sstevel@tonic-gate 		 * segvn segments that use large pages.
5512*7c478bd9Sstevel@tonic-gate 		 */
5513*7c478bd9Sstevel@tonic-gate 		page_list_break(&cpp, &npp, btopr(cur_len));
5514*7c478bd9Sstevel@tonic-gate 		if ((flags & B_READ) && (cur_len & PAGEOFFSET)) {
5515*7c478bd9Sstevel@tonic-gate 			size_t xlen = cur_len & PAGEOFFSET;
5516*7c478bd9Sstevel@tonic-gate 			pagezero(cpp->p_prev, xlen, PAGESIZE - xlen);
5517*7c478bd9Sstevel@tonic-gate 		}
5518*7c478bd9Sstevel@tonic-gate 
5519*7c478bd9Sstevel@tonic-gate 		bp = pageio_setup(cpp, cur_len, ip->i_devvp, flags);
5520*7c478bd9Sstevel@tonic-gate 		ASSERT(bp != NULL);
5521*7c478bd9Sstevel@tonic-gate 
5522*7c478bd9Sstevel@tonic-gate 		bp->b_edev = ip->i_dev;
5523*7c478bd9Sstevel@tonic-gate 		bp->b_dev = cmpdev(ip->i_dev);
5524*7c478bd9Sstevel@tonic-gate 		bp->b_blkno = bn;
5525*7c478bd9Sstevel@tonic-gate 		bp->b_un.b_addr = (caddr_t)0;
5526*7c478bd9Sstevel@tonic-gate 		bp->b_file = ip->i_vnode;
5527*7c478bd9Sstevel@tonic-gate 
5528*7c478bd9Sstevel@tonic-gate 		ufsvfsp->vfs_iotstamp = lbolt;
5529*7c478bd9Sstevel@tonic-gate 		ub.ub_pageios.value.ul++;
5530*7c478bd9Sstevel@tonic-gate 		if (ufsvfsp->vfs_snapshot)
5531*7c478bd9Sstevel@tonic-gate 			fssnap_strategy(&(ufsvfsp->vfs_snapshot), bp);
5532*7c478bd9Sstevel@tonic-gate 		else
5533*7c478bd9Sstevel@tonic-gate 			(void) bdev_strategy(bp);
5534*7c478bd9Sstevel@tonic-gate 
5535*7c478bd9Sstevel@tonic-gate 		if (flags & B_READ)
5536*7c478bd9Sstevel@tonic-gate 			ufs_pageio_reads++;
5537*7c478bd9Sstevel@tonic-gate 		else
5538*7c478bd9Sstevel@tonic-gate 			ufs_pageio_writes++;
5539*7c478bd9Sstevel@tonic-gate 		if (flags & B_READ)
5540*7c478bd9Sstevel@tonic-gate 			lwp_stat_update(LWP_STAT_INBLK, 1);
5541*7c478bd9Sstevel@tonic-gate 		else
5542*7c478bd9Sstevel@tonic-gate 			lwp_stat_update(LWP_STAT_OUBLK, 1);
5543*7c478bd9Sstevel@tonic-gate 		/*
5544*7c478bd9Sstevel@tonic-gate 		 * If the request is not B_ASYNC, wait for i/o to complete
5545*7c478bd9Sstevel@tonic-gate 		 * and re-assemble the page list to return to the caller.
5546*7c478bd9Sstevel@tonic-gate 		 * If it is B_ASYNC we leave the page list in pieces and
5547*7c478bd9Sstevel@tonic-gate 		 * cleanup() will dispose of them.
5548*7c478bd9Sstevel@tonic-gate 		 */
5549*7c478bd9Sstevel@tonic-gate 		if ((flags & B_ASYNC) == 0) {
5550*7c478bd9Sstevel@tonic-gate 			err = biowait(bp);
5551*7c478bd9Sstevel@tonic-gate 			pageio_done(bp);
5552*7c478bd9Sstevel@tonic-gate 			if (err)
5553*7c478bd9Sstevel@tonic-gate 				break;
5554*7c478bd9Sstevel@tonic-gate 			page_list_concat(&opp, &cpp);
5555*7c478bd9Sstevel@tonic-gate 		}
5556*7c478bd9Sstevel@tonic-gate 		cpp = npp;
5557*7c478bd9Sstevel@tonic-gate 		npp = NULL;
5558*7c478bd9Sstevel@tonic-gate 		if (flags & B_READ)
5559*7c478bd9Sstevel@tonic-gate 			cur_len = P2ROUNDUP_TYPED(cur_len, PAGESIZE, size_t);
5560*7c478bd9Sstevel@tonic-gate 		done_len += cur_len;
5561*7c478bd9Sstevel@tonic-gate 	}
5562*7c478bd9Sstevel@tonic-gate 	ASSERT(err || (cpp == NULL && npp == NULL && done_len == io_len));
5563*7c478bd9Sstevel@tonic-gate 	if (err) {
5564*7c478bd9Sstevel@tonic-gate 		if (flags & B_ASYNC) {
5565*7c478bd9Sstevel@tonic-gate 			/* Cleanup unprocessed parts of list */
5566*7c478bd9Sstevel@tonic-gate 			page_list_concat(&cpp, &npp);
5567*7c478bd9Sstevel@tonic-gate 			if (flags & B_READ)
5568*7c478bd9Sstevel@tonic-gate 				pvn_read_done(cpp, B_ERROR);
5569*7c478bd9Sstevel@tonic-gate 			else
5570*7c478bd9Sstevel@tonic-gate 				pvn_write_done(cpp, B_ERROR);
5571*7c478bd9Sstevel@tonic-gate 		} else {
5572*7c478bd9Sstevel@tonic-gate 			/* Re-assemble list and let caller clean up */
5573*7c478bd9Sstevel@tonic-gate 			page_list_concat(&opp, &cpp);
5574*7c478bd9Sstevel@tonic-gate 			page_list_concat(&opp, &npp);
5575*7c478bd9Sstevel@tonic-gate 		}
5576*7c478bd9Sstevel@tonic-gate 	}
5577*7c478bd9Sstevel@tonic-gate 	if (dolock)
5578*7c478bd9Sstevel@tonic-gate 		rw_exit(&ip->i_contents);
5579*7c478bd9Sstevel@tonic-gate 	return (err);
5580*7c478bd9Sstevel@tonic-gate }
5581*7c478bd9Sstevel@tonic-gate 
5582*7c478bd9Sstevel@tonic-gate /*
5583*7c478bd9Sstevel@tonic-gate  * Called when the kernel is in a frozen state to dump data
5584*7c478bd9Sstevel@tonic-gate  * directly to the device. It uses a private dump data structure,
5585*7c478bd9Sstevel@tonic-gate  * set up by dump_ctl, to locate the correct disk block to which to dump.
5586*7c478bd9Sstevel@tonic-gate  */
5587*7c478bd9Sstevel@tonic-gate static int
5588*7c478bd9Sstevel@tonic-gate ufs_dump(vnode_t *vp, caddr_t addr, int ldbn, int dblks)
5589*7c478bd9Sstevel@tonic-gate {
5590*7c478bd9Sstevel@tonic-gate 	u_offset_t	file_size;
5591*7c478bd9Sstevel@tonic-gate 	struct inode    *ip = VTOI(vp);
5592*7c478bd9Sstevel@tonic-gate 	struct fs	*fs = ip->i_fs;
5593*7c478bd9Sstevel@tonic-gate 	daddr_t		dbn, lfsbn;
5594*7c478bd9Sstevel@tonic-gate 	int		disk_blks = fs->fs_bsize >> DEV_BSHIFT;
5595*7c478bd9Sstevel@tonic-gate 	int		error = 0;
5596*7c478bd9Sstevel@tonic-gate 	int		ndbs, nfsbs;
5597*7c478bd9Sstevel@tonic-gate 
5598*7c478bd9Sstevel@tonic-gate 	/*
5599*7c478bd9Sstevel@tonic-gate 	 * forced unmount case
5600*7c478bd9Sstevel@tonic-gate 	 */
5601*7c478bd9Sstevel@tonic-gate 	if (ip->i_ufsvfs == NULL)
5602*7c478bd9Sstevel@tonic-gate 		return (EIO);
5603*7c478bd9Sstevel@tonic-gate 	/*
5604*7c478bd9Sstevel@tonic-gate 	 * Validate the inode that it has not been modified since
5605*7c478bd9Sstevel@tonic-gate 	 * the dump structure is allocated.
5606*7c478bd9Sstevel@tonic-gate 	 */
5607*7c478bd9Sstevel@tonic-gate 	mutex_enter(&ip->i_tlock);
5608*7c478bd9Sstevel@tonic-gate 	if ((dump_info == NULL) ||
5609*7c478bd9Sstevel@tonic-gate 	    (dump_info->ip != ip) ||
5610*7c478bd9Sstevel@tonic-gate 	    (dump_info->time.tv_sec != ip->i_mtime.tv_sec) ||
5611*7c478bd9Sstevel@tonic-gate 	    (dump_info->time.tv_usec != ip->i_mtime.tv_usec)) {
5612*7c478bd9Sstevel@tonic-gate 		mutex_exit(&ip->i_tlock);
5613*7c478bd9Sstevel@tonic-gate 		return (-1);
5614*7c478bd9Sstevel@tonic-gate 	}
5615*7c478bd9Sstevel@tonic-gate 	mutex_exit(&ip->i_tlock);
5616*7c478bd9Sstevel@tonic-gate 
5617*7c478bd9Sstevel@tonic-gate 	/*
5618*7c478bd9Sstevel@tonic-gate 	 * See that the file has room for this write
5619*7c478bd9Sstevel@tonic-gate 	 */
5620*7c478bd9Sstevel@tonic-gate 	UFS_GET_ISIZE(&file_size, ip);
5621*7c478bd9Sstevel@tonic-gate 
5622*7c478bd9Sstevel@tonic-gate 	if (ldbtob((offset_t)(ldbn + dblks)) > file_size)
5623*7c478bd9Sstevel@tonic-gate 		return (ENOSPC);
5624*7c478bd9Sstevel@tonic-gate 
5625*7c478bd9Sstevel@tonic-gate 	/*
5626*7c478bd9Sstevel@tonic-gate 	 * Find the physical disk block numbers from the dump
5627*7c478bd9Sstevel@tonic-gate 	 * private data structure directly and write out the data
5628*7c478bd9Sstevel@tonic-gate 	 * in contiguous block lumps
5629*7c478bd9Sstevel@tonic-gate 	 */
5630*7c478bd9Sstevel@tonic-gate 	while (dblks > 0 && !error) {
5631*7c478bd9Sstevel@tonic-gate 		lfsbn = (daddr_t)lblkno(fs, ldbtob((offset_t)ldbn));
5632*7c478bd9Sstevel@tonic-gate 		dbn = fsbtodb(fs, dump_info->dblk[lfsbn]) + ldbn % disk_blks;
5633*7c478bd9Sstevel@tonic-gate 		nfsbs = 1;
5634*7c478bd9Sstevel@tonic-gate 		ndbs = disk_blks - ldbn % disk_blks;
5635*7c478bd9Sstevel@tonic-gate 		while (ndbs < dblks && fsbtodb(fs, dump_info->dblk[lfsbn +
5636*7c478bd9Sstevel@tonic-gate 		    nfsbs]) == dbn + ndbs) {
5637*7c478bd9Sstevel@tonic-gate 			nfsbs++;
5638*7c478bd9Sstevel@tonic-gate 			ndbs += disk_blks;
5639*7c478bd9Sstevel@tonic-gate 		}
5640*7c478bd9Sstevel@tonic-gate 		if (ndbs > dblks)
5641*7c478bd9Sstevel@tonic-gate 			ndbs = dblks;
5642*7c478bd9Sstevel@tonic-gate 		error = bdev_dump(ip->i_dev, addr, dbn, ndbs);
5643*7c478bd9Sstevel@tonic-gate 		addr += ldbtob((offset_t)ndbs);
5644*7c478bd9Sstevel@tonic-gate 		dblks -= ndbs;
5645*7c478bd9Sstevel@tonic-gate 		ldbn += ndbs;
5646*7c478bd9Sstevel@tonic-gate 	}
5647*7c478bd9Sstevel@tonic-gate 	return (error);
5648*7c478bd9Sstevel@tonic-gate 
5649*7c478bd9Sstevel@tonic-gate }
5650*7c478bd9Sstevel@tonic-gate 
5651*7c478bd9Sstevel@tonic-gate /*
5652*7c478bd9Sstevel@tonic-gate  * Prepare the file system before and after the dump operation.
5653*7c478bd9Sstevel@tonic-gate  *
5654*7c478bd9Sstevel@tonic-gate  * action = DUMP_ALLOC:
5655*7c478bd9Sstevel@tonic-gate  * Preparation before dump, allocate dump private data structure
5656*7c478bd9Sstevel@tonic-gate  * to hold all the direct and indirect block info for dump.
5657*7c478bd9Sstevel@tonic-gate  *
5658*7c478bd9Sstevel@tonic-gate  * action = DUMP_FREE:
5659*7c478bd9Sstevel@tonic-gate  * Clean up after dump, deallocate the dump private data structure.
5660*7c478bd9Sstevel@tonic-gate  *
5661*7c478bd9Sstevel@tonic-gate  * action = DUMP_SCAN:
5662*7c478bd9Sstevel@tonic-gate  * Scan dump_info for *blkp DEV_BSIZE blocks of contig fs space;
5663*7c478bd9Sstevel@tonic-gate  * if found, the starting file-relative DEV_BSIZE lbn is written
5664*7c478bd9Sstevel@tonic-gate  * to *bklp; that lbn is intended for use with VOP_DUMP()
5665*7c478bd9Sstevel@tonic-gate  */
5666*7c478bd9Sstevel@tonic-gate static int
5667*7c478bd9Sstevel@tonic-gate ufs_dumpctl(vnode_t *vp, int action, int *blkp)
5668*7c478bd9Sstevel@tonic-gate {
5669*7c478bd9Sstevel@tonic-gate 	struct inode	*ip = VTOI(vp);
5670*7c478bd9Sstevel@tonic-gate 	ufsvfs_t	*ufsvfsp = ip->i_ufsvfs;
5671*7c478bd9Sstevel@tonic-gate 	struct fs	*fs;
5672*7c478bd9Sstevel@tonic-gate 	daddr32_t	*dblk, *storeblk;
5673*7c478bd9Sstevel@tonic-gate 	daddr32_t	*nextblk, *endblk;
5674*7c478bd9Sstevel@tonic-gate 	struct buf	*bp;
5675*7c478bd9Sstevel@tonic-gate 	int		i, entry, entries;
5676*7c478bd9Sstevel@tonic-gate 	int		n, ncontig;
5677*7c478bd9Sstevel@tonic-gate 
5678*7c478bd9Sstevel@tonic-gate 	/*
5679*7c478bd9Sstevel@tonic-gate 	 * check for forced unmount
5680*7c478bd9Sstevel@tonic-gate 	 */
5681*7c478bd9Sstevel@tonic-gate 	if (ufsvfsp == NULL)
5682*7c478bd9Sstevel@tonic-gate 		return (EIO);
5683*7c478bd9Sstevel@tonic-gate 
5684*7c478bd9Sstevel@tonic-gate 	if (action == DUMP_ALLOC) {
5685*7c478bd9Sstevel@tonic-gate 		/*
5686*7c478bd9Sstevel@tonic-gate 		 * alloc and record dump_info
5687*7c478bd9Sstevel@tonic-gate 		 */
5688*7c478bd9Sstevel@tonic-gate 		if (dump_info != NULL)
5689*7c478bd9Sstevel@tonic-gate 			return (EINVAL);
5690*7c478bd9Sstevel@tonic-gate 
5691*7c478bd9Sstevel@tonic-gate 		ASSERT(vp->v_type == VREG);
5692*7c478bd9Sstevel@tonic-gate 		fs = ufsvfsp->vfs_fs;
5693*7c478bd9Sstevel@tonic-gate 
5694*7c478bd9Sstevel@tonic-gate 		rw_enter(&ip->i_contents, RW_READER);
5695*7c478bd9Sstevel@tonic-gate 
5696*7c478bd9Sstevel@tonic-gate 		if (bmap_has_holes(ip)) {
5697*7c478bd9Sstevel@tonic-gate 			rw_exit(&ip->i_contents);
5698*7c478bd9Sstevel@tonic-gate 			return (EFAULT);
5699*7c478bd9Sstevel@tonic-gate 		}
5700*7c478bd9Sstevel@tonic-gate 
5701*7c478bd9Sstevel@tonic-gate 		/*
5702*7c478bd9Sstevel@tonic-gate 		 * calculate and allocate space needed according to i_size
5703*7c478bd9Sstevel@tonic-gate 		 */
5704*7c478bd9Sstevel@tonic-gate 		entries = (int)lblkno(fs, blkroundup(fs, ip->i_size));
5705*7c478bd9Sstevel@tonic-gate 		if ((dump_info = (struct dump *)
5706*7c478bd9Sstevel@tonic-gate 		    kmem_alloc(sizeof (struct dump) +
5707*7c478bd9Sstevel@tonic-gate 		    (entries - 1) * sizeof (daddr32_t), KM_NOSLEEP)) == NULL) {
5708*7c478bd9Sstevel@tonic-gate 			    rw_exit(&ip->i_contents);
5709*7c478bd9Sstevel@tonic-gate 			    return (ENOMEM);
5710*7c478bd9Sstevel@tonic-gate 		}
5711*7c478bd9Sstevel@tonic-gate 
5712*7c478bd9Sstevel@tonic-gate 		/* Start saving the info */
5713*7c478bd9Sstevel@tonic-gate 		dump_info->fsbs = entries;
5714*7c478bd9Sstevel@tonic-gate 		dump_info->ip = ip;
5715*7c478bd9Sstevel@tonic-gate 		storeblk = &dump_info->dblk[0];
5716*7c478bd9Sstevel@tonic-gate 
5717*7c478bd9Sstevel@tonic-gate 		/* Direct Blocks */
5718*7c478bd9Sstevel@tonic-gate 		for (entry = 0; entry < NDADDR && entry < entries; entry++)
5719*7c478bd9Sstevel@tonic-gate 			*storeblk++ = ip->i_db[entry];
5720*7c478bd9Sstevel@tonic-gate 
5721*7c478bd9Sstevel@tonic-gate 		/* Indirect Blocks */
5722*7c478bd9Sstevel@tonic-gate 		for (i = 0; i < NIADDR; i++) {
5723*7c478bd9Sstevel@tonic-gate 			int error = 0;
5724*7c478bd9Sstevel@tonic-gate 
5725*7c478bd9Sstevel@tonic-gate 			bp = UFS_BREAD(ufsvfsp,
5726*7c478bd9Sstevel@tonic-gate 				ip->i_dev, fsbtodb(fs, ip->i_ib[i]),
5727*7c478bd9Sstevel@tonic-gate 				fs->fs_bsize);
5728*7c478bd9Sstevel@tonic-gate 			if (bp->b_flags & B_ERROR)
5729*7c478bd9Sstevel@tonic-gate 				error = EIO;
5730*7c478bd9Sstevel@tonic-gate 			else {
5731*7c478bd9Sstevel@tonic-gate 				dblk = bp->b_un.b_daddr;
5732*7c478bd9Sstevel@tonic-gate 				if ((storeblk = save_dblks(ip, ufsvfsp,
5733*7c478bd9Sstevel@tonic-gate 				    storeblk, dblk, i, entries)) == NULL)
5734*7c478bd9Sstevel@tonic-gate 					error = EIO;
5735*7c478bd9Sstevel@tonic-gate 			}
5736*7c478bd9Sstevel@tonic-gate 
5737*7c478bd9Sstevel@tonic-gate 			brelse(bp);
5738*7c478bd9Sstevel@tonic-gate 
5739*7c478bd9Sstevel@tonic-gate 			if (error != 0) {
5740*7c478bd9Sstevel@tonic-gate 				kmem_free(dump_info, sizeof (struct dump) +
5741*7c478bd9Sstevel@tonic-gate 				    (entries - 1) * sizeof (daddr32_t));
5742*7c478bd9Sstevel@tonic-gate 				rw_exit(&ip->i_contents);
5743*7c478bd9Sstevel@tonic-gate 				dump_info = NULL;
5744*7c478bd9Sstevel@tonic-gate 				return (error);
5745*7c478bd9Sstevel@tonic-gate 			}
5746*7c478bd9Sstevel@tonic-gate 		}
5747*7c478bd9Sstevel@tonic-gate 		/* and time stamp the information */
5748*7c478bd9Sstevel@tonic-gate 		mutex_enter(&ip->i_tlock);
5749*7c478bd9Sstevel@tonic-gate 		dump_info->time = ip->i_mtime;
5750*7c478bd9Sstevel@tonic-gate 		mutex_exit(&ip->i_tlock);
5751*7c478bd9Sstevel@tonic-gate 
5752*7c478bd9Sstevel@tonic-gate 		rw_exit(&ip->i_contents);
5753*7c478bd9Sstevel@tonic-gate 	} else if (action == DUMP_FREE) {
5754*7c478bd9Sstevel@tonic-gate 		/*
5755*7c478bd9Sstevel@tonic-gate 		 * free dump_info
5756*7c478bd9Sstevel@tonic-gate 		 */
5757*7c478bd9Sstevel@tonic-gate 		if (dump_info == NULL)
5758*7c478bd9Sstevel@tonic-gate 			return (EINVAL);
5759*7c478bd9Sstevel@tonic-gate 		entries = dump_info->fsbs - 1;
5760*7c478bd9Sstevel@tonic-gate 		kmem_free(dump_info, sizeof (struct dump) +
5761*7c478bd9Sstevel@tonic-gate 		    entries * sizeof (daddr32_t));
5762*7c478bd9Sstevel@tonic-gate 		dump_info = NULL;
5763*7c478bd9Sstevel@tonic-gate 	} else if (action == DUMP_SCAN) {
5764*7c478bd9Sstevel@tonic-gate 		/*
5765*7c478bd9Sstevel@tonic-gate 		 * scan dump_info
5766*7c478bd9Sstevel@tonic-gate 		 */
5767*7c478bd9Sstevel@tonic-gate 		if (dump_info == NULL)
5768*7c478bd9Sstevel@tonic-gate 			return (EINVAL);
5769*7c478bd9Sstevel@tonic-gate 
5770*7c478bd9Sstevel@tonic-gate 		dblk = dump_info->dblk;
5771*7c478bd9Sstevel@tonic-gate 		nextblk = dblk + 1;
5772*7c478bd9Sstevel@tonic-gate 		endblk = dblk + dump_info->fsbs - 1;
5773*7c478bd9Sstevel@tonic-gate 		fs = ufsvfsp->vfs_fs;
5774*7c478bd9Sstevel@tonic-gate 		ncontig = *blkp >> (fs->fs_bshift - DEV_BSHIFT);
5775*7c478bd9Sstevel@tonic-gate 
5776*7c478bd9Sstevel@tonic-gate 		/*
5777*7c478bd9Sstevel@tonic-gate 		 * scan dblk[] entries; contig fs space is found when:
5778*7c478bd9Sstevel@tonic-gate 		 * ((current blkno + frags per block) == next blkno)
5779*7c478bd9Sstevel@tonic-gate 		 */
5780*7c478bd9Sstevel@tonic-gate 		n = 0;
5781*7c478bd9Sstevel@tonic-gate 		while (n < ncontig && dblk < endblk) {
5782*7c478bd9Sstevel@tonic-gate 			if ((*dblk + fs->fs_frag) == *nextblk)
5783*7c478bd9Sstevel@tonic-gate 				n++;
5784*7c478bd9Sstevel@tonic-gate 			else
5785*7c478bd9Sstevel@tonic-gate 				n = 0;
5786*7c478bd9Sstevel@tonic-gate 			dblk++;
5787*7c478bd9Sstevel@tonic-gate 			nextblk++;
5788*7c478bd9Sstevel@tonic-gate 		}
5789*7c478bd9Sstevel@tonic-gate 
5790*7c478bd9Sstevel@tonic-gate 		/*
5791*7c478bd9Sstevel@tonic-gate 		 * index is where size bytes of contig space begins;
5792*7c478bd9Sstevel@tonic-gate 		 * conversion from index to the file's DEV_BSIZE lbn
5793*7c478bd9Sstevel@tonic-gate 		 * is equivalent to:  (index * fs_bsize) / DEV_BSIZE
5794*7c478bd9Sstevel@tonic-gate 		 */
5795*7c478bd9Sstevel@tonic-gate 		if (n == ncontig) {
5796*7c478bd9Sstevel@tonic-gate 			i = (dblk - dump_info->dblk) - ncontig;
5797*7c478bd9Sstevel@tonic-gate 			*blkp = i << (fs->fs_bshift - DEV_BSHIFT);
5798*7c478bd9Sstevel@tonic-gate 		} else
5799*7c478bd9Sstevel@tonic-gate 			return (EFAULT);
5800*7c478bd9Sstevel@tonic-gate 	}
5801*7c478bd9Sstevel@tonic-gate 	return (0);
5802*7c478bd9Sstevel@tonic-gate }
5803*7c478bd9Sstevel@tonic-gate 
5804*7c478bd9Sstevel@tonic-gate /*
5805*7c478bd9Sstevel@tonic-gate  * Recursive helper function for ufs_dumpctl().  It follows the indirect file
5806*7c478bd9Sstevel@tonic-gate  * system  blocks until it reaches the the disk block addresses, which are
5807*7c478bd9Sstevel@tonic-gate  * then stored into the given buffer, storeblk.
5808*7c478bd9Sstevel@tonic-gate  */
5809*7c478bd9Sstevel@tonic-gate static daddr32_t *
5810*7c478bd9Sstevel@tonic-gate save_dblks(struct inode *ip, struct ufsvfs *ufsvfsp,  daddr32_t *storeblk,
5811*7c478bd9Sstevel@tonic-gate     daddr32_t *dblk, int level, int entries)
5812*7c478bd9Sstevel@tonic-gate {
5813*7c478bd9Sstevel@tonic-gate 	struct fs	*fs = ufsvfsp->vfs_fs;
5814*7c478bd9Sstevel@tonic-gate 	struct buf	*bp;
5815*7c478bd9Sstevel@tonic-gate 	int		i;
5816*7c478bd9Sstevel@tonic-gate 
5817*7c478bd9Sstevel@tonic-gate 	if (level == 0) {
5818*7c478bd9Sstevel@tonic-gate 		for (i = 0; i < NINDIR(fs); i++) {
5819*7c478bd9Sstevel@tonic-gate 			if (storeblk - dump_info->dblk >= entries)
5820*7c478bd9Sstevel@tonic-gate 				break;
5821*7c478bd9Sstevel@tonic-gate 			*storeblk++ = dblk[i];
5822*7c478bd9Sstevel@tonic-gate 		}
5823*7c478bd9Sstevel@tonic-gate 		return (storeblk);
5824*7c478bd9Sstevel@tonic-gate 	}
5825*7c478bd9Sstevel@tonic-gate 	for (i = 0; i < NINDIR(fs); i++) {
5826*7c478bd9Sstevel@tonic-gate 		if (storeblk - dump_info->dblk >= entries)
5827*7c478bd9Sstevel@tonic-gate 			break;
5828*7c478bd9Sstevel@tonic-gate 		bp = UFS_BREAD(ufsvfsp,
5829*7c478bd9Sstevel@tonic-gate 				ip->i_dev, fsbtodb(fs, dblk[i]), fs->fs_bsize);
5830*7c478bd9Sstevel@tonic-gate 		if (bp->b_flags & B_ERROR) {
5831*7c478bd9Sstevel@tonic-gate 			brelse(bp);
5832*7c478bd9Sstevel@tonic-gate 			return (NULL);
5833*7c478bd9Sstevel@tonic-gate 		}
5834*7c478bd9Sstevel@tonic-gate 		storeblk = save_dblks(ip, ufsvfsp, storeblk, bp->b_un.b_daddr,
5835*7c478bd9Sstevel@tonic-gate 		    level - 1, entries);
5836*7c478bd9Sstevel@tonic-gate 		brelse(bp);
5837*7c478bd9Sstevel@tonic-gate 
5838*7c478bd9Sstevel@tonic-gate 		if (storeblk == NULL)
5839*7c478bd9Sstevel@tonic-gate 			return (NULL);
5840*7c478bd9Sstevel@tonic-gate 	}
5841*7c478bd9Sstevel@tonic-gate 	return (storeblk);
5842*7c478bd9Sstevel@tonic-gate }
5843*7c478bd9Sstevel@tonic-gate 
5844*7c478bd9Sstevel@tonic-gate /* ARGSUSED */
5845*7c478bd9Sstevel@tonic-gate static int
5846*7c478bd9Sstevel@tonic-gate ufs_getsecattr(struct vnode *vp, vsecattr_t *vsap, int flag,
5847*7c478bd9Sstevel@tonic-gate 	struct cred *cr)
5848*7c478bd9Sstevel@tonic-gate {
5849*7c478bd9Sstevel@tonic-gate 	struct inode	*ip = VTOI(vp);
5850*7c478bd9Sstevel@tonic-gate 	struct ulockfs	*ulp;
5851*7c478bd9Sstevel@tonic-gate 	struct ufsvfs	*ufsvfsp = ip->i_ufsvfs;
5852*7c478bd9Sstevel@tonic-gate 	ulong_t		vsa_mask = vsap->vsa_mask;
5853*7c478bd9Sstevel@tonic-gate 	int		err = EINVAL;
5854*7c478bd9Sstevel@tonic-gate 
5855*7c478bd9Sstevel@tonic-gate 	TRACE_3(TR_FAC_UFS, TR_UFS_GETSECATTR_START,
5856*7c478bd9Sstevel@tonic-gate 	    "ufs_getsecattr_start:vp %p, vsap %p, flags %x", vp, vsap, flag);
5857*7c478bd9Sstevel@tonic-gate 
5858*7c478bd9Sstevel@tonic-gate 	vsa_mask &= (VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT);
5859*7c478bd9Sstevel@tonic-gate 
5860*7c478bd9Sstevel@tonic-gate 	/*
5861*7c478bd9Sstevel@tonic-gate 	 * Only grab locks if needed - they're not needed to check vsa_mask
5862*7c478bd9Sstevel@tonic-gate 	 * or if the mask contains no acl flags.
5863*7c478bd9Sstevel@tonic-gate 	 */
5864*7c478bd9Sstevel@tonic-gate 	if (vsa_mask != 0) {
5865*7c478bd9Sstevel@tonic-gate 		if (err = ufs_lockfs_begin(ufsvfsp, &ulp,
5866*7c478bd9Sstevel@tonic-gate 		    ULOCKFS_GETATTR_MASK))
5867*7c478bd9Sstevel@tonic-gate 			return (err);
5868*7c478bd9Sstevel@tonic-gate 
5869*7c478bd9Sstevel@tonic-gate 		rw_enter(&ip->i_contents, RW_READER);
5870*7c478bd9Sstevel@tonic-gate 		err = ufs_acl_get(ip, vsap, flag, cr);
5871*7c478bd9Sstevel@tonic-gate 		rw_exit(&ip->i_contents);
5872*7c478bd9Sstevel@tonic-gate 
5873*7c478bd9Sstevel@tonic-gate 		if (ulp)
5874*7c478bd9Sstevel@tonic-gate 			ufs_lockfs_end(ulp);
5875*7c478bd9Sstevel@tonic-gate 	}
5876*7c478bd9Sstevel@tonic-gate 	TRACE_1(TR_FAC_UFS, TR_UFS_GETSECATTR_END,
5877*7c478bd9Sstevel@tonic-gate 	    "ufs_getsecattr_end:vp %p", vp);
5878*7c478bd9Sstevel@tonic-gate 	return (err);
5879*7c478bd9Sstevel@tonic-gate }
5880*7c478bd9Sstevel@tonic-gate 
5881*7c478bd9Sstevel@tonic-gate /* ARGSUSED */
5882*7c478bd9Sstevel@tonic-gate static int
5883*7c478bd9Sstevel@tonic-gate ufs_setsecattr(struct vnode *vp, vsecattr_t *vsap, int flag, struct cred *cr)
5884*7c478bd9Sstevel@tonic-gate {
5885*7c478bd9Sstevel@tonic-gate 	struct inode	*ip = VTOI(vp);
5886*7c478bd9Sstevel@tonic-gate 	struct ulockfs	*ulp = NULL;
5887*7c478bd9Sstevel@tonic-gate 	struct ufsvfs	*ufsvfsp = VTOI(vp)->i_ufsvfs;
5888*7c478bd9Sstevel@tonic-gate 	ulong_t		vsa_mask = vsap->vsa_mask;
5889*7c478bd9Sstevel@tonic-gate 	int		err;
5890*7c478bd9Sstevel@tonic-gate 	int		haverwlock = 1;
5891*7c478bd9Sstevel@tonic-gate 	int		trans_size;
5892*7c478bd9Sstevel@tonic-gate 	int		donetrans = 0;
5893*7c478bd9Sstevel@tonic-gate 	int		retry = 1;
5894*7c478bd9Sstevel@tonic-gate 
5895*7c478bd9Sstevel@tonic-gate 
5896*7c478bd9Sstevel@tonic-gate 	TRACE_3(TR_FAC_UFS, TR_UFS_SETSECATTR_START,
5897*7c478bd9Sstevel@tonic-gate 	    "ufs_setsecattr_start:vp %p, vsap %p, flags %x", vp, vsap, flag);
5898*7c478bd9Sstevel@tonic-gate 
5899*7c478bd9Sstevel@tonic-gate 	ASSERT(RW_LOCK_HELD(&ip->i_rwlock));
5900*7c478bd9Sstevel@tonic-gate 
5901*7c478bd9Sstevel@tonic-gate 	/* Abort now if the request is either empty or invalid. */
5902*7c478bd9Sstevel@tonic-gate 	vsa_mask &= (VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT);
5903*7c478bd9Sstevel@tonic-gate 	if ((vsa_mask == 0) ||
5904*7c478bd9Sstevel@tonic-gate 	    ((vsap->vsa_aclentp == NULL) &&
5905*7c478bd9Sstevel@tonic-gate 	    (vsap->vsa_dfaclentp == NULL))) {
5906*7c478bd9Sstevel@tonic-gate 		err = EINVAL;
5907*7c478bd9Sstevel@tonic-gate 		goto out;
5908*7c478bd9Sstevel@tonic-gate 	}
5909*7c478bd9Sstevel@tonic-gate 
5910*7c478bd9Sstevel@tonic-gate 	/*
5911*7c478bd9Sstevel@tonic-gate 	 * Following convention, if this is a directory then we acquire the
5912*7c478bd9Sstevel@tonic-gate 	 * inode's i_rwlock after starting a UFS logging transaction;
5913*7c478bd9Sstevel@tonic-gate 	 * otherwise, we acquire it beforehand. Since we were called (and
5914*7c478bd9Sstevel@tonic-gate 	 * must therefore return) with the lock held, we will have to drop it,
5915*7c478bd9Sstevel@tonic-gate 	 * and later reacquire it, if operating on a directory.
5916*7c478bd9Sstevel@tonic-gate 	 */
5917*7c478bd9Sstevel@tonic-gate 	if (vp->v_type == VDIR) {
5918*7c478bd9Sstevel@tonic-gate 		rw_exit(&ip->i_rwlock);
5919*7c478bd9Sstevel@tonic-gate 		haverwlock = 0;
5920*7c478bd9Sstevel@tonic-gate 	} else {
5921*7c478bd9Sstevel@tonic-gate 		/* Upgrade the lock if required. */
5922*7c478bd9Sstevel@tonic-gate 		if (!rw_write_held(&ip->i_rwlock)) {
5923*7c478bd9Sstevel@tonic-gate 			rw_exit(&ip->i_rwlock);
5924*7c478bd9Sstevel@tonic-gate 			rw_enter(&ip->i_rwlock, RW_WRITER);
5925*7c478bd9Sstevel@tonic-gate 		}
5926*7c478bd9Sstevel@tonic-gate 	}
5927*7c478bd9Sstevel@tonic-gate 
5928*7c478bd9Sstevel@tonic-gate again:
5929*7c478bd9Sstevel@tonic-gate 	ASSERT(!(vp->v_type == VDIR && haverwlock));
5930*7c478bd9Sstevel@tonic-gate 	if (err = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_SETATTR_MASK)) {
5931*7c478bd9Sstevel@tonic-gate 		ulp = NULL;
5932*7c478bd9Sstevel@tonic-gate 		retry = 0;
5933*7c478bd9Sstevel@tonic-gate 		goto out;
5934*7c478bd9Sstevel@tonic-gate 	}
5935*7c478bd9Sstevel@tonic-gate 
5936*7c478bd9Sstevel@tonic-gate 	/*
5937*7c478bd9Sstevel@tonic-gate 	 * Check that the file system supports this operation. Note that
5938*7c478bd9Sstevel@tonic-gate 	 * ufs_lockfs_begin() will have checked that the file system had
5939*7c478bd9Sstevel@tonic-gate 	 * not been forcibly unmounted.
5940*7c478bd9Sstevel@tonic-gate 	 */
5941*7c478bd9Sstevel@tonic-gate 	if (ufsvfsp->vfs_fs->fs_ronly) {
5942*7c478bd9Sstevel@tonic-gate 		err = EROFS;
5943*7c478bd9Sstevel@tonic-gate 		goto out;
5944*7c478bd9Sstevel@tonic-gate 	}
5945*7c478bd9Sstevel@tonic-gate 	if (ufsvfsp->vfs_nosetsec) {
5946*7c478bd9Sstevel@tonic-gate 		err = ENOSYS;
5947*7c478bd9Sstevel@tonic-gate 		goto out;
5948*7c478bd9Sstevel@tonic-gate 	}
5949*7c478bd9Sstevel@tonic-gate 
5950*7c478bd9Sstevel@tonic-gate 	if (ulp) {
5951*7c478bd9Sstevel@tonic-gate 		TRANS_BEGIN_ASYNC(ufsvfsp, TOP_SETSECATTR,
5952*7c478bd9Sstevel@tonic-gate 			trans_size = TOP_SETSECATTR_SIZE(VTOI(vp)));
5953*7c478bd9Sstevel@tonic-gate 		donetrans = 1;
5954*7c478bd9Sstevel@tonic-gate 	}
5955*7c478bd9Sstevel@tonic-gate 
5956*7c478bd9Sstevel@tonic-gate 	if (vp->v_type == VDIR) {
5957*7c478bd9Sstevel@tonic-gate 		rw_enter(&ip->i_rwlock, RW_WRITER);
5958*7c478bd9Sstevel@tonic-gate 		haverwlock = 1;
5959*7c478bd9Sstevel@tonic-gate 	}
5960*7c478bd9Sstevel@tonic-gate 
5961*7c478bd9Sstevel@tonic-gate 	ASSERT(haverwlock);
5962*7c478bd9Sstevel@tonic-gate 
5963*7c478bd9Sstevel@tonic-gate 	/* Do the actual work. */
5964*7c478bd9Sstevel@tonic-gate 	rw_enter(&ip->i_contents, RW_WRITER);
5965*7c478bd9Sstevel@tonic-gate 	/*
5966*7c478bd9Sstevel@tonic-gate 	 * Suppress out of inodes messages if we will retry.
5967*7c478bd9Sstevel@tonic-gate 	 */
5968*7c478bd9Sstevel@tonic-gate 	if (retry)
5969*7c478bd9Sstevel@tonic-gate 		ip->i_flag |= IQUIET;
5970*7c478bd9Sstevel@tonic-gate 	err = ufs_acl_set(ip, vsap, flag, cr);
5971*7c478bd9Sstevel@tonic-gate 	ip->i_flag &= ~IQUIET;
5972*7c478bd9Sstevel@tonic-gate 	rw_exit(&ip->i_contents);
5973*7c478bd9Sstevel@tonic-gate 
5974*7c478bd9Sstevel@tonic-gate out:
5975*7c478bd9Sstevel@tonic-gate 	if (ulp) {
5976*7c478bd9Sstevel@tonic-gate 		if (donetrans) {
5977*7c478bd9Sstevel@tonic-gate 			/*
5978*7c478bd9Sstevel@tonic-gate 			 * top_end_async() can eventually call
5979*7c478bd9Sstevel@tonic-gate 			 * top_end_sync(), which can block. We must
5980*7c478bd9Sstevel@tonic-gate 			 * therefore observe the lock-ordering protocol
5981*7c478bd9Sstevel@tonic-gate 			 * here as well.
5982*7c478bd9Sstevel@tonic-gate 			 */
5983*7c478bd9Sstevel@tonic-gate 			if (vp->v_type == VDIR) {
5984*7c478bd9Sstevel@tonic-gate 				rw_exit(&ip->i_rwlock);
5985*7c478bd9Sstevel@tonic-gate 				haverwlock = 0;
5986*7c478bd9Sstevel@tonic-gate 			}
5987*7c478bd9Sstevel@tonic-gate 			TRANS_END_ASYNC(ufsvfsp, TOP_SETSECATTR, trans_size);
5988*7c478bd9Sstevel@tonic-gate 		}
5989*7c478bd9Sstevel@tonic-gate 		ufs_lockfs_end(ulp);
5990*7c478bd9Sstevel@tonic-gate 	}
5991*7c478bd9Sstevel@tonic-gate 	/*
5992*7c478bd9Sstevel@tonic-gate 	 * If no inodes available, try scaring a logically-
5993*7c478bd9Sstevel@tonic-gate 	 * free one out of the delete queue to someplace
5994*7c478bd9Sstevel@tonic-gate 	 * that we can find it.
5995*7c478bd9Sstevel@tonic-gate 	 */
5996*7c478bd9Sstevel@tonic-gate 	if ((err == ENOSPC) && retry && TRANS_ISTRANS(ufsvfsp)) {
5997*7c478bd9Sstevel@tonic-gate 		ufs_delete_drain_wait(ufsvfsp, 1);
5998*7c478bd9Sstevel@tonic-gate 		retry = 0;
5999*7c478bd9Sstevel@tonic-gate 		if (vp->v_type == VDIR && haverwlock) {
6000*7c478bd9Sstevel@tonic-gate 			rw_exit(&ip->i_rwlock);
6001*7c478bd9Sstevel@tonic-gate 			haverwlock = 0;
6002*7c478bd9Sstevel@tonic-gate 		}
6003*7c478bd9Sstevel@tonic-gate 		goto again;
6004*7c478bd9Sstevel@tonic-gate 	}
6005*7c478bd9Sstevel@tonic-gate 	/*
6006*7c478bd9Sstevel@tonic-gate 	 * If we need to reacquire the lock then it is safe to do so
6007*7c478bd9Sstevel@tonic-gate 	 * as a reader. This is because ufs_rwunlock(), which will be
6008*7c478bd9Sstevel@tonic-gate 	 * called by our caller after we return, does not differentiate
6009*7c478bd9Sstevel@tonic-gate 	 * between shared and exclusive locks.
6010*7c478bd9Sstevel@tonic-gate 	 */
6011*7c478bd9Sstevel@tonic-gate 	if (!haverwlock) {
6012*7c478bd9Sstevel@tonic-gate 		ASSERT(vp->v_type == VDIR);
6013*7c478bd9Sstevel@tonic-gate 		rw_enter(&ip->i_rwlock, RW_READER);
6014*7c478bd9Sstevel@tonic-gate 	}
6015*7c478bd9Sstevel@tonic-gate 
6016*7c478bd9Sstevel@tonic-gate 	TRACE_1(TR_FAC_UFS, TR_UFS_SETSECATTR_END,
6017*7c478bd9Sstevel@tonic-gate 	    "ufs_setsecattr_end:vp %p", vp);
6018*7c478bd9Sstevel@tonic-gate 	return (err);
6019*7c478bd9Sstevel@tonic-gate }
6020