xref: /illumos-gate/usr/src/uts/common/fs/nfs/nfs4_vnops.c (revision 6dc7d057)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2016 STRATO AG. All rights reserved.
24  */
25 
26 /*
27  * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
28  */
29 
30 /*
31  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
32  * Use is subject to license terms.
33  */
34 
35 /*
36  *	Copyright 1983,1984,1985,1986,1987,1988,1989 AT&T.
37  *	All Rights Reserved
38  */
39 
40 /*
41  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
42  */
43 
44 #include <sys/param.h>
45 #include <sys/types.h>
46 #include <sys/systm.h>
47 #include <sys/cred.h>
48 #include <sys/time.h>
49 #include <sys/vnode.h>
50 #include <sys/vfs.h>
51 #include <sys/vfs_opreg.h>
52 #include <sys/file.h>
53 #include <sys/filio.h>
54 #include <sys/uio.h>
55 #include <sys/buf.h>
56 #include <sys/mman.h>
57 #include <sys/pathname.h>
58 #include <sys/dirent.h>
59 #include <sys/debug.h>
60 #include <sys/vmsystm.h>
61 #include <sys/fcntl.h>
62 #include <sys/flock.h>
63 #include <sys/swap.h>
64 #include <sys/errno.h>
65 #include <sys/strsubr.h>
66 #include <sys/sysmacros.h>
67 #include <sys/kmem.h>
68 #include <sys/cmn_err.h>
69 #include <sys/pathconf.h>
70 #include <sys/utsname.h>
71 #include <sys/dnlc.h>
72 #include <sys/acl.h>
73 #include <sys/systeminfo.h>
74 #include <sys/policy.h>
75 #include <sys/sdt.h>
76 #include <sys/list.h>
77 #include <sys/stat.h>
78 #include <sys/zone.h>
79 
80 #include <rpc/types.h>
81 #include <rpc/auth.h>
82 #include <rpc/clnt.h>
83 
84 #include <nfs/nfs.h>
85 #include <nfs/nfs_clnt.h>
86 #include <nfs/nfs_acl.h>
87 #include <nfs/lm.h>
88 #include <nfs/nfs4.h>
89 #include <nfs/nfs4_kprot.h>
90 #include <nfs/rnode4.h>
91 #include <nfs/nfs4_clnt.h>
92 
93 #include <vm/hat.h>
94 #include <vm/as.h>
95 #include <vm/page.h>
96 #include <vm/pvn.h>
97 #include <vm/seg.h>
98 #include <vm/seg_map.h>
99 #include <vm/seg_kpm.h>
100 #include <vm/seg_vn.h>
101 
102 #include <fs/fs_subr.h>
103 
104 #include <sys/ddi.h>
105 #include <sys/int_fmtio.h>
106 #include <sys/fs/autofs.h>
107 
108 typedef struct {
109 	nfs4_ga_res_t	*di_garp;
110 	cred_t		*di_cred;
111 	hrtime_t	di_time_call;
112 } dirattr_info_t;
113 
114 typedef enum nfs4_acl_op {
115 	NFS4_ACL_GET,
116 	NFS4_ACL_SET
117 } nfs4_acl_op_t;
118 
119 static struct lm_sysid *nfs4_find_sysid(mntinfo4_t *);
120 
121 static void	nfs4_update_dircaches(change_info4 *, vnode_t *, vnode_t *,
122 			char *, dirattr_info_t *);
123 
124 static void	nfs4close_otw(rnode4_t *, cred_t *, nfs4_open_owner_t *,
125 		    nfs4_open_stream_t *, int *, int *, nfs4_close_type_t,
126 		    nfs4_error_t *, int *);
127 static int	nfs4_rdwrlbn(vnode_t *, page_t *, u_offset_t, size_t, int,
128 			cred_t *);
129 static int	nfs4write(vnode_t *, caddr_t, u_offset_t, int, cred_t *,
130 			stable_how4 *);
131 static int	nfs4read(vnode_t *, caddr_t, offset_t, int, size_t *,
132 			cred_t *, bool_t, struct uio *);
133 static int	nfs4setattr(vnode_t *, struct vattr *, int, cred_t *,
134 			vsecattr_t *);
135 static int	nfs4openattr(vnode_t *, vnode_t **, int, cred_t *);
136 static int	nfs4lookup(vnode_t *, char *, vnode_t **, cred_t *, int);
137 static int	nfs4lookup_xattr(vnode_t *, char *, vnode_t **, int, cred_t *);
138 static int	nfs4lookupvalidate_otw(vnode_t *, char *, vnode_t **, cred_t *);
139 static int	nfs4lookupnew_otw(vnode_t *, char *, vnode_t **, cred_t *);
140 static int	nfs4mknod(vnode_t *, char *, struct vattr *, enum vcexcl,
141 			int, vnode_t **, cred_t *);
142 static int	nfs4open_otw(vnode_t *, char *, struct vattr *, vnode_t **,
143 			cred_t *, int, int, enum createmode4, int);
144 static int	nfs4rename(vnode_t *, char *, vnode_t *, char *, cred_t *,
145 			caller_context_t *);
146 static int	nfs4rename_persistent_fh(vnode_t *, char *, vnode_t *,
147 			vnode_t *, char *, cred_t *, nfsstat4 *);
148 static int	nfs4rename_volatile_fh(vnode_t *, char *, vnode_t *,
149 			vnode_t *, char *, cred_t *, nfsstat4 *);
150 static int	do_nfs4readdir(vnode_t *, rddir4_cache *, cred_t *);
151 static void	nfs4readdir(vnode_t *, rddir4_cache *, cred_t *);
152 static int	nfs4_bio(struct buf *, stable_how4 *, cred_t *, bool_t);
153 static int	nfs4_getapage(vnode_t *, u_offset_t, size_t, uint_t *,
154 			page_t *[], size_t, struct seg *, caddr_t,
155 			enum seg_rw, cred_t *);
156 static void	nfs4_readahead(vnode_t *, u_offset_t, caddr_t, struct seg *,
157 			cred_t *);
158 static int	nfs4_sync_putapage(vnode_t *, page_t *, u_offset_t, size_t,
159 			int, cred_t *);
160 static int	nfs4_sync_pageio(vnode_t *, page_t *, u_offset_t, size_t,
161 			int, cred_t *);
162 static int	nfs4_commit(vnode_t *, offset4, count4, cred_t *);
163 static void	nfs4_set_mod(vnode_t *);
164 static void	nfs4_get_commit(vnode_t *);
165 static void	nfs4_get_commit_range(vnode_t *, u_offset_t, size_t);
166 static int	nfs4_putpage_commit(vnode_t *, offset_t, size_t, cred_t *);
167 static int	nfs4_commit_vp(vnode_t *, u_offset_t, size_t, cred_t *, int);
168 static int	nfs4_sync_commit(vnode_t *, page_t *, offset3, count3,
169 			cred_t *);
170 static void	do_nfs4_async_commit(vnode_t *, page_t *, offset3, count3,
171 			cred_t *);
172 static int	nfs4_update_attrcache(nfsstat4, nfs4_ga_res_t *,
173 			hrtime_t, vnode_t *, cred_t *);
174 static int	nfs4_open_non_reg_file(vnode_t **, int, cred_t *);
175 static int	nfs4_safelock(vnode_t *, const struct flock64 *, cred_t *);
176 static void	nfs4_register_lock_locally(vnode_t *, struct flock64 *, int,
177 			u_offset_t);
178 static int	nfs4_lockrelease(vnode_t *, int, offset_t, cred_t *);
179 static int	nfs4_block_and_wait(clock_t *, rnode4_t *);
180 static cred_t  *state_to_cred(nfs4_open_stream_t *);
181 static void	denied_to_flk(LOCK4denied *, flock64_t *, LOCKT4args *);
182 static pid_t	lo_to_pid(lock_owner4 *);
183 static void	nfs4_reinstitute_local_lock_state(vnode_t *, flock64_t *,
184 			cred_t *, nfs4_lock_owner_t *);
185 static void	push_reinstate(vnode_t *, int, flock64_t *, cred_t *,
186 			nfs4_lock_owner_t *);
187 static int	open_and_get_osp(vnode_t *, cred_t *, nfs4_open_stream_t **);
188 static void	nfs4_delmap_callback(struct as *, void *, uint_t);
189 static void	nfs4_free_delmapcall(nfs4_delmapcall_t *);
190 static nfs4_delmapcall_t	*nfs4_init_delmapcall();
191 static int	nfs4_find_and_delete_delmapcall(rnode4_t *, int *);
192 static int	nfs4_is_acl_mask_valid(uint_t, nfs4_acl_op_t);
193 static int	nfs4_create_getsecattr_return(vsecattr_t *, vsecattr_t *,
194 			uid_t, gid_t, int);
195 
196 /*
197  * Routines that implement the setting of v4 args for the misc. ops
198  */
199 static void	nfs4args_lock_free(nfs_argop4 *);
200 static void	nfs4args_lockt_free(nfs_argop4 *);
201 static void	nfs4args_setattr(nfs_argop4 *, vattr_t *, vsecattr_t *,
202 			int, rnode4_t *, cred_t *, bitmap4, int *,
203 			nfs4_stateid_types_t *);
204 static void	nfs4args_setattr_free(nfs_argop4 *);
205 static int	nfs4args_verify(nfs_argop4 *, vattr_t *, enum nfs_opnum4,
206 			bitmap4);
207 static void	nfs4args_verify_free(nfs_argop4 *);
208 static void	nfs4args_write(nfs_argop4 *, stable_how4, rnode4_t *, cred_t *,
209 			WRITE4args **, nfs4_stateid_types_t *);
210 
211 /*
212  * These are the vnode ops functions that implement the vnode interface to
213  * the networked file system.  See more comments below at nfs4_vnodeops.
214  */
215 static int	nfs4_open(vnode_t **, int, cred_t *, caller_context_t *);
216 static int	nfs4_close(vnode_t *, int, int, offset_t, cred_t *,
217 			caller_context_t *);
218 static int	nfs4_read(vnode_t *, struct uio *, int, cred_t *,
219 			caller_context_t *);
220 static int	nfs4_write(vnode_t *, struct uio *, int, cred_t *,
221 			caller_context_t *);
222 static int	nfs4_ioctl(vnode_t *, int, intptr_t, int, cred_t *, int *,
223 			caller_context_t *);
224 static int	nfs4_setattr(vnode_t *, struct vattr *, int, cred_t *,
225 			caller_context_t *);
226 static int	nfs4_access(vnode_t *, int, int, cred_t *, caller_context_t *);
227 static int	nfs4_readlink(vnode_t *, struct uio *, cred_t *,
228 			caller_context_t *);
229 static int	nfs4_fsync(vnode_t *, int, cred_t *, caller_context_t *);
230 static int	nfs4_create(vnode_t *, char *, struct vattr *, enum vcexcl,
231 			int, vnode_t **, cred_t *, int, caller_context_t *,
232 			vsecattr_t *);
233 static int	nfs4_remove(vnode_t *, char *, cred_t *, caller_context_t *,
234 			int);
235 static int	nfs4_link(vnode_t *, vnode_t *, char *, cred_t *,
236 			caller_context_t *, int);
237 static int	nfs4_rename(vnode_t *, char *, vnode_t *, char *, cred_t *,
238 			caller_context_t *, int);
239 static int	nfs4_mkdir(vnode_t *, char *, struct vattr *, vnode_t **,
240 			cred_t *, caller_context_t *, int, vsecattr_t *);
241 static int	nfs4_rmdir(vnode_t *, char *, vnode_t *, cred_t *,
242 			caller_context_t *, int);
243 static int	nfs4_symlink(vnode_t *, char *, struct vattr *, char *,
244 			cred_t *, caller_context_t *, int);
245 static int	nfs4_readdir(vnode_t *, struct uio *, cred_t *, int *,
246 			caller_context_t *, int);
247 static int	nfs4_seek(vnode_t *, offset_t, offset_t *, caller_context_t *);
248 static int	nfs4_getpage(vnode_t *, offset_t, size_t, uint_t *,
249 			page_t *[], size_t, struct seg *, caddr_t,
250 			enum seg_rw, cred_t *, caller_context_t *);
251 static int	nfs4_putpage(vnode_t *, offset_t, size_t, int, cred_t *,
252 			caller_context_t *);
253 static int	nfs4_map(vnode_t *, offset_t, struct as *, caddr_t *, size_t,
254 			uchar_t, uchar_t, uint_t, cred_t *, caller_context_t *);
255 static int	nfs4_addmap(vnode_t *, offset_t, struct as *, caddr_t, size_t,
256 			uchar_t, uchar_t, uint_t, cred_t *, caller_context_t *);
257 static int	nfs4_cmp(vnode_t *, vnode_t *, caller_context_t *);
258 static int	nfs4_frlock(vnode_t *, int, struct flock64 *, int, offset_t,
259 			struct flk_callback *, cred_t *, caller_context_t *);
260 static int	nfs4_space(vnode_t *, int, struct flock64 *, int, offset_t,
261 			cred_t *, caller_context_t *);
262 static int	nfs4_delmap(vnode_t *, offset_t, struct as *, caddr_t, size_t,
263 			uint_t, uint_t, uint_t, cred_t *, caller_context_t *);
264 static int	nfs4_pageio(vnode_t *, page_t *, u_offset_t, size_t, int,
265 			cred_t *, caller_context_t *);
266 static void	nfs4_dispose(vnode_t *, page_t *, int, int, cred_t *,
267 			caller_context_t *);
268 static int	nfs4_setsecattr(vnode_t *, vsecattr_t *, int, cred_t *,
269 			caller_context_t *);
270 /*
271  * These vnode ops are required to be called from outside this source file,
272  * e.g. by ephemeral mount stub vnode ops, and so may not be declared
273  * as static.
274  */
275 int	nfs4_getattr(vnode_t *, struct vattr *, int, cred_t *,
276 	    caller_context_t *);
277 void	nfs4_inactive(vnode_t *, cred_t *, caller_context_t *);
278 int	nfs4_lookup(vnode_t *, char *, vnode_t **,
279 	    struct pathname *, int, vnode_t *, cred_t *,
280 	    caller_context_t *, int *, pathname_t *);
281 int	nfs4_fid(vnode_t *, fid_t *, caller_context_t *);
282 int	nfs4_rwlock(vnode_t *, int, caller_context_t *);
283 void	nfs4_rwunlock(vnode_t *, int, caller_context_t *);
284 int	nfs4_realvp(vnode_t *, vnode_t **, caller_context_t *);
285 int	nfs4_pathconf(vnode_t *, int, ulong_t *, cred_t *,
286 	    caller_context_t *);
287 int	nfs4_getsecattr(vnode_t *, vsecattr_t *, int, cred_t *,
288 	    caller_context_t *);
289 int	nfs4_shrlock(vnode_t *, int, struct shrlock *, int, cred_t *,
290 	    caller_context_t *);
291 
292 /*
293  * Used for nfs4_commit_vp() to indicate if we should
294  * wait on pending writes.
295  */
296 #define	NFS4_WRITE_NOWAIT	0
297 #define	NFS4_WRITE_WAIT		1
298 
299 #define	NFS4_BASE_WAIT_TIME 1	/* 1 second */
300 
301 /*
302  * Error flags used to pass information about certain special errors
303  * which need to be handled specially.
304  */
305 #define	NFS_EOF			-98
306 #define	NFS_VERF_MISMATCH	-97
307 
308 /*
309  * Flags used to differentiate between which operation drove the
310  * potential CLOSE OTW. (see nfs4_close_otw_if_necessary)
311  */
312 #define	NFS4_CLOSE_OP		0x1
313 #define	NFS4_DELMAP_OP		0x2
314 #define	NFS4_INACTIVE_OP	0x3
315 
316 #define	ISVDEV(t) ((t == VBLK) || (t == VCHR) || (t == VFIFO))
317 
318 /* ALIGN64 aligns the given buffer and adjust buffer size to 64 bit */
319 #define	ALIGN64(x, ptr, sz)						\
320 	x = ((uintptr_t)(ptr)) & (sizeof (uint64_t) - 1);		\
321 	if (x) {							\
322 		x = sizeof (uint64_t) - (x);				\
323 		sz -= (x);						\
324 		ptr += (x);						\
325 	}
326 
327 #ifdef DEBUG
328 int nfs4_client_attr_debug = 0;
329 int nfs4_client_state_debug = 0;
330 int nfs4_client_shadow_debug = 0;
331 int nfs4_client_lock_debug = 0;
332 int nfs4_seqid_sync = 0;
333 int nfs4_client_map_debug = 0;
334 static int nfs4_pageio_debug = 0;
335 int nfs4_client_inactive_debug = 0;
336 int nfs4_client_recov_debug = 0;
337 int nfs4_client_failover_debug = 0;
338 int nfs4_client_call_debug = 0;
339 int nfs4_client_lookup_debug = 0;
340 int nfs4_client_zone_debug = 0;
341 int nfs4_lost_rqst_debug = 0;
342 int nfs4_rdattrerr_debug = 0;
343 int nfs4_open_stream_debug = 0;
344 
345 int nfs4read_error_inject;
346 
347 static int nfs4_create_misses = 0;
348 
349 static int nfs4_readdir_cache_shorts = 0;
350 static int nfs4_readdir_readahead = 0;
351 
352 static int nfs4_bio_do_stop = 0;
353 
354 static int nfs4_lostpage = 0;	/* number of times we lost original page */
355 
356 int nfs4_mmap_debug = 0;
357 
358 static int nfs4_pathconf_cache_hits = 0;
359 static int nfs4_pathconf_cache_misses = 0;
360 
361 int nfs4close_all_cnt;
362 int nfs4close_one_debug = 0;
363 int nfs4close_notw_debug = 0;
364 
365 int denied_to_flk_debug = 0;
366 void *lockt_denied_debug;
367 
368 #endif
369 
370 /*
371  * How long to wait before trying again if OPEN_CONFIRM gets ETIMEDOUT
372  * or NFS4ERR_RESOURCE.
373  */
374 static int confirm_retry_sec = 30;
375 
376 static int nfs4_lookup_neg_cache = 1;
377 
378 /*
379  * number of pages to read ahead
380  * optimized for 100 base-T.
381  */
382 static int nfs4_nra = 4;
383 
384 static int nfs4_do_symlink_cache = 1;
385 
386 static int nfs4_pathconf_disable_cache = 0;
387 
388 /*
389  * These are the vnode ops routines which implement the vnode interface to
390  * the networked file system.  These routines just take their parameters,
391  * make them look networkish by putting the right info into interface structs,
392  * and then calling the appropriate remote routine(s) to do the work.
393  *
394  * Note on directory name lookup cacheing:  If we detect a stale fhandle,
395  * we purge the directory cache relative to that vnode.  This way, the
396  * user won't get burned by the cache repeatedly.  See <nfs/rnode4.h> for
397  * more details on rnode locking.
398  */
399 
400 struct vnodeops *nfs4_vnodeops;
401 
402 const fs_operation_def_t nfs4_vnodeops_template[] = {
403 	VOPNAME_OPEN,		{ .vop_open = nfs4_open },
404 	VOPNAME_CLOSE,		{ .vop_close = nfs4_close },
405 	VOPNAME_READ,		{ .vop_read = nfs4_read },
406 	VOPNAME_WRITE,		{ .vop_write = nfs4_write },
407 	VOPNAME_IOCTL,		{ .vop_ioctl = nfs4_ioctl },
408 	VOPNAME_GETATTR,	{ .vop_getattr = nfs4_getattr },
409 	VOPNAME_SETATTR,	{ .vop_setattr = nfs4_setattr },
410 	VOPNAME_ACCESS,		{ .vop_access = nfs4_access },
411 	VOPNAME_LOOKUP,		{ .vop_lookup = nfs4_lookup },
412 	VOPNAME_CREATE,		{ .vop_create = nfs4_create },
413 	VOPNAME_REMOVE,		{ .vop_remove = nfs4_remove },
414 	VOPNAME_LINK,		{ .vop_link = nfs4_link },
415 	VOPNAME_RENAME,		{ .vop_rename = nfs4_rename },
416 	VOPNAME_MKDIR,		{ .vop_mkdir = nfs4_mkdir },
417 	VOPNAME_RMDIR,		{ .vop_rmdir = nfs4_rmdir },
418 	VOPNAME_READDIR,	{ .vop_readdir = nfs4_readdir },
419 	VOPNAME_SYMLINK,	{ .vop_symlink = nfs4_symlink },
420 	VOPNAME_READLINK,	{ .vop_readlink = nfs4_readlink },
421 	VOPNAME_FSYNC,		{ .vop_fsync = nfs4_fsync },
422 	VOPNAME_INACTIVE,	{ .vop_inactive = nfs4_inactive },
423 	VOPNAME_FID,		{ .vop_fid = nfs4_fid },
424 	VOPNAME_RWLOCK,		{ .vop_rwlock = nfs4_rwlock },
425 	VOPNAME_RWUNLOCK,	{ .vop_rwunlock = nfs4_rwunlock },
426 	VOPNAME_SEEK,		{ .vop_seek = nfs4_seek },
427 	VOPNAME_FRLOCK,		{ .vop_frlock = nfs4_frlock },
428 	VOPNAME_SPACE,		{ .vop_space = nfs4_space },
429 	VOPNAME_REALVP,		{ .vop_realvp = nfs4_realvp },
430 	VOPNAME_GETPAGE,	{ .vop_getpage = nfs4_getpage },
431 	VOPNAME_PUTPAGE,	{ .vop_putpage = nfs4_putpage },
432 	VOPNAME_MAP,		{ .vop_map = nfs4_map },
433 	VOPNAME_ADDMAP,		{ .vop_addmap = nfs4_addmap },
434 	VOPNAME_DELMAP,		{ .vop_delmap = nfs4_delmap },
435 	/* no separate nfs4_dump */
436 	VOPNAME_DUMP,		{ .vop_dump = nfs_dump },
437 	VOPNAME_PATHCONF,	{ .vop_pathconf = nfs4_pathconf },
438 	VOPNAME_PAGEIO,		{ .vop_pageio = nfs4_pageio },
439 	VOPNAME_DISPOSE,	{ .vop_dispose = nfs4_dispose },
440 	VOPNAME_SETSECATTR,	{ .vop_setsecattr = nfs4_setsecattr },
441 	VOPNAME_GETSECATTR,	{ .vop_getsecattr = nfs4_getsecattr },
442 	VOPNAME_SHRLOCK,	{ .vop_shrlock = nfs4_shrlock },
443 	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
444 	NULL,			NULL
445 };
446 
447 /*
448  * The following are subroutines and definitions to set args or get res
449  * for the different nfsv4 ops
450  */
451 
452 void
nfs4args_lookup_free(nfs_argop4 * argop,int arglen)453 nfs4args_lookup_free(nfs_argop4 *argop, int arglen)
454 {
455 	int		i;
456 
457 	for (i = 0; i < arglen; i++) {
458 		if (argop[i].argop == OP_LOOKUP) {
459 			kmem_free(
460 			    argop[i].nfs_argop4_u.oplookup.
461 			    objname.utf8string_val,
462 			    argop[i].nfs_argop4_u.oplookup.
463 			    objname.utf8string_len);
464 		}
465 	}
466 }
467 
468 static void
nfs4args_lock_free(nfs_argop4 * argop)469 nfs4args_lock_free(nfs_argop4 *argop)
470 {
471 	locker4 *locker = &argop->nfs_argop4_u.oplock.locker;
472 
473 	if (locker->new_lock_owner == TRUE) {
474 		open_to_lock_owner4 *open_owner;
475 
476 		open_owner = &locker->locker4_u.open_owner;
477 		if (open_owner->lock_owner.owner_val != NULL) {
478 			kmem_free(open_owner->lock_owner.owner_val,
479 			    open_owner->lock_owner.owner_len);
480 		}
481 	}
482 }
483 
484 static void
nfs4args_lockt_free(nfs_argop4 * argop)485 nfs4args_lockt_free(nfs_argop4 *argop)
486 {
487 	lock_owner4 *lowner = &argop->nfs_argop4_u.oplockt.owner;
488 
489 	if (lowner->owner_val != NULL) {
490 		kmem_free(lowner->owner_val, lowner->owner_len);
491 	}
492 }
493 
494 static void
nfs4args_setattr(nfs_argop4 * argop,vattr_t * vap,vsecattr_t * vsap,int flags,rnode4_t * rp,cred_t * cr,bitmap4 supp,int * error,nfs4_stateid_types_t * sid_types)495 nfs4args_setattr(nfs_argop4 *argop, vattr_t *vap, vsecattr_t *vsap, int flags,
496     rnode4_t *rp, cred_t *cr, bitmap4 supp, int *error,
497     nfs4_stateid_types_t *sid_types)
498 {
499 	fattr4		*attr = &argop->nfs_argop4_u.opsetattr.obj_attributes;
500 	mntinfo4_t	*mi;
501 
502 	argop->argop = OP_SETATTR;
503 	/*
504 	 * The stateid is set to 0 if client is not modifying the size
505 	 * and otherwise to whatever nfs4_get_stateid() returns.
506 	 *
507 	 * XXX Note: nfs4_get_stateid() returns 0 if no lockowner and/or no
508 	 * state struct could be found for the process/file pair.  We may
509 	 * want to change this in the future (by OPENing the file).  See
510 	 * bug # 4474852.
511 	 */
512 	if (vap->va_mask & AT_SIZE) {
513 
514 		ASSERT(rp != NULL);
515 		mi = VTOMI4(RTOV4(rp));
516 
517 		argop->nfs_argop4_u.opsetattr.stateid =
518 		    nfs4_get_stateid(cr, rp, curproc->p_pidp->pid_id, mi,
519 		    OP_SETATTR, sid_types, FALSE);
520 	} else {
521 		bzero(&argop->nfs_argop4_u.opsetattr.stateid,
522 		    sizeof (stateid4));
523 	}
524 
525 	*error = vattr_to_fattr4(vap, vsap, attr, flags, OP_SETATTR, supp);
526 	if (*error)
527 		bzero(attr, sizeof (*attr));
528 }
529 
530 static void
nfs4args_setattr_free(nfs_argop4 * argop)531 nfs4args_setattr_free(nfs_argop4 *argop)
532 {
533 	nfs4_fattr4_free(&argop->nfs_argop4_u.opsetattr.obj_attributes);
534 }
535 
536 static int
nfs4args_verify(nfs_argop4 * argop,vattr_t * vap,enum nfs_opnum4 op,bitmap4 supp)537 nfs4args_verify(nfs_argop4 *argop, vattr_t *vap, enum nfs_opnum4 op,
538     bitmap4 supp)
539 {
540 	fattr4 *attr;
541 	int error = 0;
542 
543 	argop->argop = op;
544 	switch (op) {
545 	case OP_VERIFY:
546 		attr = &argop->nfs_argop4_u.opverify.obj_attributes;
547 		break;
548 	case OP_NVERIFY:
549 		attr = &argop->nfs_argop4_u.opnverify.obj_attributes;
550 		break;
551 	default:
552 		return (EINVAL);
553 	}
554 	if (!error)
555 		error = vattr_to_fattr4(vap, NULL, attr, 0, op, supp);
556 	if (error)
557 		bzero(attr, sizeof (*attr));
558 	return (error);
559 }
560 
561 static void
nfs4args_verify_free(nfs_argop4 * argop)562 nfs4args_verify_free(nfs_argop4 *argop)
563 {
564 	switch (argop->argop) {
565 	case OP_VERIFY:
566 		nfs4_fattr4_free(&argop->nfs_argop4_u.opverify.obj_attributes);
567 		break;
568 	case OP_NVERIFY:
569 		nfs4_fattr4_free(&argop->nfs_argop4_u.opnverify.obj_attributes);
570 		break;
571 	default:
572 		break;
573 	}
574 }
575 
576 static void
nfs4args_write(nfs_argop4 * argop,stable_how4 stable,rnode4_t * rp,cred_t * cr,WRITE4args ** wargs_pp,nfs4_stateid_types_t * sid_tp)577 nfs4args_write(nfs_argop4 *argop, stable_how4 stable, rnode4_t *rp, cred_t *cr,
578     WRITE4args **wargs_pp, nfs4_stateid_types_t *sid_tp)
579 {
580 	WRITE4args *wargs = &argop->nfs_argop4_u.opwrite;
581 	mntinfo4_t *mi = VTOMI4(RTOV4(rp));
582 
583 	argop->argop = OP_WRITE;
584 	wargs->stable = stable;
585 	wargs->stateid = nfs4_get_w_stateid(cr, rp, curproc->p_pidp->pid_id,
586 	    mi, OP_WRITE, sid_tp);
587 	wargs->mblk = NULL;
588 	*wargs_pp = wargs;
589 }
590 
591 void
nfs4args_copen_free(OPEN4cargs * open_args)592 nfs4args_copen_free(OPEN4cargs *open_args)
593 {
594 	if (open_args->owner.owner_val) {
595 		kmem_free(open_args->owner.owner_val,
596 		    open_args->owner.owner_len);
597 	}
598 	if ((open_args->opentype == OPEN4_CREATE) &&
599 	    (open_args->mode != EXCLUSIVE4)) {
600 		nfs4_fattr4_free(&open_args->createhow4_u.createattrs);
601 	}
602 }
603 
604 /*
605  * XXX:  This is referenced in modstubs.s
606  */
607 struct vnodeops *
nfs4_getvnodeops(void)608 nfs4_getvnodeops(void)
609 {
610 	return (nfs4_vnodeops);
611 }
612 
613 /*
614  * The OPEN operation opens a regular file.
615  */
616 /*ARGSUSED3*/
617 static int
nfs4_open(vnode_t ** vpp,int flag,cred_t * cr,caller_context_t * ct)618 nfs4_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
619 {
620 	vnode_t *dvp = NULL;
621 	rnode4_t *rp, *drp;
622 	int error;
623 	int just_been_created;
624 	char fn[MAXNAMELEN];
625 
626 	NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4_open: "));
627 	if (nfs_zone() != VTOMI4(*vpp)->mi_zone)
628 		return (EIO);
629 	rp = VTOR4(*vpp);
630 
631 	/*
632 	 * Check to see if opening something besides a regular file;
633 	 * if so skip the OTW call
634 	 */
635 	if ((*vpp)->v_type != VREG) {
636 		error = nfs4_open_non_reg_file(vpp, flag, cr);
637 		return (error);
638 	}
639 
640 	/*
641 	 * XXX - would like a check right here to know if the file is
642 	 * executable or not, so as to skip OTW
643 	 */
644 
645 	if ((error = vtodv(*vpp, &dvp, cr, TRUE)) != 0)
646 		return (error);
647 
648 	drp = VTOR4(dvp);
649 	if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp)))
650 		return (EINTR);
651 
652 	if ((error = vtoname(*vpp, fn, MAXNAMELEN)) != 0) {
653 		nfs_rw_exit(&drp->r_rwlock);
654 		return (error);
655 	}
656 
657 	/*
658 	 * See if this file has just been CREATEd.
659 	 * If so, clear the flag and update the dnlc, which was previously
660 	 * skipped in nfs4_create.
661 	 * XXX need better serilization on this.
662 	 * XXX move this into the nf4open_otw call, after we have
663 	 * XXX acquired the open owner seqid sync.
664 	 */
665 	mutex_enter(&rp->r_statev4_lock);
666 	if (rp->created_v4) {
667 		rp->created_v4 = 0;
668 		mutex_exit(&rp->r_statev4_lock);
669 
670 		dnlc_update(dvp, fn, *vpp);
671 		/* This is needed so we don't bump the open ref count */
672 		just_been_created = 1;
673 	} else {
674 		mutex_exit(&rp->r_statev4_lock);
675 		just_been_created = 0;
676 	}
677 
678 	/*
679 	 * If caller specified O_TRUNC/FTRUNC, then be sure to set
680 	 * FWRITE (to drive successful setattr(size=0) after open)
681 	 */
682 	if (flag & FTRUNC)
683 		flag |= FWRITE;
684 
685 	error = nfs4open_otw(dvp, fn, NULL, vpp, cr, 0, flag, 0,
686 	    just_been_created);
687 
688 	if (!error && !((*vpp)->v_flag & VROOT))
689 		dnlc_update(dvp, fn, *vpp);
690 
691 	nfs_rw_exit(&drp->r_rwlock);
692 
693 	/* release the hold from vtodv */
694 	VN_RELE(dvp);
695 
696 	/* exchange the shadow for the master vnode, if needed */
697 
698 	if (error == 0 && IS_SHADOW(*vpp, rp))
699 		sv_exchange(vpp);
700 
701 	return (error);
702 }
703 
704 /*
705  * See if there's a "lost open" request to be saved and recovered.
706  */
707 static void
nfs4open_save_lost_rqst(int error,nfs4_lost_rqst_t * lost_rqstp,nfs4_open_owner_t * oop,cred_t * cr,vnode_t * vp,vnode_t * dvp,OPEN4cargs * open_args)708 nfs4open_save_lost_rqst(int error, nfs4_lost_rqst_t *lost_rqstp,
709     nfs4_open_owner_t *oop, cred_t *cr, vnode_t *vp,
710     vnode_t *dvp, OPEN4cargs *open_args)
711 {
712 	vfs_t *vfsp;
713 	char *srccfp;
714 
715 	vfsp = (dvp ? dvp->v_vfsp : vp->v_vfsp);
716 
717 	if (error != ETIMEDOUT && error != EINTR &&
718 	    !NFS4_FRC_UNMT_ERR(error, vfsp)) {
719 		lost_rqstp->lr_op = 0;
720 		return;
721 	}
722 
723 	NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
724 	    "nfs4open_save_lost_rqst: error %d", error));
725 
726 	lost_rqstp->lr_op = OP_OPEN;
727 
728 	/*
729 	 * The vp (if it is not NULL) and dvp are held and rele'd via
730 	 * the recovery code.  See nfs4_save_lost_rqst.
731 	 */
732 	lost_rqstp->lr_vp = vp;
733 	lost_rqstp->lr_dvp = dvp;
734 	lost_rqstp->lr_oop = oop;
735 	lost_rqstp->lr_osp = NULL;
736 	lost_rqstp->lr_lop = NULL;
737 	lost_rqstp->lr_cr = cr;
738 	lost_rqstp->lr_flk = NULL;
739 	lost_rqstp->lr_oacc = open_args->share_access;
740 	lost_rqstp->lr_odeny = open_args->share_deny;
741 	lost_rqstp->lr_oclaim = open_args->claim;
742 	if (open_args->claim == CLAIM_DELEGATE_CUR) {
743 		lost_rqstp->lr_ostateid =
744 		    open_args->open_claim4_u.delegate_cur_info.delegate_stateid;
745 		srccfp = open_args->open_claim4_u.delegate_cur_info.cfile;
746 	} else {
747 		srccfp = open_args->open_claim4_u.cfile;
748 	}
749 	lost_rqstp->lr_ofile.utf8string_len = 0;
750 	lost_rqstp->lr_ofile.utf8string_val = NULL;
751 	(void) str_to_utf8(srccfp, &lost_rqstp->lr_ofile);
752 	lost_rqstp->lr_putfirst = FALSE;
753 }
754 
755 struct nfs4_excl_time {
756 	uint32 seconds;
757 	uint32 nseconds;
758 };
759 
760 /*
761  * The OPEN operation creates and/or opens a regular file
762  *
763  * ARGSUSED
764  */
765 static int
nfs4open_otw(vnode_t * dvp,char * file_name,struct vattr * in_va,vnode_t ** vpp,cred_t * cr,int create_flag,int open_flag,enum createmode4 createmode,int file_just_been_created)766 nfs4open_otw(vnode_t *dvp, char *file_name, struct vattr *in_va,
767     vnode_t **vpp, cred_t *cr, int create_flag, int open_flag,
768     enum createmode4 createmode, int file_just_been_created)
769 {
770 	rnode4_t *rp;
771 	rnode4_t *drp = VTOR4(dvp);
772 	vnode_t *vp = NULL;
773 	vnode_t *vpi = *vpp;
774 	bool_t needrecov = FALSE;
775 
776 	int doqueue = 1;
777 
778 	COMPOUND4args_clnt args;
779 	COMPOUND4res_clnt res;
780 	nfs_argop4 *argop;
781 	nfs_resop4 *resop;
782 	int argoplist_size;
783 	int idx_open, idx_fattr;
784 
785 	GETFH4res *gf_res = NULL;
786 	OPEN4res *op_res = NULL;
787 	nfs4_ga_res_t *garp;
788 	fattr4 *attr = NULL;
789 	struct nfs4_excl_time verf;
790 	bool_t did_excl_setup = FALSE;
791 	int created_osp;
792 
793 	OPEN4cargs *open_args;
794 	nfs4_open_owner_t	*oop = NULL;
795 	nfs4_open_stream_t	*osp = NULL;
796 	seqid4 seqid = 0;
797 	bool_t retry_open = FALSE;
798 	nfs4_recov_state_t recov_state;
799 	nfs4_lost_rqst_t lost_rqst;
800 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
801 	hrtime_t t;
802 	int acc = 0;
803 	cred_t *cred_otw = NULL;	/* cred used to do the RPC call */
804 	cred_t *ncr = NULL;
805 
806 	nfs4_sharedfh_t *otw_sfh;
807 	nfs4_sharedfh_t *orig_sfh;
808 	int fh_differs = 0;
809 	int numops, setgid_flag;
810 	int num_bseqid_retry = NFS4_NUM_RETRY_BAD_SEQID + 1;
811 
812 	/*
813 	 * Make sure we properly deal with setting the right gid on
814 	 * a newly created file to reflect the parent's setgid bit
815 	 */
816 	setgid_flag = 0;
817 	if (create_flag && in_va) {
818 
819 		/*
820 		 * If there is grpid mount flag used or
821 		 * the parent's directory has the setgid bit set
822 		 * _and_ the client was able to get a valid mapping
823 		 * for the parent dir's owner_group, we want to
824 		 * append NVERIFY(owner_group == dva.va_gid) and
825 		 * SETATTR to the CREATE compound.
826 		 */
827 		mutex_enter(&drp->r_statelock);
828 		if ((VTOMI4(dvp)->mi_flags & MI4_GRPID ||
829 		    drp->r_attr.va_mode & VSGID) &&
830 		    drp->r_attr.va_gid != GID_NOBODY) {
831 			in_va->va_mask |= AT_GID;
832 			in_va->va_gid = drp->r_attr.va_gid;
833 			setgid_flag = 1;
834 		}
835 		mutex_exit(&drp->r_statelock);
836 	}
837 
838 	/*
839 	 * Normal/non-create compound:
840 	 * PUTFH(dfh) + OPEN(create) + GETFH + GETATTR(new)
841 	 *
842 	 * Open(create) compound no setgid:
843 	 * PUTFH(dfh) + SAVEFH + OPEN(create) + GETFH + GETATTR(new) +
844 	 * RESTOREFH + GETATTR
845 	 *
846 	 * Open(create) setgid:
847 	 * PUTFH(dfh) + OPEN(create) + GETFH + GETATTR(new) +
848 	 * SAVEFH + PUTFH(dfh) + GETATTR(dvp) + RESTOREFH +
849 	 * NVERIFY(grp) + SETATTR
850 	 */
851 	if (setgid_flag) {
852 		numops = 10;
853 		idx_open = 1;
854 		idx_fattr = 3;
855 	} else if (create_flag) {
856 		numops = 7;
857 		idx_open = 2;
858 		idx_fattr = 4;
859 	} else {
860 		numops = 4;
861 		idx_open = 1;
862 		idx_fattr = 3;
863 	}
864 
865 	args.array_len = numops;
866 	argoplist_size = numops * sizeof (nfs_argop4);
867 	argop = kmem_alloc(argoplist_size, KM_SLEEP);
868 
869 	NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4open_otw: "
870 	    "open %s open flag 0x%x cred %p", file_name, open_flag,
871 	    (void *)cr));
872 
873 	ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone);
874 	if (create_flag) {
875 		/*
876 		 * We are to create a file.  Initialize the passed in vnode
877 		 * pointer.
878 		 */
879 		vpi = NULL;
880 	} else {
881 		/*
882 		 * Check to see if the client owns a read delegation and is
883 		 * trying to open for write.  If so, then return the delegation
884 		 * to avoid the server doing a cb_recall and returning DELAY.
885 		 * NB - we don't use the statev4_lock here because we'd have
886 		 * to drop the lock anyway and the result would be stale.
887 		 */
888 		if ((open_flag & FWRITE) &&
889 		    VTOR4(vpi)->r_deleg_type == OPEN_DELEGATE_READ)
890 			(void) nfs4delegreturn(VTOR4(vpi), NFS4_DR_REOPEN);
891 
892 		/*
893 		 * If the file has a delegation, then do an access check up
894 		 * front.  This avoids having to an access check later after
895 		 * we've already done start_op, which could deadlock.
896 		 */
897 		if (VTOR4(vpi)->r_deleg_type != OPEN_DELEGATE_NONE) {
898 			if (open_flag & FREAD &&
899 			    nfs4_access(vpi, VREAD, 0, cr, NULL) == 0)
900 				acc |= VREAD;
901 			if (open_flag & FWRITE &&
902 			    nfs4_access(vpi, VWRITE, 0, cr, NULL) == 0)
903 				acc |= VWRITE;
904 		}
905 	}
906 
907 	drp = VTOR4(dvp);
908 
909 	recov_state.rs_flags = 0;
910 	recov_state.rs_num_retry_despite_err = 0;
911 	cred_otw = cr;
912 
913 recov_retry:
914 	fh_differs = 0;
915 	nfs4_error_zinit(&e);
916 
917 	e.error = nfs4_start_op(VTOMI4(dvp), dvp, vpi, &recov_state);
918 	if (e.error) {
919 		if (ncr != NULL)
920 			crfree(ncr);
921 		kmem_free(argop, argoplist_size);
922 		return (e.error);
923 	}
924 
925 	args.ctag = TAG_OPEN;
926 	args.array_len = numops;
927 	args.array = argop;
928 
929 	/* putfh directory fh */
930 	argop[0].argop = OP_CPUTFH;
931 	argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh;
932 
933 	/* OPEN: either op 1 or op 2 depending upon create/setgid flags */
934 	argop[idx_open].argop = OP_COPEN;
935 	open_args = &argop[idx_open].nfs_argop4_u.opcopen;
936 	open_args->claim = CLAIM_NULL;
937 
938 	/* name of file */
939 	open_args->open_claim4_u.cfile = file_name;
940 	open_args->owner.owner_len = 0;
941 	open_args->owner.owner_val = NULL;
942 
943 	if (create_flag) {
944 		/* CREATE a file */
945 		open_args->opentype = OPEN4_CREATE;
946 		open_args->mode = createmode;
947 		if (createmode == EXCLUSIVE4) {
948 			if (did_excl_setup == FALSE) {
949 				verf.seconds = zone_get_hostid(NULL);
950 				if (verf.seconds != 0)
951 					verf.nseconds = newnum();
952 				else {
953 					timestruc_t now;
954 
955 					gethrestime(&now);
956 					verf.seconds = now.tv_sec;
957 					verf.nseconds = now.tv_nsec;
958 				}
959 				/*
960 				 * Since the server will use this value for the
961 				 * mtime, make sure that it can't overflow. Zero
962 				 * out the MSB. The actual value does not matter
963 				 * here, only its uniqeness.
964 				 */
965 				verf.seconds &= INT32_MAX;
966 				did_excl_setup = TRUE;
967 			}
968 
969 			/* Now copy over verifier to OPEN4args. */
970 			open_args->createhow4_u.createverf = *(uint64_t *)&verf;
971 		} else {
972 			int v_error;
973 			bitmap4 supp_attrs;
974 			servinfo4_t *svp;
975 
976 			attr = &open_args->createhow4_u.createattrs;
977 
978 			svp = drp->r_server;
979 			(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
980 			supp_attrs = svp->sv_supp_attrs;
981 			nfs_rw_exit(&svp->sv_lock);
982 
983 			/* GUARDED4 or UNCHECKED4 */
984 			v_error = vattr_to_fattr4(in_va, NULL, attr, 0, OP_OPEN,
985 			    supp_attrs);
986 			if (v_error) {
987 				bzero(attr, sizeof (*attr));
988 				nfs4args_copen_free(open_args);
989 				nfs4_end_op(VTOMI4(dvp), dvp, vpi,
990 				    &recov_state, FALSE);
991 				if (ncr != NULL)
992 					crfree(ncr);
993 				kmem_free(argop, argoplist_size);
994 				return (v_error);
995 			}
996 		}
997 	} else {
998 		/* NO CREATE */
999 		open_args->opentype = OPEN4_NOCREATE;
1000 	}
1001 
1002 	if (recov_state.rs_sp != NULL) {
1003 		mutex_enter(&recov_state.rs_sp->s_lock);
1004 		open_args->owner.clientid = recov_state.rs_sp->clientid;
1005 		mutex_exit(&recov_state.rs_sp->s_lock);
1006 	} else {
1007 		/* XXX should we just fail here? */
1008 		open_args->owner.clientid = 0;
1009 	}
1010 
1011 	/*
1012 	 * This increments oop's ref count or creates a temporary 'just_created'
1013 	 * open owner that will become valid when this OPEN/OPEN_CONFIRM call
1014 	 * completes.
1015 	 */
1016 	mutex_enter(&VTOMI4(dvp)->mi_lock);
1017 
1018 	/* See if a permanent or just created open owner exists */
1019 	oop = find_open_owner_nolock(cr, NFS4_JUST_CREATED, VTOMI4(dvp));
1020 	if (!oop) {
1021 		/*
1022 		 * This open owner does not exist so create a temporary
1023 		 * just created one.
1024 		 */
1025 		oop = create_open_owner(cr, VTOMI4(dvp));
1026 		ASSERT(oop != NULL);
1027 	}
1028 	mutex_exit(&VTOMI4(dvp)->mi_lock);
1029 
1030 	/* this length never changes, do alloc before seqid sync */
1031 	open_args->owner.owner_len = sizeof (oop->oo_name);
1032 	open_args->owner.owner_val =
1033 	    kmem_alloc(open_args->owner.owner_len, KM_SLEEP);
1034 
1035 	e.error = nfs4_start_open_seqid_sync(oop, VTOMI4(dvp));
1036 	if (e.error == EAGAIN) {
1037 		open_owner_rele(oop);
1038 		nfs4args_copen_free(open_args);
1039 		nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, TRUE);
1040 		if (ncr != NULL) {
1041 			crfree(ncr);
1042 			ncr = NULL;
1043 		}
1044 		goto recov_retry;
1045 	}
1046 
1047 	/* Check to see if we need to do the OTW call */
1048 	if (!create_flag) {
1049 		if (!nfs4_is_otw_open_necessary(oop, open_flag, vpi,
1050 		    file_just_been_created, &e.error, acc, &recov_state)) {
1051 
1052 			/*
1053 			 * The OTW open is not necessary.  Either
1054 			 * the open can succeed without it (eg.
1055 			 * delegation, error == 0) or the open
1056 			 * must fail due to an access failure
1057 			 * (error != 0).  In either case, tidy
1058 			 * up and return.
1059 			 */
1060 
1061 			nfs4_end_open_seqid_sync(oop);
1062 			open_owner_rele(oop);
1063 			nfs4args_copen_free(open_args);
1064 			nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, FALSE);
1065 			if (ncr != NULL)
1066 				crfree(ncr);
1067 			kmem_free(argop, argoplist_size);
1068 			return (e.error);
1069 		}
1070 	}
1071 
1072 	bcopy(&oop->oo_name, open_args->owner.owner_val,
1073 	    open_args->owner.owner_len);
1074 
1075 	seqid = nfs4_get_open_seqid(oop) + 1;
1076 	open_args->seqid = seqid;
1077 	open_args->share_access = 0;
1078 	if (open_flag & FREAD)
1079 		open_args->share_access |= OPEN4_SHARE_ACCESS_READ;
1080 	if (open_flag & FWRITE)
1081 		open_args->share_access |= OPEN4_SHARE_ACCESS_WRITE;
1082 	open_args->share_deny = OPEN4_SHARE_DENY_NONE;
1083 
1084 
1085 
1086 	/*
1087 	 * getfh w/sanity check for idx_open/idx_fattr
1088 	 */
1089 	ASSERT((idx_open + 1) == (idx_fattr - 1));
1090 	argop[idx_open + 1].argop = OP_GETFH;
1091 
1092 	/* getattr */
1093 	argop[idx_fattr].argop = OP_GETATTR;
1094 	argop[idx_fattr].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
1095 	argop[idx_fattr].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp);
1096 
1097 	if (setgid_flag) {
1098 		vattr_t	_v;
1099 		servinfo4_t *svp;
1100 		bitmap4	supp_attrs;
1101 
1102 		svp = drp->r_server;
1103 		(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
1104 		supp_attrs = svp->sv_supp_attrs;
1105 		nfs_rw_exit(&svp->sv_lock);
1106 
1107 		/*
1108 		 * For setgid case, we need to:
1109 		 * 4:savefh(new) 5:putfh(dir) 6:getattr(dir) 7:restorefh(new)
1110 		 */
1111 		argop[4].argop = OP_SAVEFH;
1112 
1113 		argop[5].argop = OP_CPUTFH;
1114 		argop[5].nfs_argop4_u.opcputfh.sfh = drp->r_fh;
1115 
1116 		argop[6].argop = OP_GETATTR;
1117 		argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
1118 		argop[6].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp);
1119 
1120 		argop[7].argop = OP_RESTOREFH;
1121 
1122 		/*
1123 		 * nverify
1124 		 */
1125 		_v.va_mask = AT_GID;
1126 		_v.va_gid = in_va->va_gid;
1127 		if (!(e.error = nfs4args_verify(&argop[8], &_v, OP_NVERIFY,
1128 		    supp_attrs))) {
1129 
1130 			/*
1131 			 * setattr
1132 			 *
1133 			 * We _know_ we're not messing with AT_SIZE or
1134 			 * AT_XTIME, so no need for stateid or flags.
1135 			 * Also we specify NULL rp since we're only
1136 			 * interested in setting owner_group attributes.
1137 			 */
1138 			nfs4args_setattr(&argop[9], &_v, NULL, 0, NULL, cr,
1139 			    supp_attrs, &e.error, 0);
1140 			if (e.error)
1141 				nfs4args_verify_free(&argop[8]);
1142 		}
1143 
1144 		if (e.error) {
1145 			/*
1146 			 * XXX - Revisit the last argument to nfs4_end_op()
1147 			 *	 once 5020486 is fixed.
1148 			 */
1149 			nfs4_end_open_seqid_sync(oop);
1150 			open_owner_rele(oop);
1151 			nfs4args_copen_free(open_args);
1152 			nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, TRUE);
1153 			if (ncr != NULL)
1154 				crfree(ncr);
1155 			kmem_free(argop, argoplist_size);
1156 			return (e.error);
1157 		}
1158 	} else if (create_flag) {
1159 		argop[1].argop = OP_SAVEFH;
1160 
1161 		argop[5].argop = OP_RESTOREFH;
1162 
1163 		argop[6].argop = OP_GETATTR;
1164 		argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
1165 		argop[6].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp);
1166 	}
1167 
1168 	NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
1169 	    "nfs4open_otw: %s call, nm %s, rp %s",
1170 	    needrecov ? "recov" : "first", file_name,
1171 	    rnode4info(VTOR4(dvp))));
1172 
1173 	t = gethrtime();
1174 
1175 	rfs4call(VTOMI4(dvp), &args, &res, cred_otw, &doqueue, 0, &e);
1176 
1177 	if (!e.error && nfs4_need_to_bump_seqid(&res))
1178 		nfs4_set_open_seqid(seqid, oop, args.ctag);
1179 
1180 	needrecov = nfs4_needs_recovery(&e, TRUE, dvp->v_vfsp);
1181 
1182 	if (e.error || needrecov) {
1183 		bool_t abort = FALSE;
1184 
1185 		if (needrecov) {
1186 			nfs4_bseqid_entry_t *bsep = NULL;
1187 
1188 			nfs4open_save_lost_rqst(e.error, &lost_rqst, oop,
1189 			    cred_otw, vpi, dvp, open_args);
1190 
1191 			if (!e.error && res.status == NFS4ERR_BAD_SEQID) {
1192 				bsep = nfs4_create_bseqid_entry(oop, NULL,
1193 				    vpi, 0, args.ctag, open_args->seqid);
1194 				num_bseqid_retry--;
1195 			}
1196 
1197 			abort = nfs4_start_recovery(&e, VTOMI4(dvp), dvp, vpi,
1198 			    NULL, lost_rqst.lr_op == OP_OPEN ?
1199 			    &lost_rqst : NULL, OP_OPEN, bsep, NULL, NULL);
1200 
1201 			if (bsep)
1202 				kmem_free(bsep, sizeof (*bsep));
1203 			/* give up if we keep getting BAD_SEQID */
1204 			if (num_bseqid_retry == 0)
1205 				abort = TRUE;
1206 			if (abort == TRUE && e.error == 0)
1207 				e.error = geterrno4(res.status);
1208 		}
1209 		nfs4_end_open_seqid_sync(oop);
1210 		open_owner_rele(oop);
1211 		nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov);
1212 		nfs4args_copen_free(open_args);
1213 		if (setgid_flag) {
1214 			nfs4args_verify_free(&argop[8]);
1215 			nfs4args_setattr_free(&argop[9]);
1216 		}
1217 		if (!e.error)
1218 			xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1219 		if (ncr != NULL) {
1220 			crfree(ncr);
1221 			ncr = NULL;
1222 		}
1223 		if (!needrecov || abort == TRUE || e.error == EINTR ||
1224 		    NFS4_FRC_UNMT_ERR(e.error, dvp->v_vfsp)) {
1225 			kmem_free(argop, argoplist_size);
1226 			return (e.error);
1227 		}
1228 		goto recov_retry;
1229 	}
1230 
1231 	/*
1232 	 * Will check and update lease after checking the rflag for
1233 	 * OPEN_CONFIRM in the successful OPEN call.
1234 	 */
1235 	if (res.status != NFS4_OK && res.array_len <= idx_fattr + 1) {
1236 
1237 		/*
1238 		 * XXX what if we're crossing mount points from server1:/drp
1239 		 * to server2:/drp/rp.
1240 		 */
1241 
1242 		/* Signal our end of use of the open seqid */
1243 		nfs4_end_open_seqid_sync(oop);
1244 
1245 		/*
1246 		 * This will destroy the open owner if it was just created,
1247 		 * and no one else has put a reference on it.
1248 		 */
1249 		open_owner_rele(oop);
1250 		if (create_flag && (createmode != EXCLUSIVE4) &&
1251 		    res.status == NFS4ERR_BADOWNER)
1252 			nfs4_log_badowner(VTOMI4(dvp), OP_OPEN);
1253 
1254 		e.error = geterrno4(res.status);
1255 		nfs4args_copen_free(open_args);
1256 		if (setgid_flag) {
1257 			nfs4args_verify_free(&argop[8]);
1258 			nfs4args_setattr_free(&argop[9]);
1259 		}
1260 		xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1261 		nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov);
1262 		/*
1263 		 * If the reply is NFS4ERR_ACCESS, it may be because
1264 		 * we are root (no root net access).  If the real uid
1265 		 * is not root, then retry with the real uid instead.
1266 		 */
1267 		if (ncr != NULL) {
1268 			crfree(ncr);
1269 			ncr = NULL;
1270 		}
1271 		if (res.status == NFS4ERR_ACCESS &&
1272 		    (ncr = crnetadjust(cred_otw)) != NULL) {
1273 			cred_otw = ncr;
1274 			goto recov_retry;
1275 		}
1276 		kmem_free(argop, argoplist_size);
1277 		return (e.error);
1278 	}
1279 
1280 	resop = &res.array[idx_open];  /* open res */
1281 	op_res = &resop->nfs_resop4_u.opopen;
1282 
1283 #ifdef DEBUG
1284 	/*
1285 	 * verify attrset bitmap
1286 	 */
1287 	if (create_flag &&
1288 	    (createmode == UNCHECKED4 || createmode == GUARDED4)) {
1289 		/* make sure attrset returned is what we asked for */
1290 		/* XXX Ignore this 'error' for now */
1291 		if (attr->attrmask != op_res->attrset)
1292 			/* EMPTY */;
1293 	}
1294 #endif
1295 
1296 	if (op_res->rflags & OPEN4_RESULT_LOCKTYPE_POSIX) {
1297 		mutex_enter(&VTOMI4(dvp)->mi_lock);
1298 		VTOMI4(dvp)->mi_flags |= MI4_POSIX_LOCK;
1299 		mutex_exit(&VTOMI4(dvp)->mi_lock);
1300 	}
1301 
1302 	resop = &res.array[idx_open + 1];  /* getfh res */
1303 	gf_res = &resop->nfs_resop4_u.opgetfh;
1304 
1305 	otw_sfh = sfh4_get(&gf_res->object, VTOMI4(dvp));
1306 
1307 	/*
1308 	 * The open stateid has been updated on the server but not
1309 	 * on the client yet.  There is a path: makenfs4node->nfs4_attr_cache->
1310 	 * flush_pages->VOP_PUTPAGE->...->nfs4write where we will issue an OTW
1311 	 * WRITE call.  That, however, will use the old stateid, so go ahead
1312 	 * and upate the open stateid now, before any call to makenfs4node.
1313 	 */
1314 	if (vpi) {
1315 		nfs4_open_stream_t	*tmp_osp;
1316 		rnode4_t		*tmp_rp = VTOR4(vpi);
1317 
1318 		tmp_osp = find_open_stream(oop, tmp_rp);
1319 		if (tmp_osp) {
1320 			tmp_osp->open_stateid = op_res->stateid;
1321 			mutex_exit(&tmp_osp->os_sync_lock);
1322 			open_stream_rele(tmp_osp, tmp_rp);
1323 		}
1324 
1325 		/*
1326 		 * We must determine if the file handle given by the otw open
1327 		 * is the same as the file handle which was passed in with
1328 		 * *vpp.  This case can be reached if the file we are trying
1329 		 * to open has been removed and another file has been created
1330 		 * having the same file name.  The passed in vnode is released
1331 		 * later.
1332 		 */
1333 		orig_sfh = VTOR4(vpi)->r_fh;
1334 		fh_differs = nfs4cmpfh(&orig_sfh->sfh_fh, &otw_sfh->sfh_fh);
1335 	}
1336 
1337 	garp = &res.array[idx_fattr].nfs_resop4_u.opgetattr.ga_res;
1338 
1339 	if (create_flag || fh_differs) {
1340 		int rnode_err = 0;
1341 
1342 		vp = makenfs4node(otw_sfh, garp, dvp->v_vfsp, t, cr,
1343 		    dvp, fn_get(VTOSV(dvp)->sv_name, file_name, otw_sfh));
1344 
1345 		if (e.error)
1346 			PURGE_ATTRCACHE4(vp);
1347 		/*
1348 		 * For the newly created vp case, make sure the rnode
1349 		 * isn't bad before using it.
1350 		 */
1351 		mutex_enter(&(VTOR4(vp))->r_statelock);
1352 		if (VTOR4(vp)->r_flags & R4RECOVERR)
1353 			rnode_err = EIO;
1354 		mutex_exit(&(VTOR4(vp))->r_statelock);
1355 
1356 		if (rnode_err) {
1357 			nfs4_end_open_seqid_sync(oop);
1358 			nfs4args_copen_free(open_args);
1359 			if (setgid_flag) {
1360 				nfs4args_verify_free(&argop[8]);
1361 				nfs4args_setattr_free(&argop[9]);
1362 			}
1363 			xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1364 			nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state,
1365 			    needrecov);
1366 			open_owner_rele(oop);
1367 			VN_RELE(vp);
1368 			if (ncr != NULL)
1369 				crfree(ncr);
1370 			sfh4_rele(&otw_sfh);
1371 			kmem_free(argop, argoplist_size);
1372 			return (EIO);
1373 		}
1374 	} else {
1375 		vp = vpi;
1376 	}
1377 	sfh4_rele(&otw_sfh);
1378 
1379 	/*
1380 	 * It seems odd to get a full set of attrs and then not update
1381 	 * the object's attrcache in the non-create case.  Create case uses
1382 	 * the attrs since makenfs4node checks to see if the attrs need to
1383 	 * be updated (and then updates them).  The non-create case should
1384 	 * update attrs also.
1385 	 */
1386 	if (! create_flag && ! fh_differs && !e.error) {
1387 		nfs4_attr_cache(vp, garp, t, cr, TRUE, NULL);
1388 	}
1389 
1390 	nfs4_error_zinit(&e);
1391 	if (op_res->rflags & OPEN4_RESULT_CONFIRM) {
1392 		/* This does not do recovery for vp explicitly. */
1393 		nfs4open_confirm(vp, &seqid, &op_res->stateid, cred_otw, FALSE,
1394 		    &retry_open, oop, FALSE, &e, &num_bseqid_retry);
1395 
1396 		if (e.error || e.stat) {
1397 			nfs4_end_open_seqid_sync(oop);
1398 			nfs4args_copen_free(open_args);
1399 			if (setgid_flag) {
1400 				nfs4args_verify_free(&argop[8]);
1401 				nfs4args_setattr_free(&argop[9]);
1402 			}
1403 			xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1404 			nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state,
1405 			    needrecov);
1406 			open_owner_rele(oop);
1407 			if (create_flag || fh_differs) {
1408 				/* rele the makenfs4node */
1409 				VN_RELE(vp);
1410 			}
1411 			if (ncr != NULL) {
1412 				crfree(ncr);
1413 				ncr = NULL;
1414 			}
1415 			if (retry_open == TRUE) {
1416 				NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
1417 				    "nfs4open_otw: retry the open since OPEN "
1418 				    "CONFIRM failed with error %d stat %d",
1419 				    e.error, e.stat));
1420 				if (create_flag && createmode == GUARDED4) {
1421 					NFS4_DEBUG(nfs4_client_recov_debug,
1422 					    (CE_NOTE, "nfs4open_otw: switch "
1423 					    "createmode from GUARDED4 to "
1424 					    "UNCHECKED4"));
1425 					createmode = UNCHECKED4;
1426 				}
1427 				goto recov_retry;
1428 			}
1429 			if (!e.error) {
1430 				if (create_flag && (createmode != EXCLUSIVE4) &&
1431 				    e.stat == NFS4ERR_BADOWNER)
1432 					nfs4_log_badowner(VTOMI4(dvp), OP_OPEN);
1433 
1434 				e.error = geterrno4(e.stat);
1435 			}
1436 			kmem_free(argop, argoplist_size);
1437 			return (e.error);
1438 		}
1439 	}
1440 
1441 	rp = VTOR4(vp);
1442 
1443 	mutex_enter(&rp->r_statev4_lock);
1444 	if (create_flag)
1445 		rp->created_v4 = 1;
1446 	mutex_exit(&rp->r_statev4_lock);
1447 
1448 	mutex_enter(&oop->oo_lock);
1449 	/* Doesn't matter if 'oo_just_created' already was set as this */
1450 	oop->oo_just_created = NFS4_PERM_CREATED;
1451 	if (oop->oo_cred_otw)
1452 		crfree(oop->oo_cred_otw);
1453 	oop->oo_cred_otw = cred_otw;
1454 	crhold(oop->oo_cred_otw);
1455 	mutex_exit(&oop->oo_lock);
1456 
1457 	/* returns with 'os_sync_lock' held */
1458 	osp = find_or_create_open_stream(oop, rp, &created_osp);
1459 	if (!osp) {
1460 		NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
1461 		    "nfs4open_otw: failed to create an open stream"));
1462 		NFS4_DEBUG(nfs4_seqid_sync, (CE_NOTE, "nfs4open_otw: "
1463 		    "signal our end of use of the open seqid"));
1464 
1465 		nfs4_end_open_seqid_sync(oop);
1466 		open_owner_rele(oop);
1467 		nfs4args_copen_free(open_args);
1468 		if (setgid_flag) {
1469 			nfs4args_verify_free(&argop[8]);
1470 			nfs4args_setattr_free(&argop[9]);
1471 		}
1472 		xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1473 		nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov);
1474 		if (create_flag || fh_differs)
1475 			VN_RELE(vp);
1476 		if (ncr != NULL)
1477 			crfree(ncr);
1478 
1479 		kmem_free(argop, argoplist_size);
1480 		return (EINVAL);
1481 
1482 	}
1483 
1484 	osp->open_stateid = op_res->stateid;
1485 
1486 	if (open_flag & FREAD)
1487 		osp->os_share_acc_read++;
1488 	if (open_flag & FWRITE)
1489 		osp->os_share_acc_write++;
1490 	osp->os_share_deny_none++;
1491 
1492 	/*
1493 	 * Need to reset this bitfield for the possible case where we were
1494 	 * going to OTW CLOSE the file, got a non-recoverable error, and before
1495 	 * we could retry the CLOSE, OPENed the file again.
1496 	 */
1497 	ASSERT(osp->os_open_owner->oo_seqid_inuse);
1498 	osp->os_final_close = 0;
1499 	osp->os_force_close = 0;
1500 #ifdef DEBUG
1501 	if (osp->os_failed_reopen)
1502 		NFS4_DEBUG(nfs4_open_stream_debug, (CE_NOTE, "nfs4open_otw:"
1503 		    " clearing os_failed_reopen for osp %p, cr %p, rp %s",
1504 		    (void *)osp, (void *)cr, rnode4info(rp)));
1505 #endif
1506 	osp->os_failed_reopen = 0;
1507 
1508 	mutex_exit(&osp->os_sync_lock);
1509 
1510 	nfs4_end_open_seqid_sync(oop);
1511 
1512 	if (created_osp && recov_state.rs_sp != NULL) {
1513 		mutex_enter(&recov_state.rs_sp->s_lock);
1514 		nfs4_inc_state_ref_count_nolock(recov_state.rs_sp, VTOMI4(dvp));
1515 		mutex_exit(&recov_state.rs_sp->s_lock);
1516 	}
1517 
1518 	/* get rid of our reference to find oop */
1519 	open_owner_rele(oop);
1520 
1521 	open_stream_rele(osp, rp);
1522 
1523 	/* accept delegation, if any */
1524 	nfs4_delegation_accept(rp, CLAIM_NULL, op_res, garp, cred_otw);
1525 
1526 	nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov);
1527 
1528 	if (createmode == EXCLUSIVE4 &&
1529 	    (in_va->va_mask & ~(AT_GID | AT_SIZE))) {
1530 		NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4open_otw:"
1531 		    " EXCLUSIVE4: sending a SETATTR"));
1532 		/*
1533 		 * If doing an exclusive create, then generate
1534 		 * a SETATTR to set the initial attributes.
1535 		 * Try to set the mtime and the atime to the
1536 		 * server's current time.  It is somewhat
1537 		 * expected that these fields will be used to
1538 		 * store the exclusive create cookie.  If not,
1539 		 * server implementors will need to know that
1540 		 * a SETATTR will follow an exclusive create
1541 		 * and the cookie should be destroyed if
1542 		 * appropriate.
1543 		 *
1544 		 * The AT_GID and AT_SIZE bits are turned off
1545 		 * so that the SETATTR request will not attempt
1546 		 * to process these.  The gid will be set
1547 		 * separately if appropriate.  The size is turned
1548 		 * off because it is assumed that a new file will
1549 		 * be created empty and if the file wasn't empty,
1550 		 * then the exclusive create will have failed
1551 		 * because the file must have existed already.
1552 		 * Therefore, no truncate operation is needed.
1553 		 */
1554 		in_va->va_mask &= ~(AT_GID | AT_SIZE);
1555 		in_va->va_mask |= (AT_MTIME | AT_ATIME);
1556 
1557 		e.error = nfs4setattr(vp, in_va, 0, cr, NULL);
1558 		if (e.error) {
1559 			nfs4_error_t err;
1560 
1561 			/*
1562 			 * Couldn't correct the attributes of
1563 			 * the newly created file and the
1564 			 * attributes are wrong.  Remove the
1565 			 * file and return an error to the
1566 			 * application.
1567 			 */
1568 			/* XXX will this take care of client state ? */
1569 			NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
1570 			    "nfs4open_otw: EXCLUSIVE4: error %d on SETATTR:"
1571 			    " remove file", e.error));
1572 
1573 			/*
1574 			 * The file is currently open so try to close it first.
1575 			 *
1576 			 * If we do not close the file explicitly here then the
1577 			 * VN_RELE() would do an (implicit and asynchronous)
1578 			 * close for us.  But such async close could race with
1579 			 * the nfs4_remove() below.  If the async close is
1580 			 * slower than nfs4_remove() then nfs4_remove()
1581 			 * wouldn't remove the file but rename it to .nfsXXXX
1582 			 * instead.
1583 			 */
1584 			nfs4close_one(vp, NULL, cr, open_flag, NULL, &err,
1585 			    CLOSE_NORM, 0, 0, 0);
1586 			VN_RELE(vp);
1587 			(void) nfs4_remove(dvp, file_name, cr, NULL, 0);
1588 
1589 			/*
1590 			 * Since we've reled the vnode and removed
1591 			 * the file we now need to return the error.
1592 			 * At this point we don't want to update the
1593 			 * dircaches, call nfs4_waitfor_purge_complete
1594 			 * or set vpp to vp so we need to skip these
1595 			 * as well.
1596 			 */
1597 			goto skip_update_dircaches;
1598 		}
1599 	}
1600 
1601 	/*
1602 	 * If we created or found the correct vnode, due to create_flag or
1603 	 * fh_differs being set, then update directory cache attribute, readdir
1604 	 * and dnlc caches.
1605 	 */
1606 	if (create_flag || fh_differs) {
1607 		dirattr_info_t dinfo, *dinfop;
1608 
1609 		/*
1610 		 * Make sure getattr succeeded before using results.
1611 		 * note: op 7 is getattr(dir) for both flavors of
1612 		 * open(create).
1613 		 */
1614 		if (create_flag && res.status == NFS4_OK) {
1615 			dinfo.di_time_call = t;
1616 			dinfo.di_cred = cr;
1617 			dinfo.di_garp =
1618 			    &res.array[6].nfs_resop4_u.opgetattr.ga_res;
1619 			dinfop = &dinfo;
1620 		} else {
1621 			dinfop = NULL;
1622 		}
1623 
1624 		nfs4_update_dircaches(&op_res->cinfo, dvp, vp, file_name,
1625 		    dinfop);
1626 	}
1627 
1628 	/*
1629 	 * If the page cache for this file was flushed from actions
1630 	 * above, it was done asynchronously and if that is true,
1631 	 * there is a need to wait here for it to complete.  This must
1632 	 * be done outside of start_fop/end_fop.
1633 	 */
1634 	(void) nfs4_waitfor_purge_complete(vp);
1635 
1636 	/*
1637 	 * It is implicit that we are in the open case (create_flag == 0) since
1638 	 * fh_differs can only be set to a non-zero value in the open case.
1639 	 */
1640 	if (fh_differs != 0 && vpi != NULL)
1641 		VN_RELE(vpi);
1642 
1643 	/*
1644 	 * Be sure to set *vpp to the correct value before returning.
1645 	 */
1646 	*vpp = vp;
1647 
1648 skip_update_dircaches:
1649 
1650 	nfs4args_copen_free(open_args);
1651 	if (setgid_flag) {
1652 		nfs4args_verify_free(&argop[8]);
1653 		nfs4args_setattr_free(&argop[9]);
1654 	}
1655 	xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1656 
1657 	if (ncr)
1658 		crfree(ncr);
1659 	kmem_free(argop, argoplist_size);
1660 	return (e.error);
1661 }
1662 
1663 /*
1664  * Reopen an open instance.  cf. nfs4open_otw().
1665  *
1666  * Errors are returned by the nfs4_error_t parameter.
1667  * - ep->error contains an errno value or zero.
1668  * - if it is zero, ep->stat is set to an NFS status code, if any.
1669  *   If the file could not be reopened, but the caller should continue, the
1670  *   file is marked dead and no error values are returned.  If the caller
1671  *   should stop recovering open files and start over, either the ep->error
1672  *   value or ep->stat will indicate an error (either something that requires
1673  *   recovery or EAGAIN).  Note that some recovery (e.g., expired volatile
1674  *   filehandles) may be handled silently by this routine.
1675  * - if it is EINTR, ETIMEDOUT, or NFS4_FRC_UNMT_ERR, recovery for lost state
1676  *   will be started, so the caller should not do it.
1677  *
1678  * Gotos:
1679  * - kill_file : reopen failed in such a fashion to constitute marking the
1680  *    file dead and setting the open stream's 'os_failed_reopen' as 1.  This
1681  *   is for cases where recovery is not possible.
1682  * - failed_reopen : same as above, except that the file has already been
1683  *   marked dead, so no need to do it again.
1684  * - bailout : reopen failed but we are able to recover and retry the reopen -
1685  *   either within this function immediately or via the calling function.
1686  */
1687 
1688 void
nfs4_reopen(vnode_t * vp,nfs4_open_stream_t * osp,nfs4_error_t * ep,open_claim_type4 claim,bool_t frc_use_claim_previous,bool_t is_recov)1689 nfs4_reopen(vnode_t *vp, nfs4_open_stream_t *osp, nfs4_error_t *ep,
1690     open_claim_type4 claim, bool_t frc_use_claim_previous,
1691     bool_t is_recov)
1692 {
1693 	COMPOUND4args_clnt args;
1694 	COMPOUND4res_clnt res;
1695 	nfs_argop4 argop[4];
1696 	nfs_resop4 *resop;
1697 	OPEN4res *op_res = NULL;
1698 	OPEN4cargs *open_args;
1699 	GETFH4res *gf_res;
1700 	rnode4_t *rp = VTOR4(vp);
1701 	int doqueue = 1;
1702 	cred_t *cr = NULL, *cred_otw = NULL;
1703 	nfs4_open_owner_t *oop = NULL;
1704 	seqid4 seqid;
1705 	nfs4_ga_res_t *garp;
1706 	char fn[MAXNAMELEN];
1707 	nfs4_recov_state_t recov = {NULL, 0};
1708 	nfs4_lost_rqst_t lost_rqst;
1709 	mntinfo4_t *mi = VTOMI4(vp);
1710 	bool_t abort;
1711 	char *failed_msg = "";
1712 	int fh_different;
1713 	hrtime_t t;
1714 	nfs4_bseqid_entry_t *bsep = NULL;
1715 
1716 	ASSERT(nfs4_consistent_type(vp));
1717 	ASSERT(nfs_zone() == mi->mi_zone);
1718 
1719 	nfs4_error_zinit(ep);
1720 
1721 	/* this is the cred used to find the open owner */
1722 	cr = state_to_cred(osp);
1723 	if (cr == NULL) {
1724 		failed_msg = "Couldn't reopen: no cred";
1725 		goto kill_file;
1726 	}
1727 	/* use this cred for OTW operations */
1728 	cred_otw = nfs4_get_otw_cred(cr, mi, osp->os_open_owner);
1729 
1730 top:
1731 	nfs4_error_zinit(ep);
1732 
1733 	if (mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED) {
1734 		/* File system has been unmounted, quit */
1735 		ep->error = EIO;
1736 		failed_msg = "Couldn't reopen: file system has been unmounted";
1737 		goto kill_file;
1738 	}
1739 
1740 	oop = osp->os_open_owner;
1741 
1742 	ASSERT(oop != NULL);
1743 	if (oop == NULL) {	/* be defensive in non-DEBUG */
1744 		failed_msg = "can't reopen: no open owner";
1745 		goto kill_file;
1746 	}
1747 	open_owner_hold(oop);
1748 
1749 	ep->error = nfs4_start_open_seqid_sync(oop, mi);
1750 	if (ep->error) {
1751 		open_owner_rele(oop);
1752 		oop = NULL;
1753 		goto bailout;
1754 	}
1755 
1756 	/*
1757 	 * If the rnode has a delegation and the delegation has been
1758 	 * recovered and the server didn't request a recall and the caller
1759 	 * didn't specifically ask for CLAIM_PREVIOUS (nfs4frlock during
1760 	 * recovery) and the rnode hasn't been marked dead, then install
1761 	 * the delegation stateid in the open stream.  Otherwise, proceed
1762 	 * with a CLAIM_PREVIOUS or CLAIM_NULL OPEN.
1763 	 */
1764 	mutex_enter(&rp->r_statev4_lock);
1765 	if (rp->r_deleg_type != OPEN_DELEGATE_NONE &&
1766 	    !rp->r_deleg_return_pending &&
1767 	    (rp->r_deleg_needs_recovery == OPEN_DELEGATE_NONE) &&
1768 	    !rp->r_deleg_needs_recall &&
1769 	    claim != CLAIM_DELEGATE_CUR && !frc_use_claim_previous &&
1770 	    !(rp->r_flags & R4RECOVERR)) {
1771 		mutex_enter(&osp->os_sync_lock);
1772 		osp->os_delegation = 1;
1773 		osp->open_stateid = rp->r_deleg_stateid;
1774 		mutex_exit(&osp->os_sync_lock);
1775 		mutex_exit(&rp->r_statev4_lock);
1776 		goto bailout;
1777 	}
1778 	mutex_exit(&rp->r_statev4_lock);
1779 
1780 	/*
1781 	 * If the file failed recovery, just quit.  This failure need not
1782 	 * affect other reopens, so don't return an error.
1783 	 */
1784 	mutex_enter(&rp->r_statelock);
1785 	if (rp->r_flags & R4RECOVERR) {
1786 		mutex_exit(&rp->r_statelock);
1787 		ep->error = 0;
1788 		goto failed_reopen;
1789 	}
1790 	mutex_exit(&rp->r_statelock);
1791 
1792 	/*
1793 	 * argop is empty here
1794 	 *
1795 	 * PUTFH, OPEN, GETATTR
1796 	 */
1797 	args.ctag = TAG_REOPEN;
1798 	args.array_len = 4;
1799 	args.array = argop;
1800 
1801 	NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
1802 	    "nfs4_reopen: file is type %d, id %s",
1803 	    vp->v_type, rnode4info(VTOR4(vp))));
1804 
1805 	argop[0].argop = OP_CPUTFH;
1806 
1807 	if (claim != CLAIM_PREVIOUS) {
1808 		/*
1809 		 * if this is a file mount then
1810 		 * use the mntinfo parentfh
1811 		 */
1812 		argop[0].nfs_argop4_u.opcputfh.sfh =
1813 		    (vp->v_flag & VROOT) ? mi->mi_srvparentfh :
1814 		    VTOSV(vp)->sv_dfh;
1815 	} else {
1816 		/* putfh fh to reopen */
1817 		argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
1818 	}
1819 
1820 	argop[1].argop = OP_COPEN;
1821 	open_args = &argop[1].nfs_argop4_u.opcopen;
1822 	open_args->claim = claim;
1823 
1824 	if (claim == CLAIM_NULL) {
1825 
1826 		if ((ep->error = vtoname(vp, fn, MAXNAMELEN)) != 0) {
1827 			nfs_cmn_err(ep->error, CE_WARN, "nfs4_reopen: vtoname "
1828 			    "failed for vp 0x%p for CLAIM_NULL with %m",
1829 			    (void *)vp);
1830 			failed_msg = "Couldn't reopen: vtoname failed for "
1831 			    "CLAIM_NULL";
1832 			/* nothing allocated yet */
1833 			goto kill_file;
1834 		}
1835 
1836 		open_args->open_claim4_u.cfile = fn;
1837 	} else if (claim == CLAIM_PREVIOUS) {
1838 
1839 		/*
1840 		 * We have two cases to deal with here:
1841 		 * 1) We're being called to reopen files in order to satisfy
1842 		 *    a lock operation request which requires us to explicitly
1843 		 *    reopen files which were opened under a delegation.  If
1844 		 *    we're in recovery, we *must* use CLAIM_PREVIOUS.  In
1845 		 *    that case, frc_use_claim_previous is TRUE and we must
1846 		 *    use the rnode's current delegation type (r_deleg_type).
1847 		 * 2) We're reopening files during some form of recovery.
1848 		 *    In this case, frc_use_claim_previous is FALSE and we
1849 		 *    use the delegation type appropriate for recovery
1850 		 *    (r_deleg_needs_recovery).
1851 		 */
1852 		mutex_enter(&rp->r_statev4_lock);
1853 		open_args->open_claim4_u.delegate_type =
1854 		    frc_use_claim_previous ?
1855 		    rp->r_deleg_type :
1856 		    rp->r_deleg_needs_recovery;
1857 		mutex_exit(&rp->r_statev4_lock);
1858 
1859 	} else if (claim == CLAIM_DELEGATE_CUR) {
1860 
1861 		if ((ep->error = vtoname(vp, fn, MAXNAMELEN)) != 0) {
1862 			nfs_cmn_err(ep->error, CE_WARN, "nfs4_reopen: vtoname "
1863 			    "failed for vp 0x%p for CLAIM_DELEGATE_CUR "
1864 			    "with %m", (void *)vp);
1865 			failed_msg = "Couldn't reopen: vtoname failed for "
1866 			    "CLAIM_DELEGATE_CUR";
1867 			/* nothing allocated yet */
1868 			goto kill_file;
1869 		}
1870 
1871 		mutex_enter(&rp->r_statev4_lock);
1872 		open_args->open_claim4_u.delegate_cur_info.delegate_stateid =
1873 		    rp->r_deleg_stateid;
1874 		mutex_exit(&rp->r_statev4_lock);
1875 
1876 		open_args->open_claim4_u.delegate_cur_info.cfile = fn;
1877 	}
1878 	open_args->opentype = OPEN4_NOCREATE;
1879 	open_args->owner.clientid = mi2clientid(mi);
1880 	open_args->owner.owner_len = sizeof (oop->oo_name);
1881 	open_args->owner.owner_val =
1882 	    kmem_alloc(open_args->owner.owner_len, KM_SLEEP);
1883 	bcopy(&oop->oo_name, open_args->owner.owner_val,
1884 	    open_args->owner.owner_len);
1885 	open_args->share_access = 0;
1886 	open_args->share_deny = 0;
1887 
1888 	mutex_enter(&osp->os_sync_lock);
1889 	NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, "nfs4_reopen: osp %p rp "
1890 	    "%p: read acc %"PRIu64" write acc %"PRIu64": open ref count %d: "
1891 	    "mmap read %"PRIu64" mmap write %"PRIu64" claim %d ",
1892 	    (void *)osp, (void *)rp, osp->os_share_acc_read,
1893 	    osp->os_share_acc_write, osp->os_open_ref_count,
1894 	    osp->os_mmap_read, osp->os_mmap_write, claim));
1895 
1896 	if (osp->os_share_acc_read || osp->os_mmap_read)
1897 		open_args->share_access |= OPEN4_SHARE_ACCESS_READ;
1898 	if (osp->os_share_acc_write || osp->os_mmap_write)
1899 		open_args->share_access |= OPEN4_SHARE_ACCESS_WRITE;
1900 	if (osp->os_share_deny_read)
1901 		open_args->share_deny |= OPEN4_SHARE_DENY_READ;
1902 	if (osp->os_share_deny_write)
1903 		open_args->share_deny |= OPEN4_SHARE_DENY_WRITE;
1904 	mutex_exit(&osp->os_sync_lock);
1905 
1906 	seqid = nfs4_get_open_seqid(oop) + 1;
1907 	open_args->seqid = seqid;
1908 
1909 	/* Construct the getfh part of the compound */
1910 	argop[2].argop = OP_GETFH;
1911 
1912 	/* Construct the getattr part of the compound */
1913 	argop[3].argop = OP_GETATTR;
1914 	argop[3].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
1915 	argop[3].nfs_argop4_u.opgetattr.mi = mi;
1916 
1917 	t = gethrtime();
1918 
1919 	rfs4call(mi, &args, &res, cred_otw, &doqueue, 0, ep);
1920 
1921 	if (ep->error) {
1922 		if (!is_recov && !frc_use_claim_previous &&
1923 		    (ep->error == EINTR || ep->error == ETIMEDOUT ||
1924 		    NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp))) {
1925 			nfs4open_save_lost_rqst(ep->error, &lost_rqst, oop,
1926 			    cred_otw, vp, NULL, open_args);
1927 			abort = nfs4_start_recovery(ep,
1928 			    VTOMI4(vp), vp, NULL, NULL,
1929 			    lost_rqst.lr_op == OP_OPEN ?
1930 			    &lost_rqst : NULL, OP_OPEN, NULL, NULL, NULL);
1931 			nfs4args_copen_free(open_args);
1932 			goto bailout;
1933 		}
1934 
1935 		nfs4args_copen_free(open_args);
1936 
1937 		if (ep->error == EACCES && cred_otw != cr) {
1938 			crfree(cred_otw);
1939 			cred_otw = cr;
1940 			crhold(cred_otw);
1941 			nfs4_end_open_seqid_sync(oop);
1942 			open_owner_rele(oop);
1943 			oop = NULL;
1944 			goto top;
1945 		}
1946 		if (ep->error == ETIMEDOUT)
1947 			goto bailout;
1948 		failed_msg = "Couldn't reopen: rpc error";
1949 		goto kill_file;
1950 	}
1951 
1952 	if (nfs4_need_to_bump_seqid(&res))
1953 		nfs4_set_open_seqid(seqid, oop, args.ctag);
1954 
1955 	switch (res.status) {
1956 	case NFS4_OK:
1957 		if (recov.rs_flags & NFS4_RS_DELAY_MSG) {
1958 			mutex_enter(&rp->r_statelock);
1959 			rp->r_delay_interval = 0;
1960 			mutex_exit(&rp->r_statelock);
1961 		}
1962 		break;
1963 	case NFS4ERR_BAD_SEQID:
1964 		bsep = nfs4_create_bseqid_entry(oop, NULL, vp, 0,
1965 		    args.ctag, open_args->seqid);
1966 
1967 		abort = nfs4_start_recovery(ep, VTOMI4(vp), vp, NULL,
1968 		    NULL, lost_rqst.lr_op == OP_OPEN ? &lost_rqst :
1969 		    NULL, OP_OPEN, bsep, NULL, NULL);
1970 
1971 		nfs4args_copen_free(open_args);
1972 		xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1973 		nfs4_end_open_seqid_sync(oop);
1974 		open_owner_rele(oop);
1975 		oop = NULL;
1976 		kmem_free(bsep, sizeof (*bsep));
1977 
1978 		goto kill_file;
1979 	case NFS4ERR_NO_GRACE:
1980 		nfs4args_copen_free(open_args);
1981 		xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1982 		nfs4_end_open_seqid_sync(oop);
1983 		open_owner_rele(oop);
1984 		oop = NULL;
1985 		if (claim == CLAIM_PREVIOUS) {
1986 			/*
1987 			 * Retry as a plain open. We don't need to worry about
1988 			 * checking the changeinfo: it is acceptable for a
1989 			 * client to re-open a file and continue processing
1990 			 * (in the absence of locks).
1991 			 */
1992 			NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
1993 			    "nfs4_reopen: CLAIM_PREVIOUS: NFS4ERR_NO_GRACE; "
1994 			    "will retry as CLAIM_NULL"));
1995 			claim = CLAIM_NULL;
1996 			nfs4_mi_kstat_inc_no_grace(mi);
1997 			goto top;
1998 		}
1999 		failed_msg =
2000 		    "Couldn't reopen: tried reclaim outside grace period. ";
2001 		goto kill_file;
2002 	case NFS4ERR_GRACE:
2003 		nfs4_set_grace_wait(mi);
2004 		nfs4args_copen_free(open_args);
2005 		xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2006 		nfs4_end_open_seqid_sync(oop);
2007 		open_owner_rele(oop);
2008 		oop = NULL;
2009 		ep->error = nfs4_wait_for_grace(mi, &recov);
2010 		if (ep->error != 0)
2011 			goto bailout;
2012 		goto top;
2013 	case NFS4ERR_DELAY:
2014 		nfs4_set_delay_wait(vp);
2015 		nfs4args_copen_free(open_args);
2016 		xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2017 		nfs4_end_open_seqid_sync(oop);
2018 		open_owner_rele(oop);
2019 		oop = NULL;
2020 		ep->error = nfs4_wait_for_delay(vp, &recov);
2021 		nfs4_mi_kstat_inc_delay(mi);
2022 		if (ep->error != 0)
2023 			goto bailout;
2024 		goto top;
2025 	case NFS4ERR_FHEXPIRED:
2026 		/* recover filehandle and retry */
2027 		abort = nfs4_start_recovery(ep,
2028 		    mi, vp, NULL, NULL, NULL, OP_OPEN, NULL, NULL, NULL);
2029 		nfs4args_copen_free(open_args);
2030 		xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2031 		nfs4_end_open_seqid_sync(oop);
2032 		open_owner_rele(oop);
2033 		oop = NULL;
2034 		if (abort == FALSE)
2035 			goto top;
2036 		failed_msg = "Couldn't reopen: recovery aborted";
2037 		goto kill_file;
2038 	case NFS4ERR_RESOURCE:
2039 	case NFS4ERR_STALE_CLIENTID:
2040 	case NFS4ERR_WRONGSEC:
2041 	case NFS4ERR_EXPIRED:
2042 		/*
2043 		 * Do not mark the file dead and let the calling
2044 		 * function initiate recovery.
2045 		 */
2046 		nfs4args_copen_free(open_args);
2047 		xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2048 		nfs4_end_open_seqid_sync(oop);
2049 		open_owner_rele(oop);
2050 		oop = NULL;
2051 		goto bailout;
2052 	case NFS4ERR_ACCESS:
2053 		if (cred_otw != cr) {
2054 			crfree(cred_otw);
2055 			cred_otw = cr;
2056 			crhold(cred_otw);
2057 			nfs4args_copen_free(open_args);
2058 			xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2059 			nfs4_end_open_seqid_sync(oop);
2060 			open_owner_rele(oop);
2061 			oop = NULL;
2062 			goto top;
2063 		}
2064 		/* fall through */
2065 	default:
2066 		NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
2067 		    "nfs4_reopen: r_server 0x%p, mi_curr_serv 0x%p, rnode %s",
2068 		    (void*)VTOR4(vp)->r_server, (void*)mi->mi_curr_serv,
2069 		    rnode4info(VTOR4(vp))));
2070 		failed_msg = "Couldn't reopen: NFSv4 error";
2071 		nfs4args_copen_free(open_args);
2072 		xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2073 		goto kill_file;
2074 	}
2075 
2076 	resop = &res.array[1];  /* open res */
2077 	op_res = &resop->nfs_resop4_u.opopen;
2078 
2079 	garp = &res.array[3].nfs_resop4_u.opgetattr.ga_res;
2080 
2081 	/*
2082 	 * Check if the path we reopened really is the same
2083 	 * file. We could end up in a situation where the file
2084 	 * was removed and a new file created with the same name.
2085 	 */
2086 	resop = &res.array[2];
2087 	gf_res = &resop->nfs_resop4_u.opgetfh;
2088 	(void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0);
2089 	fh_different = (nfs4cmpfh(&rp->r_fh->sfh_fh, &gf_res->object) != 0);
2090 	if (fh_different) {
2091 		if (mi->mi_fh_expire_type == FH4_PERSISTENT ||
2092 		    mi->mi_fh_expire_type & FH4_NOEXPIRE_WITH_OPEN) {
2093 			/* Oops, we don't have the same file */
2094 			if (mi->mi_fh_expire_type == FH4_PERSISTENT)
2095 				failed_msg = "Couldn't reopen: Persistent "
2096 				    "file handle changed";
2097 			else
2098 				failed_msg = "Couldn't reopen: Volatile "
2099 				    "(no expire on open) file handle changed";
2100 
2101 			nfs4args_copen_free(open_args);
2102 			xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2103 			nfs_rw_exit(&mi->mi_fh_lock);
2104 			goto kill_file;
2105 
2106 		} else {
2107 			/*
2108 			 * We have volatile file handles that don't compare.
2109 			 * If the fids are the same then we assume that the
2110 			 * file handle expired but the rnode still refers to
2111 			 * the same file object.
2112 			 *
2113 			 * First check that we have fids or not.
2114 			 * If we don't we have a dumb server so we will
2115 			 * just assume every thing is ok for now.
2116 			 */
2117 			if (!ep->error && garp->n4g_va.va_mask & AT_NODEID &&
2118 			    rp->r_attr.va_mask & AT_NODEID &&
2119 			    rp->r_attr.va_nodeid != garp->n4g_va.va_nodeid) {
2120 				/*
2121 				 * We have fids, but they don't
2122 				 * compare. So kill the file.
2123 				 */
2124 				failed_msg =
2125 				    "Couldn't reopen: file handle changed"
2126 				    " due to mismatched fids";
2127 				nfs4args_copen_free(open_args);
2128 				xdr_free(xdr_COMPOUND4res_clnt,
2129 				    (caddr_t)&res);
2130 				nfs_rw_exit(&mi->mi_fh_lock);
2131 				goto kill_file;
2132 			} else {
2133 				/*
2134 				 * We have volatile file handles that refers
2135 				 * to the same file (at least they have the
2136 				 * same fid) or we don't have fids so we
2137 				 * can't tell. :(. We'll be a kind and accepting
2138 				 * client so we'll update the rnode's file
2139 				 * handle with the otw handle.
2140 				 *
2141 				 * We need to drop mi->mi_fh_lock since
2142 				 * sh4_update acquires it. Since there is
2143 				 * only one recovery thread there is no
2144 				 * race.
2145 				 */
2146 				nfs_rw_exit(&mi->mi_fh_lock);
2147 				sfh4_update(rp->r_fh, &gf_res->object);
2148 			}
2149 		}
2150 	} else {
2151 		nfs_rw_exit(&mi->mi_fh_lock);
2152 	}
2153 
2154 	ASSERT(nfs4_consistent_type(vp));
2155 
2156 	/*
2157 	 * If the server wanted an OPEN_CONFIRM but that fails, just start
2158 	 * over.  Presumably if there is a persistent error it will show up
2159 	 * when we resend the OPEN.
2160 	 */
2161 	if (op_res->rflags & OPEN4_RESULT_CONFIRM) {
2162 		bool_t retry_open = FALSE;
2163 
2164 		nfs4open_confirm(vp, &seqid, &op_res->stateid,
2165 		    cred_otw, is_recov, &retry_open,
2166 		    oop, FALSE, ep, NULL);
2167 		if (ep->error || ep->stat) {
2168 			nfs4args_copen_free(open_args);
2169 			xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2170 			nfs4_end_open_seqid_sync(oop);
2171 			open_owner_rele(oop);
2172 			oop = NULL;
2173 			goto top;
2174 		}
2175 	}
2176 
2177 	mutex_enter(&osp->os_sync_lock);
2178 	osp->open_stateid = op_res->stateid;
2179 	osp->os_delegation = 0;
2180 	/*
2181 	 * Need to reset this bitfield for the possible case where we were
2182 	 * going to OTW CLOSE the file, got a non-recoverable error, and before
2183 	 * we could retry the CLOSE, OPENed the file again.
2184 	 */
2185 	ASSERT(osp->os_open_owner->oo_seqid_inuse);
2186 	osp->os_final_close = 0;
2187 	osp->os_force_close = 0;
2188 	if (claim == CLAIM_DELEGATE_CUR || claim == CLAIM_PREVIOUS)
2189 		osp->os_dc_openacc = open_args->share_access;
2190 	mutex_exit(&osp->os_sync_lock);
2191 
2192 	nfs4_end_open_seqid_sync(oop);
2193 
2194 	/* accept delegation, if any */
2195 	nfs4_delegation_accept(rp, claim, op_res, garp, cred_otw);
2196 
2197 	nfs4args_copen_free(open_args);
2198 
2199 	nfs4_attr_cache(vp, garp, t, cr, TRUE, NULL);
2200 
2201 	xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2202 
2203 	ASSERT(nfs4_consistent_type(vp));
2204 
2205 	open_owner_rele(oop);
2206 	crfree(cr);
2207 	crfree(cred_otw);
2208 	return;
2209 
2210 kill_file:
2211 	nfs4_fail_recov(vp, failed_msg, ep->error, ep->stat);
2212 failed_reopen:
2213 	NFS4_DEBUG(nfs4_open_stream_debug, (CE_NOTE,
2214 	    "nfs4_reopen: setting os_failed_reopen for osp %p, cr %p, rp %s",
2215 	    (void *)osp, (void *)cr, rnode4info(rp)));
2216 	mutex_enter(&osp->os_sync_lock);
2217 	osp->os_failed_reopen = 1;
2218 	mutex_exit(&osp->os_sync_lock);
2219 bailout:
2220 	if (oop != NULL) {
2221 		nfs4_end_open_seqid_sync(oop);
2222 		open_owner_rele(oop);
2223 	}
2224 	if (cr != NULL)
2225 		crfree(cr);
2226 	if (cred_otw != NULL)
2227 		crfree(cred_otw);
2228 }
2229 
2230 /* for . and .. OPENs */
2231 /* ARGSUSED */
2232 static int
nfs4_open_non_reg_file(vnode_t ** vpp,int flag,cred_t * cr)2233 nfs4_open_non_reg_file(vnode_t **vpp, int flag, cred_t *cr)
2234 {
2235 	rnode4_t *rp;
2236 	nfs4_ga_res_t gar;
2237 
2238 	ASSERT(nfs_zone() == VTOMI4(*vpp)->mi_zone);
2239 
2240 	/*
2241 	 * If close-to-open consistency checking is turned off or
2242 	 * if there is no cached data, we can avoid
2243 	 * the over the wire getattr.  Otherwise, force a
2244 	 * call to the server to get fresh attributes and to
2245 	 * check caches. This is required for close-to-open
2246 	 * consistency.
2247 	 */
2248 	rp = VTOR4(*vpp);
2249 	if (VTOMI4(*vpp)->mi_flags & MI4_NOCTO ||
2250 	    (rp->r_dir == NULL && !nfs4_has_pages(*vpp)))
2251 		return (0);
2252 
2253 	return (nfs4_getattr_otw(*vpp, &gar, cr, 0));
2254 }
2255 
2256 /*
2257  * CLOSE a file
2258  */
2259 /* ARGSUSED */
2260 static int
nfs4_close(vnode_t * vp,int flag,int count,offset_t offset,cred_t * cr,caller_context_t * ct)2261 nfs4_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
2262     caller_context_t *ct)
2263 {
2264 	rnode4_t	*rp;
2265 	int		 error = 0;
2266 	int		 r_error = 0;
2267 	int		 n4error = 0;
2268 	nfs4_error_t	 e = { 0, NFS4_OK, RPC_SUCCESS };
2269 
2270 	/*
2271 	 * Remove client state for this (lockowner, file) pair.
2272 	 * Issue otw v4 call to have the server do the same.
2273 	 */
2274 
2275 	rp = VTOR4(vp);
2276 
2277 	/*
2278 	 * zone_enter(2) prevents processes from changing zones with NFS files
2279 	 * open; if we happen to get here from the wrong zone we can't do
2280 	 * anything over the wire.
2281 	 */
2282 	if (VTOMI4(vp)->mi_zone != nfs_zone()) {
2283 		/*
2284 		 * We could attempt to clean up locks, except we're sure
2285 		 * that the current process didn't acquire any locks on
2286 		 * the file: any attempt to lock a file belong to another zone
2287 		 * will fail, and one can't lock an NFS file and then change
2288 		 * zones, as that fails too.
2289 		 *
2290 		 * Returning an error here is the sane thing to do.  A
2291 		 * subsequent call to VN_RELE() which translates to a
2292 		 * nfs4_inactive() will clean up state: if the zone of the
2293 		 * vnode's origin is still alive and kicking, the inactive
2294 		 * thread will handle the request (from the correct zone), and
2295 		 * everything (minus the OTW close call) should be OK.  If the
2296 		 * zone is going away nfs4_async_inactive() will throw away
2297 		 * delegations, open streams and cached pages inline.
2298 		 */
2299 		return (EIO);
2300 	}
2301 
2302 	/*
2303 	 * If we are using local locking for this filesystem, then
2304 	 * release all of the SYSV style record locks.  Otherwise,
2305 	 * we are doing network locking and we need to release all
2306 	 * of the network locks.  All of the locks held by this
2307 	 * process on this file are released no matter what the
2308 	 * incoming reference count is.
2309 	 */
2310 	if (VTOMI4(vp)->mi_flags & MI4_LLOCK) {
2311 		cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
2312 		cleanshares(vp, ttoproc(curthread)->p_pid);
2313 	} else
2314 		e.error = nfs4_lockrelease(vp, flag, offset, cr);
2315 
2316 	if (e.error) {
2317 		struct lm_sysid *lmsid;
2318 		lmsid = nfs4_find_sysid(VTOMI4(vp));
2319 		if (lmsid == NULL) {
2320 			DTRACE_PROBE2(unknown__sysid, int, e.error,
2321 			    vnode_t *, vp);
2322 		} else {
2323 			cleanlocks(vp, ttoproc(curthread)->p_pid,
2324 			    (lm_sysidt(lmsid) | LM_SYSID_CLIENT));
2325 
2326 			lm_rel_sysid(lmsid);
2327 		}
2328 		return (e.error);
2329 	}
2330 
2331 	if (count > 1)
2332 		return (0);
2333 
2334 	/*
2335 	 * If the file has been `unlinked', then purge the
2336 	 * DNLC so that this vnode will get reycled quicker
2337 	 * and the .nfs* file on the server will get removed.
2338 	 */
2339 	if (rp->r_unldvp != NULL)
2340 		dnlc_purge_vp(vp);
2341 
2342 	/*
2343 	 * If the file was open for write and there are pages,
2344 	 * do a synchronous flush and commit of all of the
2345 	 * dirty and uncommitted pages.
2346 	 */
2347 	ASSERT(!e.error);
2348 	if ((flag & FWRITE) && nfs4_has_pages(vp))
2349 		error = nfs4_putpage_commit(vp, 0, 0, cr);
2350 
2351 	mutex_enter(&rp->r_statelock);
2352 	r_error = rp->r_error;
2353 	rp->r_error = 0;
2354 	mutex_exit(&rp->r_statelock);
2355 
2356 	/*
2357 	 * If this file type is one for which no explicit 'open' was
2358 	 * done, then bail now (ie. no need for protocol 'close'). If
2359 	 * there was an error w/the vm subsystem, return _that_ error,
2360 	 * otherwise, return any errors that may've been reported via
2361 	 * the rnode.
2362 	 */
2363 	if (vp->v_type != VREG)
2364 		return (error ? error : r_error);
2365 
2366 	/*
2367 	 * The sync putpage commit may have failed above, but since
2368 	 * we're working w/a regular file, we need to do the protocol
2369 	 * 'close' (nfs4close_one will figure out if an otw close is
2370 	 * needed or not). Report any errors _after_ doing the protocol
2371 	 * 'close'.
2372 	 */
2373 	nfs4close_one(vp, NULL, cr, flag, NULL, &e, CLOSE_NORM, 0, 0, 0);
2374 	n4error = e.error ? e.error : geterrno4(e.stat);
2375 
2376 	/*
2377 	 * Error reporting prio (Hi -> Lo)
2378 	 *
2379 	 *   i) nfs4_putpage_commit (error)
2380 	 *  ii) rnode's (r_error)
2381 	 * iii) nfs4close_one (n4error)
2382 	 */
2383 	return (error ? error : (r_error ? r_error : n4error));
2384 }
2385 
2386 /*
2387  * Initialize *lost_rqstp.
2388  */
2389 
2390 static void
nfs4close_save_lost_rqst(int error,nfs4_lost_rqst_t * lost_rqstp,nfs4_open_owner_t * oop,nfs4_open_stream_t * osp,cred_t * cr,vnode_t * vp)2391 nfs4close_save_lost_rqst(int error, nfs4_lost_rqst_t *lost_rqstp,
2392     nfs4_open_owner_t *oop, nfs4_open_stream_t *osp, cred_t *cr,
2393     vnode_t *vp)
2394 {
2395 	if (error != ETIMEDOUT && error != EINTR &&
2396 	    !NFS4_FRC_UNMT_ERR(error, vp->v_vfsp)) {
2397 		lost_rqstp->lr_op = 0;
2398 		return;
2399 	}
2400 
2401 	NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
2402 	    "nfs4close_save_lost_rqst: error %d", error));
2403 
2404 	lost_rqstp->lr_op = OP_CLOSE;
2405 	/*
2406 	 * The vp is held and rele'd via the recovery code.
2407 	 * See nfs4_save_lost_rqst.
2408 	 */
2409 	lost_rqstp->lr_vp = vp;
2410 	lost_rqstp->lr_dvp = NULL;
2411 	lost_rqstp->lr_oop = oop;
2412 	lost_rqstp->lr_osp = osp;
2413 	ASSERT(osp != NULL);
2414 	ASSERT(mutex_owned(&osp->os_sync_lock));
2415 	osp->os_pending_close = 1;
2416 	lost_rqstp->lr_lop = NULL;
2417 	lost_rqstp->lr_cr = cr;
2418 	lost_rqstp->lr_flk = NULL;
2419 	lost_rqstp->lr_putfirst = FALSE;
2420 }
2421 
2422 /*
2423  * Assumes you already have the open seqid sync grabbed as well as the
2424  * 'os_sync_lock'.  Note: this will release the open seqid sync and
2425  * 'os_sync_lock' if client recovery starts.  Calling functions have to
2426  * be prepared to handle this.
2427  *
2428  * 'recov' is returned as 1 if the CLOSE operation detected client recovery
2429  * was needed and was started, and that the calling function should retry
2430  * this function; otherwise it is returned as 0.
2431  *
2432  * Errors are returned via the nfs4_error_t parameter.
2433  */
2434 static void
nfs4close_otw(rnode4_t * rp,cred_t * cred_otw,nfs4_open_owner_t * oop,nfs4_open_stream_t * osp,int * recov,int * did_start_seqid_syncp,nfs4_close_type_t close_type,nfs4_error_t * ep,int * have_sync_lockp)2435 nfs4close_otw(rnode4_t *rp, cred_t *cred_otw, nfs4_open_owner_t *oop,
2436     nfs4_open_stream_t *osp, int *recov, int *did_start_seqid_syncp,
2437     nfs4_close_type_t close_type, nfs4_error_t *ep, int *have_sync_lockp)
2438 {
2439 	COMPOUND4args_clnt args;
2440 	COMPOUND4res_clnt res;
2441 	CLOSE4args *close_args;
2442 	nfs_resop4 *resop;
2443 	nfs_argop4 argop[3];
2444 	int doqueue = 1;
2445 	mntinfo4_t *mi;
2446 	seqid4 seqid;
2447 	vnode_t *vp;
2448 	bool_t needrecov = FALSE;
2449 	nfs4_lost_rqst_t lost_rqst;
2450 	hrtime_t t;
2451 
2452 	ASSERT(nfs_zone() == VTOMI4(RTOV4(rp))->mi_zone);
2453 
2454 	ASSERT(MUTEX_HELD(&osp->os_sync_lock));
2455 
2456 	NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4close_otw"));
2457 
2458 	/* Only set this to 1 if recovery is started */
2459 	*recov = 0;
2460 
2461 	/* do the OTW call to close the file */
2462 
2463 	if (close_type == CLOSE_RESEND)
2464 		args.ctag = TAG_CLOSE_LOST;
2465 	else if (close_type == CLOSE_AFTER_RESEND)
2466 		args.ctag = TAG_CLOSE_UNDO;
2467 	else
2468 		args.ctag = TAG_CLOSE;
2469 
2470 	args.array_len = 3;
2471 	args.array = argop;
2472 
2473 	vp = RTOV4(rp);
2474 
2475 	mi = VTOMI4(vp);
2476 
2477 	/* putfh target fh */
2478 	argop[0].argop = OP_CPUTFH;
2479 	argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
2480 
2481 	argop[1].argop = OP_GETATTR;
2482 	argop[1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
2483 	argop[1].nfs_argop4_u.opgetattr.mi = mi;
2484 
2485 	argop[2].argop = OP_CLOSE;
2486 	close_args = &argop[2].nfs_argop4_u.opclose;
2487 
2488 	seqid = nfs4_get_open_seqid(oop) + 1;
2489 
2490 	close_args->seqid = seqid;
2491 	close_args->open_stateid = osp->open_stateid;
2492 
2493 	NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
2494 	    "nfs4close_otw: %s call, rp %s", needrecov ? "recov" : "first",
2495 	    rnode4info(rp)));
2496 
2497 	t = gethrtime();
2498 
2499 	rfs4call(mi, &args, &res, cred_otw, &doqueue, 0, ep);
2500 
2501 	if (!ep->error && nfs4_need_to_bump_seqid(&res)) {
2502 		nfs4_set_open_seqid(seqid, oop, args.ctag);
2503 	}
2504 
2505 	needrecov = nfs4_needs_recovery(ep, TRUE, mi->mi_vfsp);
2506 	if (ep->error && !needrecov) {
2507 		/*
2508 		 * if there was an error and no recovery is to be done
2509 		 * then then set up the file to flush its cache if
2510 		 * needed for the next caller.
2511 		 */
2512 		mutex_enter(&rp->r_statelock);
2513 		PURGE_ATTRCACHE4_LOCKED(rp);
2514 		rp->r_flags &= ~R4WRITEMODIFIED;
2515 		mutex_exit(&rp->r_statelock);
2516 		return;
2517 	}
2518 
2519 	if (needrecov) {
2520 		bool_t abort;
2521 		nfs4_bseqid_entry_t *bsep = NULL;
2522 
2523 		if (close_type != CLOSE_RESEND)
2524 			nfs4close_save_lost_rqst(ep->error, &lost_rqst, oop,
2525 			    osp, cred_otw, vp);
2526 
2527 		if (!ep->error && res.status == NFS4ERR_BAD_SEQID)
2528 			bsep = nfs4_create_bseqid_entry(oop, NULL, vp,
2529 			    0, args.ctag, close_args->seqid);
2530 
2531 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
2532 		    "nfs4close_otw: initiating recovery. error %d "
2533 		    "res.status %d", ep->error, res.status));
2534 
2535 		/*
2536 		 * Drop the 'os_sync_lock' here so we don't hit
2537 		 * a potential recursive mutex_enter via an
2538 		 * 'open_stream_hold()'.
2539 		 */
2540 		mutex_exit(&osp->os_sync_lock);
2541 		*have_sync_lockp = 0;
2542 		abort = nfs4_start_recovery(ep, VTOMI4(vp), vp, NULL, NULL,
2543 		    (close_type != CLOSE_RESEND &&
2544 		    lost_rqst.lr_op == OP_CLOSE) ? &lost_rqst : NULL,
2545 		    OP_CLOSE, bsep, NULL, NULL);
2546 
2547 		/* drop open seq sync, and let the calling function regrab it */
2548 		nfs4_end_open_seqid_sync(oop);
2549 		*did_start_seqid_syncp = 0;
2550 
2551 		if (bsep)
2552 			kmem_free(bsep, sizeof (*bsep));
2553 		/*
2554 		 * For signals, the caller wants to quit, so don't say to
2555 		 * retry.  For forced unmount, if it's a user thread, it
2556 		 * wants to quit.  If it's a recovery thread, the retry
2557 		 * will happen higher-up on the call stack.  Either way,
2558 		 * don't say to retry.
2559 		 */
2560 		if (abort == FALSE && ep->error != EINTR &&
2561 		    !NFS4_FRC_UNMT_ERR(ep->error, mi->mi_vfsp) &&
2562 		    close_type != CLOSE_RESEND &&
2563 		    close_type != CLOSE_AFTER_RESEND)
2564 			*recov = 1;
2565 		else
2566 			*recov = 0;
2567 
2568 		if (!ep->error)
2569 			xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2570 		return;
2571 	}
2572 
2573 	if (res.status) {
2574 		xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2575 		return;
2576 	}
2577 
2578 	mutex_enter(&rp->r_statev4_lock);
2579 	rp->created_v4 = 0;
2580 	mutex_exit(&rp->r_statev4_lock);
2581 
2582 	resop = &res.array[2];
2583 	osp->open_stateid = resop->nfs_resop4_u.opclose.open_stateid;
2584 	osp->os_valid = 0;
2585 
2586 	/*
2587 	 * This removes the reference obtained at OPEN; ie, when the
2588 	 * open stream structure was created.
2589 	 *
2590 	 * We don't have to worry about calling 'open_stream_rele'
2591 	 * since we our currently holding a reference to the open
2592 	 * stream which means the count cannot go to 0 with this
2593 	 * decrement.
2594 	 */
2595 	ASSERT(osp->os_ref_count >= 2);
2596 	osp->os_ref_count--;
2597 
2598 	if (ep->error == 0) {
2599 		mutex_exit(&osp->os_sync_lock);
2600 		*have_sync_lockp = 0;
2601 
2602 		nfs4_attr_cache(vp,
2603 		    &res.array[1].nfs_resop4_u.opgetattr.ga_res,
2604 		    t, cred_otw, TRUE, NULL);
2605 	}
2606 
2607 	NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4close_otw:"
2608 	    " returning %d", ep->error));
2609 
2610 	xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2611 }
2612 
2613 /* ARGSUSED */
2614 static int
nfs4_read(vnode_t * vp,struct uio * uiop,int ioflag,cred_t * cr,caller_context_t * ct)2615 nfs4_read(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr,
2616     caller_context_t *ct)
2617 {
2618 	rnode4_t *rp;
2619 	u_offset_t off;
2620 	offset_t diff;
2621 	uint_t on;
2622 	uint_t n;
2623 	caddr_t base;
2624 	uint_t flags;
2625 	int error;
2626 	mntinfo4_t *mi;
2627 
2628 	rp = VTOR4(vp);
2629 
2630 	ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER));
2631 
2632 	if (IS_SHADOW(vp, rp))
2633 		vp = RTOV4(rp);
2634 
2635 	if (vp->v_type != VREG)
2636 		return (EISDIR);
2637 
2638 	mi = VTOMI4(vp);
2639 
2640 	if (nfs_zone() != mi->mi_zone)
2641 		return (EIO);
2642 
2643 	if (uiop->uio_resid == 0)
2644 		return (0);
2645 
2646 	if (uiop->uio_loffset < 0 || uiop->uio_loffset + uiop->uio_resid < 0)
2647 		return (EINVAL);
2648 
2649 	mutex_enter(&rp->r_statelock);
2650 	if (rp->r_flags & R4RECOVERRP)
2651 		error = (rp->r_error ? rp->r_error : EIO);
2652 	else
2653 		error = 0;
2654 	mutex_exit(&rp->r_statelock);
2655 	if (error)
2656 		return (error);
2657 
2658 	/*
2659 	 * Bypass VM if caching has been disabled (e.g., locking) or if
2660 	 * using client-side direct I/O and the file is not mmap'd and
2661 	 * there are no cached pages.
2662 	 */
2663 	if ((vp->v_flag & VNOCACHE) ||
2664 	    (((rp->r_flags & R4DIRECTIO) || (mi->mi_flags & MI4_DIRECTIO)) &&
2665 	    rp->r_mapcnt == 0 && rp->r_inmap == 0 && !nfs4_has_pages(vp))) {
2666 		size_t resid = 0;
2667 
2668 		return (nfs4read(vp, NULL, uiop->uio_loffset,
2669 		    uiop->uio_resid, &resid, cr, FALSE, uiop));
2670 	}
2671 
2672 	error = 0;
2673 
2674 	do {
2675 		off = uiop->uio_loffset & MAXBMASK; /* mapping offset */
2676 		on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */
2677 		n = MIN(MAXBSIZE - on, uiop->uio_resid);
2678 
2679 		if (error = nfs4_validate_caches(vp, cr))
2680 			break;
2681 
2682 		mutex_enter(&rp->r_statelock);
2683 		while (rp->r_flags & R4INCACHEPURGE) {
2684 			if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
2685 				mutex_exit(&rp->r_statelock);
2686 				return (EINTR);
2687 			}
2688 		}
2689 		diff = rp->r_size - uiop->uio_loffset;
2690 		mutex_exit(&rp->r_statelock);
2691 		if (diff <= 0)
2692 			break;
2693 		if (diff < n)
2694 			n = (uint_t)diff;
2695 
2696 		if (vpm_enable) {
2697 			/*
2698 			 * Copy data.
2699 			 */
2700 			error = vpm_data_copy(vp, off + on, n, uiop,
2701 			    1, NULL, 0, S_READ);
2702 		} else {
2703 			base = segmap_getmapflt(segkmap, vp, off + on, n, 1,
2704 			    S_READ);
2705 
2706 			error = uiomove(base + on, n, UIO_READ, uiop);
2707 		}
2708 
2709 		if (!error) {
2710 			/*
2711 			 * If read a whole block or read to eof,
2712 			 * won't need this buffer again soon.
2713 			 */
2714 			mutex_enter(&rp->r_statelock);
2715 			if (n + on == MAXBSIZE ||
2716 			    uiop->uio_loffset == rp->r_size)
2717 				flags = SM_DONTNEED;
2718 			else
2719 				flags = 0;
2720 			mutex_exit(&rp->r_statelock);
2721 			if (vpm_enable) {
2722 				error = vpm_sync_pages(vp, off, n, flags);
2723 			} else {
2724 				error = segmap_release(segkmap, base, flags);
2725 			}
2726 		} else {
2727 			if (vpm_enable) {
2728 				(void) vpm_sync_pages(vp, off, n, 0);
2729 			} else {
2730 				(void) segmap_release(segkmap, base, 0);
2731 			}
2732 		}
2733 	} while (!error && uiop->uio_resid > 0);
2734 
2735 	return (error);
2736 }
2737 
2738 /* ARGSUSED */
2739 static int
nfs4_write(vnode_t * vp,struct uio * uiop,int ioflag,cred_t * cr,caller_context_t * ct)2740 nfs4_write(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr,
2741     caller_context_t *ct)
2742 {
2743 	rlim64_t limit = uiop->uio_llimit;
2744 	rnode4_t *rp;
2745 	u_offset_t off;
2746 	caddr_t base;
2747 	uint_t flags;
2748 	int remainder;
2749 	size_t n;
2750 	int on;
2751 	int error;
2752 	int resid;
2753 	u_offset_t offset;
2754 	mntinfo4_t *mi;
2755 	uint_t bsize;
2756 
2757 	rp = VTOR4(vp);
2758 
2759 	if (IS_SHADOW(vp, rp))
2760 		vp = RTOV4(rp);
2761 
2762 	if (vp->v_type != VREG)
2763 		return (EISDIR);
2764 
2765 	mi = VTOMI4(vp);
2766 
2767 	if (nfs_zone() != mi->mi_zone)
2768 		return (EIO);
2769 
2770 	if (uiop->uio_resid == 0)
2771 		return (0);
2772 
2773 	mutex_enter(&rp->r_statelock);
2774 	if (rp->r_flags & R4RECOVERRP)
2775 		error = (rp->r_error ? rp->r_error : EIO);
2776 	else
2777 		error = 0;
2778 	mutex_exit(&rp->r_statelock);
2779 	if (error)
2780 		return (error);
2781 
2782 	if (ioflag & FAPPEND) {
2783 		struct vattr va;
2784 
2785 		/*
2786 		 * Must serialize if appending.
2787 		 */
2788 		if (nfs_rw_lock_held(&rp->r_rwlock, RW_READER)) {
2789 			nfs_rw_exit(&rp->r_rwlock);
2790 			if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER,
2791 			    INTR4(vp)))
2792 				return (EINTR);
2793 		}
2794 
2795 		va.va_mask = AT_SIZE;
2796 		error = nfs4getattr(vp, &va, cr);
2797 		if (error)
2798 			return (error);
2799 		uiop->uio_loffset = va.va_size;
2800 	}
2801 
2802 	offset = uiop->uio_loffset + uiop->uio_resid;
2803 
2804 	if (uiop->uio_loffset < (offset_t)0 || offset < 0)
2805 		return (EINVAL);
2806 
2807 	if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
2808 		limit = MAXOFFSET_T;
2809 
2810 	/*
2811 	 * Check to make sure that the process will not exceed
2812 	 * its limit on file size.  It is okay to write up to
2813 	 * the limit, but not beyond.  Thus, the write which
2814 	 * reaches the limit will be short and the next write
2815 	 * will return an error.
2816 	 */
2817 	remainder = 0;
2818 	if (offset > uiop->uio_llimit) {
2819 		remainder = offset - uiop->uio_llimit;
2820 		uiop->uio_resid = uiop->uio_llimit - uiop->uio_loffset;
2821 		if (uiop->uio_resid <= 0) {
2822 			proc_t *p = ttoproc(curthread);
2823 
2824 			uiop->uio_resid += remainder;
2825 			mutex_enter(&p->p_lock);
2826 			(void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
2827 			    p->p_rctls, p, RCA_UNSAFE_SIGINFO);
2828 			mutex_exit(&p->p_lock);
2829 			return (EFBIG);
2830 		}
2831 	}
2832 
2833 	/* update the change attribute, if we have a write delegation */
2834 
2835 	mutex_enter(&rp->r_statev4_lock);
2836 	if (rp->r_deleg_type == OPEN_DELEGATE_WRITE)
2837 		rp->r_deleg_change++;
2838 
2839 	mutex_exit(&rp->r_statev4_lock);
2840 
2841 	if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR4(vp)))
2842 		return (EINTR);
2843 
2844 	/*
2845 	 * Bypass VM if caching has been disabled (e.g., locking) or if
2846 	 * using client-side direct I/O and the file is not mmap'd and
2847 	 * there are no cached pages.
2848 	 */
2849 	if ((vp->v_flag & VNOCACHE) ||
2850 	    (((rp->r_flags & R4DIRECTIO) || (mi->mi_flags & MI4_DIRECTIO)) &&
2851 	    rp->r_mapcnt == 0 && rp->r_inmap == 0 && !nfs4_has_pages(vp))) {
2852 		size_t bufsize;
2853 		int count;
2854 		u_offset_t org_offset;
2855 		stable_how4 stab_comm;
2856 nfs4_fwrite:
2857 		if (rp->r_flags & R4STALE) {
2858 			resid = uiop->uio_resid;
2859 			offset = uiop->uio_loffset;
2860 			error = rp->r_error;
2861 			/*
2862 			 * A close may have cleared r_error, if so,
2863 			 * propagate ESTALE error return properly
2864 			 */
2865 			if (error == 0)
2866 				error = ESTALE;
2867 			goto bottom;
2868 		}
2869 
2870 		bufsize = MIN(uiop->uio_resid, mi->mi_stsize);
2871 		base = kmem_alloc(bufsize, KM_SLEEP);
2872 		do {
2873 			if (ioflag & FDSYNC)
2874 				stab_comm = DATA_SYNC4;
2875 			else
2876 				stab_comm = FILE_SYNC4;
2877 			resid = uiop->uio_resid;
2878 			offset = uiop->uio_loffset;
2879 			count = MIN(uiop->uio_resid, bufsize);
2880 			org_offset = uiop->uio_loffset;
2881 			error = uiomove(base, count, UIO_WRITE, uiop);
2882 			if (!error) {
2883 				error = nfs4write(vp, base, org_offset,
2884 				    count, cr, &stab_comm);
2885 				if (!error) {
2886 					mutex_enter(&rp->r_statelock);
2887 					if (rp->r_size < uiop->uio_loffset)
2888 						rp->r_size = uiop->uio_loffset;
2889 					mutex_exit(&rp->r_statelock);
2890 				}
2891 			}
2892 		} while (!error && uiop->uio_resid > 0);
2893 		kmem_free(base, bufsize);
2894 		goto bottom;
2895 	}
2896 
2897 	bsize = vp->v_vfsp->vfs_bsize;
2898 
2899 	do {
2900 		off = uiop->uio_loffset & MAXBMASK; /* mapping offset */
2901 		on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */
2902 		n = MIN(MAXBSIZE - on, uiop->uio_resid);
2903 
2904 		resid = uiop->uio_resid;
2905 		offset = uiop->uio_loffset;
2906 
2907 		if (rp->r_flags & R4STALE) {
2908 			error = rp->r_error;
2909 			/*
2910 			 * A close may have cleared r_error, if so,
2911 			 * propagate ESTALE error return properly
2912 			 */
2913 			if (error == 0)
2914 				error = ESTALE;
2915 			break;
2916 		}
2917 
2918 		/*
2919 		 * Don't create dirty pages faster than they
2920 		 * can be cleaned so that the system doesn't
2921 		 * get imbalanced.  If the async queue is
2922 		 * maxed out, then wait for it to drain before
2923 		 * creating more dirty pages.  Also, wait for
2924 		 * any threads doing pagewalks in the vop_getattr
2925 		 * entry points so that they don't block for
2926 		 * long periods.
2927 		 */
2928 		mutex_enter(&rp->r_statelock);
2929 		while ((mi->mi_max_threads != 0 &&
2930 		    rp->r_awcount > 2 * mi->mi_max_threads) ||
2931 		    rp->r_gcount > 0) {
2932 			if (INTR4(vp)) {
2933 				klwp_t *lwp = ttolwp(curthread);
2934 
2935 				if (lwp != NULL)
2936 					lwp->lwp_nostop++;
2937 				if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
2938 					mutex_exit(&rp->r_statelock);
2939 					if (lwp != NULL)
2940 						lwp->lwp_nostop--;
2941 					error = EINTR;
2942 					goto bottom;
2943 				}
2944 				if (lwp != NULL)
2945 					lwp->lwp_nostop--;
2946 			} else
2947 				cv_wait(&rp->r_cv, &rp->r_statelock);
2948 		}
2949 		mutex_exit(&rp->r_statelock);
2950 
2951 		/*
2952 		 * Touch the page and fault it in if it is not in core
2953 		 * before segmap_getmapflt or vpm_data_copy can lock it.
2954 		 * This is to avoid the deadlock if the buffer is mapped
2955 		 * to the same file through mmap which we want to write.
2956 		 */
2957 		uio_prefaultpages((long)n, uiop);
2958 
2959 		if (vpm_enable) {
2960 			/*
2961 			 * It will use kpm mappings, so no need to
2962 			 * pass an address.
2963 			 */
2964 			error = writerp4(rp, NULL, n, uiop, 0);
2965 		} else  {
2966 			if (segmap_kpm) {
2967 				int pon = uiop->uio_loffset & PAGEOFFSET;
2968 				size_t pn = MIN(PAGESIZE - pon,
2969 				    uiop->uio_resid);
2970 				int pagecreate;
2971 
2972 				mutex_enter(&rp->r_statelock);
2973 				pagecreate = (pon == 0) && (pn == PAGESIZE ||
2974 				    uiop->uio_loffset + pn >= rp->r_size);
2975 				mutex_exit(&rp->r_statelock);
2976 
2977 				base = segmap_getmapflt(segkmap, vp, off + on,
2978 				    pn, !pagecreate, S_WRITE);
2979 
2980 				error = writerp4(rp, base + pon, n, uiop,
2981 				    pagecreate);
2982 
2983 			} else {
2984 				base = segmap_getmapflt(segkmap, vp, off + on,
2985 				    n, 0, S_READ);
2986 				error = writerp4(rp, base + on, n, uiop, 0);
2987 			}
2988 		}
2989 
2990 		if (!error) {
2991 			if (mi->mi_flags & MI4_NOAC)
2992 				flags = SM_WRITE;
2993 			else if ((uiop->uio_loffset % bsize) == 0 ||
2994 			    IS_SWAPVP(vp)) {
2995 				/*
2996 				 * Have written a whole block.
2997 				 * Start an asynchronous write
2998 				 * and mark the buffer to
2999 				 * indicate that it won't be
3000 				 * needed again soon.
3001 				 */
3002 				flags = SM_WRITE | SM_ASYNC | SM_DONTNEED;
3003 			} else
3004 				flags = 0;
3005 			if ((ioflag & (FSYNC|FDSYNC)) ||
3006 			    (rp->r_flags & R4OUTOFSPACE)) {
3007 				flags &= ~SM_ASYNC;
3008 				flags |= SM_WRITE;
3009 			}
3010 			if (vpm_enable) {
3011 				error = vpm_sync_pages(vp, off, n, flags);
3012 			} else {
3013 				error = segmap_release(segkmap, base, flags);
3014 			}
3015 		} else {
3016 			if (vpm_enable) {
3017 				(void) vpm_sync_pages(vp, off, n, 0);
3018 			} else {
3019 				(void) segmap_release(segkmap, base, 0);
3020 			}
3021 			/*
3022 			 * In the event that we got an access error while
3023 			 * faulting in a page for a write-only file just
3024 			 * force a write.
3025 			 */
3026 			if (error == EACCES)
3027 				goto nfs4_fwrite;
3028 		}
3029 	} while (!error && uiop->uio_resid > 0);
3030 
3031 bottom:
3032 	if (error) {
3033 		uiop->uio_resid = resid + remainder;
3034 		uiop->uio_loffset = offset;
3035 	} else {
3036 		uiop->uio_resid += remainder;
3037 
3038 		mutex_enter(&rp->r_statev4_lock);
3039 		if (rp->r_deleg_type == OPEN_DELEGATE_WRITE) {
3040 			gethrestime(&rp->r_attr.va_mtime);
3041 			rp->r_attr.va_ctime = rp->r_attr.va_mtime;
3042 		}
3043 		mutex_exit(&rp->r_statev4_lock);
3044 	}
3045 
3046 	nfs_rw_exit(&rp->r_lkserlock);
3047 
3048 	return (error);
3049 }
3050 
3051 /*
3052  * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED}
3053  */
3054 static int
nfs4_rdwrlbn(vnode_t * vp,page_t * pp,u_offset_t off,size_t len,int flags,cred_t * cr)3055 nfs4_rdwrlbn(vnode_t *vp, page_t *pp, u_offset_t off, size_t len,
3056     int flags, cred_t *cr)
3057 {
3058 	struct buf *bp;
3059 	int error;
3060 	page_t *savepp;
3061 	uchar_t fsdata;
3062 	stable_how4 stab_comm;
3063 
3064 	ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
3065 	bp = pageio_setup(pp, len, vp, flags);
3066 	ASSERT(bp != NULL);
3067 
3068 	/*
3069 	 * pageio_setup should have set b_addr to 0.  This
3070 	 * is correct since we want to do I/O on a page
3071 	 * boundary.  bp_mapin will use this addr to calculate
3072 	 * an offset, and then set b_addr to the kernel virtual
3073 	 * address it allocated for us.
3074 	 */
3075 	ASSERT(bp->b_un.b_addr == 0);
3076 
3077 	bp->b_edev = 0;
3078 	bp->b_dev = 0;
3079 	bp->b_lblkno = lbtodb(off);
3080 	bp->b_file = vp;
3081 	bp->b_offset = (offset_t)off;
3082 	bp_mapin(bp);
3083 
3084 	if ((flags & (B_WRITE|B_ASYNC)) == (B_WRITE|B_ASYNC) &&
3085 	    freemem > desfree)
3086 		stab_comm = UNSTABLE4;
3087 	else
3088 		stab_comm = FILE_SYNC4;
3089 
3090 	error = nfs4_bio(bp, &stab_comm, cr, FALSE);
3091 
3092 	bp_mapout(bp);
3093 	pageio_done(bp);
3094 
3095 	if (stab_comm == UNSTABLE4)
3096 		fsdata = C_DELAYCOMMIT;
3097 	else
3098 		fsdata = C_NOCOMMIT;
3099 
3100 	savepp = pp;
3101 	do {
3102 		pp->p_fsdata = fsdata;
3103 	} while ((pp = pp->p_next) != savepp);
3104 
3105 	return (error);
3106 }
3107 
3108 /*
3109  */
3110 static int
nfs4rdwr_check_osid(vnode_t * vp,nfs4_error_t * ep,cred_t * cr)3111 nfs4rdwr_check_osid(vnode_t *vp, nfs4_error_t *ep, cred_t *cr)
3112 {
3113 	nfs4_open_owner_t	*oop;
3114 	nfs4_open_stream_t	*osp;
3115 	rnode4_t		*rp = VTOR4(vp);
3116 	mntinfo4_t		*mi = VTOMI4(vp);
3117 	int			reopen_needed;
3118 
3119 	ASSERT(nfs_zone() == mi->mi_zone);
3120 
3121 
3122 	oop = find_open_owner(cr, NFS4_PERM_CREATED, mi);
3123 	if (!oop)
3124 		return (EIO);
3125 
3126 	/* returns with 'os_sync_lock' held */
3127 	osp = find_open_stream(oop, rp);
3128 	if (!osp) {
3129 		open_owner_rele(oop);
3130 		return (EIO);
3131 	}
3132 
3133 	if (osp->os_failed_reopen) {
3134 		mutex_exit(&osp->os_sync_lock);