1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 2016 STRATO AG. All rights reserved.
24 */
25
26/*
27 * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
28 */
29
30/*
31 * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
32 * Use is subject to license terms.
33 */
34
35/*
36 *	Copyright 1983,1984,1985,1986,1987,1988,1989 AT&T.
37 *	All Rights Reserved
38 */
39
40/*
41 * Copyright (c) 2013, Joyent, Inc. All rights reserved.
42 */
43
44#include <sys/param.h>
45#include <sys/types.h>
46#include <sys/systm.h>
47#include <sys/cred.h>
48#include <sys/time.h>
49#include <sys/vnode.h>
50#include <sys/vfs.h>
51#include <sys/vfs_opreg.h>
52#include <sys/file.h>
53#include <sys/filio.h>
54#include <sys/uio.h>
55#include <sys/buf.h>
56#include <sys/mman.h>
57#include <sys/pathname.h>
58#include <sys/dirent.h>
59#include <sys/debug.h>
60#include <sys/vmsystm.h>
61#include <sys/fcntl.h>
62#include <sys/flock.h>
63#include <sys/swap.h>
64#include <sys/errno.h>
65#include <sys/strsubr.h>
66#include <sys/sysmacros.h>
67#include <sys/kmem.h>
68#include <sys/cmn_err.h>
69#include <sys/pathconf.h>
70#include <sys/utsname.h>
71#include <sys/dnlc.h>
72#include <sys/acl.h>
73#include <sys/systeminfo.h>
74#include <sys/policy.h>
75#include <sys/sdt.h>
76#include <sys/list.h>
77#include <sys/stat.h>
78#include <sys/zone.h>
79
80#include <rpc/types.h>
81#include <rpc/auth.h>
82#include <rpc/clnt.h>
83
84#include <nfs/nfs.h>
85#include <nfs/nfs_clnt.h>
86#include <nfs/nfs_acl.h>
87#include <nfs/lm.h>
88#include <nfs/nfs4.h>
89#include <nfs/nfs4_kprot.h>
90#include <nfs/rnode4.h>
91#include <nfs/nfs4_clnt.h>
92
93#include <vm/hat.h>
94#include <vm/as.h>
95#include <vm/page.h>
96#include <vm/pvn.h>
97#include <vm/seg.h>
98#include <vm/seg_map.h>
99#include <vm/seg_kpm.h>
100#include <vm/seg_vn.h>
101
102#include <fs/fs_subr.h>
103
104#include <sys/ddi.h>
105#include <sys/int_fmtio.h>
106#include <sys/fs/autofs.h>
107
108typedef struct {
109	nfs4_ga_res_t	*di_garp;
110	cred_t		*di_cred;
111	hrtime_t	di_time_call;
112} dirattr_info_t;
113
114typedef enum nfs4_acl_op {
115	NFS4_ACL_GET,
116	NFS4_ACL_SET
117} nfs4_acl_op_t;
118
119static struct lm_sysid *nfs4_find_sysid(mntinfo4_t *);
120
121static void	nfs4_update_dircaches(change_info4 *, vnode_t *, vnode_t *,
122			char *, dirattr_info_t *);
123
124static void	nfs4close_otw(rnode4_t *, cred_t *, nfs4_open_owner_t *,
125		    nfs4_open_stream_t *, int *, int *, nfs4_close_type_t,
126		    nfs4_error_t *, int *);
127static int	nfs4_rdwrlbn(vnode_t *, page_t *, u_offset_t, size_t, int,
128			cred_t *);
129static int	nfs4write(vnode_t *, caddr_t, u_offset_t, int, cred_t *,
130			stable_how4 *);
131static int	nfs4read(vnode_t *, caddr_t, offset_t, int, size_t *,
132			cred_t *, bool_t, struct uio *);
133static int	nfs4setattr(vnode_t *, struct vattr *, int, cred_t *,
134			vsecattr_t *);
135static int	nfs4openattr(vnode_t *, vnode_t **, int, cred_t *);
136static int	nfs4lookup(vnode_t *, char *, vnode_t **, cred_t *, int);
137static int	nfs4lookup_xattr(vnode_t *, char *, vnode_t **, int, cred_t *);
138static int	nfs4lookupvalidate_otw(vnode_t *, char *, vnode_t **, cred_t *);
139static int	nfs4lookupnew_otw(vnode_t *, char *, vnode_t **, cred_t *);
140static int	nfs4mknod(vnode_t *, char *, struct vattr *, enum vcexcl,
141			int, vnode_t **, cred_t *);
142static int	nfs4open_otw(vnode_t *, char *, struct vattr *, vnode_t **,
143			cred_t *, int, int, enum createmode4, int);
144static int	nfs4rename(vnode_t *, char *, vnode_t *, char *, cred_t *,
145			caller_context_t *);
146static int	nfs4rename_persistent_fh(vnode_t *, char *, vnode_t *,
147			vnode_t *, char *, cred_t *, nfsstat4 *);
148static int	nfs4rename_volatile_fh(vnode_t *, char *, vnode_t *,
149			vnode_t *, char *, cred_t *, nfsstat4 *);
150static int	do_nfs4readdir(vnode_t *, rddir4_cache *, cred_t *);
151static void	nfs4readdir(vnode_t *, rddir4_cache *, cred_t *);
152static int	nfs4_bio(struct buf *, stable_how4 *, cred_t *, bool_t);
153static int	nfs4_getapage(vnode_t *, u_offset_t, size_t, uint_t *,
154			page_t *[], size_t, struct seg *, caddr_t,
155			enum seg_rw, cred_t *);
156static void	nfs4_readahead(vnode_t *, u_offset_t, caddr_t, struct seg *,
157			cred_t *);
158static int	nfs4_sync_putapage(vnode_t *, page_t *, u_offset_t, size_t,
159			int, cred_t *);
160static int	nfs4_sync_pageio(vnode_t *, page_t *, u_offset_t, size_t,
161			int, cred_t *);
162static int	nfs4_commit(vnode_t *, offset4, count4, cred_t *);
163static void	nfs4_set_mod(vnode_t *);
164static void	nfs4_get_commit(vnode_t *);
165static void	nfs4_get_commit_range(vnode_t *, u_offset_t, size_t);
166static int	nfs4_putpage_commit(vnode_t *, offset_t, size_t, cred_t *);
167static int	nfs4_commit_vp(vnode_t *, u_offset_t, size_t, cred_t *, int);
168static int	nfs4_sync_commit(vnode_t *, page_t *, offset3, count3,
169			cred_t *);
170static void	do_nfs4_async_commit(vnode_t *, page_t *, offset3, count3,
171			cred_t *);
172static int	nfs4_update_attrcache(nfsstat4, nfs4_ga_res_t *,
173			hrtime_t, vnode_t *, cred_t *);
174static int	nfs4_open_non_reg_file(vnode_t **, int, cred_t *);
175static int	nfs4_safelock(vnode_t *, const struct flock64 *, cred_t *);
176static void	nfs4_register_lock_locally(vnode_t *, struct flock64 *, int,
177			u_offset_t);
178static int	nfs4_lockrelease(vnode_t *, int, offset_t, cred_t *);
179static int	nfs4_block_and_wait(clock_t *, rnode4_t *);
180static cred_t  *state_to_cred(nfs4_open_stream_t *);
181static void	denied_to_flk(LOCK4denied *, flock64_t *, LOCKT4args *);
182static pid_t	lo_to_pid(lock_owner4 *);
183static void	nfs4_reinstitute_local_lock_state(vnode_t *, flock64_t *,
184			cred_t *, nfs4_lock_owner_t *);
185static void	push_reinstate(vnode_t *, int, flock64_t *, cred_t *,
186			nfs4_lock_owner_t *);
187static int	open_and_get_osp(vnode_t *, cred_t *, nfs4_open_stream_t **);
188static void	nfs4_delmap_callback(struct as *, void *, uint_t);
189static void	nfs4_free_delmapcall(nfs4_delmapcall_t *);
190static nfs4_delmapcall_t	*nfs4_init_delmapcall();
191static int	nfs4_find_and_delete_delmapcall(rnode4_t *, int *);
192static int	nfs4_is_acl_mask_valid(uint_t, nfs4_acl_op_t);
193static int	nfs4_create_getsecattr_return(vsecattr_t *, vsecattr_t *,
194			uid_t, gid_t, int);
195
196/*
197 * Routines that implement the setting of v4 args for the misc. ops
198 */
199static void	nfs4args_lock_free(nfs_argop4 *);
200static void	nfs4args_lockt_free(nfs_argop4 *);
201static void	nfs4args_setattr(nfs_argop4 *, vattr_t *, vsecattr_t *,
202			int, rnode4_t *, cred_t *, bitmap4, int *,
203			nfs4_stateid_types_t *);
204static void	nfs4args_setattr_free(nfs_argop4 *);
205static int	nfs4args_verify(nfs_argop4 *, vattr_t *, enum nfs_opnum4,
206			bitmap4);
207static void	nfs4args_verify_free(nfs_argop4 *);
208static void	nfs4args_write(nfs_argop4 *, stable_how4, rnode4_t *, cred_t *,
209			WRITE4args **, nfs4_stateid_types_t *);
210
211/*
212 * These are the vnode ops functions that implement the vnode interface to
213 * the networked file system.  See more comments below at nfs4_vnodeops.
214 */
215static int	nfs4_open(vnode_t **, int, cred_t *, caller_context_t *);
216static int	nfs4_close(vnode_t *, int, int, offset_t, cred_t *,
217			caller_context_t *);
218static int	nfs4_read(vnode_t *, struct uio *, int, cred_t *,
219			caller_context_t *);
220static int	nfs4_write(vnode_t *, struct uio *, int, cred_t *,
221			caller_context_t *);
222static int	nfs4_ioctl(vnode_t *, int, intptr_t, int, cred_t *, int *,
223			caller_context_t *);
224static int	nfs4_setattr(vnode_t *, struct vattr *, int, cred_t *,
225			caller_context_t *);
226static int	nfs4_access(vnode_t *, int, int, cred_t *, caller_context_t *);
227static int	nfs4_readlink(vnode_t *, struct uio *, cred_t *,
228			caller_context_t *);
229static int	nfs4_fsync(vnode_t *, int, cred_t *, caller_context_t *);
230static int	nfs4_create(vnode_t *, char *, struct vattr *, enum vcexcl,
231			int, vnode_t **, cred_t *, int, caller_context_t *,
232			vsecattr_t *);
233static int	nfs4_remove(vnode_t *, char *, cred_t *, caller_context_t *,
234			int);
235static int	nfs4_link(vnode_t *, vnode_t *, char *, cred_t *,
236			caller_context_t *, int);
237static int	nfs4_rename(vnode_t *, char *, vnode_t *, char *, cred_t *,
238			caller_context_t *, int);
239static int	nfs4_mkdir(vnode_t *, char *, struct vattr *, vnode_t **,
240			cred_t *, caller_context_t *, int, vsecattr_t *);
241static int	nfs4_rmdir(vnode_t *, char *, vnode_t *, cred_t *,
242			caller_context_t *, int);
243static int	nfs4_symlink(vnode_t *, char *, struct vattr *, char *,
244			cred_t *, caller_context_t *, int);
245static int	nfs4_readdir(vnode_t *, struct uio *, cred_t *, int *,
246			caller_context_t *, int);
247static int	nfs4_seek(vnode_t *, offset_t, offset_t *, caller_context_t *);
248static int	nfs4_getpage(vnode_t *, offset_t, size_t, uint_t *,
249			page_t *[], size_t, struct seg *, caddr_t,
250			enum seg_rw, cred_t *, caller_context_t *);
251static int	nfs4_putpage(vnode_t *, offset_t, size_t, int, cred_t *,
252			caller_context_t *);
253static int	nfs4_map(vnode_t *, offset_t, struct as *, caddr_t *, size_t,
254			uchar_t, uchar_t, uint_t, cred_t *, caller_context_t *);
255static int	nfs4_addmap(vnode_t *, offset_t, struct as *, caddr_t, size_t,
256			uchar_t, uchar_t, uint_t, cred_t *, caller_context_t *);
257static int	nfs4_cmp(vnode_t *, vnode_t *, caller_context_t *);
258static int	nfs4_frlock(vnode_t *, int, struct flock64 *, int, offset_t,
259			struct flk_callback *, cred_t *, caller_context_t *);
260static int	nfs4_space(vnode_t *, int, struct flock64 *, int, offset_t,
261			cred_t *, caller_context_t *);
262static int	nfs4_delmap(vnode_t *, offset_t, struct as *, caddr_t, size_t,
263			uint_t, uint_t, uint_t, cred_t *, caller_context_t *);
264static int	nfs4_pageio(vnode_t *, page_t *, u_offset_t, size_t, int,
265			cred_t *, caller_context_t *);
266static void	nfs4_dispose(vnode_t *, page_t *, int, int, cred_t *,
267			caller_context_t *);
268static int	nfs4_setsecattr(vnode_t *, vsecattr_t *, int, cred_t *,
269			caller_context_t *);
270/*
271 * These vnode ops are required to be called from outside this source file,
272 * e.g. by ephemeral mount stub vnode ops, and so may not be declared
273 * as static.
274 */
275int	nfs4_getattr(vnode_t *, struct vattr *, int, cred_t *,
276	    caller_context_t *);
277void	nfs4_inactive(vnode_t *, cred_t *, caller_context_t *);
278int	nfs4_lookup(vnode_t *, char *, vnode_t **,
279	    struct pathname *, int, vnode_t *, cred_t *,
280	    caller_context_t *, int *, pathname_t *);
281int	nfs4_fid(vnode_t *, fid_t *, caller_context_t *);
282int	nfs4_rwlock(vnode_t *, int, caller_context_t *);
283void	nfs4_rwunlock(vnode_t *, int, caller_context_t *);
284int	nfs4_realvp(vnode_t *, vnode_t **, caller_context_t *);
285int	nfs4_pathconf(vnode_t *, int, ulong_t *, cred_t *,
286	    caller_context_t *);
287int	nfs4_getsecattr(vnode_t *, vsecattr_t *, int, cred_t *,
288	    caller_context_t *);
289int	nfs4_shrlock(vnode_t *, int, struct shrlock *, int, cred_t *,
290	    caller_context_t *);
291
292/*
293 * Used for nfs4_commit_vp() to indicate if we should
294 * wait on pending writes.
295 */
296#define	NFS4_WRITE_NOWAIT	0
297#define	NFS4_WRITE_WAIT		1
298
299#define	NFS4_BASE_WAIT_TIME 1	/* 1 second */
300
301/*
302 * Error flags used to pass information about certain special errors
303 * which need to be handled specially.
304 */
305#define	NFS_EOF			-98
306#define	NFS_VERF_MISMATCH	-97
307
308/*
309 * Flags used to differentiate between which operation drove the
310 * potential CLOSE OTW. (see nfs4_close_otw_if_necessary)
311 */
312#define	NFS4_CLOSE_OP		0x1
313#define	NFS4_DELMAP_OP		0x2
314#define	NFS4_INACTIVE_OP	0x3
315
316#define	ISVDEV(t) ((t == VBLK) || (t == VCHR) || (t == VFIFO))
317
318/* ALIGN64 aligns the given buffer and adjust buffer size to 64 bit */
319#define	ALIGN64(x, ptr, sz)						\
320	x = ((uintptr_t)(ptr)) & (sizeof (uint64_t) - 1);		\
321	if (x) {							\
322		x = sizeof (uint64_t) - (x);				\
323		sz -= (x);						\
324		ptr += (x);						\
325	}
326
327#ifdef DEBUG
328int nfs4_client_attr_debug = 0;
329int nfs4_client_state_debug = 0;
330int nfs4_client_shadow_debug = 0;
331int nfs4_client_lock_debug = 0;
332int nfs4_seqid_sync = 0;
333int nfs4_client_map_debug = 0;
334static int nfs4_pageio_debug = 0;
335int nfs4_client_inactive_debug = 0;
336int nfs4_client_recov_debug = 0;
337int nfs4_client_failover_debug = 0;
338int nfs4_client_call_debug = 0;
339int nfs4_client_lookup_debug = 0;
340int nfs4_client_zone_debug = 0;
341int nfs4_lost_rqst_debug = 0;
342int nfs4_rdattrerr_debug = 0;
343int nfs4_open_stream_debug = 0;
344
345int nfs4read_error_inject;
346
347static int nfs4_create_misses = 0;
348
349static int nfs4_readdir_cache_shorts = 0;
350static int nfs4_readdir_readahead = 0;
351
352static int nfs4_bio_do_stop = 0;
353
354static int nfs4_lostpage = 0;	/* number of times we lost original page */
355
356int nfs4_mmap_debug = 0;
357
358static int nfs4_pathconf_cache_hits = 0;
359static int nfs4_pathconf_cache_misses = 0;
360
361int nfs4close_all_cnt;
362int nfs4close_one_debug = 0;
363int nfs4close_notw_debug = 0;
364
365int denied_to_flk_debug = 0;
366void *lockt_denied_debug;
367
368#endif
369
370/*
371 * How long to wait before trying again if OPEN_CONFIRM gets ETIMEDOUT
372 * or NFS4ERR_RESOURCE.
373 */
374static int confirm_retry_sec = 30;
375
376static int nfs4_lookup_neg_cache = 1;
377
378/*
379 * number of pages to read ahead
380 * optimized for 100 base-T.
381 */
382static int nfs4_nra = 4;
383
384static int nfs4_do_symlink_cache = 1;
385
386static int nfs4_pathconf_disable_cache = 0;
387
388/*
389 * These are the vnode ops routines which implement the vnode interface to
390 * the networked file system.  These routines just take their parameters,
391 * make them look networkish by putting the right info into interface structs,
392 * and then calling the appropriate remote routine(s) to do the work.
393 *
394 * Note on directory name lookup cacheing:  If we detect a stale fhandle,
395 * we purge the directory cache relative to that vnode.  This way, the
396 * user won't get burned by the cache repeatedly.  See <nfs/rnode4.h> for
397 * more details on rnode locking.
398 */
399
400struct vnodeops *nfs4_vnodeops;
401
402const fs_operation_def_t nfs4_vnodeops_template[] = {
403	VOPNAME_OPEN,		{ .vop_open = nfs4_open },
404	VOPNAME_CLOSE,		{ .vop_close = nfs4_close },
405	VOPNAME_READ,		{ .vop_read = nfs4_read },
406	VOPNAME_WRITE,		{ .vop_write = nfs4_write },
407	VOPNAME_IOCTL,		{ .vop_ioctl = nfs4_ioctl },
408	VOPNAME_GETATTR,	{ .vop_getattr = nfs4_getattr },
409	VOPNAME_SETATTR,	{ .vop_setattr = nfs4_setattr },
410	VOPNAME_ACCESS,		{ .vop_access = nfs4_access },
411	VOPNAME_LOOKUP,		{ .vop_lookup = nfs4_lookup },
412	VOPNAME_CREATE,		{ .vop_create = nfs4_create },
413	VOPNAME_REMOVE,		{ .vop_remove = nfs4_remove },
414	VOPNAME_LINK,		{ .vop_link = nfs4_link },
415	VOPNAME_RENAME,		{ .vop_rename = nfs4_rename },
416	VOPNAME_MKDIR,		{ .vop_mkdir = nfs4_mkdir },
417	VOPNAME_RMDIR,		{ .vop_rmdir = nfs4_rmdir },
418	VOPNAME_READDIR,	{ .vop_readdir = nfs4_readdir },
419	VOPNAME_SYMLINK,	{ .vop_symlink = nfs4_symlink },
420	VOPNAME_READLINK,	{ .vop_readlink = nfs4_readlink },
421	VOPNAME_FSYNC,		{ .vop_fsync = nfs4_fsync },
422	VOPNAME_INACTIVE,	{ .vop_inactive = nfs4_inactive },
423	VOPNAME_FID,		{ .vop_fid = nfs4_fid },
424	VOPNAME_RWLOCK,		{ .vop_rwlock = nfs4_rwlock },
425	VOPNAME_RWUNLOCK,	{ .vop_rwunlock = nfs4_rwunlock },
426	VOPNAME_SEEK,		{ .vop_seek = nfs4_seek },
427	VOPNAME_FRLOCK,		{ .vop_frlock = nfs4_frlock },
428	VOPNAME_SPACE,		{ .vop_space = nfs4_space },
429	VOPNAME_REALVP,		{ .vop_realvp = nfs4_realvp },
430	VOPNAME_GETPAGE,	{ .vop_getpage = nfs4_getpage },
431	VOPNAME_PUTPAGE,	{ .vop_putpage = nfs4_putpage },
432	VOPNAME_MAP,		{ .vop_map = nfs4_map },
433	VOPNAME_ADDMAP,		{ .vop_addmap = nfs4_addmap },
434	VOPNAME_DELMAP,		{ .vop_delmap = nfs4_delmap },
435	/* no separate nfs4_dump */
436	VOPNAME_DUMP,		{ .vop_dump = nfs_dump },
437	VOPNAME_PATHCONF,	{ .vop_pathconf = nfs4_pathconf },
438	VOPNAME_PAGEIO,		{ .vop_pageio = nfs4_pageio },
439	VOPNAME_DISPOSE,	{ .vop_dispose = nfs4_dispose },
440	VOPNAME_SETSECATTR,	{ .vop_setsecattr = nfs4_setsecattr },
441	VOPNAME_GETSECATTR,	{ .vop_getsecattr = nfs4_getsecattr },
442	VOPNAME_SHRLOCK,	{ .vop_shrlock = nfs4_shrlock },
443	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
444	NULL,			NULL
445};
446
447/*
448 * The following are subroutines and definitions to set args or get res
449 * for the different nfsv4 ops
450 */
451
452void
453nfs4args_lookup_free(nfs_argop4 *argop, int arglen)
454{
455	int		i;
456
457	for (i = 0; i < arglen; i++) {
458		if (argop[i].argop == OP_LOOKUP) {
459			kmem_free(
460			    argop[i].nfs_argop4_u.oplookup.
461			    objname.utf8string_val,
462			    argop[i].nfs_argop4_u.oplookup.
463			    objname.utf8string_len);
464		}
465	}
466}
467
468static void
469nfs4args_lock_free(nfs_argop4 *argop)
470{
471	locker4 *locker = &argop->nfs_argop4_u.oplock.locker;
472
473	if (locker->new_lock_owner == TRUE) {
474		open_to_lock_owner4 *open_owner;
475
476		open_owner = &locker->locker4_u.open_owner;
477		if (open_owner->lock_owner.owner_val != NULL) {
478			kmem_free(open_owner->lock_owner.owner_val,
479			    open_owner->lock_owner.owner_len);
480		}
481	}
482}
483
484static void
485nfs4args_lockt_free(nfs_argop4 *argop)
486{
487	lock_owner4 *lowner = &argop->nfs_argop4_u.oplockt.owner;
488
489	if (lowner->owner_val != NULL) {
490		kmem_free(lowner->owner_val, lowner->owner_len);
491	}
492}
493
494static void
495nfs4args_setattr(nfs_argop4 *argop, vattr_t *vap, vsecattr_t *vsap, int flags,
496    rnode4_t *rp, cred_t *cr, bitmap4 supp, int *error,
497    nfs4_stateid_types_t *sid_types)
498{
499	fattr4		*attr = &argop->nfs_argop4_u.opsetattr.obj_attributes;
500	mntinfo4_t	*mi;
501
502	argop->argop = OP_SETATTR;
503	/*
504	 * The stateid is set to 0 if client is not modifying the size
505	 * and otherwise to whatever nfs4_get_stateid() returns.
506	 *
507	 * XXX Note: nfs4_get_stateid() returns 0 if no lockowner and/or no
508	 * state struct could be found for the process/file pair.  We may
509	 * want to change this in the future (by OPENing the file).  See
510	 * bug # 4474852.
511	 */
512	if (vap->va_mask & AT_SIZE) {
513
514		ASSERT(rp != NULL);
515		mi = VTOMI4(RTOV4(rp));
516
517		argop->nfs_argop4_u.opsetattr.stateid =
518		    nfs4_get_stateid(cr, rp, curproc->p_pidp->pid_id, mi,
519		    OP_SETATTR, sid_types, FALSE);
520	} else {
521		bzero(&argop->nfs_argop4_u.opsetattr.stateid,
522		    sizeof (stateid4));
523	}
524
525	*error = vattr_to_fattr4(vap, vsap, attr, flags, OP_SETATTR, supp);
526	if (*error)
527		bzero(attr, sizeof (*attr));
528}
529
530static void
531nfs4args_setattr_free(nfs_argop4 *argop)
532{
533	nfs4_fattr4_free(&argop->nfs_argop4_u.opsetattr.obj_attributes);
534}
535
536static int
537nfs4args_verify(nfs_argop4 *argop, vattr_t *vap, enum nfs_opnum4 op,
538    bitmap4 supp)
539{
540	fattr4 *attr;
541	int error = 0;
542
543	argop->argop = op;
544	switch (op) {
545	case OP_VERIFY:
546		attr = &argop->nfs_argop4_u.opverify.obj_attributes;
547		break;
548	case OP_NVERIFY:
549		attr = &argop->nfs_argop4_u.opnverify.obj_attributes;
550		break;
551	default:
552		return (EINVAL);
553	}
554	if (!error)
555		error = vattr_to_fattr4(vap, NULL, attr, 0, op, supp);
556	if (error)
557		bzero(attr, sizeof (*attr));
558	return (error);
559}
560
561static void
562nfs4args_verify_free(nfs_argop4 *argop)
563{
564	switch (argop->argop) {
565	case OP_VERIFY:
566		nfs4_fattr4_free(&argop->nfs_argop4_u.opverify.obj_attributes);
567		break;
568	case OP_NVERIFY:
569		nfs4_fattr4_free(&argop->nfs_argop4_u.opnverify.obj_attributes);
570		break;
571	default:
572		break;
573	}
574}
575
576static void
577nfs4args_write(nfs_argop4 *argop, stable_how4 stable, rnode4_t *rp, cred_t *cr,
578    WRITE4args **wargs_pp, nfs4_stateid_types_t *sid_tp)
579{
580	WRITE4args *wargs = &argop->nfs_argop4_u.opwrite;
581	mntinfo4_t *mi = VTOMI4(RTOV4(rp));
582
583	argop->argop = OP_WRITE;
584	wargs->stable = stable;
585	wargs->stateid = nfs4_get_w_stateid(cr, rp, curproc->p_pidp->pid_id,
586	    mi, OP_WRITE, sid_tp);
587	wargs->mblk = NULL;
588	*wargs_pp = wargs;
589}
590
591void
592nfs4args_copen_free(OPEN4cargs *open_args)
593{
594	if (open_args->owner.owner_val) {
595		kmem_free(open_args->owner.owner_val,
596		    open_args->owner.owner_len);
597	}
598	if ((open_args->opentype == OPEN4_CREATE) &&
599	    (open_args->mode != EXCLUSIVE4)) {
600		nfs4_fattr4_free(&open_args->createhow4_u.createattrs);
601	}
602}
603
604/*
605 * XXX:  This is referenced in modstubs.s
606 */
607struct vnodeops *
608nfs4_getvnodeops(void)
609{
610	return (nfs4_vnodeops);
611}
612
613/*
614 * The OPEN operation opens a regular file.
615 */
616/*ARGSUSED3*/
617static int
618nfs4_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
619{
620	vnode_t *dvp = NULL;
621	rnode4_t *rp, *drp;
622	int error;
623	int just_been_created;
624	char fn[MAXNAMELEN];
625
626	NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4_open: "));
627	if (nfs_zone() != VTOMI4(*vpp)->mi_zone)
628		return (EIO);
629	rp = VTOR4(*vpp);
630
631	/*
632	 * Check to see if opening something besides a regular file;
633	 * if so skip the OTW call
634	 */
635	if ((*vpp)->v_type != VREG) {
636		error = nfs4_open_non_reg_file(vpp, flag, cr);
637		return (error);
638	}
639
640	/*
641	 * XXX - would like a check right here to know if the file is
642	 * executable or not, so as to skip OTW
643	 */
644
645	if ((error = vtodv(*vpp, &dvp, cr, TRUE)) != 0)
646		return (error);
647
648	drp = VTOR4(dvp);
649	if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp)))
650		return (EINTR);
651
652	if ((error = vtoname(*vpp, fn, MAXNAMELEN)) != 0) {
653		nfs_rw_exit(&drp->r_rwlock);
654		return (error);
655	}
656
657	/*
658	 * See if this file has just been CREATEd.
659	 * If so, clear the flag and update the dnlc, which was previously
660	 * skipped in nfs4_create.
661	 * XXX need better serilization on this.
662	 * XXX move this into the nf4open_otw call, after we have
663	 * XXX acquired the open owner seqid sync.
664	 */
665	mutex_enter(&rp->r_statev4_lock);
666	if (rp->created_v4) {
667		rp->created_v4 = 0;
668		mutex_exit(&rp->r_statev4_lock);
669
670		dnlc_update(dvp, fn, *vpp);
671		/* This is needed so we don't bump the open ref count */
672		just_been_created = 1;
673	} else {
674		mutex_exit(&rp->r_statev4_lock);
675		just_been_created = 0;
676	}
677
678	/*
679	 * If caller specified O_TRUNC/FTRUNC, then be sure to set
680	 * FWRITE (to drive successful setattr(size=0) after open)
681	 */
682	if (flag & FTRUNC)
683		flag |= FWRITE;
684
685	error = nfs4open_otw(dvp, fn, NULL, vpp, cr, 0, flag, 0,
686	    just_been_created);
687
688	if (!error && !((*vpp)->v_flag & VROOT))
689		dnlc_update(dvp, fn, *vpp);
690
691	nfs_rw_exit(&drp->r_rwlock);
692
693	/* release the hold from vtodv */
694	VN_RELE(dvp);
695
696	/* exchange the shadow for the master vnode, if needed */
697
698	if (error == 0 && IS_SHADOW(*vpp, rp))
699		sv_exchange(vpp);
700
701	return (error);
702}
703
704/*
705 * See if there's a "lost open" request to be saved and recovered.
706 */
707static void
708nfs4open_save_lost_rqst(int error, nfs4_lost_rqst_t *lost_rqstp,
709    nfs4_open_owner_t *oop, cred_t *cr, vnode_t *vp,
710    vnode_t *dvp, OPEN4cargs *open_args)
711{
712	vfs_t *vfsp;
713	char *srccfp;
714
715	vfsp = (dvp ? dvp->v_vfsp : vp->v_vfsp);
716
717	if (error != ETIMEDOUT && error != EINTR &&
718	    !NFS4_FRC_UNMT_ERR(error, vfsp)) {
719		lost_rqstp->lr_op = 0;
720		return;
721	}
722
723	NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
724	    "nfs4open_save_lost_rqst: error %d", error));
725
726	lost_rqstp->lr_op = OP_OPEN;
727
728	/*
729	 * The vp (if it is not NULL) and dvp are held and rele'd via
730	 * the recovery code.  See nfs4_save_lost_rqst.
731	 */
732	lost_rqstp->lr_vp = vp;
733	lost_rqstp->lr_dvp = dvp;
734	lost_rqstp->lr_oop = oop;
735	lost_rqstp->lr_osp = NULL;
736	lost_rqstp->lr_lop = NULL;
737	lost_rqstp->lr_cr = cr;
738	lost_rqstp->lr_flk = NULL;
739	lost_rqstp->lr_oacc = open_args->share_access;
740	lost_rqstp->lr_odeny = open_args->share_deny;
741	lost_rqstp->lr_oclaim = open_args->claim;
742	if (open_args->claim == CLAIM_DELEGATE_CUR) {
743		lost_rqstp->lr_ostateid =
744		    open_args->open_claim4_u.delegate_cur_info.delegate_stateid;
745		srccfp = open_args->open_claim4_u.delegate_cur_info.cfile;
746	} else {
747		srccfp = open_args->open_claim4_u.cfile;
748	}
749	lost_rqstp->lr_ofile.utf8string_len = 0;
750	lost_rqstp->lr_ofile.utf8string_val = NULL;
751	(void) str_to_utf8(srccfp, &lost_rqstp->lr_ofile);
752	lost_rqstp->lr_putfirst = FALSE;
753}
754
755struct nfs4_excl_time {
756	uint32 seconds;
757	uint32 nseconds;
758};
759
760/*
761 * The OPEN operation creates and/or opens a regular file
762 *
763 * ARGSUSED
764 */
765static int
766nfs4open_otw(vnode_t *dvp, char *file_name, struct vattr *in_va,
767    vnode_t **vpp, cred_t *cr, int create_flag, int open_flag,
768    enum createmode4 createmode, int file_just_been_created)
769{
770	rnode4_t *rp;
771	rnode4_t *drp = VTOR4(dvp);
772	vnode_t *vp = NULL;
773	vnode_t *vpi = *vpp;
774	bool_t needrecov = FALSE;
775
776	int doqueue = 1;
777
778	COMPOUND4args_clnt args;
779	COMPOUND4res_clnt res;
780	nfs_argop4 *argop;
781	nfs_resop4 *resop;
782	int argoplist_size;
783	int idx_open, idx_fattr;
784
785	GETFH4res *gf_res = NULL;
786	OPEN4res *op_res = NULL;
787	nfs4_ga_res_t *garp;
788	fattr4 *attr = NULL;
789	struct nfs4_excl_time verf;
790	bool_t did_excl_setup = FALSE;
791	int created_osp;
792
793	OPEN4cargs *open_args;
794	nfs4_open_owner_t	*oop = NULL;
795	nfs4_open_stream_t	*osp = NULL;
796	seqid4 seqid = 0;
797	bool_t retry_open = FALSE;
798	nfs4_recov_state_t recov_state;
799	nfs4_lost_rqst_t lost_rqst;
800	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
801	hrtime_t t;
802	int acc = 0;
803	cred_t *cred_otw = NULL;	/* cred used to do the RPC call */
804	cred_t *ncr = NULL;
805
806	nfs4_sharedfh_t *otw_sfh;
807	nfs4_sharedfh_t *orig_sfh;
808	int fh_differs = 0;
809	int numops, setgid_flag;
810	int num_bseqid_retry = NFS4_NUM_RETRY_BAD_SEQID + 1;
811
812	/*
813	 * Make sure we properly deal with setting the right gid on
814	 * a newly created file to reflect the parent's setgid bit
815	 */
816	setgid_flag = 0;
817	if (create_flag && in_va) {
818
819		/*
820		 * If there is grpid mount flag used or
821		 * the parent's directory has the setgid bit set
822		 * _and_ the client was able to get a valid mapping
823		 * for the parent dir's owner_group, we want to
824		 * append NVERIFY(owner_group == dva.va_gid) and
825		 * SETATTR to the CREATE compound.
826		 */
827		mutex_enter(&drp->r_statelock);
828		if ((VTOMI4(dvp)->mi_flags & MI4_GRPID ||
829		    drp->r_attr.va_mode & VSGID) &&
830		    drp->r_attr.va_gid != GID_NOBODY) {
831			in_va->va_mask |= AT_GID;
832			in_va->va_gid = drp->r_attr.va_gid;
833			setgid_flag = 1;
834		}
835		mutex_exit(&drp->r_statelock);
836	}
837
838	/*
839	 * Normal/non-create compound:
840	 * PUTFH(dfh) + OPEN(create) + GETFH + GETATTR(new)
841	 *
842	 * Open(create) compound no setgid:
843	 * PUTFH(dfh) + SAVEFH + OPEN(create) + GETFH + GETATTR(new) +
844	 * RESTOREFH + GETATTR
845	 *
846	 * Open(create) setgid:
847	 * PUTFH(dfh) + OPEN(create) + GETFH + GETATTR(new) +
848	 * SAVEFH + PUTFH(dfh) + GETATTR(dvp) + RESTOREFH +
849	 * NVERIFY(grp) + SETATTR
850	 */
851	if (setgid_flag) {
852		numops = 10;
853		idx_open = 1;
854		idx_fattr = 3;
855	} else if (create_flag) {
856		numops = 7;
857		idx_open = 2;
858		idx_fattr = 4;
859	} else {
860		numops = 4;
861		idx_open = 1;
862		idx_fattr = 3;
863	}
864
865	args.array_len = numops;
866	argoplist_size = numops * sizeof (nfs_argop4);
867	argop = kmem_alloc(argoplist_size, KM_SLEEP);
868
869	NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4open_otw: "
870	    "open %s open flag 0x%x cred %p", file_name, open_flag,
871	    (void *)cr));
872
873	ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone);
874	if (create_flag) {
875		/*
876		 * We are to create a file.  Initialize the passed in vnode
877		 * pointer.
878		 */
879		vpi = NULL;
880	} else {
881		/*
882		 * Check to see if the client owns a read delegation and is
883		 * trying to open for write.  If so, then return the delegation
884		 * to avoid the server doing a cb_recall and returning DELAY.
885		 * NB - we don't use the statev4_lock here because we'd have
886		 * to drop the lock anyway and the result would be stale.
887		 */
888		if ((open_flag & FWRITE) &&
889		    VTOR4(vpi)->r_deleg_type == OPEN_DELEGATE_READ)
890			(void) nfs4delegreturn(VTOR4(vpi), NFS4_DR_REOPEN);
891
892		/*
893		 * If the file has a delegation, then do an access check up
894		 * front.  This avoids having to an access check later after
895		 * we've already done start_op, which could deadlock.
896		 */
897		if (VTOR4(vpi)->r_deleg_type != OPEN_DELEGATE_NONE) {
898			if (open_flag & FREAD &&
899			    nfs4_access(vpi, VREAD, 0, cr, NULL) == 0)
900				acc |= VREAD;
901			if (open_flag & FWRITE &&
902			    nfs4_access(vpi, VWRITE, 0, cr, NULL) == 0)
903				acc |= VWRITE;
904		}
905	}
906
907	drp = VTOR4(dvp);
908
909	recov_state.rs_flags = 0;
910	recov_state.rs_num_retry_despite_err = 0;
911	cred_otw = cr;
912
913recov_retry:
914	fh_differs = 0;
915	nfs4_error_zinit(&e);
916
917	e.error = nfs4_start_op(VTOMI4(dvp), dvp, vpi, &recov_state);
918	if (e.error) {
919		if (ncr != NULL)
920			crfree(ncr);
921		kmem_free(argop, argoplist_size);
922		return (e.error);
923	}
924
925	args.ctag = TAG_OPEN;
926	args.array_len = numops;
927	args.array = argop;
928
929	/* putfh directory fh */
930	argop[0].argop = OP_CPUTFH;
931	argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh;
932
933	/* OPEN: either op 1 or op 2 depending upon create/setgid flags */
934	argop[idx_open].argop = OP_COPEN;
935	open_args = &argop[idx_open].nfs_argop4_u.opcopen;
936	open_args->claim = CLAIM_NULL;
937
938	/* name of file */
939	open_args->open_claim4_u.cfile = file_name;
940	open_args->owner.owner_len = 0;
941	open_args->owner.owner_val = NULL;
942
943	if (create_flag) {
944		/* CREATE a file */
945		open_args->opentype = OPEN4_CREATE;
946		open_args->mode = createmode;
947		if (createmode == EXCLUSIVE4) {
948			if (did_excl_setup == FALSE) {
949				verf.seconds = zone_get_hostid(NULL);
950				if (verf.seconds != 0)
951					verf.nseconds = newnum();
952				else {
953					timestruc_t now;
954
955					gethrestime(&now);
956					verf.seconds = now.tv_sec;
957					verf.nseconds = now.tv_nsec;
958				}
959				/*
960				 * Since the server will use this value for the
961				 * mtime, make sure that it can't overflow. Zero
962				 * out the MSB. The actual value does not matter
963				 * here, only its uniqeness.
964				 */
965				verf.seconds &= INT32_MAX;
966				did_excl_setup = TRUE;
967			}
968
969			/* Now copy over verifier to OPEN4args. */
970			open_args->createhow4_u.createverf = *(uint64_t *)&verf;
971		} else {
972			int v_error;
973			bitmap4 supp_attrs;
974			servinfo4_t *svp;
975
976			attr = &open_args->createhow4_u.createattrs;
977
978			svp = drp->r_server;
979			(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
980			supp_attrs = svp->sv_supp_attrs;
981			nfs_rw_exit(&svp->sv_lock);
982
983			/* GUARDED4 or UNCHECKED4 */
984			v_error = vattr_to_fattr4(in_va, NULL, attr, 0, OP_OPEN,
985			    supp_attrs);
986			if (v_error) {
987				bzero(attr, sizeof (*attr));
988				nfs4args_copen_free(open_args);
989				nfs4_end_op(VTOMI4(dvp), dvp, vpi,
990				    &recov_state, FALSE);
991				if (ncr != NULL)
992					crfree(ncr);
993				kmem_free(argop, argoplist_size);
994				return (v_error);
995			}
996		}
997	} else {
998		/* NO CREATE */
999		open_args->opentype = OPEN4_NOCREATE;
1000	}
1001
1002	if (recov_state.rs_sp != NULL) {
1003		mutex_enter(&recov_state.rs_sp->s_lock);
1004		open_args->owner.clientid = recov_state.rs_sp->clientid;
1005		mutex_exit(&recov_state.rs_sp->s_lock);
1006	} else {
1007		/* XXX should we just fail here? */
1008		open_args->owner.clientid = 0;
1009	}
1010
1011	/*
1012	 * This increments oop's ref count or creates a temporary 'just_created'
1013	 * open owner that will become valid when this OPEN/OPEN_CONFIRM call
1014	 * completes.
1015	 */
1016	mutex_enter(&VTOMI4(dvp)->mi_lock);
1017
1018	/* See if a permanent or just created open owner exists */
1019	oop = find_open_owner_nolock(cr, NFS4_JUST_CREATED, VTOMI4(dvp));
1020	if (!oop) {
1021		/*
1022		 * This open owner does not exist so create a temporary
1023		 * just created one.
1024		 */
1025		oop = create_open_owner(cr, VTOMI4(dvp));
1026		ASSERT(oop != NULL);
1027	}
1028	mutex_exit(&VTOMI4(dvp)->mi_lock);
1029
1030	/* this length never changes, do alloc before seqid sync */
1031	open_args->owner.owner_len = sizeof (oop->oo_name);
1032	open_args->owner.owner_val =
1033	    kmem_alloc(open_args->owner.owner_len, KM_SLEEP);
1034
1035	e.error = nfs4_start_open_seqid_sync(oop, VTOMI4(dvp));
1036	if (e.error == EAGAIN) {
1037		open_owner_rele(oop);
1038		nfs4args_copen_free(open_args);
1039		nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, TRUE);
1040		if (ncr != NULL) {
1041			crfree(ncr);
1042			ncr = NULL;
1043		}
1044		goto recov_retry;
1045	}
1046
1047	/* Check to see if we need to do the OTW call */
1048	if (!create_flag) {
1049		if (!nfs4_is_otw_open_necessary(oop, open_flag, vpi,
1050		    file_just_been_created, &e.error, acc, &recov_state)) {
1051
1052			/*
1053			 * The OTW open is not necessary.  Either
1054			 * the open can succeed without it (eg.
1055			 * delegation, error == 0) or the open
1056			 * must fail due to an access failure
1057			 * (error != 0).  In either case, tidy
1058			 * up and return.
1059			 */
1060
1061			nfs4_end_open_seqid_sync(oop);
1062			open_owner_rele(oop);
1063			nfs4args_copen_free(open_args);
1064			nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, FALSE);
1065			if (ncr != NULL)
1066				crfree(ncr);
1067			kmem_free(argop, argoplist_size);
1068			return (e.error);
1069		}
1070	}
1071
1072	bcopy(&oop->oo_name, open_args->owner.owner_val,
1073	    open_args->owner.owner_len);
1074
1075	seqid = nfs4_get_open_seqid(oop) + 1;
1076	open_args->seqid = seqid;
1077	open_args->share_access = 0;
1078	if (open_flag & FREAD)
1079		open_args->share_access |= OPEN4_SHARE_ACCESS_READ;
1080	if (open_flag & FWRITE)
1081		open_args->share_access |= OPEN4_SHARE_ACCESS_WRITE;
1082	open_args->share_deny = OPEN4_SHARE_DENY_NONE;
1083
1084
1085
1086	/*
1087	 * getfh w/sanity check for idx_open/idx_fattr
1088	 */
1089	ASSERT((idx_open + 1) == (idx_fattr - 1));
1090	argop[idx_open + 1].argop = OP_GETFH;
1091
1092	/* getattr */
1093	argop[idx_fattr].argop = OP_GETATTR;
1094	argop[idx_fattr].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
1095	argop[idx_fattr].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp);
1096
1097	if (setgid_flag) {
1098		vattr_t	_v;
1099		servinfo4_t *svp;
1100		bitmap4	supp_attrs;
1101
1102		svp = drp->r_server;
1103		(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
1104		supp_attrs = svp->sv_supp_attrs;
1105		nfs_rw_exit(&svp->sv_lock);
1106
1107		/*
1108		 * For setgid case, we need to:
1109		 * 4:savefh(new) 5:putfh(dir) 6:getattr(dir) 7:restorefh(new)
1110		 */
1111		argop[4].argop = OP_SAVEFH;
1112
1113		argop[5].argop = OP_CPUTFH;
1114		argop[5].nfs_argop4_u.opcputfh.sfh = drp->r_fh;
1115
1116		argop[6].argop = OP_GETATTR;
1117		argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
1118		argop[6].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp);
1119
1120		argop[7].argop = OP_RESTOREFH;
1121
1122		/*
1123		 * nverify
1124		 */
1125		_v.va_mask = AT_GID;
1126		_v.va_gid = in_va->va_gid;
1127		if (!(e.error = nfs4args_verify(&argop[8], &_v, OP_NVERIFY,
1128		    supp_attrs))) {
1129
1130			/*
1131			 * setattr
1132			 *
1133			 * We _know_ we're not messing with AT_SIZE or
1134			 * AT_XTIME, so no need for stateid or flags.
1135			 * Also we specify NULL rp since we're only
1136			 * interested in setting owner_group attributes.
1137			 */
1138			nfs4args_setattr(&argop[9], &_v, NULL, 0, NULL, cr,
1139			    supp_attrs, &e.error, 0);
1140			if (e.error)
1141				nfs4args_verify_free(&argop[8]);
1142		}
1143
1144		if (e.error) {
1145			/*
1146			 * XXX - Revisit the last argument to nfs4_end_op()
1147			 *	 once 5020486 is fixed.
1148			 */
1149			nfs4_end_open_seqid_sync(oop);
1150			open_owner_rele(oop);
1151			nfs4args_copen_free(open_args);
1152			nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, TRUE);
1153			if (ncr != NULL)
1154				crfree(ncr);
1155			kmem_free(argop, argoplist_size);
1156			return (e.error);
1157		}
1158	} else if (create_flag) {
1159		argop[1].argop = OP_SAVEFH;
1160
1161		argop[5].argop = OP_RESTOREFH;
1162
1163		argop[6].argop = OP_GETATTR;
1164		argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
1165		argop[6].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp);
1166	}
1167
1168	NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
1169	    "nfs4open_otw: %s call, nm %s, rp %s",
1170	    needrecov ? "recov" : "first", file_name,
1171	    rnode4info(VTOR4(dvp))));
1172
1173	t = gethrtime();
1174
1175	rfs4call(VTOMI4(dvp), &args, &res, cred_otw, &doqueue, 0, &e);
1176
1177	if (!e.error && nfs4_need_to_bump_seqid(&res))
1178		nfs4_set_open_seqid(seqid, oop, args.ctag);
1179
1180	needrecov = nfs4_needs_recovery(&e, TRUE, dvp->v_vfsp);
1181
1182	if (e.error || needrecov) {
1183		bool_t abort = FALSE;
1184
1185		if (needrecov) {
1186			nfs4_bseqid_entry_t *bsep = NULL;
1187
1188			nfs4open_save_lost_rqst(e.error, &lost_rqst, oop,
1189			    cred_otw, vpi, dvp, open_args);
1190
1191			if (!e.error && res.status == NFS4ERR_BAD_SEQID) {
1192				bsep = nfs4_create_bseqid_entry(oop, NULL,
1193				    vpi, 0, args.ctag, open_args->seqid);
1194				num_bseqid_retry--;
1195			}
1196
1197			abort = nfs4_start_recovery(&e, VTOMI4(dvp), dvp, vpi,
1198			    NULL, lost_rqst.lr_op == OP_OPEN ?
1199			    &lost_rqst : NULL, OP_OPEN, bsep, NULL, NULL);
1200
1201			if (bsep)
1202				kmem_free(bsep, sizeof (*bsep));
1203			/* give up if we keep getting BAD_SEQID */
1204			if (num_bseqid_retry == 0)
1205				abort = TRUE;
1206			if (abort == TRUE && e.error == 0)
1207				e.error = geterrno4(res.status);
1208		}
1209		nfs4_end_open_seqid_sync(oop);
1210		open_owner_rele(oop);
1211		nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov);
1212		nfs4args_copen_free(open_args);
1213		if (setgid_flag) {
1214			nfs4args_verify_free(&argop[8]);
1215			nfs4args_setattr_free(&argop[9]);
1216		}
1217		if (!e.error)
1218			xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1219		if (ncr != NULL) {
1220			crfree(ncr);
1221			ncr = NULL;
1222		}
1223		if (!needrecov || abort == TRUE || e.error == EINTR ||
1224		    NFS4_FRC_UNMT_ERR(e.error, dvp->v_vfsp)) {
1225			kmem_free(argop, argoplist_size);
1226			return (e.error);
1227		}
1228		goto recov_retry;
1229	}
1230
1231	/*
1232	 * Will check and update lease after checking the rflag for
1233	 * OPEN_CONFIRM in the successful OPEN call.
1234	 */
1235	if (res.status != NFS4_OK && res.array_len <= idx_fattr + 1) {
1236
1237		/*
1238		 * XXX what if we're crossing mount points from server1:/drp
1239		 * to server2:/drp/rp.
1240		 */
1241
1242		/* Signal our end of use of the open seqid */
1243		nfs4_end_open_seqid_sync(oop);
1244
1245		/*
1246		 * This will destroy the open owner if it was just created,
1247		 * and no one else has put a reference on it.
1248		 */
1249		open_owner_rele(oop);
1250		if (create_flag && (createmode != EXCLUSIVE4) &&
1251		    res.status == NFS4ERR_BADOWNER)
1252			nfs4_log_badowner(VTOMI4(dvp), OP_OPEN);
1253
1254		e.error = geterrno4(res.status);
1255		nfs4args_copen_free(open_args);
1256		if (setgid_flag) {
1257			nfs4args_verify_free(&argop[8]);
1258			nfs4args_setattr_free(&argop[9]);
1259		}
1260		xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1261		nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov);
1262		/*
1263		 * If the reply is NFS4ERR_ACCESS, it may be because
1264		 * we are root (no root net access).  If the real uid
1265		 * is not root, then retry with the real uid instead.
1266		 */
1267		if (ncr != NULL) {
1268			crfree(ncr);
1269			ncr = NULL;
1270		}
1271		if (res.status == NFS4ERR_ACCESS &&
1272		    (ncr = crnetadjust(cred_otw)) != NULL) {
1273			cred_otw = ncr;
1274			goto recov_retry;
1275		}
1276		kmem_free(argop, argoplist_size);
1277		return (e.error);
1278	}
1279
1280	resop = &res.array[idx_open];  /* open res */
1281	op_res = &resop->nfs_resop4_u.opopen;
1282
1283#ifdef DEBUG
1284	/*
1285	 * verify attrset bitmap
1286	 */
1287	if (create_flag &&
1288	    (createmode == UNCHECKED4 || createmode == GUARDED4)) {
1289		/* make sure attrset returned is what we asked for */
1290		/* XXX Ignore this 'error' for now */
1291		if (attr->attrmask != op_res->attrset)
1292			/* EMPTY */;
1293	}
1294#endif
1295
1296	if (op_res->rflags & OPEN4_RESULT_LOCKTYPE_POSIX) {
1297		mutex_enter(&VTOMI4(dvp)->mi_lock);
1298		VTOMI4(dvp)->mi_flags |= MI4_POSIX_LOCK;
1299		mutex_exit(&VTOMI4(dvp)->mi_lock);
1300	}
1301
1302	resop = &res.array[idx_open + 1];  /* getfh res */
1303	gf_res = &resop->nfs_resop4_u.opgetfh;
1304
1305	otw_sfh = sfh4_get(&gf_res->object, VTOMI4(dvp));
1306
1307	/*
1308	 * The open stateid has been updated on the server but not
1309	 * on the client yet.  There is a path: makenfs4node->nfs4_attr_cache->
1310	 * flush_pages->VOP_PUTPAGE->...->nfs4write where we will issue an OTW
1311	 * WRITE call.  That, however, will use the old stateid, so go ahead
1312	 * and upate the open stateid now, before any call to makenfs4node.
1313	 */
1314	if (vpi) {
1315		nfs4_open_stream_t	*tmp_osp;
1316		rnode4_t		*tmp_rp = VTOR4(vpi);
1317
1318		tmp_osp = find_open_stream(oop, tmp_rp);
1319		if (tmp_osp) {
1320			tmp_osp->open_stateid = op_res->stateid;
1321			mutex_exit(&tmp_osp->os_sync_lock);
1322			open_stream_rele(tmp_osp, tmp_rp);
1323		}
1324
1325		/*
1326		 * We must determine if the file handle given by the otw open
1327		 * is the same as the file handle which was passed in with
1328		 * *vpp.  This case can be reached if the file we are trying
1329		 * to open has been removed and another file has been created
1330		 * having the same file name.  The passed in vnode is released
1331		 * later.
1332		 */
1333		orig_sfh = VTOR4(vpi)->r_fh;
1334		fh_differs = nfs4cmpfh(&orig_sfh->sfh_fh, &otw_sfh->sfh_fh);
1335	}
1336
1337	garp = &res.array[idx_fattr].nfs_resop4_u.opgetattr.ga_res;
1338
1339	if (create_flag || fh_differs) {
1340		int rnode_err = 0;
1341
1342		vp = makenfs4node(otw_sfh, garp, dvp->v_vfsp, t, cr,
1343		    dvp, fn_get(VTOSV(dvp)->sv_name, file_name, otw_sfh));
1344
1345		if (e.error)
1346			PURGE_ATTRCACHE4(vp);
1347		/*
1348		 * For the newly created vp case, make sure the rnode
1349		 * isn't bad before using it.
1350		 */
1351		mutex_enter(&(VTOR4(vp))->r_statelock);
1352		if (VTOR4(vp)->r_flags & R4RECOVERR)
1353			rnode_err = EIO;
1354		mutex_exit(&(VTOR4(vp))->r_statelock);
1355
1356		if (rnode_err) {
1357			nfs4_end_open_seqid_sync(oop);
1358			nfs4args_copen_free(open_args);
1359			if (setgid_flag) {
1360				nfs4args_verify_free(&argop[8]);
1361				nfs4args_setattr_free(&argop[9]);
1362			}
1363			xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1364			nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state,
1365			    needrecov);
1366			open_owner_rele(oop);
1367			VN_RELE(vp);
1368			if (ncr != NULL)
1369				crfree(ncr);
1370			sfh4_rele(&otw_sfh);
1371			kmem_free(argop, argoplist_size);
1372			return (EIO);
1373		}
1374	} else {
1375		vp = vpi;
1376	}
1377	sfh4_rele(&otw_sfh);
1378
1379	/*
1380	 * It seems odd to get a full set of attrs and then not update
1381	 * the object's attrcache in the non-create case.  Create case uses
1382	 * the attrs since makenfs4node checks to see if the attrs need to
1383	 * be updated (and then updates them).  The non-create case should
1384	 * update attrs also.
1385	 */
1386	if (! create_flag && ! fh_differs && !e.error) {
1387		nfs4_attr_cache(vp, garp, t, cr, TRUE, NULL);
1388	}
1389
1390	nfs4_error_zinit(&e);
1391	if (op_res->rflags & OPEN4_RESULT_CONFIRM) {
1392		/* This does not do recovery for vp explicitly. */
1393		nfs4open_confirm(vp, &seqid, &op_res->stateid, cred_otw, FALSE,
1394		    &retry_open, oop, FALSE, &e, &num_bseqid_retry);
1395
1396		if (e.error || e.stat) {
1397			nfs4_end_open_seqid_sync(oop);
1398			nfs4args_copen_free(open_args);
1399			if (setgid_flag) {
1400				nfs4args_verify_free(&argop[8]);
1401				nfs4args_setattr_free(&argop[9]);
1402			}
1403			xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1404			nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state,
1405			    needrecov);
1406			open_owner_rele(oop);
1407			if (create_flag || fh_differs) {
1408				/* rele the makenfs4node */
1409				VN_RELE(vp);
1410			}
1411			if (ncr != NULL) {
1412				crfree(ncr);
1413				ncr = NULL;
1414			}
1415			if (retry_open == TRUE) {
1416				NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
1417				    "nfs4open_otw: retry the open since OPEN "
1418				    "CONFIRM failed with error %d stat %d",
1419				    e.error, e.stat));
1420				if (create_flag && createmode == GUARDED4) {
1421					NFS4_DEBUG(nfs4_client_recov_debug,
1422					    (CE_NOTE, "nfs4open_otw: switch "
1423					    "createmode from GUARDED4 to "
1424					    "UNCHECKED4"));
1425					createmode = UNCHECKED4;
1426				}
1427				goto recov_retry;
1428			}
1429			if (!e.error) {
1430				if (create_flag && (createmode != EXCLUSIVE4) &&
1431				    e.stat == NFS4ERR_BADOWNER)
1432					nfs4_log_badowner(VTOMI4(dvp), OP_OPEN);
1433
1434				e.error = geterrno4(e.stat);
1435			}
1436			kmem_free(argop, argoplist_size);
1437			return (e.error);
1438		}
1439	}
1440
1441	rp = VTOR4(vp);
1442
1443	mutex_enter(&rp->r_statev4_lock);
1444	if (create_flag)
1445		rp->created_v4 = 1;
1446	mutex_exit(&rp->r_statev4_lock);
1447
1448	mutex_enter(&oop->oo_lock);
1449	/* Doesn't matter if 'oo_just_created' already was set as this */
1450	oop->oo_just_created = NFS4_PERM_CREATED;
1451	if (oop->oo_cred_otw)
1452		crfree(oop->oo_cred_otw);
1453	oop->oo_cred_otw = cred_otw;
1454	crhold(oop->oo_cred_otw);
1455	mutex_exit(&oop->oo_lock);
1456
1457	/* returns with 'os_sync_lock' held */
1458	osp = find_or_create_open_stream(oop, rp, &created_osp);
1459	if (!osp) {
1460		NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
1461		    "nfs4open_otw: failed to create an open stream"));
1462		NFS4_DEBUG(nfs4_seqid_sync, (CE_NOTE, "nfs4open_otw: "
1463		    "signal our end of use of the open seqid"));
1464
1465		nfs4_end_open_seqid_sync(oop);
1466		open_owner_rele(oop);
1467		nfs4args_copen_free(open_args);
1468		if (setgid_flag) {
1469			nfs4args_verify_free(&argop[8]);
1470			nfs4args_setattr_free(&argop[9]);
1471		}
1472		xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1473		nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov);
1474		if (create_flag || fh_differs)
1475			VN_RELE(vp);
1476		if (ncr != NULL)
1477			crfree(ncr);
1478
1479		kmem_free(argop, argoplist_size);
1480		return (EINVAL);
1481
1482	}
1483
1484	osp->open_stateid = op_res->stateid;
1485
1486	if (open_flag & FREAD)
1487		osp->os_share_acc_read++;
1488	if (open_flag & FWRITE)
1489		osp->os_share_acc_write++;
1490	osp->os_share_deny_none++;
1491
1492	/*
1493	 * Need to reset this bitfield for the possible case where we were
1494	 * going to OTW CLOSE the file, got a non-recoverable error, and before
1495	 * we could retry the CLOSE, OPENed the file again.
1496	 */
1497	ASSERT(osp->os_open_owner->oo_seqid_inuse);
1498	osp->os_final_close = 0;
1499	osp->os_force_close = 0;
1500#ifdef DEBUG
1501	if (osp->os_failed_reopen)
1502		NFS4_DEBUG(nfs4_open_stream_debug, (CE_NOTE, "nfs4open_otw:"
1503		    " clearing os_failed_reopen for osp %p, cr %p, rp %s",
1504		    (void *)osp, (void *)cr, rnode4info(rp)));
1505#endif
1506	osp->os_failed_reopen = 0;
1507
1508	mutex_exit(&osp->os_sync_lock);
1509
1510	nfs4_end_open_seqid_sync(oop);
1511
1512	if (created_osp && recov_state.rs_sp != NULL) {
1513		mutex_enter(&recov_state.rs_sp->s_lock);
1514		nfs4_inc_state_ref_count_nolock(recov_state.rs_sp, VTOMI4(dvp));
1515		mutex_exit(&recov_state.rs_sp->s_lock);
1516	}
1517
1518	/* get rid of our reference to find oop */
1519	open_owner_rele(oop);
1520
1521	open_stream_rele(osp, rp);
1522
1523	/* accept delegation, if any */
1524	nfs4_delegation_accept(rp, CLAIM_NULL, op_res, garp, cred_otw);
1525
1526	nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov);
1527
1528	if (createmode == EXCLUSIVE4 &&
1529	    (in_va->va_mask & ~(AT_GID | AT_SIZE))) {
1530		NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4open_otw:"
1531		    " EXCLUSIVE4: sending a SETATTR"));
1532		/*
1533		 * If doing an exclusive create, then generate
1534		 * a SETATTR to set the initial attributes.
1535		 * Try to set the mtime and the atime to the
1536		 * server's current time.  It is somewhat
1537		 * expected that these fields will be used to
1538		 * store the exclusive create cookie.  If not,
1539		 * server implementors will need to know that
1540		 * a SETATTR will follow an exclusive create
1541		 * and the cookie should be destroyed if
1542		 * appropriate.
1543		 *
1544		 * The AT_GID and AT_SIZE bits are turned off
1545		 * so that the SETATTR request will not attempt
1546		 * to process these.  The gid will be set
1547		 * separately if appropriate.  The size is turned
1548		 * off because it is assumed that a new file will
1549		 * be created empty and if the file wasn't empty,
1550		 * then the exclusive create will have failed
1551		 * because the file must have existed already.
1552		 * Therefore, no truncate operation is needed.
1553		 */
1554		in_va->va_mask &= ~(AT_GID | AT_SIZE);
1555		in_va->va_mask |= (AT_MTIME | AT_ATIME);
1556
1557		e.error = nfs4setattr(vp, in_va, 0, cr, NULL);
1558		if (e.error) {
1559			nfs4_error_t err;
1560
1561			/*
1562			 * Couldn't correct the attributes of
1563			 * the newly created file and the
1564			 * attributes are wrong.  Remove the
1565			 * file and return an error to the
1566			 * application.
1567			 */
1568			/* XXX will this take care of client state ? */
1569			NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
1570			    "nfs4open_otw: EXCLUSIVE4: error %d on SETATTR:"
1571			    " remove file", e.error));
1572
1573			/*
1574			 * The file is currently open so try to close it first.
1575			 *
1576			 * If we do not close the file explicitly here then the
1577			 * VN_RELE() would do an (implicit and asynchronous)
1578			 * close for us.  But such async close could race with
1579			 * the nfs4_remove() below.  If the async close is
1580			 * slower than nfs4_remove() then nfs4_remove()
1581			 * wouldn't remove the file but rename it to .nfsXXXX
1582			 * instead.
1583			 */
1584			nfs4close_one(vp, NULL, cr, open_flag, NULL, &err,
1585			    CLOSE_NORM, 0, 0, 0);
1586			VN_RELE(vp);
1587			(void) nfs4_remove(dvp, file_name, cr, NULL, 0);
1588
1589			/*
1590			 * Since we've reled the vnode and removed
1591			 * the file we now need to return the error.
1592			 * At this point we don't want to update the
1593			 * dircaches, call nfs4_waitfor_purge_complete
1594			 * or set vpp to vp so we need to skip these
1595			 * as well.
1596			 */
1597			goto skip_update_dircaches;
1598		}
1599	}
1600
1601	/*
1602	 * If we created or found the correct vnode, due to create_flag or
1603	 * fh_differs being set, then update directory cache attribute, readdir
1604	 * and dnlc caches.
1605	 */
1606	if (create_flag || fh_differs) {
1607		dirattr_info_t dinfo, *dinfop;
1608
1609		/*
1610		 * Make sure getattr succeeded before using results.
1611		 * note: op 7 is getattr(dir) for both flavors of
1612		 * open(create).
1613		 */
1614		if (create_flag && res.status == NFS4_OK) {
1615			dinfo.di_time_call = t;
1616			dinfo.di_cred = cr;
1617			dinfo.di_garp =
1618			    &res.array[6].nfs_resop4_u.opgetattr.ga_res;
1619			dinfop = &dinfo;
1620		} else {
1621			dinfop = NULL;
1622		}
1623
1624		nfs4_update_dircaches(&op_res->cinfo, dvp, vp, file_name,
1625		    dinfop);
1626	}
1627
1628	/*
1629	 * If the page cache for this file was flushed from actions
1630	 * above, it was done asynchronously and if that is true,
1631	 * there is a need to wait here for it to complete.  This must
1632	 * be done outside of start_fop/end_fop.
1633	 */
1634	(void) nfs4_waitfor_purge_complete(vp);
1635
1636	/*
1637	 * It is implicit that we are in the open case (create_flag == 0) since
1638	 * fh_differs can only be set to a non-zero value in the open case.
1639	 */
1640	if (fh_differs != 0 && vpi != NULL)
1641		VN_RELE(vpi);
1642
1643	/*
1644	 * Be sure to set *vpp to the correct value before returning.
1645	 */
1646	*vpp = vp;
1647
1648skip_update_dircaches:
1649
1650	nfs4args_copen_free(open_args);
1651	if (setgid_flag) {
1652		nfs4args_verify_free(&argop[8]);
1653		nfs4args_setattr_free(&argop[9]);
1654	}
1655	xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1656
1657	if (ncr)
1658		crfree(ncr);
1659	kmem_free(argop, argoplist_size);
1660	return (e.error);
1661}
1662
1663/*
1664 * Reopen an open instance.  cf. nfs4open_otw().
1665 *
1666 * Errors are returned by the nfs4_error_t parameter.
1667 * - ep->error contains an errno value or zero.
1668 * - if it is zero, ep->stat is set to an NFS status code, if any.
1669 *   If the file could not be reopened, but the caller should continue, the
1670 *   file is marked dead and no error values are returned.  If the caller
1671 *   should stop recovering open files and start over, either the ep->error
1672 *   value or ep->stat will indicate an error (either something that requires
1673 *   recovery or EAGAIN).  Note that some recovery (e.g., expired volatile
1674 *   filehandles) may be handled silently by this routine.
1675 * - if it is EINTR, ETIMEDOUT, or NFS4_FRC_UNMT_ERR, recovery for lost state
1676 *   will be started, so the caller should not do it.
1677 *
1678 * Gotos:
1679 * - kill_file : reopen failed in such a fashion to constitute marking the
1680 *    file dead and setting the open stream's 'os_failed_reopen' as 1.  This
1681 *   is for cases where recovery is not possible.
1682 * - failed_reopen : same as above, except that the file has already been
1683 *   marked dead, so no need to do it again.
1684 * - bailout : reopen failed but we are able to recover and retry the reopen -
1685 *   either within this function immediately or via the calling function.
1686 */
1687
1688void
1689nfs4_reopen(vnode_t *vp, nfs4_open_stream_t *osp, nfs4_error_t *ep,
1690    open_claim_type4 claim, bool_t frc_use_claim_previous,
1691    bool_t is_recov)
1692{
1693	COMPOUND4args_clnt args;
1694	COMPOUND4res_clnt res;
1695	nfs_argop4 argop[4];
1696	nfs_resop4 *resop;
1697	OPEN4res *op_res = NULL;
1698	OPEN4cargs *open_args;
1699	GETFH4res *gf_res;
1700	rnode4_t *rp = VTOR4(vp);
1701	int doqueue = 1;
1702	cred_t *cr = NULL, *cred_otw = NULL;
1703	nfs4_open_owner_t *oop = NULL;
1704	seqid4 seqid;
1705	nfs4_ga_res_t *garp;
1706	char fn[MAXNAMELEN];
1707	nfs4_recov_state_t recov = {NULL, 0};
1708	nfs4_lost_rqst_t lost_rqst;
1709	mntinfo4_t *mi = VTOMI4(vp);
1710	bool_t abort;
1711	char *failed_msg = "";
1712	int fh_different;
1713	hrtime_t t;
1714	nfs4_bseqid_entry_t *bsep = NULL;
1715
1716	ASSERT(nfs4_consistent_type(vp));
1717	ASSERT(nfs_zone() == mi->mi_zone);
1718
1719	nfs4_error_zinit(ep);
1720
1721	/* this is the cred used to find the open owner */
1722	cr = state_to_cred(osp);
1723	if (cr == NULL) {
1724		failed_msg = "Couldn't reopen: no cred";
1725		goto kill_file;
1726	}
1727	/* use this cred for OTW operations */
1728	cred_otw = nfs4_get_otw_cred(cr, mi, osp->os_open_owner);
1729
1730top:
1731	nfs4_error_zinit(ep);
1732
1733	if (mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED) {
1734		/* File system has been unmounted, quit */
1735		ep->error = EIO;
1736		failed_msg = "Couldn't reopen: file system has been unmounted";
1737		goto kill_file;
1738	}
1739
1740	oop = osp->os_open_owner;
1741
1742	ASSERT(oop != NULL);
1743	if (oop == NULL) {	/* be defensive in non-DEBUG */
1744		failed_msg = "can't reopen: no open owner";
1745		goto kill_file;
1746	}
1747	open_owner_hold(oop);
1748
1749	ep->error = nfs4_start_open_seqid_sync(oop, mi);
1750	if (ep->error) {
1751		open_owner_rele(oop);
1752		oop = NULL;
1753		goto bailout;
1754	}
1755
1756	/*
1757	 * If the rnode has a delegation and the delegation has been
1758	 * recovered and the server didn't request a recall and the caller
1759	 * didn't specifically ask for CLAIM_PREVIOUS (nfs4frlock during
1760	 * recovery) and the rnode hasn't been marked dead, then install
1761	 * the delegation stateid in the open stream.  Otherwise, proceed
1762	 * with a CLAIM_PREVIOUS or CLAIM_NULL OPEN.
1763	 */
1764	mutex_enter(&rp->r_statev4_lock);
1765	if (rp->r_deleg_type != OPEN_DELEGATE_NONE &&
1766	    !rp->r_deleg_return_pending &&
1767	    (rp->r_deleg_needs_recovery == OPEN_DELEGATE_NONE) &&
1768	    !rp->r_deleg_needs_recall &&
1769	    claim != CLAIM_DELEGATE_CUR && !frc_use_claim_previous &&
1770	    !(rp->r_flags & R4RECOVERR)) {
1771		mutex_enter(&osp->os_sync_lock);
1772		osp->os_delegation = 1;
1773		osp->open_stateid = rp->r_deleg_stateid;
1774		mutex_exit(&osp->os_sync_lock);
1775		mutex_exit(&rp->r_statev4_lock);
1776		goto bailout;
1777	}
1778	mutex_exit(&rp->r_statev4_lock);
1779
1780	/*
1781	 * If the file failed recovery, just quit.  This failure need not
1782	 * affect other reopens, so don't return an error.
1783	 */
1784	mutex_enter(&rp->r_statelock);
1785	if (rp->r_flags & R4RECOVERR) {
1786		mutex_exit(&rp->r_statelock);
1787		ep->error = 0;
1788		goto failed_reopen;
1789	}
1790	mutex_exit(&rp->r_statelock);
1791
1792	/*
1793	 * argop is empty here
1794	 *
1795	 * PUTFH, OPEN, GETATTR
1796	 */
1797	args.ctag = TAG_REOPEN;
1798	args.array_len = 4;
1799	args.array = argop;
1800
1801	NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
1802	    "nfs4_reopen: file is type %d, id %s",
1803	    vp->v_type, rnode4info(VTOR4(vp))));
1804
1805	argop[0].argop = OP_CPUTFH;
1806
1807	if (claim != CLAIM_PREVIOUS) {
1808		/*
1809		 * if this is a file mount then
1810		 * use the mntinfo parentfh
1811		 */
1812		argop[0].nfs_argop4_u.opcputfh.sfh =
1813		    (vp->v_flag & VROOT) ? mi->mi_srvparentfh :
1814		    VTOSV(vp)->sv_dfh;
1815	} else {
1816		/* putfh fh to reopen */
1817		argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
1818	}
1819
1820	argop[1].argop = OP_COPEN;
1821	open_args = &argop[1].nfs_argop4_u.opcopen;
1822	open_args->claim = claim;
1823
1824	if (claim == CLAIM_NULL) {
1825
1826		if ((ep->error = vtoname(vp, fn, MAXNAMELEN)) != 0) {
1827			nfs_cmn_err(ep->error, CE_WARN, "nfs4_reopen: vtoname "
1828			    "failed for vp 0x%p for CLAIM_NULL with %m",
1829			    (void *)vp);
1830			failed_msg = "Couldn't reopen: vtoname failed for "
1831			    "CLAIM_NULL";
1832			/* nothing allocated yet */
1833			goto kill_file;
1834		}
1835
1836		open_args->open_claim4_u.cfile = fn;
1837	} else if (claim == CLAIM_PREVIOUS) {
1838
1839		/*
1840		 * We have two cases to deal with here:
1841		 * 1) We're being called to reopen files in order to satisfy
1842		 *    a lock operation request which requires us to explicitly
1843		 *    reopen files which were opened under a delegation.  If
1844		 *    we're in recovery, we *must* use CLAIM_PREVIOUS.  In
1845		 *    that case, frc_use_claim_previous is TRUE and we must
1846		 *    use the rnode's current delegation type (r_deleg_type).
1847		 * 2) We're reopening files during some form of recovery.
1848		 *    In this case, frc_use_claim_previous is FALSE and we
1849		 *    use the delegation type appropriate for recovery
1850		 *    (r_deleg_needs_recovery).
1851		 */
1852		mutex_enter(&rp->r_statev4_lock);
1853		open_args->open_claim4_u.delegate_type =
1854		    frc_use_claim_previous ?
1855		    rp->r_deleg_type :
1856		    rp->r_deleg_needs_recovery;
1857		mutex_exit(&rp->r_statev4_lock);
1858
1859	} else if (claim == CLAIM_DELEGATE_CUR) {
1860
1861		if ((ep->error = vtoname(vp, fn, MAXNAMELEN)) != 0) {
1862			nfs_cmn_err(ep->error, CE_WARN, "nfs4_reopen: vtoname "
1863			    "failed for vp 0x%p for CLAIM_DELEGATE_CUR "
1864			    "with %m", (void *)vp);
1865			failed_msg = "Couldn't reopen: vtoname failed for "
1866			    "CLAIM_DELEGATE_CUR";
1867			/* nothing allocated yet */
1868			goto kill_file;
1869		}
1870
1871		mutex_enter(&rp->r_statev4_lock);
1872		open_args->open_claim4_u.delegate_cur_info.delegate_stateid =
1873		    rp->r_deleg_stateid;
1874		mutex_exit(&rp->r_statev4_lock);
1875
1876		open_args->open_claim4_u.delegate_cur_info.cfile = fn;
1877	}
1878	open_args->opentype = OPEN4_NOCREATE;
1879	open_args->owner.clientid = mi2clientid(mi);
1880	open_args->owner.owner_len = sizeof (oop->oo_name);
1881	open_args->owner.owner_val =
1882	    kmem_alloc(open_args->owner.owner_len, KM_SLEEP);
1883	bcopy(&oop->oo_name, open_args->owner.owner_val,
1884	    open_args->owner.owner_len);
1885	open_args->share_access = 0;
1886	open_args->share_deny = 0;
1887
1888	mutex_enter(&osp->os_sync_lock);
1889	NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, "nfs4_reopen: osp %p rp "
1890	    "%p: read acc %"PRIu64" write acc %"PRIu64": open ref count %d: "
1891	    "mmap read %"PRIu64" mmap write %"PRIu64" claim %d ",
1892	    (void *)osp, (void *)rp, osp->os_share_acc_read,
1893	    osp->os_share_acc_write, osp->os_open_ref_count,
1894	    osp->os_mmap_read, osp->os_mmap_write, claim));
1895
1896	if (osp->os_share_acc_read || osp->os_mmap_read)
1897		open_args->share_access |= OPEN4_SHARE_ACCESS_READ;
1898	if (osp->os_share_acc_write || osp->os_mmap_write)
1899		open_args->share_access |= OPEN4_SHARE_ACCESS_WRITE;
1900	if (osp->os_share_deny_read)
1901		open_args->share_deny |= OPEN4_SHARE_DENY_READ;
1902	if (osp->os_share_deny_write)
1903		open_args->share_deny |= OPEN4_SHARE_DENY_WRITE;
1904	mutex_exit(&osp->os_sync_lock);
1905
1906	seqid = nfs4_get_open_seqid(oop) + 1;
1907	open_args->seqid = seqid;
1908
1909	/* Construct the getfh part of the compound */
1910	argop[2].argop = OP_GETFH;
1911
1912	/* Construct the getattr part of the compound */
1913	argop[3].argop = OP_GETATTR;
1914	argop[3].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
1915	argop[3].nfs_argop4_u.opgetattr.mi = mi;
1916
1917	t = gethrtime();
1918
1919	rfs4call(mi, &args, &res, cred_otw, &doqueue, 0, ep);
1920
1921	if (ep->error) {
1922		if (!is_recov && !frc_use_claim_previous &&
1923		    (ep->error == EINTR || ep->error == ETIMEDOUT ||
1924		    NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp))) {
1925			nfs4open_save_lost_rqst(ep->error, &lost_rqst, oop,
1926			    cred_otw, vp, NULL, open_args);
1927			abort = nfs4_start_recovery(ep,
1928			    VTOMI4(vp), vp, NULL, NULL,
1929			    lost_rqst.lr_op == OP_OPEN ?
1930			    &lost_rqst : NULL, OP_OPEN, NULL, NULL, NULL);
1931			nfs4args_copen_free(open_args);
1932			goto bailout;
1933		}
1934
1935		nfs4args_copen_free(open_args);
1936
1937		if (ep->error == EACCES && cred_otw != cr) {
1938			crfree(cred_otw);
1939			cred_otw = cr;
1940			crhold(cred_otw);
1941			nfs4_end_open_seqid_sync(oop);
1942			open_owner_rele(oop);
1943			oop = NULL;
1944			goto top;
1945		}
1946		if (ep->error == ETIMEDOUT)
1947			goto bailout;
1948		failed_msg = "Couldn't reopen: rpc error";
1949		goto kill_file;
1950	}
1951
1952	if (nfs4_need_to_bump_seqid(&res))
1953		nfs4_set_open_seqid(seqid, oop, args.ctag);
1954
1955	switch (res.status) {
1956	case NFS4_OK:
1957		if (recov.rs_flags & NFS4_RS_DELAY_MSG) {
1958			mutex_enter(&rp->r_statelock);
1959			rp->r_delay_interval = 0;
1960			mutex_exit(&rp->r_statelock);
1961		}
1962		break;
1963	case NFS4ERR_BAD_SEQID:
1964		bsep = nfs4_create_bseqid_entry(oop, NULL, vp, 0,
1965		    args.ctag, open_args->seqid);
1966
1967		abort = nfs4_start_recovery(ep, VTOMI4(vp), vp, NULL,
1968		    NULL, lost_rqst.lr_op == OP_OPEN ? &lost_rqst :
1969		    NULL, OP_OPEN, bsep, NULL, NULL);
1970
1971		nfs4args_copen_free(open_args);
1972		xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1973		nfs4_end_open_seqid_sync(oop);
1974		open_owner_rele(oop);
1975		oop = NULL;
1976		kmem_free(bsep, sizeof (*bsep));
1977
1978		goto kill_file;
1979	case NFS4ERR_NO_GRACE:
1980		nfs4args_copen_free(open_args);
1981		xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1982		nfs4_end_open_seqid_sync(oop);
1983		open_owner_rele(oop);
1984		oop = NULL;
1985		if (claim == CLAIM_PREVIOUS) {
1986			/*
1987			 * Retry as a plain open. We don't need to worry about
1988			 * checking the changeinfo: it is acceptable for a
1989			 * client to re-open a file and continue processing
1990			 * (in the absence of locks).
1991			 */
1992			NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
1993			    "nfs4_reopen: CLAIM_PREVIOUS: NFS4ERR_NO_GRACE; "
1994			    "will retry as CLAIM_NULL"));
1995			claim = CLAIM_NULL;
1996			nfs4_mi_kstat_inc_no_grace(mi);
1997			goto top;
1998		}
1999		failed_msg =
2000		    "Couldn't reopen: tried reclaim outside grace period. ";
2001		goto kill_file;
2002	case NFS4ERR_GRACE:
2003		nfs4_set_grace_wait(mi);
2004		nfs4args_copen_free(open_args);
2005		xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2006		nfs4_end_open_seqid_sync(oop);
2007		open_owner_rele(oop);
2008		oop = NULL;
2009		ep->error = nfs4_wait_for_grace(mi, &recov);
2010		if (ep->error != 0)
2011			goto bailout;
2012		goto top;
2013	case NFS4ERR_DELAY:
2014		nfs4_set_delay_wait(vp);
2015		nfs4args_copen_free(open_args);
2016		xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2017		nfs4_end_open_seqid_sync(oop);
2018		open_owner_rele(oop);
2019		oop = NULL;
2020		ep->error = nfs4_wait_for_delay(vp, &recov);
2021		nfs4_mi_kstat_inc_delay(mi);
2022		if (ep->error != 0)
2023			goto bailout;
2024		goto top;
2025	case NFS4ERR_FHEXPIRED:
2026		/* recover filehandle and retry */
2027		abort = nfs4_start_recovery(ep,
2028		    mi, vp, NULL, NULL, NULL, OP_OPEN, NULL, NULL, NULL);
2029		nfs4args_copen_free(open_args);
2030		xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2031		nfs4_end_open_seqid_sync(oop);
2032		open_owner_rele(oop);
2033		oop = NULL;
2034		if (abort == FALSE)
2035			goto top;
2036		failed_msg = "Couldn't reopen: recovery aborted";
2037		goto kill_file;
2038	case NFS4ERR_RESOURCE:
2039	case NFS4ERR_STALE_CLIENTID:
2040	case NFS4ERR_WRONGSEC:
2041	case NFS4ERR_EXPIRED:
2042		/*
2043		 * Do not mark the file dead and let the calling
2044		 * function initiate recovery.
2045		 */
2046		nfs4args_copen_free(open_args);
2047		xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2048		nfs4_end_open_seqid_sync(oop);
2049		open_owner_rele(oop);
2050		oop = NULL;
2051		goto bailout;
2052	case NFS4ERR_ACCESS:
2053		if (cred_otw != cr) {
2054			crfree(cred_otw);
2055			cred_otw = cr;
2056			crhold(cred_otw);
2057			nfs4args_copen_free(open_args);
2058			xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2059			nfs4_end_open_seqid_sync(oop);
2060			open_owner_rele(oop);
2061			oop = NULL;
2062			goto top;
2063		}
2064		/* fall through */
2065	default:
2066		NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
2067		    "nfs4_reopen: r_server 0x%p, mi_curr_serv 0x%p, rnode %s",
2068		    (void*)VTOR4(vp)->r_server, (void*)mi->mi_curr_serv,
2069		    rnode4info(VTOR4(vp))));
2070		failed_msg = "Couldn't reopen: NFSv4 error";
2071		nfs4args_copen_free(open_args);
2072		xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2073		goto kill_file;
2074	}
2075
2076	resop = &res.array[1];  /* open res */
2077	op_res = &resop->nfs_resop4_u.opopen;
2078
2079	garp = &res.array[3].nfs_resop4_u.opgetattr.ga_res;
2080
2081	/*
2082	 * Check if the path we reopened really is the same
2083	 * file. We could end up in a situation where the file
2084	 * was removed and a new file created with the same name.
2085	 */
2086	resop = &res.array[2];
2087	gf_res = &resop->nfs_resop4_u.opgetfh;
2088	(void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0);
2089	fh_different = (nfs4cmpfh(&rp->r_fh->sfh_fh, &gf_res->object) != 0);
2090	if (fh_different) {
2091		if (mi->mi_fh_expire_type == FH4_PERSISTENT ||
2092		    mi->mi_fh_expire_type & FH4_NOEXPIRE_WITH_OPEN) {
2093			/* Oops, we don't have the same file */
2094			if (mi->mi_fh_expire_type == FH4_PERSISTENT)
2095				failed_msg = "Couldn't reopen: Persistent "
2096				    "file handle changed";
2097			else
2098				failed_msg = "Couldn't reopen: Volatile "
2099				    "(no expire on open) file handle changed";
2100
2101			nfs4args_copen_free(open_args);
2102			xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2103			nfs_rw_exit(&mi->mi_fh_lock);
2104			goto kill_file;
2105
2106		} else {
2107			/*
2108			 * We have volatile file handles that don't compare.
2109			 * If the fids are the same then we assume that the
2110			 * file handle expired but the rnode still refers to
2111			 * the same file object.
2112			 *
2113			 * First check that we have fids or not.
2114			 * If we don't we have a dumb server so we will
2115			 * just assume every thing is ok for now.
2116			 */
2117			if (!ep->error && garp->n4g_va.va_mask & AT_NODEID &&
2118			    rp->r_attr.va_mask & AT_NODEID &&
2119			    rp->r_attr.va_nodeid != garp->n4g_va.va_nodeid) {
2120				/*
2121				 * We have fids, but they don't
2122				 * compare. So kill the file.
2123				 */
2124				failed_msg =
2125				    "Couldn't reopen: file handle changed"
2126				    " due to mismatched fids";
2127				nfs4args_copen_free(open_args);
2128				xdr_free(xdr_COMPOUND4res_clnt,
2129				    (caddr_t)&res);
2130				nfs_rw_exit(&mi->mi_fh_lock);
2131				goto kill_file;
2132			} else {
2133				/*
2134				 * We have volatile file handles that refers
2135				 * to the same file (at least they have the
2136				 * same fid) or we don't have fids so we
2137				 * can't tell. :(. We'll be a kind and accepting
2138				 * client so we'll update the rnode's file
2139				 * handle with the otw handle.
2140				 *
2141				 * We need to drop mi->mi_fh_lock since
2142				 * sh4_update acquires it. Since there is
2143				 * only one recovery thread there is no
2144				 * race.
2145				 */
2146				nfs_rw_exit(&mi->mi_fh_lock);
2147				sfh4_update(rp->r_fh, &gf_res->object);
2148			}
2149		}
2150	} else {
2151		nfs_rw_exit(&mi->mi_fh_lock);
2152	}
2153
2154	ASSERT(nfs4_consistent_type(vp));
2155
2156	/*
2157	 * If the server wanted an OPEN_CONFIRM but that fails, just start
2158	 * over.  Presumably if there is a persistent error it will show up
2159	 * when we resend the OPEN.
2160	 */
2161	if (op_res->rflags & OPEN4_RESULT_CONFIRM) {
2162		bool_t retry_open = FALSE;
2163
2164		nfs4open_confirm(vp, &seqid, &op_res->stateid,
2165		    cred_otw, is_recov, &retry_open,
2166		    oop, FALSE, ep, NULL);
2167		if (ep->error || ep->stat) {
2168			nfs4args_copen_free(open_args);
2169			xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2170			nfs4_end_open_seqid_sync(oop);
2171			open_owner_rele(oop);
2172			oop = NULL;
2173			goto top;
2174		}
2175	}
2176
2177	mutex_enter(&osp->os_sync_lock);
2178	osp->open_stateid = op_res->stateid;
2179	osp->os_delegation = 0;
2180	/*
2181	 * Need to reset this bitfield for the possible case where we were
2182	 * going to OTW CLOSE the file, got a non-recoverable error, and before
2183	 * we could retry the CLOSE, OPENed the file again.
2184	 */
2185	ASSERT(osp->os_open_owner->oo_seqid_inuse);
2186	osp->os_final_close = 0;
2187	osp->os_force_close = 0;
2188	if (claim == CLAIM_DELEGATE_CUR || claim == CLAIM_PREVIOUS)
2189		osp->os_dc_openacc = open_args->share_access;
2190	mutex_exit(&osp->os_sync_lock);
2191
2192	nfs4_end_open_seqid_sync(oop);
2193
2194	/* accept delegation, if any */
2195	nfs4_delegation_accept(rp, claim, op_res, garp, cred_otw);
2196
2197	nfs4args_copen_free(open_args);
2198
2199	nfs4_attr_cache(vp, garp, t, cr, TRUE, NULL);
2200
2201	xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2202
2203	ASSERT(nfs4_consistent_type(vp));
2204
2205	open_owner_rele(oop);
2206	crfree(cr);
2207	crfree(cred_otw);
2208	return;
2209
2210kill_file:
2211	nfs4_fail_recov(vp, failed_msg, ep->error, ep->stat);
2212failed_reopen:
2213	NFS4_DEBUG(nfs4_open_stream_debug, (CE_NOTE,
2214	    "nfs4_reopen: setting os_failed_reopen for osp %p, cr %p, rp %s",
2215	    (void *)osp, (void *)cr, rnode4info(rp)));
2216	mutex_enter(&osp->os_sync_lock);
2217	osp->os_failed_reopen = 1;
2218	mutex_exit(&osp->os_sync_lock);
2219bailout:
2220	if (oop != NULL) {
2221		nfs4_end_open_seqid_sync(oop);
2222		open_owner_rele(oop);
2223	}
2224	if (cr != NULL)
2225		crfree(cr);
2226	if (cred_otw != NULL)
2227		crfree(cred_otw);
2228}
2229
2230/* for . and .. OPENs */
2231/* ARGSUSED */
2232static int
2233nfs4_open_non_reg_file(vnode_t **vpp, int flag, cred_t *cr)
2234{
2235	rnode4_t *rp;
2236	nfs4_ga_res_t gar;
2237
2238	ASSERT(nfs_zone() == VTOMI4(*vpp)->mi_zone);
2239
2240	/*
2241	 * If close-to-open consistency checking is turned off or
2242	 * if there is no cached data, we can avoid
2243	 * the over the wire getattr.  Otherwise, force a
2244	 * call to the server to get fresh attributes and to
2245	 * check caches. This is required for close-to-open
2246	 * consistency.
2247	 */
2248	rp = VTOR4(*vpp);
2249	if (VTOMI4(*vpp)->mi_flags & MI4_NOCTO ||
2250	    (rp->r_dir == NULL && !nfs4_has_pages(*vpp)))
2251		return (0);
2252
2253	return (nfs4_getattr_otw(*vpp, &gar, cr, 0));
2254}
2255
2256/*
2257 * CLOSE a file
2258 */
2259/* ARGSUSED */
2260static int
2261nfs4_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
2262    caller_context_t *ct)
2263{
2264	rnode4_t	*rp;
2265	int		 error = 0;
2266	int		 r_error = 0;
2267	int		 n4error = 0;
2268	nfs4_error_t	 e = { 0, NFS4_OK, RPC_SUCCESS };
2269
2270	/*
2271	 * Remove client state for this (lockowner, file) pair.
2272	 * Issue otw v4 call to have the server do the same.
2273	 */
2274
2275	rp = VTOR4(vp);
2276
2277	/*
2278	 * zone_enter(2) prevents processes from changing zones with NFS files
2279	 * open; if we happen to get here from the wrong zone we can't do
2280	 * anything over the wire.
2281	 */
2282	if (VTOMI4(vp)->mi_zone != nfs_zone()) {
2283		/*
2284		 * We could attempt to clean up locks, except we're sure
2285		 * that the current process didn't acquire any locks on
2286		 * the file: any attempt to lock a file belong to another zone
2287		 * will fail, and one can't lock an NFS file and then change
2288		 * zones, as that fails too.
2289		 *
2290		 * Returning an error here is the sane thing to do.  A
2291		 * subsequent call to VN_RELE() which translates to a
2292		 * nfs4_inactive() will clean up state: if the zone of the
2293		 * vnode's origin is still alive and kicking, the inactive
2294		 * thread will handle the request (from the correct zone), and
2295		 * everything (minus the OTW close call) should be OK.  If the
2296		 * zone is going away nfs4_async_inactive() will throw away
2297		 * delegations, open streams and cached pages inline.
2298		 */
2299		return (EIO);
2300	}
2301
2302	/*
2303	 * If we are using local locking for this filesystem, then
2304	 * release all of the SYSV style record locks.  Otherwise,
2305	 * we are doing network locking and we need to release all
2306	 * of the network locks.  All of the locks held by this
2307	 * process on this file are released no matter what the
2308	 * incoming reference count is.
2309	 */
2310	if (VTOMI4(vp)->mi_flags & MI4_LLOCK) {
2311		cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
2312		cleanshares(vp, ttoproc(curthread)->p_pid);
2313	} else
2314		e.error = nfs4_lockrelease(vp, flag, offset, cr);
2315
2316	if (e.error) {
2317		struct lm_sysid *lmsid;
2318		lmsid = nfs4_find_sysid(VTOMI4(vp));
2319		if (lmsid == NULL) {
2320			DTRACE_PROBE2(unknown__sysid, int, e.error,
2321			    vnode_t *, vp);
2322		} else {
2323			cleanlocks(vp, ttoproc(curthread)->p_pid,
2324			    (lm_sysidt(lmsid) | LM_SYSID_CLIENT));
2325
2326			lm_rel_sysid(lmsid);
2327		}
2328		return (e.error);
2329	}
2330
2331	if (count > 1)
2332		return (0);
2333
2334	/*
2335	 * If the file has been `unlinked', then purge the
2336	 * DNLC so that this vnode will get reycled quicker
2337	 * and the .nfs* file on the server will get removed.
2338	 */
2339	if (rp->r_unldvp != NULL)
2340		dnlc_purge_vp(vp);
2341
2342	/*
2343	 * If the file was open for write and there are pages,
2344	 * do a synchronous flush and commit of all of the
2345	 * dirty and uncommitted pages.
2346	 */
2347	ASSERT(!e.error);
2348	if ((flag & FWRITE) && nfs4_has_pages(vp))
2349		error = nfs4_putpage_commit(vp, 0, 0, cr);
2350
2351	mutex_enter(&rp->r_statelock);
2352	r_error = rp->r_error;
2353	rp->r_error = 0;
2354	mutex_exit(&rp->r_statelock);
2355
2356	/*
2357	 * If this file type is one for which no explicit 'open' was
2358	 * done, then bail now (ie. no need for protocol 'close'). If
2359	 * there was an error w/the vm subsystem, return _that_ error,
2360	 * otherwise, return any errors that may've been reported via
2361	 * the rnode.
2362	 */
2363	if (vp->v_type != VREG)
2364		return (error ? error : r_error);
2365
2366	/*
2367	 * The sync putpage commit may have failed above, but since
2368	 * we're working w/a regular file, we need to do the protocol
2369	 * 'close' (nfs4close_one will figure out if an otw close is
2370	 * needed or not). Report any errors _after_ doing the protocol
2371	 * 'close'.
2372	 */
2373	nfs4close_one(vp, NULL, cr, flag, NULL, &e, CLOSE_NORM, 0, 0, 0);
2374	n4error = e.error ? e.error : geterrno4(e.stat);
2375
2376	/*
2377	 * Error reporting prio (Hi -> Lo)
2378	 *
2379	 *   i) nfs4_putpage_commit (error)
2380	 *  ii) rnode's (r_error)
2381	 * iii) nfs4close_one (n4error)
2382	 */
2383	return (error ? error : (r_error ? r_error : n4error));
2384}
2385
2386/*
2387 * Initialize *lost_rqstp.
2388 */
2389
2390static void
2391nfs4close_save_lost_rqst(int error, nfs4_lost_rqst_t *lost_rqstp,
2392    nfs4_open_owner_t *oop, nfs4_open_stream_t *osp, cred_t *cr,
2393    vnode_t *vp)
2394{
2395	if (error != ETIMEDOUT && error != EINTR &&
2396	    !NFS4_FRC_UNMT_ERR(error, vp->v_vfsp)) {
2397		lost_rqstp->lr_op = 0;
2398		return;
2399	}
2400
2401	NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
2402	    "nfs4close_save_lost_rqst: error %d", error));
2403
2404	lost_rqstp->lr_op = OP_CLOSE;
2405	/*
2406	 * The vp is held and rele'd via the recovery code.
2407	 * See nfs4_save_lost_rqst.
2408	 */
2409	lost_rqstp->lr_vp = vp;
2410	lost_rqstp->lr_dvp = NULL;
2411	lost_rqstp->lr_oop = oop;
2412	lost_rqstp->lr_osp = osp;
2413	ASSERT(osp != NULL);
2414	ASSERT(mutex_owned(&osp->os_sync_lock));
2415	osp->os_pending_close = 1;
2416	lost_rqstp->lr_lop = NULL;
2417	lost_rqstp->lr_cr = cr;
2418	lost_rqstp->lr_flk = NULL;
2419	lost_rqstp->lr_putfirst = FALSE;
2420}
2421
2422/*
2423 * Assumes you already have the open seqid sync grabbed as well as the
2424 * 'os_sync_lock'.  Note: this will release the open seqid sync and
2425 * 'os_sync_lock' if client recovery starts.  Calling functions have to
2426 * be prepared to handle this.
2427 *
2428 * 'recov' is returned as 1 if the CLOSE operation detected client recovery
2429 * was needed and was started, and that the calling function should retry
2430 * this function; otherwise it is returned as 0.
2431 *
2432 * Errors are returned via the nfs4_error_t parameter.
2433 */
2434static void
2435nfs4close_otw(rnode4_t *rp, cred_t *cred_otw, nfs4_open_owner_t *oop,
2436    nfs4_open_stream_t *osp, int *recov, int *did_start_seqid_syncp,
2437    nfs4_close_type_t close_type, nfs4_error_t *ep, int *have_sync_lockp)
2438{
2439	COMPOUND4args_clnt args;
2440	COMPOUND4res_clnt res;
2441	CLOSE4args *close_args;
2442	nfs_resop4 *resop;
2443	nfs_argop4 argop[3];
2444	int doqueue = 1;
2445	mntinfo4_t *mi;
2446	seqid4 seqid;
2447	vnode_t *vp;
2448	bool_t needrecov = FALSE;
2449	nfs4_lost_rqst_t lost_rqst;
2450	hrtime_t t;
2451
2452	ASSERT(nfs_zone() == VTOMI4(RTOV4(rp))->mi_zone);
2453
2454	ASSERT(MUTEX_HELD(&osp->os_sync_lock));
2455
2456	NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4close_otw"));
2457
2458	/* Only set this to 1 if recovery is started */
2459	*recov = 0;
2460
2461	/* do the OTW call to close the file */
2462
2463	if (close_type == CLOSE_RESEND)
2464		args.ctag = TAG_CLOSE_LOST;
2465	else if (close_type == CLOSE_AFTER_RESEND)
2466		args.ctag = TAG_CLOSE_UNDO;
2467	else
2468		args.ctag = TAG_CLOSE;
2469
2470	args.array_len = 3;
2471	args.array = argop;
2472
2473	vp = RTOV4(rp);
2474
2475	mi = VTOMI4(vp);
2476
2477	/* putfh target fh */
2478	argop[0].argop = OP_CPUTFH;
2479	argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
2480
2481	argop[1].argop = OP_GETATTR;
2482	argop[1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
2483	argop[1].nfs_argop4_u.opgetattr.mi = mi;
2484
2485	argop[2].argop = OP_CLOSE;
2486	close_args = &argop[2].nfs_argop4_u.opclose;
2487
2488	seqid = nfs4_get_open_seqid(oop) + 1;
2489
2490	close_args->seqid = seqid;
2491	close_args->open_stateid = osp->open_stateid;
2492
2493	NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
2494	    "nfs4close_otw: %s call, rp %s", needrecov ? "recov" : "first",
2495	    rnode4info(rp)));
2496
2497	t = gethrtime();
2498
2499	rfs4call(mi, &args, &res, cred_otw, &doqueue, 0, ep);
2500
2501	if (!ep->error && nfs4_need_to_bump_seqid(&res)) {
2502		nfs4_set_open_seqid(seqid, oop, args.ctag);
2503	}
2504
2505	needrecov = nfs4_needs_recovery(ep, TRUE, mi->mi_vfsp);
2506	if (ep->error && !needrecov) {
2507		/*
2508		 * if there was an error and no recovery is to be done
2509		 * then then set up the file to flush its cache if
2510		 * needed for the next caller.
2511		 */
2512		mutex_enter(&rp->r_statelock);
2513		PURGE_ATTRCACHE4_LOCKED(rp);
2514		rp->r_flags &= ~R4WRITEMODIFIED;
2515		mutex_exit(&rp->r_statelock);
2516		return;
2517	}
2518
2519	if (needrecov) {
2520		bool_t abort;
2521		nfs4_bseqid_entry_t *bsep = NULL;
2522
2523		if (close_type != CLOSE_RESEND)
2524			nfs4close_save_lost_rqst(ep->error, &lost_rqst, oop,
2525			    osp, cred_otw, vp);
2526
2527		if (!ep->error && res.status == NFS4ERR_BAD_SEQID)
2528			bsep = nfs4_create_bseqid_entry(oop, NULL, vp,
2529			    0, args.ctag, close_args->seqid);
2530
2531		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
2532		    "nfs4close_otw: initiating recovery. error %d "
2533		    "res.status %d", ep->error, res.status));
2534
2535		/*
2536		 * Drop the 'os_sync_lock' here so we don't hit
2537		 * a potential recursive mutex_enter via an
2538		 * 'open_stream_hold()'.
2539		 */
2540		mutex_exit(&osp->os_sync_lock);
2541		*have_sync_lockp = 0;
2542		abort = nfs4_start_recovery(ep, VTOMI4(vp), vp, NULL, NULL,
2543		    (close_type != CLOSE_RESEND &&
2544		    lost_rqst.lr_op == OP_CLOSE) ? &lost_rqst : NULL,
2545		    OP_CLOSE, bsep, NULL, NULL);
2546
2547		/* drop open seq sync, and let the calling function regrab it */
2548		nfs4_end_open_seqid_sync(oop);
2549		*did_start_seqid_syncp = 0;
2550
2551		if (bsep)
2552			kmem_free(bsep, sizeof (*bsep));
2553		/*
2554		 * For signals, the caller wants to quit, so don't say to
2555		 * retry.  For forced unmount, if it's a user thread, it
2556		 * wants to quit.  If it's a recovery thread, the retry
2557		 * will happen higher-up on the call stack.  Either way,
2558		 * don't say to retry.
2559		 */
2560		if (abort == FALSE && ep->error != EINTR &&
2561		    !NFS4_FRC_UNMT_ERR(ep->error, mi->mi_vfsp) &&
2562		    close_type != CLOSE_RESEND &&
2563		    close_type != CLOSE_AFTER_RESEND)
2564			*recov = 1;
2565		else
2566			*recov = 0;
2567
2568		if (!ep->error)
2569			xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2570		return;
2571	}
2572
2573	if (res.status) {
2574		xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2575		return;
2576	}
2577
2578	mutex_enter(&rp->r_statev4_lock);
2579	rp->created_v4 = 0;
2580	mutex_exit(&rp->r_statev4_lock);
2581
2582	resop = &res.array[2];
2583	osp->open_stateid = resop->nfs_resop4_u.opclose.open_stateid;
2584	osp->os_valid = 0;
2585
2586	/*
2587	 * This removes the reference obtained at OPEN; ie, when the
2588	 * open stream structure was created.
2589	 *
2590	 * We don't have to worry about calling 'open_stream_rele'
2591	 * since we our currently holding a reference to the open
2592	 * stream which means the count cannot go to 0 with this
2593	 * decrement.
2594	 */
2595	ASSERT(osp->os_ref_count >= 2);
2596	osp->os_ref_count--;
2597
2598	if (ep->error == 0) {
2599		/*
2600		 * Avoid a deadlock with the r_serial thread waiting for
2601		 * os_sync_lock in nfs4_get_otw_cred_by_osp() which might be
2602		 * held by us. We will wait in nfs4_attr_cache() for the
2603		 * completion of the r_serial thread.
2604		 */
2605		mutex_exit(&osp->os_sync_lock);
2606		*have_sync_lockp = 0;
2607
2608		nfs4_attr_cache(vp,
2609		    &res.array[1].nfs_resop4_u.opgetattr.ga_res,
2610		    t, cred_otw, TRUE, NULL);
2611	}
2612
2613	NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4close_otw:"
2614	    " returning %d", ep->error));
2615
2616	xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2617}
2618
2619/* ARGSUSED */
2620static int
2621nfs4_read(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr,
2622    caller_context_t *ct)
2623{
2624	rnode4_t *rp;
2625	u_offset_t off;
2626	offset_t diff;
2627	uint_t on;
2628	uint_t n;
2629	caddr_t base;
2630	uint_t flags;
2631	int error;
2632	mntinfo4_t *mi;
2633
2634	rp = VTOR4(vp);
2635
2636	ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER));
2637
2638	if (IS_SHADOW(vp, rp))
2639		vp = RTOV4(rp);
2640
2641	if (vp->v_type != VREG)
2642		return (EISDIR);
2643
2644	mi = VTOMI4(vp);
2645
2646	if (nfs_zone() != mi->mi_zone)
2647		return (EIO);
2648
2649	if (uiop->uio_resid == 0)
2650		return (0);
2651
2652	if (uiop->uio_loffset < 0 || uiop->uio_loffset + uiop->uio_resid < 0)
2653		return (EINVAL);
2654
2655	mutex_enter(&rp->r_statelock);
2656	if (rp->r_flags & R4RECOVERRP)
2657		error = (rp->r_error ? rp->r_error : EIO);
2658	else
2659		error = 0;
2660	mutex_exit(&rp->r_statelock);
2661	if (error)
2662		return (error);
2663
2664	/*
2665	 * Bypass VM if caching has been disabled (e.g., locking) or if
2666	 * using client-side direct I/O and the file is not mmap'd and
2667	 * there are no cached pages.
2668	 */
2669	if ((vp->v_flag & VNOCACHE) ||
2670	    (((rp->r_flags & R4DIRECTIO) || (mi->mi_flags & MI4_DIRECTIO)) &&
2671	    rp->r_mapcnt == 0 && rp->r_inmap == 0 && !nfs4_has_pages(vp))) {
2672		size_t resid = 0;
2673
2674		return (nfs4read(vp, NULL, uiop->uio_loffset,
2675		    uiop->uio_resid, &resid, cr, FALSE, uiop));
2676	}
2677
2678	error = 0;
2679
2680	do {
2681		off = uiop->uio_loffset & MAXBMASK; /* mapping offset */
2682		on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */
2683		n = MIN(MAXBSIZE - on, uiop->uio_resid);
2684
2685		if (error = nfs4_validate_caches(vp, cr))
2686			break;
2687
2688		mutex_enter(&rp->r_statelock);
2689		while (rp->r_flags & R4INCACHEPURGE) {
2690			if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
2691				mutex_exit(&rp->r_statelock);
2692				return (EINTR);
2693			}
2694		}
2695		diff = rp->r_size - uiop->uio_loffset;
2696		mutex_exit(&rp->r_statelock);
2697		if (diff <= 0)
2698			break;
2699		if (diff < n)
2700			n = (uint_t)diff;
2701
2702		if (vpm_enable) {
2703			/*
2704			 * Copy data.
2705			 */
2706			error = vpm_data_copy(vp, off + on, n, uiop,
2707			    1, NULL, 0, S_READ);
2708		} else {
2709			base = segmap_getmapflt(segkmap, vp, off + on, n, 1,
2710			    S_READ);
2711
2712			error = uiomove(base + on, n, UIO_READ, uiop);
2713		}
2714
2715		if (!error) {
2716			/*
2717			 * If read a whole block or read to eof,
2718			 * won't need this buffer again soon.
2719			 */
2720			mutex_enter(&rp->r_statelock);
2721			if (n + on == MAXBSIZE ||
2722			    uiop->uio_loffset == rp->r_size)
2723				flags = SM_DONTNEED;
2724			else
2725				flags = 0;
2726			mutex_exit(&rp->r_statelock);
2727			if (vpm_enable) {
2728				error = vpm_sync_pages(vp, off, n, flags);
2729			} else {
2730				error = segmap_release(segkmap, base, flags);
2731			}
2732		} else {
2733			if (vpm_enable) {
2734				(void) vpm_sync_pages(vp, off, n, 0);
2735			} else {
2736				(void) segmap_release(segkmap, base, 0);
2737			}
2738		}
2739	} while (!error && uiop->uio_resid > 0);
2740
2741	return (error);
2742}
2743
2744/* ARGSUSED */
2745static int
2746nfs4_write(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr,
2747    caller_context_t *ct)
2748{
2749	rlim64_t limit = uiop->uio_llimit;
2750	rnode4_t *rp;
2751	u_offset_t off;
2752	caddr_t base;
2753	uint_t flags;
2754	int remainder;
2755	size_t n;
2756	int on;
2757	int error;
2758	int resid;
2759	u_offset_t offset;
2760	mntinfo4_t *mi;
2761	uint_t bsize;
2762
2763	rp = VTOR4(vp);
2764
2765	if (IS_SHADOW(vp, rp))
2766		vp = RTOV4(rp);
2767
2768	if (vp->v_type != VREG)
2769		return (EISDIR);
2770
2771	mi = VTOMI4(vp);
2772
2773	if (nfs_zone() != mi->mi_zone)
2774		return (EIO);
2775
2776	if (uiop->uio_resid == 0)
2777		return (0);
2778
2779	mutex_enter(&rp->r_statelock);
2780	if (rp->r_flags & R4RECOVERRP)
2781		error = (rp->r_error ? rp->r_error : EIO);
2782	else
2783		error = 0;
2784	mutex_exit(&rp->r_statelock);
2785	if (error)
2786		return (error);
2787
2788	if (ioflag & FAPPEND) {
2789		struct vattr va;
2790
2791		/*
2792		 * Must serialize if appending.
2793		 */
2794		if (nfs_rw_lock_held(&rp->r_rwlock, RW_READER)) {
2795			nfs_rw_exit(&rp->r_rwlock);
2796			if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER,
2797			    INTR4(vp)))
2798				return (EINTR);
2799		}
2800
2801		va.va_mask = AT_SIZE;
2802		error = nfs4getattr(vp, &va, cr);
2803		if (error)
2804			return (error);
2805		uiop->uio_loffset = va.va_size;
2806	}
2807
2808	offset = uiop->uio_loffset + uiop->uio_resid;
2809
2810	if (uiop->uio_loffset < (offset_t)0 || offset < 0)
2811		return (EINVAL);
2812
2813	if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
2814		limit = MAXOFFSET_T;
2815
2816	/*
2817	 * Check to make sure that the process will not exceed
2818	 * its limit on file size.  It is okay to write up to
2819	 * the limit, but not beyond.  Thus, the write which
2820	 * reaches the limit will be short and the next write
2821	 * will return an error.
2822	 */
2823	remainder = 0;
2824	if (offset > uiop->uio_llimit) {
2825		remainder = offset - uiop->uio_llimit;
2826		uiop->uio_resid = uiop->uio_llimit - uiop->uio_loffset;
2827		if (uiop->uio_resid <= 0) {
2828			proc_t *p = ttoproc(curthread);
2829
2830			uiop->uio_resid += remainder;
2831			mutex_enter(&p->p_lock);
2832			(void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
2833			    p->p_rctls, p, RCA_UNSAFE_SIGINFO);
2834			mutex_exit(&p->p_lock);
2835			return (EFBIG);
2836		}
2837	}
2838
2839	/* update the change attribute, if we have a write delegation */
2840
2841	mutex_enter(&rp->r_statev4_lock);
2842	if (rp->r_deleg_type == OPEN_DELEGATE_WRITE)
2843		rp->r_deleg_change++;
2844
2845	mutex_exit(&rp->r_statev4_lock);
2846
2847	if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR4(vp)))
2848		return (EINTR);
2849
2850	/*
2851	 * Bypass VM if caching has been disabled (e.g., locking) or if
2852	 * using client-side direct I/O and the file is not mmap'd and
2853	 * there are no cached pages.
2854	 */
2855	if ((vp->v_flag & VNOCACHE) ||
2856	    (((rp->r_flags & R4DIRECTIO) || (mi->mi_flags & MI4_DIRECTIO)) &&
2857	    rp->r_mapcnt == 0 && rp->r_inmap == 0 && !nfs4_has_pages(vp))) {
2858		size_t bufsize;
2859		int count;
2860		u_offset_t org_offset;
2861		stable_how4 stab_comm;
2862nfs4_fwrite:
2863		if (rp->r_flags & R4STALE) {
2864			resid = uiop->uio_resid;
2865			offset = uiop->uio_loffset;
2866			error = rp->r_error;
2867			/*
2868			 * A close may have cleared r_error, if so,
2869			 * propagate ESTALE error return properly
2870			 */
2871			if (error == 0)
2872				error = ESTALE;
2873			goto bottom;
2874		}
2875
2876		bufsize = MIN(uiop->uio_resid, mi->mi_stsize);
2877		base = kmem_alloc(bufsize, KM_SLEEP);
2878		do {
2879			if (ioflag & FDSYNC)
2880				stab_comm = DATA_SYNC4;
2881			else
2882				stab_comm = FILE_SYNC4;
2883			resid = uiop->uio_resid;
2884			offset = uiop->uio_loffset;
2885			count = MIN(uiop->uio_resid, bufsize);
2886			org_offset = uiop->uio_loffset;
2887			error = uiomove(base, count, UIO_WRITE, uiop);
2888			if (!error) {
2889				error = nfs4write(vp, base, org_offset,
2890				    count, cr, &stab_comm);
2891				if (!error) {
2892					mutex_enter(&rp->r_statelock);
2893					if (rp->r_size < uiop->uio_loffset)
2894						rp->r_size = uiop->uio_loffset;
2895					mutex_exit(&rp->r_statelock);
2896				}
2897			}
2898		} while (!error && uiop->uio_resid > 0);
2899		kmem_free(base, bufsize);
2900		goto bottom;
2901	}
2902
2903	bsize = vp->v_vfsp->vfs_bsize;
2904
2905	do {
2906		off = uiop->uio_loffset & MAXBMASK; /* mapping offset */
2907		on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */
2908		n = MIN(MAXBSIZE - on, uiop->uio_resid);
2909
2910		resid = uiop->uio_resid;
2911		offset = uiop->uio_loffset;
2912
2913		if (rp->r_flags & R4STALE) {
2914			error = rp->r_error;
2915			/*
2916			 * A close may have cleared r_error, if so,
2917			 * propagate ESTALE error return properly
2918			 */
2919			if (error == 0)
2920				error = ESTALE;
2921			break;
2922		}
2923
2924		/*
2925		 * Don't create dirty pages faster than they
2926		 * can be cleaned so that the system doesn't
2927		 * get imbalanced.  If the async queue is
2928		 * maxed out, then wait for it to drain before
2929		 * creating more dirty pages.  Also, wait for
2930		 * any threads doing pagewalks in the vop_getattr
2931		 * entry points so that they don't block for
2932		 * long periods.
2933		 */
2934		mutex_enter(&rp->r_statelock);
2935		while ((mi->mi_max_threads != 0 &&
2936		    rp->r_awcount > 2 * mi->mi_max_threads) ||
2937		    rp->r_gcount > 0) {
2938			if (INTR4(vp)) {
2939				klwp_t *lwp = ttolwp(curthread);
2940
2941				if (lwp != NULL)
2942					lwp->lwp_nostop++;
2943				if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
2944					mutex_exit(&rp->r_statelock);
2945					if (lwp != NULL)
2946						lwp->lwp_nostop--;
2947					error = EINTR;
2948					goto bottom;
2949				}
2950				if (lwp != NULL)
2951					lwp->lwp_nostop--;
2952			} else
2953				cv_wait(&rp->r_cv, &rp->r_statelock);
2954		}
2955		mutex_exit(&rp->r_statelock);
2956
2957		/*
2958		 * Touch the page and fault it in if it is not in core
2959		 * before segmap_getmapflt or vpm_data_copy can lock it.
2960		 * This is to avoid the deadlock if the buffer is mapped
2961		 * to the same file through mmap which we want to write.
2962		 */
2963		uio_prefaultpages((long)n, uiop);
2964
2965		if (vpm_enable) {
2966			/*
2967			 * It will use kpm mappings, so no need to
2968			 * pass an address.
2969			 */
2970			error = writerp4(rp, NULL, n, uiop, 0);
2971		} else  {
2972			if (segmap_kpm) {
2973				int pon = uiop->uio_loffset & PAGEOFFSET;
2974				size_t pn = MIN(PAGESIZE - pon,
2975				    uiop->uio_resid);
2976				int pagecreate;
2977
2978				mutex_enter(&rp->r_statelock);
2979				pagecreate = (pon == 0) && (pn == PAGESIZE ||
2980				    uiop->uio_loffset + pn >= rp->r_size);
2981				mutex_exit(&rp->r_statelock);
2982
2983				base = segmap_getmapflt(segkmap, vp, off + on,
2984				    pn, !pagecreate, S_WRITE);
2985
2986				error = writerp4(rp, base + pon, n, uiop,
2987				    pagecreate);
2988
2989			} else {
2990				base = segmap_getmapflt(segkmap, vp, off + on,
2991				    n, 0, S_READ);
2992				error = writerp4(rp, base + on, n, uiop, 0);
2993			}
2994		}
2995
2996		if (!error) {
2997			if (mi->mi_flags & MI4_NOAC)
2998				flags = SM_WRITE;
2999			else if ((uiop->uio_loffset % bsize) == 0 ||
3000			    IS_SWAPVP(vp)) {
3001				/*
3002				 * Have written a whole block.
3003				 * Start an asynchronous write
3004				 * and mark the buffer to
3005				 * indicate that it won't be
3006				 * needed again soon.
3007				 */
3008				flags = SM_WRITE | SM_ASYNC | SM_DONTNEED;
3009			} else
3010				flags = 0;
3011			if ((ioflag & (FSYNC|FDSYNC)) ||
3012			    (rp->r_flags & R4OUTOFSPACE)) {
3013				flags &= ~SM_ASYNC;
3014				flags |= SM_WRITE;
3015			}
3016			if (vpm_enable) {
3017				error = vpm_sync_pages(vp, off, n, flags);
3018			} else {
3019				error = segmap_release(segkmap, base, flags);
3020			}
3021		} else {
3022			if (vpm_enable) {
3023				(void) vpm_sync_pages(vp, off, n, 0);
3024			} else {
3025				(void) segmap_release(segkmap, base, 0);
3026			}
3027			/*
3028			 * In the event that we got an access error while
3029			 * faulting in a page for a write-only file just
3030			 * force a write.
3031			 */
3032			if (error == EACCES)
3033				goto nfs4_fwrite;
3034		}
3035	} while (!error && uiop->uio_resid > 0);
3036
3037bottom:
3038	if (error) {
3039		uiop->uio_resid = resid + remainder;
3040		uiop->uio_loffset = offset;
3041	} else {
3042		uiop->uio_resid += remainder;
3043
3044		mutex_enter(&rp->r_statev4_lock);
3045		if (rp->r_deleg_type == OPEN_DELEGATE_WRITE) {
3046			gethrestime(&rp->r_attr.va_mtime);
3047			rp->r_attr.va_ctime = rp->r_attr.va_mtime;
3048		}
3049		mutex_exit(&rp->r_statev4_lock);
3050	}
3051
3052	nfs_rw_exit(&rp->r_lkserlock);
3053
3054	return (error);
3055}
3056
3057/*
3058 * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED}
3059 */
3060static int
3061nfs4_rdwrlbn(vnode_t *vp, page_t *pp, u_offset_t off, size_t len,
3062    int flags, cred_t *cr)
3063{
3064	struct buf *bp;
3065	int error;
3066	page_t *savepp;
3067	uchar_t fsdata;
3068	stable_how4 stab_comm;
3069
3070	ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
3071	bp = pageio_setup(pp, len, vp, flags);
3072	ASSERT(bp != NULL);
3073
3074	/*
3075	 * pageio_setup should have set b_addr to 0.  This
3076	 * is correct since we want to do I/O on a page
3077	 * boundary.  bp_mapin will use this addr to calculate
3078	 * an offset, and then set b_addr to the kernel virtual
3079	 * address it allocated for us.
3080	 */
3081	ASSERT(bp->b_un.b_addr == 0);
3082
3083	bp->b_edev = 0;
3084	bp->b_dev = 0;
3085	bp->b_lblkno = lbtodb(off);
3086	bp->b_file = vp;
3087	bp->b_offset = (offset_t)off;
3088	bp_mapin(bp);
3089
3090	if ((flags & (B_WRITE|B_ASYNC)) == (B_WRITE|B_ASYNC) &&
3091	    freemem > desfree)
3092		stab_comm = UNSTABLE4;
3093	else
3094		stab_comm = FILE_SYNC4;
3095
3096	error = nfs4_bio(bp, &stab_comm, cr, FALSE);
3097
3098	bp_mapout(bp);
3099	pageio_done(bp);
3100
3101	if (stab_comm == UNSTABLE4)
3102		fsdata = C_DELAYCOMMIT;
3103	else
3104		fsdata = C_NOCOMMIT;
3105
3106	savepp = pp;
3107	do {
3108		pp->p_fsdata = fsdata;
3109	} while ((pp = pp->p_next) != savepp);
3110
3111	return (error);
3112}
3113
3114/*
3115 */
3116static int
3117nfs4rdwr_check_osid(vnode_t *vp, nfs4_error_t *ep, cred_t *cr)
3118{
3119	nfs4_open_owner_t	*oop;
3120	nfs4_open_stream_t	*osp;
3121	rnode4_t		*rp = VTOR4(vp);
3122	mntinfo4_t		*mi = VTOMI4(vp);
3123	int			reopen_needed;
3124
3125	ASSERT(nfs_zone() == mi->mi_zone);
3126
3127
3128	oop = find_open_owner(cr, NFS4_PERM_CREATED, mi);
3129	if (!oop)
3130		return (EIO);
3131
3132	/* returns with 'os_sync_lock' held */
3133	osp = find_open_stream(oop, rp);
3134	if (!osp) {
3135		open_owner_rele(oop);
3136		return (EIO);
3137	}
3138
3139	if (osp->os_failed_reopen) {
3140		mutex_exit(&osp->os_sync_lock);
3141		open_stream_rele(osp, rp);
3142		open_owner_rele(oop);
3143		return (EIO);
3144	}
3145
3146	/*
3147	 * Determine whether a reopen is needed.  If this
3148	 * is a delegation open stream, then the os_delegation bit
3149	 * should be set.
3150	 */
3151
3152	reopen_needed = osp->os_delegation;
3153
3154	mutex_exit(&osp->os_sync_lock);
3155	open_owner_rele(oop);
3156
3157	if (reopen_needed) {
3158		nfs4_error_zinit(ep);
3159		nfs4_reopen(vp, osp, ep, CLAIM_NULL, FALSE, FALSE);
3160		mutex_enter(&osp->os_sync_lock);
3161		if (ep->error || ep->stat || osp->os_failed_reopen) {
3162			mutex_exit(&osp->os_sync_lock);
3163			open_stream_rele(osp, rp);
3164			return (EIO);
3165		}
3166		mutex_exit(&osp->os_sync_lock);
3167	}
3168	open_stream_rele(osp, rp);
3169
3170	return (0);
3171}
3172
3173/*
3174 * Write to file.  Writes to remote server in largest size
3175 * chunks that the server can handle.  Write is synchronous.
3176 */
3177static int
3178nfs4write(vnode_t *vp, caddr_t base, u_offset_t offset, int count, cred_t *cr,
3179    stable_how4 *stab_comm)
3180{
3181	mntinfo4_t *mi;
3182	COMPOUND4args_clnt args;
3183	COMPOUND4res_clnt res;
3184	WRITE4args *wargs;
3185	WRITE4res *wres;
3186	nfs_argop4 argop[2];
3187	nfs_resop4 *resop;
3188	int tsize;
3189	stable_how4 stable;
3190	rnode4_t *rp;
3191	int doqueue = 1;
3192	bool_t needrecov;
3193	nfs4_recov_state_t recov_state;
3194	nfs4_stateid_types_t sid_types;
3195	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
3196	int recov;
3197
3198	rp = VTOR4(vp);
3199	mi = VTOMI4(vp);
3200
3201	ASSERT(nfs_zone() == mi->mi_zone);
3202
3203	stable = *stab_comm;
3204	*stab_comm = FILE_SYNC4;
3205
3206	needrecov = FALSE;
3207	recov_state.rs_flags = 0;
3208	recov_state.rs_num_retry_despite_err = 0;
3209	nfs4_init_stateid_types(&sid_types);
3210
3211	/* Is curthread the recovery thread? */
3212	mutex_enter(&mi->mi_lock);
3213	recov = (mi->mi_recovthread == curthread);
3214	mutex_exit(&mi->mi_lock);
3215
3216recov_retry:
3217	args.ctag = TAG_WRITE;
3218	args.array_len = 2;
3219	args.array = argop;
3220
3221	if (!recov) {
3222		e.error = nfs4_start_fop(VTOMI4(vp), vp, NULL, OH_WRITE,
3223		    &recov_state, NULL);
3224		if (e.error)
3225			return (e.error);
3226	}
3227
3228	/* 0. putfh target fh */
3229	argop[0].argop = OP_CPUTFH;
3230	argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
3231
3232	/* 1. write */
3233	nfs4args_write(&argop[1], stable, rp, cr, &wargs, &sid_types);
3234
3235	do {
3236
3237		wargs->offset = (offset4)offset;
3238		wargs->data_val = base;
3239
3240		if (mi->mi_io_kstats) {
3241			mutex_enter(&mi->mi_lock);
3242			kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
3243			mutex_exit(&mi->mi_lock);
3244		}
3245
3246		if ((vp->v_flag & VNOCACHE) ||
3247		    (rp->r_flags & R4DIRECTIO) ||
3248		    (mi->mi_flags & MI4_DIRECTIO))
3249			tsize = MIN(mi->mi_stsize, count);
3250		else
3251			tsize = MIN(mi->mi_curwrite, count);
3252		wargs->data_len = (uint_t)tsize;
3253		rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
3254
3255		if (mi->mi_io_kstats) {
3256			mutex_enter(&mi->mi_lock);
3257			kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
3258			mutex_exit(&mi->mi_lock);
3259		}
3260
3261		if (!recov) {
3262			needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
3263			if (e.error && !needrecov) {
3264				nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE,
3265				    &recov_state, needrecov);
3266				return (e.error);
3267			}
3268		} else {
3269			if (e.error)
3270				return (e.error);
3271		}
3272
3273		/*
3274		 * Do handling of OLD_STATEID outside
3275		 * of the normal recovery framework.
3276		 *
3277		 * If write receives a BAD stateid error while using a
3278		 * delegation stateid, retry using the open stateid (if it
3279		 * exists).  If it doesn't have an open stateid, reopen the
3280		 * file first, then retry.
3281		 */
3282		if (!e.error && res.status == NFS4ERR_OLD_STATEID &&
3283		    sid_types.cur_sid_type != SPEC_SID) {
3284			nfs4_save_stateid(&wargs->stateid, &sid_types);
3285			if (!recov)
3286				nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE,
3287				    &recov_state, needrecov);
3288			xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3289			goto recov_retry;
3290		} else if (e.error == 0 && res.status == NFS4ERR_BAD_STATEID &&
3291		    sid_types.cur_sid_type == DEL_SID) {
3292			nfs4_save_stateid(&wargs->stateid, &sid_types);
3293			mutex_enter(&rp->r_statev4_lock);
3294			rp->r_deleg_return_pending = TRUE;
3295			mutex_exit(&rp->r_statev4_lock);
3296			if (nfs4rdwr_check_osid(vp, &e, cr)) {
3297				if (!recov)
3298					nfs4_end_fop(mi, vp, NULL, OH_WRITE,
3299					    &recov_state, needrecov);
3300				xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3301				return (EIO);
3302			}
3303			if (!recov)
3304				nfs4_end_fop(mi, vp, NULL, OH_WRITE,
3305				    &recov_state, needrecov);
3306			/* hold needed for nfs4delegreturn_thread */
3307			VN_HOLD(vp);
3308			nfs4delegreturn_async(rp, (NFS4_DR_PUSH|NFS4_DR_REOPEN|
3309			    NFS4_DR_DISCARD), FALSE);
3310			xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3311			goto recov_retry;
3312		}
3313
3314		if (needrecov) {
3315			bool_t abort;
3316
3317			NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
3318			    "nfs4write: client got error %d, res.status %d"
3319			    ", so start recovery", e.error, res.status));
3320
3321			abort = nfs4_start_recovery(&e,
3322			    VTOMI4(vp), vp, NULL, &wargs->stateid,
3323			    NULL, OP_WRITE, NULL, NULL, NULL);
3324			if (!e.error) {
3325				e.error = geterrno4(res.status);
3326				xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3327			}
3328			nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE,
3329			    &recov_state, needrecov);
3330			if (abort == FALSE)
3331				goto recov_retry;
3332			return (e.error);
3333		}
3334
3335		if (res.status) {
3336			e.error = geterrno4(res.status);
3337			xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3338			if (!recov)
3339				nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE,
3340				    &recov_state, needrecov);
3341			return (e.error);
3342		}
3343
3344		resop = &res.array[1];	/* write res */
3345		wres = &resop->nfs_resop4_u.opwrite;
3346
3347		if ((int)wres->count > tsize) {
3348			xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3349
3350			zcmn_err(getzoneid(), CE_WARN,
3351			    "nfs4write: server wrote %u, requested was %u",
3352			    (int)wres->count, tsize);
3353			if (!recov)
3354				nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE,
3355				    &recov_state, needrecov);
3356			return (EIO);
3357		}
3358		if (wres->committed == UNSTABLE4) {
3359			*stab_comm = UNSTABLE4;
3360			if (wargs->stable == DATA_SYNC4 ||
3361			    wargs->stable == FILE_SYNC4) {
3362				xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3363				zcmn_err(getzoneid(), CE_WARN,
3364				    "nfs4write: server %s did not commit "
3365				    "to stable storage",
3366				    rp->r_server->sv_hostname);
3367				if (!recov)
3368					nfs4_end_fop(VTOMI4(vp), vp, NULL,
3369					    OH_WRITE, &recov_state, needrecov);
3370				return (EIO);
3371			}
3372		}
3373
3374		tsize = (int)wres->count;
3375		count -= tsize;
3376		base += tsize;
3377		offset += tsize;
3378		if (mi->mi_io_kstats) {
3379			mutex_enter(&mi->mi_lock);
3380			KSTAT_IO_PTR(mi->mi_io_kstats)->writes++;
3381			KSTAT_IO_PTR(mi->mi_io_kstats)->nwritten +=
3382			    tsize;
3383			mutex_exit(&mi->mi_lock);
3384		}
3385		lwp_stat_update(LWP_STAT_OUBLK, 1);
3386		mutex_enter(&rp->r_statelock);
3387		if (rp->r_flags & R4HAVEVERF) {
3388			if (rp->r_writeverf != wres->writeverf) {
3389				nfs4_set_mod(vp);
3390				rp->r_writeverf = wres->writeverf;
3391			}
3392		} else {
3393			rp->r_writeverf = wres->writeverf;
3394			rp->r_flags |= R4HAVEVERF;
3395		}
3396		PURGE_ATTRCACHE4_LOCKED(rp);
3397		rp->r_flags |= R4WRITEMODIFIED;
3398		gethrestime(&rp->r_attr.va_mtime);
3399		rp->r_attr.va_ctime = rp->r_attr.va_mtime;
3400		mutex_exit(&rp->r_statelock);
3401		xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3402	} while (count);
3403
3404	if (!recov)
3405		nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE, &recov_state,
3406		    needrecov);
3407
3408	return (e.error);
3409}
3410
3411/*
3412 * Read from a file.  Reads data in largest chunks our interface can handle.
3413 */
3414static int
3415nfs4read(vnode_t *vp, caddr_t base, offset_t offset, int count,
3416    size_t *residp, cred_t *cr, bool_t async, struct uio *uiop)
3417{
3418	mntinfo4_t *mi;
3419	COMPOUND4args_clnt args;
3420	COMPOUND4res_clnt res;
3421	READ4args *rargs;
3422	nfs_argop4 argop[2];
3423	int tsize;
3424	int doqueue;
3425	rnode4_t *rp;
3426	int data_len;
3427	bool_t is_eof;
3428	bool_t needrecov = FALSE;
3429	nfs4_recov_state_t recov_state;
3430	nfs4_stateid_types_t sid_types;
3431	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
3432
3433	rp = VTOR4(vp);
3434	mi = VTOMI4(vp);
3435	doqueue = 1;
3436
3437	ASSERT(nfs_zone() == mi->mi_zone);
3438
3439	args.ctag = async ? TAG_READAHEAD : TAG_READ;
3440
3441	args.array_len = 2;
3442	args.array = argop;
3443
3444	nfs4_init_stateid_types(&sid_types);
3445
3446	recov_state.rs_flags = 0;
3447	recov_state.rs_num_retry_despite_err = 0;
3448
3449recov_retry:
3450	e.error = nfs4_start_fop(mi, vp, NULL, OH_READ,
3451	    &recov_state, NULL);
3452	if (e.error)
3453		return (e.error);
3454
3455	/* putfh target fh */
3456	argop[0].argop = OP_CPUTFH;
3457	argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
3458
3459	/* read */
3460	argop[1].argop = OP_READ;
3461	rargs = &argop[1].nfs_argop4_u.opread;
3462	rargs->stateid = nfs4_get_stateid(cr, rp, curproc->p_pidp->pid_id, mi,
3463	    OP_READ, &sid_types, async);
3464
3465	do {
3466		if (mi->mi_io_kstats) {
3467			mutex_enter(&mi->mi_lock);
3468			kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
3469			mutex_exit(&mi->mi_lock);
3470		}
3471
3472		NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
3473		    "nfs4read: %s call, rp %s",
3474		    needrecov ? "recov" : "first",
3475		    rnode4info(rp)));
3476
3477		if ((vp->v_flag & VNOCACHE) ||
3478		    (rp->r_flags & R4DIRECTIO) ||
3479		    (mi->mi_flags & MI4_DIRECTIO))
3480			tsize = MIN(mi->mi_tsize, count);
3481		else
3482			tsize = MIN(mi->mi_curread, count);
3483
3484		rargs->offset = (offset4)offset;
3485		rargs->count = (count4)tsize;
3486		rargs->res_data_val_alt = NULL;
3487		rargs->res_mblk = NULL;
3488		rargs->res_uiop = NULL;
3489		rargs->res_maxsize = 0;
3490		rargs->wlist = NULL;
3491
3492		if (uiop)
3493			rargs->res_uiop = uiop;
3494		else
3495			rargs->res_data_val_alt = base;
3496		rargs->res_maxsize = tsize;
3497
3498		rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
3499#ifdef	DEBUG
3500		if (nfs4read_error_inject) {
3501			res.status = nfs4read_error_inject;
3502			nfs4read_error_inject = 0;
3503		}
3504#endif
3505
3506		if (mi->mi_io_kstats) {
3507			mutex_enter(&mi->mi_lock);
3508			kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
3509			mutex_exit(&mi->mi_lock);
3510		}
3511
3512		needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
3513		if (e.error != 0 && !needrecov) {
3514			nfs4_end_fop(mi, vp, NULL, OH_READ,
3515			    &recov_state, needrecov);
3516			return (e.error);
3517		}
3518
3519		/*
3520		 * Do proper retry for OLD and BAD stateid errors outside
3521		 * of the normal recovery framework.  There are two differences
3522		 * between async and sync reads.  The first is that we allow
3523		 * retry on BAD_STATEID for async reads, but not sync reads.
3524		 * The second is that we mark the file dead for a failed
3525		 * attempt with a special stateid for sync reads, but just
3526		 * return EIO for async reads.
3527		 *
3528		 * If a sync read receives a BAD stateid error while using a
3529		 * delegation stateid, retry using the open stateid (if it
3530		 * exists).  If it doesn't have an open stateid, reopen the
3531		 * file first, then retry.
3532		 */
3533		if (e.error == 0 && (res.status == NFS4ERR_OLD_STATEID ||
3534		    res.status == NFS4ERR_BAD_STATEID) && async) {
3535			nfs4_end_fop(mi, vp, NULL, OH_READ,
3536			    &recov_state, needrecov);
3537			if (sid_types.cur_sid_type == SPEC_SID) {
3538				xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3539				return (EIO);
3540			}
3541			nfs4_save_stateid(&rargs->stateid, &sid_types);
3542			xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3543			goto recov_retry;
3544		} else if (e.error == 0 && res.status == NFS4ERR_OLD_STATEID &&
3545		    !async && sid_types.cur_sid_type != SPEC_SID) {
3546			nfs4_save_stateid(&rargs->stateid, &sid_types);
3547			nfs4_end_fop(mi, vp, NULL, OH_READ,
3548			    &recov_state, needrecov);
3549			xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3550			goto recov_retry;
3551		} else if (e.error == 0 && res.status == NFS4ERR_BAD_STATEID &&
3552		    sid_types.cur_sid_type == DEL_SID) {
3553			nfs4_save_stateid(&rargs->stateid, &sid_types);
3554			mutex_enter(&rp->r_statev4_lock);
3555			rp->r_deleg_return_pending = TRUE;
3556			mutex_exit(&rp->r_statev4_lock);
3557			if (nfs4rdwr_check_osid(vp, &e, cr)) {
3558				nfs4_end_fop(mi, vp, NULL, OH_READ,
3559				    &recov_state, needrecov);
3560				xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3561				return (EIO);
3562			}
3563			nfs4_end_fop(mi, vp, NULL, OH_READ,
3564			    &recov_state, needrecov);
3565			/* hold needed for nfs4delegreturn_thread */
3566			VN_HOLD(vp);
3567			nfs4delegreturn_async(rp, (NFS4_DR_PUSH|NFS4_DR_REOPEN|
3568			    NFS4_DR_DISCARD), FALSE);
3569			xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3570			goto recov_retry;
3571		}
3572		if (needrecov) {
3573			bool_t abort;
3574
3575			NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
3576			    "nfs4read: initiating recovery\n"));
3577			abort = nfs4_start_recovery(&e,
3578			    mi, vp, NULL, &rargs->stateid,
3579			    NULL, OP_READ, NULL, NULL, NULL);
3580			nfs4_end_fop(mi, vp, NULL, OH_READ,
3581			    &recov_state, needrecov);
3582			/*
3583			 * Do not retry if we got OLD_STATEID using a special
3584			 * stateid.  This avoids looping with a broken server.
3585			 */
3586			if (e.error == 0 && res.status == NFS4ERR_OLD_STATEID &&
3587			    sid_types.cur_sid_type == SPEC_SID)
3588				abort = TRUE;
3589
3590			if (abort == FALSE) {
3591				/*
3592				 * Need to retry all possible stateids in
3593				 * case the recovery error wasn't stateid
3594				 * related or the stateids have become
3595				 * stale (server reboot).
3596				 */
3597				nfs4_init_stateid_types(&sid_types);
3598				xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3599				goto recov_retry;
3600			}
3601
3602			if (!e.error) {
3603				e.error = geterrno4(res.status);
3604				xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3605			}
3606			return (e.error);
3607		}
3608
3609		if (res.status) {
3610			e.error = geterrno4(res.status);
3611			nfs4_end_fop(mi, vp, NULL, OH_READ,
3612			    &recov_state, needrecov);
3613			xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3614			return (e.error);
3615		}
3616
3617		data_len = res.array[1].nfs_resop4_u.opread.data_len;
3618		count -= data_len;
3619		if (base)
3620			base += data_len;
3621		offset += data_len;
3622		if (mi->mi_io_kstats) {
3623			mutex_enter(&mi->mi_lock);
3624			KSTAT_IO_PTR(mi->mi_io_kstats)->reads++;
3625			KSTAT_IO_PTR(mi->mi_io_kstats)->nread += data_len;
3626			mutex_exit(&mi->mi_lock);
3627		}
3628		lwp_stat_update(LWP_STAT_INBLK, 1);
3629		is_eof = res.array[1].nfs_resop4_u.opread.eof;
3630		xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3631
3632	} while (count && !is_eof);
3633
3634	*residp = count;
3635
3636	nfs4_end_fop(mi, vp, NULL, OH_READ, &recov_state, needrecov);
3637
3638	return (e.error);
3639}
3640
3641/* ARGSUSED */
3642static int
3643nfs4_ioctl(vnode_t *vp, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp,
3644    caller_context_t *ct)
3645{
3646	if (nfs_zone() != VTOMI4(vp)->mi_zone)
3647		return (EIO);
3648	switch (cmd) {
3649		case _FIODIRECTIO:
3650			return (nfs4_directio(vp, (int)arg, cr));
3651		default:
3652			return (ENOTTY);
3653	}
3654}
3655
3656/* ARGSUSED */
3657int
3658nfs4_getattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
3659    caller_context_t *ct)
3660{
3661	int error;
3662	rnode4_t *rp = VTOR4(vp);
3663
3664	if (nfs_zone() != VTOMI4(vp)->mi_zone)
3665		return (EIO);
3666	/*
3667	 * If it has been specified that the return value will
3668	 * just be used as a hint, and we are only being asked
3669	 * for size, fsid or rdevid, then return the client's
3670	 * notion of these values without checking to make sure
3671	 * that the attribute cache is up to date.
3672	 * The whole point is to avoid an over the wire GETATTR
3673	 * call.
3674	 */
3675	if (flags & ATTR_HINT) {
3676		if (!(vap->va_mask & ~(AT_SIZE | AT_FSID | AT_RDEV))) {
3677			mutex_enter(&rp->r_statelock);
3678			if (vap->va_mask & AT_SIZE)
3679				vap->va_size = rp->r_size;
3680			if (vap->va_mask & AT_FSID)
3681				vap->va_fsid = rp->r_attr.va_fsid;
3682			if (vap->va_mask & AT_RDEV)
3683				vap->va_rdev = rp->r_attr.va_rdev;
3684			mutex_exit(&rp->r_statelock);
3685			return (0);
3686		}
3687	}
3688
3689	/*
3690	 * Only need to flush pages if asking for the mtime
3691	 * and if there any dirty pages or any outstanding
3692	 * asynchronous (write) requests for this file.
3693	 */
3694	if (vap->va_mask & AT_MTIME) {
3695		rp = VTOR4(vp);
3696		if (nfs4_has_pages(vp)) {
3697			mutex_enter(&rp->r_statev4_lock);
3698			if (rp->r_deleg_type != OPEN_DELEGATE_WRITE) {
3699				mutex_exit(&rp->r_statev4_lock);
3700				if (rp->r_flags & R4DIRTY ||
3701				    rp->r_awcount > 0) {
3702					mutex_enter(&rp->r_statelock);
3703					rp->r_gcount++;
3704					mutex_exit(&rp->r_statelock);
3705					error =
3706					    nfs4_putpage(vp, (u_offset_t)0,
3707					    0, 0, cr, NULL);
3708					mutex_enter(&rp->r_statelock);
3709					if (error && (error == ENOSPC ||
3710					    error == EDQUOT)) {
3711						if (!rp->r_error)
3712							rp->r_error = error;
3713					}
3714					if (--rp->r_gcount == 0)
3715						cv_broadcast(&rp->r_cv);
3716					mutex_exit(&rp->r_statelock);
3717				}
3718			} else {
3719				mutex_exit(&rp->r_statev4_lock);
3720			}
3721		}
3722	}
3723	return (nfs4getattr(vp, vap, cr));
3724}
3725
3726int
3727nfs4_compare_modes(mode_t from_server, mode_t on_client)
3728{
3729	/*
3730	 * If these are the only two bits cleared
3731	 * on the server then return 0 (OK) else
3732	 * return 1 (BAD).
3733	 */
3734	on_client &= ~(S_ISUID|S_ISGID);
3735	if (on_client == from_server)
3736		return (0);
3737	else
3738		return (1);
3739}
3740
3741/*ARGSUSED4*/
3742static int
3743nfs4_setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
3744    caller_context_t *ct)
3745{
3746	int error;
3747
3748	if (vap->va_mask & AT_NOSET)
3749		return (EINVAL);
3750
3751	if (nfs_zone() != VTOMI4(vp)->mi_zone)
3752		return (EIO);
3753
3754	/*
3755	 * Don't call secpolicy_vnode_setattr, the client cannot
3756	 * use its cached attributes to make security decisions
3757	 * as the server may be faking mode bits or mapping uid/gid.
3758	 * Always just let the server to the checking.
3759	 * If we provide the ability to remove basic priviledges
3760	 * to setattr (e.g. basic without chmod) then we will
3761	 * need to add a check here before calling the server.
3762	 */
3763	error = nfs4setattr(vp, vap, flags, cr, NULL);
3764
3765	if (error == 0 && (vap->va_mask & AT_SIZE) && vap->va_size == 0)
3766		vnevent_truncate(vp, ct);
3767
3768	return (error);
3769}
3770
3771/*
3772 * To replace the "guarded" version 3 setattr, we use two types of compound
3773 * setattr requests:
3774 * 1. The "normal" setattr, used when the size of the file isn't being
3775 *    changed - { Putfh <fh>; Setattr; Getattr }/
3776 * 2. If the size is changed, precede Setattr with: Getattr; Verify
3777 *    with only ctime as the argument. If the server ctime differs from
3778 *    what is cached on the client, the verify will fail, but we would
3779 *    already have the ctime from the preceding getattr, so just set it
3780 *    and retry. Thus the compound here is - { Putfh <fh>; Getattr; Verify;
3781 *	Setattr; Getattr }.
3782 *
3783 * The vsecattr_t * input parameter will be non-NULL if ACLs are being set in
3784 * this setattr and NULL if they are not.
3785 */
3786static int
3787nfs4setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
3788    vsecattr_t *vsap)
3789{
3790	COMPOUND4args_clnt args;
3791	COMPOUND4res_clnt res, *resp = NULL;
3792	nfs4_ga_res_t *garp = NULL;
3793	int numops = 3;			/* { Putfh; Setattr; Getattr } */
3794	nfs_argop4 argop[5];
3795	int verify_argop = -1;
3796	int setattr_argop = 1;
3797	nfs_resop4 *resop;
3798	vattr_t va;
3799	rnode4_t *rp;
3800	int doqueue = 1;
3801	uint_t mask = vap->va_mask;
3802	mode_t omode;
3803	vsecattr_t *vsp;
3804	timestruc_t ctime;
3805	bool_t needrecov = FALSE;
3806	nfs4_recov_state_t recov_state;
3807	nfs4_stateid_types_t sid_types;
3808	stateid4 stateid;
3809	hrtime_t t;
3810	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
3811	servinfo4_t *svp;
3812	bitmap4 supp_attrs;
3813
3814	ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
3815	rp = VTOR4(vp);
3816	nfs4_init_stateid_types(&sid_types);
3817
3818	/*
3819	 * Only need to flush pages if there are any pages and
3820	 * if the file is marked as dirty in some fashion.  The
3821	 * file must be flushed so that we can accurately
3822	 * determine the size of the file and the cached data
3823	 * after the SETATTR returns.  A file is considered to
3824	 * be dirty if it is either marked with R4DIRTY, has
3825	 * outstanding i/o's active, or is mmap'd.  In this
3826	 * last case, we can't tell whether there are dirty
3827	 * pages, so we flush just to be sure.
3828	 */
3829	if (nfs4_has_pages(vp) &&
3830	    ((rp->r_flags & R4DIRTY) ||
3831	    rp->r_count > 0 ||
3832	    rp->r_mapcnt > 0)) {
3833		ASSERT(vp->v_type != VCHR);
3834		e.error = nfs4_putpage(vp, (offset_t)0, 0, 0, cr, NULL);
3835		if (e.error && (e.error == ENOSPC || e.error == EDQUOT)) {
3836			mutex_enter(&rp->r_statelock);
3837			if (!rp->r_error)
3838				rp->r_error = e.error;
3839			mutex_exit(&rp->r_statelock);
3840		}
3841	}
3842
3843	if (mask & AT_SIZE) {
3844		/*
3845		 * Verification setattr compound for non-deleg AT_SIZE:
3846		 *	{ Putfh; Getattr; Verify; Setattr; Getattr }
3847		 * Set ctime local here (outside the do_again label)
3848		 * so that subsequent retries (after failed VERIFY)
3849		 * will use ctime from GETATTR results (from failed
3850		 * verify compound) as VERIFY arg.
3851		 * If file has delegation, then VERIFY(time_metadata)
3852		 * is of little added value, so don't bother.
3853		 */
3854		mutex_enter(&rp->r_statev4_lock);
3855		if (rp->r_deleg_type == OPEN_DELEGATE_NONE ||
3856		    rp->r_deleg_return_pending) {
3857			numops = 5;
3858			ctime = rp->r_attr.va_ctime;
3859		}
3860		mutex_exit(&rp->r_statev4_lock);
3861	}
3862
3863	recov_state.rs_flags = 0;
3864	recov_state.rs_num_retry_despite_err = 0;
3865
3866	args.ctag = TAG_SETATTR;
3867do_again:
3868recov_retry:
3869	setattr_argop = numops - 2;
3870
3871	args.array = argop;
3872	args.array_len = numops;
3873
3874	e.error = nfs4_start_op(VTOMI4(vp), vp, NULL, &recov_state);
3875	if (e.error)
3876		return (e.error);
3877
3878
3879	/* putfh target fh */
3880	argop[0].argop = OP_CPUTFH;
3881	argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
3882
3883	if (numops == 5) {
3884		/*
3885		 * We only care about the ctime, but need to get mtime
3886		 * and size for proper cache update.
3887		 */
3888		/* getattr */
3889		argop[1].argop = OP_GETATTR;
3890		argop[1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
3891		argop[1].nfs_argop4_u.opgetattr.mi = VTOMI4(vp);
3892
3893		/* verify - set later in loop */
3894		verify_argop = 2;
3895	}
3896
3897	/* setattr */
3898	svp = rp->r_server;
3899	(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
3900	supp_attrs = svp->sv_supp_attrs;
3901	nfs_rw_exit(&svp->sv_lock);
3902
3903	nfs4args_setattr(&argop[setattr_argop], vap, vsap, flags, rp, cr,
3904	    supp_attrs, &e.error, &sid_types);
3905	stateid = argop[setattr_argop].nfs_argop4_u.opsetattr.stateid;
3906	if (e.error) {
3907		/* req time field(s) overflow - return immediately */
3908		nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, needrecov);
3909		nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u.
3910		    opsetattr.obj_attributes);
3911		return (e.error);
3912	}
3913	omode = rp->r_attr.va_mode;
3914
3915	/* getattr */
3916	argop[numops-1].argop = OP_GETATTR;
3917	argop[numops-1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
3918	/*
3919	 * If we are setting the ACL (indicated only by vsap != NULL), request
3920	 * the ACL in this getattr.  The ACL returned from this getattr will be
3921	 * used in updating the ACL cache.
3922	 */
3923	if (vsap != NULL)
3924		argop[numops-1].nfs_argop4_u.opgetattr.attr_request |=
3925		    FATTR4_ACL_MASK;
3926	argop[numops-1].nfs_argop4_u.opgetattr.mi = VTOMI4(vp);
3927
3928	/*
3929	 * setattr iterates if the object size is set and the cached ctime
3930	 * does not match the file ctime. In that case, verify the ctime first.
3931	 */
3932
3933	do {
3934		if (verify_argop != -1) {
3935			/*
3936			 * Verify that the ctime match before doing setattr.
3937			 */
3938			va.va_mask = AT_CTIME;
3939			va.va_ctime = ctime;
3940			svp = rp->r_server;
3941			(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
3942			supp_attrs = svp->sv_supp_attrs;
3943			nfs_rw_exit(&svp->sv_lock);
3944			e.error = nfs4args_verify(&argop[verify_argop], &va,
3945			    OP_VERIFY, supp_attrs);
3946			if (e.error) {
3947				/* req time field(s) overflow - return */
3948				nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state,
3949				    needrecov);
3950				break;
3951			}
3952		}
3953
3954		doqueue = 1;
3955
3956		t = gethrtime();
3957
3958		rfs4call(VTOMI4(vp), &args, &res, cr, &doqueue, 0, &e);
3959
3960		/*
3961		 * Purge the access cache and ACL cache if changing either the
3962		 * owner of the file, the group owner, or the mode.  These may
3963		 * change the access permissions of the file, so purge old
3964		 * information and start over again.
3965		 */
3966		if (mask & (AT_UID | AT_GID | AT_MODE)) {
3967			(void) nfs4_access_purge_rp(rp);
3968			if (rp->r_secattr != NULL) {
3969				mutex_enter(&rp->r_statelock);
3970				vsp = rp->r_secattr;
3971				rp->r_secattr = NULL;
3972				mutex_exit(&rp->r_statelock);
3973				if (vsp != NULL)
3974					nfs4_acl_free_cache(vsp);
3975			}
3976		}
3977
3978		/*
3979		 * If res.array_len == numops, then everything succeeded,
3980		 * except for possibly the final getattr.  If only the
3981		 * last getattr failed, give up, and don't try recovery.
3982		 */
3983		if (res.array_len == numops) {
3984			nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state,
3985			    needrecov);
3986			if (! e.error)
3987				resp = &res;
3988			break;
3989		}
3990
3991		/*
3992		 * if either rpc call failed or completely succeeded - done
3993		 */
3994		needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp);
3995		if (e.error) {
3996			PURGE_ATTRCACHE4(vp);
3997			if (!needrecov) {
3998				nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state,
3999				    needrecov);
4000				break;
4001			}
4002		}
4003
4004		/*
4005		 * Do proper retry for OLD_STATEID outside of the normal
4006		 * recovery framework.
4007		 */
4008		if (e.error == 0 && res.status == NFS4ERR_OLD_STATEID &&
4009		    sid_types.cur_sid_type != SPEC_SID &&
4010		    sid_types.cur_sid_type != NO_SID) {
4011			nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state,
4012			    needrecov);
4013			nfs4_save_stateid(&stateid, &sid_types);
4014			nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u.
4015			    opsetattr.obj_attributes);
4016			if (verify_argop != -1) {
4017				nfs4args_verify_free(&argop[verify_argop]);
4018				verify_argop = -1;
4019			}
4020			xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
4021			goto recov_retry;
4022		}
4023
4024		if (needrecov) {
4025			bool_t abort;
4026
4027			abort = nfs4_start_recovery(&e,
4028			    VTOMI4(vp), vp, NULL, NULL, NULL,
4029			    OP_SETATTR, NULL, NULL, NULL);
4030			nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state,
4031			    needrecov);
4032			/*
4033			 * Do not retry if we failed with OLD_STATEID using
4034			 * a special stateid.  This is done to avoid looping
4035			 * with a broken server.
4036			 */
4037			if (e.error == 0 && res.status == NFS4ERR_OLD_STATEID &&
4038			    (sid_types.cur_sid_type == SPEC_SID ||
4039			    sid_types.cur_sid_type == NO_SID))
4040				abort = TRUE;
4041			if (!e.error) {
4042				if (res.status == NFS4ERR_BADOWNER)
4043					nfs4_log_badowner(VTOMI4(vp),
4044					    OP_SETATTR);
4045
4046				e.error = geterrno4(res.status);
4047				xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
4048			}
4049			nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u.
4050			    opsetattr.obj_attributes);
4051			if (verify_argop != -1) {
4052				nfs4args_verify_free(&argop[verify_argop]);
4053				verify_argop = -1;
4054			}
4055			if (abort == FALSE) {
4056				/*
4057				 * Need to retry all possible stateids in
4058				 * case the recovery error wasn't stateid
4059				 * related or the stateids have become
4060				 * stale (server reboot).
4061				 */
4062				nfs4_init_stateid_types(&sid_types);
4063				goto recov_retry;
4064			}
4065			return (e.error);
4066		}
4067
4068		/*
4069		 * Need to call nfs4_end_op before nfs4getattr to
4070		 * avoid potential nfs4_start_op deadlock. See RFE
4071		 * 4777612.  Calls to nfs4_invalidate_pages() and
4072		 * nfs4_purge_stale_fh() might also generate over the
4073		 * wire calls which my cause nfs4_start_op() deadlock.
4074		 */
4075		nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, needrecov);
4076
4077		/*
4078		 * Check to update lease.
4079		 */
4080		resp = &res;
4081		if (res.status == NFS4_OK) {
4082			break;
4083		}
4084
4085		/*
4086		 * Check if verify failed to see if try again
4087		 */
4088		if ((verify_argop == -1) || (res.array_len != 3)) {
4089			/*
4090			 * can't continue...
4091			 */
4092			if (res.status == NFS4ERR_BADOWNER)
4093				nfs4_log_badowner(VTOMI4(vp), OP_SETATTR);
4094
4095			e.error = geterrno4(res.status);
4096		} else {
4097			/*
4098			 * When the verify request fails, the client ctime is
4099			 * not in sync with the server. This is the same as
4100			 * the version 3 "not synchronized" error, and we
4101			 * handle it in a similar manner (XXX do we need to???).
4102			 * Use the ctime returned in the first getattr for
4103			 * the input to the next verify.
4104			 * If we couldn't get the attributes, then we give up
4105			 * because we can't complete the operation as required.
4106			 */
4107			garp = &res.array[1].nfs_resop4_u.opgetattr.ga_res;
4108		}
4109		if (e.error) {
4110			PURGE_ATTRCACHE4(vp);
4111			nfs4_purge_stale_fh(e.error, vp, cr);
4112		} else {
4113			/*
4114			 * retry with a new verify value
4115			 */
4116			ctime = garp->n4g_va.va_ctime;
4117			xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
4118			resp = NULL;
4119		}
4120		if (!e.error) {
4121			nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u.
4122			    opsetattr.obj_attributes);
4123			if (verify_argop != -1) {
4124				nfs4args_verify_free(&argop[verify_argop]);
4125				verify_argop = -1;
4126			}
4127			xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
4128			goto do_again;
4129		}
4130	} while (!e.error);
4131
4132	if (e.error) {
4133		/*
4134		 * If we are here, rfs4call has an irrecoverable error - return
4135		 */
4136		nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u.
4137		    opsetattr.obj_attributes);
4138		if (verify_argop != -1) {
4139			nfs4args_verify_free(&argop[verify_argop]);
4140			verify_argop = -1;
4141		}
4142		if (resp)
4143			xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp);
4144		return (e.error);
4145	}
4146
4147
4148
4149	/*
4150	 * If changing the size of the file, invalidate
4151	 * any local cached data which is no longer part
4152	 * of the file.  We also possibly invalidate the
4153	 * last page in the file.  We could use
4154	 * pvn_vpzero(), but this would mark the page as
4155	 * modified and require it to be written back to
4156	 * the server for no particularly good reason.
4157	 * This way, if we access it, then we bring it
4158	 * back in.  A read should be cheaper than a
4159	 * write.
4160	 */
4161	if (mask & AT_SIZE) {
4162		nfs4_invalidate_pages(vp, (vap->va_size & PAGEMASK), cr);
4163	}
4164
4165	/* either no error or one of the postop getattr failed */
4166
4167	/*
4168	 * XXX Perform a simplified version of wcc checking. Instead of
4169	 * have another getattr to get pre-op, just purge cache if
4170	 * any of the ops prior to and including the getattr failed.
4171	 * If the getattr succeeded then update the attrcache accordingly.
4172	 */
4173
4174	garp = NULL;
4175	if (res.status == NFS4_OK) {
4176		/*
4177		 * Last getattr
4178		 */
4179		resop = &res.array[numops - 1];
4180		garp = &resop->nfs_resop4_u.opgetattr.ga_res;
4181	}
4182	/*
4183	 * In certain cases, nfs4_update_attrcache() will purge the attrcache,
4184	 * rather than filling it.  See the function itself for details.
4185	 */
4186	e.error = nfs4_update_attrcache(res.status, garp, t, vp, cr);
4187	if (garp != NULL) {
4188		if (garp->n4g_resbmap & FATTR4_ACL_MASK) {
4189			nfs4_acl_fill_cache(rp, &garp->n4g_vsa);
4190			vs_ace4_destroy(&garp->n4g_vsa);
4191		} else {
4192			if (vsap != NULL) {
4193				/*
4194				 * The ACL was supposed to be set and to be
4195				 * returned in the last getattr of this
4196				 * compound, but for some reason the getattr
4197				 * result doesn't contain the ACL.  In this
4198				 * case, purge the ACL cache.
4199				 */
4200				if (rp->r_secattr != NULL) {
4201					mutex_enter(&rp->r_statelock);
4202					vsp = rp->r_secattr;
4203					rp->r_secattr = NULL;
4204					mutex_exit(&rp->r_statelock);
4205					if (vsp != NULL)
4206						nfs4_acl_free_cache(vsp);
4207				}
4208			}
4209		}
4210	}
4211
4212	if (res.status == NFS4_OK && (mask & AT_SIZE)) {
4213		/*
4214		 * Set the size, rather than relying on getting it updated
4215		 * via a GETATTR.  With delegations the client tries to
4216		 * suppress GETATTR calls.
4217		 */
4218		mutex_enter(&rp->r_statelock);
4219		rp->r_size = vap->va_size;
4220		mutex_exit(&rp->r_statelock);
4221	}
4222
4223	/*
4224	 * Can free up request args and res
4225	 */
4226	nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u.
4227	    opsetattr.obj_attributes);
4228	if (verify_argop != -1) {
4229		nfs4args_verify_free(&argop[verify_argop]);
4230		verify_argop = -1;
4231	}
4232	xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
4233
4234	/*
4235	 * Some servers will change the mode to clear the setuid
4236	 * and setgid bits when changing the uid or gid.  The
4237	 * client needs to compensate appropriately.
4238	 */
4239	if (mask & (AT_UID | AT_GID)) {
4240		int terror, do_setattr;
4241
4242		do_setattr = 0;
4243		va.va_mask = AT_MODE;
4244		terror = nfs4getattr(vp, &va, cr);
4245		if (!terror &&
4246		    (((mask & AT_MODE) && va.va_mode != vap->va_mode) ||
4247		    (!(mask & AT_MODE) && va.va_mode != omode))) {
4248			va.va_mask = AT_MODE;
4249			if (mask & AT_MODE) {
4250				/*
4251				 * We asked the mode to be changed and what
4252				 * we just got from the server in getattr is
4253				 * not what we wanted it to be, so set it now.
4254				 */
4255				va.va_mode = vap->va_mode;
4256				do_setattr = 1;
4257			} else {
4258				/*
4259				 * We did not ask the mode to be changed,
4260				 * Check to see that the server just cleared
4261				 * I_SUID and I_GUID from it. If not then
4262				 * set mode to omode with UID/GID cleared.
4263				 */
4264				if (nfs4_compare_modes(va.va_mode, omode)) {
4265					omode &= ~(S_ISUID|S_ISGID);
4266					va.va_mode = omode;
4267					do_setattr = 1;
4268				}
4269			}
4270
4271			if (do_setattr)
4272				(void) nfs4setattr(vp, &va, 0, cr, NULL);
4273		}
4274	}
4275
4276	return (e.error);
4277}
4278
4279/* ARGSUSED */
4280static int
4281nfs4_access(vnode_t *vp, int mode, int flags, cred_t *cr, caller_context_t *ct)
4282{
4283	COMPOUND4args_clnt args;
4284	COMPOUND4res_clnt res;
4285	int doqueue;
4286	uint32_t acc, resacc, argacc;
4287	rnode4_t *rp;
4288	cred_t *cred, *ncr, *ncrfree = NULL;
4289	nfs4_access_type_t cacc;
4290	int num_ops;
4291	nfs_argop4 argop[3];
4292	nfs_resop4 *resop;
4293	bool_t needrecov = FALSE, do_getattr;
4294	nfs4_recov_state_t recov_state;
4295	int rpc_error;
4296	hrtime_t t;
4297	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
4298	mntinfo4_t *mi = VTOMI4(vp);
4299
4300	if (nfs_zone() != mi->mi_zone)
4301		return (EIO);
4302
4303	acc = 0;
4304	if (mode & VREAD)
4305		acc |= ACCESS4_READ;
4306	if (mode & VWRITE) {
4307		if ((vp->v_vfsp->vfs_flag & VFS_RDONLY) && !ISVDEV(vp->v_type))
4308			return (EROFS);
4309		if (vp->v_type == VDIR)
4310			acc |= ACCESS4_DELETE;
4311		acc |= ACCESS4_MODIFY | ACCESS4_EXTEND;
4312	}
4313	if (mode & VEXEC) {
4314		if (vp->v_type == VDIR)
4315			acc |= ACCESS4_LOOKUP;
4316		else
4317			acc |= ACCESS4_EXECUTE;
4318	}
4319
4320	if (VTOR4(vp)->r_acache != NULL) {
4321		e.error = nfs4_validate_caches(vp, cr);
4322		if (e.error)
4323			return (e.error);
4324	}
4325
4326	rp = VTOR4(vp);
4327	if (vp->v_type == VDIR)
4328		argacc = ACCESS4_READ | ACCESS4_DELETE | ACCESS4_MODIFY |
4329		    ACCESS4_EXTEND | ACCESS4_LOOKUP;
4330	else
4331		argacc = ACCESS4_READ | ACCESS4_MODIFY | ACCESS4_EXTEND |
4332		    ACCESS4_EXECUTE;
4333	recov_state.rs_flags = 0;
4334	recov_state.rs_num_retry_despite_err = 0;
4335
4336	cred = cr;
4337	/*
4338	 * ncr and ncrfree both initially
4339	 * point to the memory area returned
4340	 * by crnetadjust();
4341	 * ncrfree not NULL when exiting means
4342	 * that we need to release it
4343	 */
4344	ncr = crnetadjust(cred);
4345	ncrfree = ncr;
4346
4347tryagain:
4348	cacc = nfs4_access_check(rp, acc, cred);
4349	if (cacc == NFS4_ACCESS_ALLOWED) {
4350		if (ncrfree != NULL)
4351			crfree(ncrfree);
4352		return (0);
4353	}
4354	if (cacc == NFS4_ACCESS_DENIED) {
4355		/*
4356		 * If the cred can be adjusted, try again
4357		 * with the new cred.
4358		 */
4359		if (ncr != NULL) {
4360			cred = ncr;
4361			ncr = NULL;
4362			goto tryagain;
4363		}
4364		if (ncrfree != NULL)
4365			crfree(ncrfree);
4366		return (EACCES);
4367	}
4368
4369recov_retry:
4370	/*
4371	 * Don't take with r_statev4_lock here. r_deleg_type could
4372	 * change as soon as lock is released.  Since it is an int,
4373	 * there is no atomicity issue.
4374	 */
4375	do_getattr = (rp->r_deleg_type == OPEN_DELEGATE_NONE);
4376	num_ops = do_getattr ? 3 : 2;
4377
4378	args.ctag = TAG_ACCESS;
4379
4380	args.array_len = num_ops;
4381	args.array = argop;
4382
4383	if (e.error = nfs4_start_fop(mi, vp, NULL, OH_ACCESS,
4384	    &recov_state, NULL)) {
4385		if (ncrfree != NULL)
4386			crfree(ncrfree);
4387		return (e.error);
4388	}
4389
4390	/* putfh target fh */
4391	argop[0].argop = OP_CPUTFH;
4392	argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh;
4393
4394	/* access */
4395	argop[1].argop = OP_ACCESS;
4396	argop[1].nfs_argop4_u.opaccess.access = argacc;
4397
4398	/* getattr */
4399	if (do_getattr) {
4400		argop[2].argop = OP_GETATTR;
4401		argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
4402		argop[2].nfs_argop4_u.opgetattr.mi = mi;
4403	}
4404
4405	NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
4406	    "nfs4_access: %s call, rp %s", needrecov ? "recov" : "first",
4407	    rnode4info(VTOR4(vp))));
4408
4409	doqueue = 1;
4410	t = gethrtime();
4411	rfs4call(VTOMI4(vp), &args, &res, cred, &doqueue, 0, &e);
4412	rpc_error = e.error;
4413
4414	needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp);
4415	if (needrecov) {
4416		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
4417		    "nfs4_access: initiating recovery\n"));
4418
4419		if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL,
4420		    NULL, OP_ACCESS, NULL, NULL, NULL) == FALSE) {
4421			nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_ACCESS,
4422			    &recov_state, needrecov);
4423			if (!e.error)
4424				xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
4425			goto recov_retry;
4426		}
4427	}
4428	nfs4_end_fop(mi, vp, NULL, OH_ACCESS, &recov_state, needrecov);
4429
4430	if (e.error)
4431		goto out;
4432
4433	if (res.status) {
4434		e.error = geterrno4(res.status);
4435		/*
4436		 * This might generate over the wire calls throught
4437		 * nfs4_invalidate_pages. Hence we need to call nfs4_end_op()
4438		 * here to avoid a deadlock.
4439		 */
4440		nfs4_purge_stale_fh(e.error, vp, cr);
4441		goto out;
4442	}
4443	resop = &res.array[1];	/* access res */
4444
4445	resacc = resop->nfs_resop4_u.opaccess.access;
4446
4447	if (do_getattr) {
4448		resop++;	/* getattr res */
4449		nfs4_attr_cache(vp, &resop->nfs_resop4_u.opgetattr.ga_res,
4450		    t, cr, FALSE, NULL);
4451	}
4452
4453	if (!e.error) {
4454		nfs4_access_cache(rp, argacc, resacc, cred);
4455		/*
4456		 * we just cached results with cred; if cred is the
4457		 * adjusted credentials from crnetadjust, we do not want
4458		 * to release them before exiting: hence setting ncrfree
4459		 * to NULL
4460		 */
4461		if (cred != cr)
4462			ncrfree = NULL;
4463		/* XXX check the supported bits too? */
4464		if ((acc & resacc) != acc) {
4465			/*
4466			 * The following code implements the semantic
4467			 * that a setuid root program has *at least* the
4468			 * permissions of the user that is running the
4469			 * program.  See rfs3call() for more portions
4470			 * of the implementation of this functionality.
4471			 */
4472			/* XXX-LP */
4473			if (ncr != NULL) {
4474				xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
4475				cred = ncr;
4476				ncr = NULL;
4477				goto tryagain;
4478			}
4479			e.error = EACCES;
4480		}
4481	}
4482
4483out:
4484	if (!rpc_error)
4485		xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
4486
4487	if (ncrfree != NULL)
4488		crfree(ncrfree);
4489
4490	return (e.error);
4491}
4492
4493/* ARGSUSED */
4494static int
4495nfs4_readlink(vnode_t *vp, struct uio *uiop, cred_t *cr, caller_context_t *ct)
4496{
4497	COMPOUND4args_clnt args;
4498	COMPOUND4res_clnt res;
4499	int doqueue;
4500	rnode4_t *rp;
4501	nfs_argop4 argop[3];
4502	nfs_resop4 *resop;
4503	READLINK4res *lr_res;
4504	nfs4_ga_res_t *garp;
4505	uint_t len;
4506	char *linkdata;
4507	bool_t needrecov = FALSE;
4508	nfs4_recov_state_t recov_state;
4509	hrtime_t t;
4510	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
4511
4512	if (nfs_zone() != VTOMI4(vp)->mi_zone)
4513		return (EIO);
4514	/*
4515	 * Can't readlink anything other than a symbolic link.
4516	 */
4517	if (vp->v_type != VLNK)
4518		return (EINVAL);
4519
4520	rp = VTOR4(vp);
4521	if (nfs4_do_symlink_cache && rp->r_symlink.contents != NULL) {
4522		e.error = nfs4_validate_caches(vp, cr);
4523		if (e.error)
4524			return (e.error);
4525		mutex_enter(&rp->r_statelock);
4526		if (rp->r_symlink.contents != NULL) {
4527			e.error = uiomove(rp->r_symlink.contents,
4528			    rp->r_symlink.len, UIO_READ, uiop);
4529			mutex_exit(&rp->r_statelock);
4530			return (e.error);
4531		}
4532		mutex_exit(&rp->r_statelock);
4533	}
4534	recov_state.rs_flags = 0;
4535	recov_state.rs_num_retry_despite_err = 0;
4536
4537recov_retry:
4538	args.array_len = 3;
4539	args.array = argop;
4540	args.ctag = TAG_READLINK;
4541
4542	e.error = nfs4_start_op(VTOMI4(vp), vp, NULL, &recov_state);
4543	if (e.error) {
4544		return (e.error);
4545	}
4546
4547	/* 0. putfh symlink fh */
4548	argop[0].argop = OP_CPUTFH;
4549	argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh;
4550
4551	/* 1. readlink */
4552	argop[1].argop = OP_READLINK;
4553
4554	/* 2. getattr */
4555	argop[2].argop = OP_GETATTR;
4556	argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
4557	argop[2].nfs_argop4_u.opgetattr.mi = VTOMI4(vp);
4558
4559	doqueue = 1;
4560
4561	NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
4562	    "nfs4_readlink: %s call, rp %s", needrecov ? "recov" : "first",
4563	    rnode4info(VTOR4(vp))));
4564
4565	t = gethrtime();
4566
4567	rfs4call(VTOMI4(vp), &args, &res, cr, &doqueue, 0, &e);
4568
4569	needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp);
4570	if (needrecov) {
4571		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
4572		    "nfs4_readlink: initiating recovery\n"));
4573
4574		if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL,
4575		    NULL, OP_READLINK, NULL, NULL, NULL) == FALSE) {
4576			if (!e.error)
4577				xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
4578
4579			nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state,
4580			    needrecov);
4581			goto recov_retry;
4582		}
4583	}
4584
4585	nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, needrecov);
4586
4587	if (e.error)
4588		return (e.error);
4589
4590	/*
4591	 * There is an path in the code below which calls
4592	 * nfs4_purge_stale_fh(), which may generate otw calls through
4593	 * nfs4_invalidate_pages. Hence we need to call nfs4_end_op()
4594	 * here to avoid nfs4_start_op() deadlock.
4595	 */
4596
4597	if (res.status && (res.array_len < args.array_len)) {
4598		/*
4599		 * either Putfh or Link failed
4600		 */
4601		e.error = geterrno4(res.status);
4602		nfs4_purge_stale_fh(e.error, vp, cr);
4603		xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
4604		return (e.error);
4605	}
4606
4607	resop = &res.array[1];	/* readlink res */
4608	lr_res = &resop->nfs_resop4_u.opreadlink;
4609
4610	/*
4611	 * treat symlink names as data
4612	 */
4613	linkdata = utf8_to_str((utf8string *)&lr_res->link, &len, NULL);
4614	if (linkdata != NULL) {
4615		int uio_len = len - 1;
4616		/* len includes null byte, which we won't uiomove */
4617		e.error = uiomove(linkdata, uio_len, UIO_READ, uiop);
4618		if (nfs4_do_symlink_cache && rp->r_symlink.contents == NULL) {
4619			mutex_enter(&rp->r_statelock);
4620			if (rp->r_symlink.contents == NULL) {
4621				rp->r_symlink.contents = linkdata;
4622				rp->r_symlink.len = uio_len;
4623				rp->r_symlink.size = len;
4624				mutex_exit(&rp->r_statelock);
4625			} else {
4626				mutex_exit(&rp->r_statelock);
4627				kmem_free(linkdata, len);
4628			}
4629		} else {
4630			kmem_free(linkdata, len);
4631		}
4632	}
4633	if (res.status == NFS4_OK) {
4634		resop++;	/* getattr res */
4635		garp = &resop->nfs_resop4_u.opgetattr.ga_res;
4636	}
4637	e.error = nfs4_update_attrcache(res.status, garp, t, vp, cr);
4638
4639	xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
4640
4641	/*
4642	 * The over the wire error for attempting to readlink something
4643	 * other than a symbolic link is ENXIO.  However, we need to
4644	 * return EINVAL instead of ENXIO, so we map it here.
4645	 */
4646	return (e.error == ENXIO ? EINVAL : e.error);
4647}
4648
4649/*
4650 * Flush local dirty pages to stable storage on the server.
4651 *
4652 * If FNODSYNC is specified, then there is nothing to do because
4653 * metadata changes are not cached on the client before being
4654 * sent to the server.
4655 */
4656/* ARGSUSED */
4657static int
4658nfs4_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
4659{
4660	int error;
4661
4662	if ((syncflag & FNODSYNC) || IS_SWAPVP(vp))
4663		return (0);
4664	if (nfs_zone() != VTOMI4(vp)->mi_zone)
4665		return (EIO);
4666	error = nfs4_putpage_commit(vp, (offset_t)0, 0, cr);
4667	if (!error)
4668		error = VTOR4(vp)->r_error;
4669	return (error);
4670}
4671
4672/*
4673 * Weirdness: if the file was removed or the target of a rename
4674 * operation while it was open, it got renamed instead.  Here we
4675 * remove the renamed file.
4676 */
4677/* ARGSUSED */
4678void
4679nfs4_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
4680{
4681	rnode4_t *rp;
4682
4683	ASSERT(vp != DNLC_NO_VNODE);
4684
4685	rp = VTOR4(vp);
4686
4687	if (IS_SHADOW(vp, rp)) {
4688		sv_inactive(vp);
4689		return;
4690	}
4691
4692	/*
4693	 * If this is coming from the wrong zone, we let someone in the right
4694	 * zone take care of it asynchronously.  We can get here due to
4695	 * VN_RELE() being called from pageout() or fsflush().  This call may
4696	 * potentially turn into an expensive no-op if, for instance, v_count
4697	 * gets incremented in the meantime, but it's still correct.
4698	 */
4699	if (nfs_zone() != VTOMI4(vp)->mi_zone) {
4700		nfs4_async_inactive(vp, cr);
4701		return;
4702	}
4703
4704	/*
4705	 * Some of the cleanup steps might require over-the-wire
4706	 * operations.  Since VOP_INACTIVE can get called as a result of
4707	 * other over-the-wire operations (e.g., an attribute cache update
4708	 * can lead to a DNLC purge), doing those steps now would lead to a
4709	 * nested call to the recovery framework, which can deadlock.  So
4710	 * do any over-the-wire cleanups asynchronously, in a separate
4711	 * thread.
4712	 */
4713
4714	mutex_enter(&rp->r_os_lock);
4715	mutex_enter(&rp->r_statelock);
4716	mutex_enter(&rp->r_statev4_lock);
4717
4718	if (vp->v_type == VREG && list_head(&rp->r_open_streams) != NULL) {
4719		mutex_exit(&rp->r_statev4_lock);
4720		mutex_exit(&rp->r_statelock);
4721		mutex_exit(&rp->r_os_lock);
4722		nfs4_async_inactive(vp, cr);
4723		return;
4724	}
4725
4726	if (rp->r_deleg_type == OPEN_DELEGATE_READ ||
4727	    rp->r_deleg_type == OPEN_DELEGATE_WRITE) {
4728		mutex_exit(&rp->r_statev4_lock);
4729		mutex_exit(&rp->r_statelock);
4730		mutex_exit(&rp->r_os_lock);
4731		nfs4_async_inactive(vp, cr);
4732		return;
4733	}
4734
4735	if (rp->r_unldvp != NULL) {
4736		mutex_exit(&rp->r_statev4_lock);
4737		mutex_exit(&rp->r_statelock);
4738		mutex_exit(&rp->r_os_lock);
4739		nfs4_async_inactive(vp, cr);
4740		return;
4741	}
4742	mutex_exit(&rp->r_statev4_lock);
4743	mutex_exit(&rp->r_statelock);
4744	mutex_exit(&rp->r_os_lock);
4745
4746	rp4_addfree(rp, cr);
4747}
4748
4749/*
4750 * nfs4_inactive_otw - nfs4_inactive, plus over-the-wire calls to free up
4751 * various bits of state.  The caller must not refer to vp after this call.
4752 */
4753
4754void
4755nfs4_inactive_otw(vnode_t *vp, cred_t *cr)
4756{
4757	rnode4_t *rp = VTOR4(vp);
4758	nfs4_recov_state_t recov_state;
4759	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
4760	vnode_t *unldvp;
4761	char *unlname;
4762	cred_t *unlcred;
4763	COMPOUND4args_clnt args;
4764	COMPOUND4res_clnt res, *resp;
4765	nfs_argop4 argop[2];
4766	int doqueue;
4767#ifdef DEBUG
4768	char *name;
4769#endif
4770
4771	ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
4772	ASSERT(!IS_SHADOW(vp, rp));
4773
4774#ifdef DEBUG
4775	name = fn_name(VTOSV(vp)->sv_name);
4776	NFS4_DEBUG(nfs4_client_inactive_debug, (CE_NOTE, "nfs4_inactive_otw: "
4777	    "release vnode %s", name));
4778	kmem_free(name, MAXNAMELEN);
4779#endif
4780
4781	if (vp->v_type == VREG) {
4782		bool_t recov_failed = FALSE;
4783
4784		e.error = nfs4close_all(vp, cr);
4785		if (e.error) {
4786			/* Check to see if recovery failed */
4787			mutex_enter(&(VTOMI4(vp)->mi_lock));
4788			if (VTOMI4(vp)->mi_flags & MI4_RECOV_FAIL)
4789				recov_failed = TRUE;
4790			mutex_exit(&(VTOMI4(vp)->mi_lock));
4791			if (!recov_failed) {
4792				mutex_enter(&rp->r_statelock);
4793				if (rp->r_flags & R4RECOVERR)
4794					recov_failed = TRUE;
4795				mutex_exit(&rp->r_statelock);
4796			}
4797			if (recov_failed) {
4798				NFS4_DEBUG(nfs4_client_recov_debug,
4799				    (CE_NOTE, "nfs4_inactive_otw: "
4800				    "close failed (recovery failure)"));
4801			}
4802		}
4803	}
4804
4805redo:
4806	if (rp->r_unldvp == NULL) {
4807		rp4_addfree(rp, cr);
4808		return;
4809	}
4810
4811	/*
4812	 * Save the vnode pointer for the directory where the
4813	 * unlinked-open file got renamed, then set it to NULL
4814	 * to prevent another thread from getting here before
4815	 * we're done with the remove.  While we have the
4816	 * statelock, make local copies of the pertinent rnode
4817	 * fields.  If we weren't to do this in an atomic way, the
4818	 * the unl* fields could become inconsistent with respect
4819	 * to each other due to a race condition between this
4820	 * code and nfs_remove().  See bug report 1034328.
4821	 */
4822	mutex_enter(&rp->r_statelock);
4823	if (rp->r_unldvp == NULL) {
4824		mutex_exit(&rp->r_statelock);
4825		rp4_addfree(rp, cr);
4826		return;
4827	}
4828
4829	unldvp = rp->r_unldvp;
4830	rp->r_unldvp = NULL;
4831	unlname = rp->r_unlname;
4832	rp->r_unlname = NULL;
4833	unlcred = rp->r_unlcred;
4834	rp->r_unlcred = NULL;
4835	mutex_exit(&rp->r_statelock);
4836
4837	/*
4838	 * If there are any dirty pages left, then flush
4839	 * them.  This is unfortunate because they just
4840	 * may get thrown away during the remove operation,
4841	 * but we have to do this for correctness.
4842	 */
4843	if (nfs4_has_pages(vp) &&
4844	    ((rp->r_flags & R4DIRTY) || rp->r_count > 0)) {
4845		ASSERT(vp->v_type != VCHR);
4846		e.error = nfs4_putpage(vp, (u_offset_t)0, 0, 0, cr, NULL);
4847		if (e.error) {
4848			mutex_enter(&rp->r_statelock);
4849			if (!rp->r_error)
4850				rp->r_error = e.error;
4851			mutex_exit(&rp->r_statelock);
4852		}
4853	}
4854
4855	recov_state.rs_flags = 0;
4856	recov_state.rs_num_retry_despite_err = 0;
4857recov_retry_remove:
4858	/*
4859	 * Do the remove operation on the renamed file
4860	 */
4861	args.ctag = TAG_INACTIVE;
4862
4863	/*
4864	 * Remove ops: putfh dir; remove
4865	 */
4866	args.array_len = 2;
4867	args.array = argop;
4868
4869	e.error = nfs4_start_op(VTOMI4(unldvp), unldvp, NULL, &recov_state);
4870	if (e.error) {
4871		kmem_free(unlname, MAXNAMELEN);
4872		crfree(unlcred);
4873		VN_RELE(unldvp);
4874		/*
4875		 * Try again; this time around r_unldvp will be NULL, so we'll
4876		 * just call rp4_addfree() and return.
4877		 */
4878		goto redo;
4879	}
4880
4881	/* putfh directory */
4882	argop[0].argop = OP_CPUTFH;
4883	argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(unldvp)->r_fh;
4884
4885	/* remove */
4886	argop[1].argop = OP_CREMOVE;
4887	argop[1].nfs_argop4_u.opcremove.ctarget = unlname;
4888
4889	doqueue = 1;
4890	resp = &res;
4891
4892#if 0 /* notyet */
4893	/*
4894	 * Can't do this yet.  We may be being called from
4895	 * dnlc_purge_XXX while that routine is holding a
4896	 * mutex lock to the nc_rele list.  The calls to
4897	 * nfs3_cache_wcc_data may result in calls to
4898	 * dnlc_purge_XXX.  This will result in a deadlock.
4899	 */
4900	rfs4call(VTOMI4(unldvp), &args, &res, unlcred, &doqueue, 0, &e);
4901	if (e.error) {
4902		PURGE_ATTRCACHE4(unldvp);
4903		resp = NULL;
4904	} else if (res.status) {
4905		e.error = geterrno4(res.status);
4906		PURGE_ATTRCACHE4(unldvp);
4907		/*
4908		 * This code is inactive right now
4909		 * but if made active there should
4910		 * be a nfs4_end_op() call before
4911		 * nfs4_purge_stale_fh to avoid start_op()
4912		 * deadlock. See BugId: 4948726
4913		 */
4914		nfs4_purge_stale_fh(error, unldvp, cr);
4915	} else {
4916		nfs_resop4 *resop;
4917		REMOVE4res *rm_res;
4918
4919		resop = &res.array[1];
4920		rm_res = &resop->nfs_resop4_u.opremove;
4921		/*
4922		 * Update directory cache attribute,
4923		 * readdir and dnlc caches.
4924		 */
4925		nfs4_update_dircaches(&rm_res->cinfo, unldvp, NULL, NULL, NULL);
4926	}
4927#else
4928	rfs4call(VTOMI4(unldvp), &args, &res, unlcred, &doqueue, 0, &e);
4929
4930	PURGE_ATTRCACHE4(unldvp);
4931#endif
4932
4933	if (nfs4_needs_recovery(&e, FALSE, unldvp->v_vfsp)) {
4934		if (nfs4_start_recovery(&e, VTOMI4(unldvp), unldvp, NULL,
4935		    NULL, NULL, OP_REMOVE, NULL, NULL, NULL) == FALSE) {
4936			if (!e.error)
4937				xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
4938			nfs4_end_op(VTOMI4(unldvp), unldvp, NULL,
4939			    &recov_state, TRUE);
4940			goto recov_retry_remove;
4941		}
4942	}
4943	nfs4_end_op(VTOMI4(unldvp), unldvp, NULL, &recov_state, FALSE);
4944
4945	/*
4946	 * Release stuff held for the remove
4947	 */
4948	VN_RELE(unldvp);
4949	if (!e.error && resp)
4950		xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp);
4951
4952	kmem_free(unlname, MAXNAMELEN);
4953	crfree(unlcred);
4954	goto redo;
4955}
4956
4957/*
4958 * Remote file system operations having to do with directory manipulation.
4959 */
4960/* ARGSUSED3 */
4961int
4962nfs4_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
4963    int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
4964    int *direntflags, pathname_t *realpnp)
4965{
4966	int error;
4967	vnode_t *vp, *avp = NULL;
4968	rnode4_t *drp;
4969
4970	*vpp = NULL;
4971	if (nfs_zone() != VTOMI4(dvp)->mi_zone)
4972		return (EPERM);
4973	/*
4974	 * if LOOKUP_XATTR, must replace dvp (object) with
4975	 * object's attrdir before continuing with lookup
4976	 */
4977	if (flags & LOOKUP_XATTR) {
4978		error = nfs4lookup_xattr(dvp, nm, &avp, flags, cr);
4979		if (error)
4980			return (error);
4981
4982		dvp = avp;
4983
4984		/*
4985		 * If lookup is for "", just return dvp now.  The attrdir
4986		 * has already been activated (from nfs4lookup_xattr), and
4987		 * the caller will RELE the original dvp -- not
4988		 * the attrdir.  So, set vpp and return.
4989		 * Currently, when the LOOKUP_XATTR flag is
4990		 * passed to VOP_LOOKUP, the name is always empty, and
4991		 * shortcircuiting here avoids 3 unneeded lock/unlock
4992		 * pairs.
4993		 *
4994		 * If a non-empty name was provided, then it is the
4995		 * attribute name, and it will be looked up below.
4996		 */
4997		if (*nm == '\0') {
4998			*vpp = dvp;
4999			return (0);
5000		}
5001
5002		/*
5003		 * The vfs layer never sends a name when asking for the
5004		 * attrdir, so we should never get here (unless of course
5005		 * name is passed at some time in future -- at which time
5006		 * we'll blow up here).
5007		 */
5008		ASSERT(0);
5009	}
5010
5011	drp = VTOR4(dvp);
5012	if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp)))
5013		return (EINTR);
5014
5015	error = nfs4lookup(dvp, nm, vpp, cr, 0);
5016	nfs_rw_exit(&drp->r_rwlock);
5017
5018	/*
5019	 * If vnode is a device, create special vnode.
5020	 */
5021	if (!error && ISVDEV((*vpp)->v_type)) {
5022		vp = *vpp;
5023		*vpp = specvp(vp, vp->v_rdev, vp->v_type, cr);
5024		VN_RELE(vp);
5025	}
5026
5027	return (error);
5028}
5029
5030/* ARGSUSED */
5031static int
5032nfs4lookup_xattr(vnode_t *dvp, char *nm, vnode_t **vpp, int flags, cred_t *cr)
5033{
5034	int error;
5035	rnode4_t *drp;
5036	int cflag = ((flags & CREATE_XATTR_DIR) != 0);
5037	mntinfo4_t *mi;
5038
5039	mi = VTOMI4(dvp);
5040	if (!(mi->mi_vfsp->vfs_flag & VFS_XATTR) &&
5041	    !vfs_has_feature(mi->mi_vfsp, VFSFT_SYSATTR_VIEWS))
5042		return (EINVAL);
5043
5044	drp = VTOR4(dvp);
5045	if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp)))
5046		return (EINTR);
5047
5048	mutex_enter(&drp->r_statelock);
5049	/*
5050	 * If the server doesn't support xattrs just return EINVAL
5051	 */
5052	if (drp->r_xattr_dir == NFS4_XATTR_DIR_NOTSUPP) {
5053		mutex_exit(&drp->r_statelock);
5054		nfs_rw_exit(&drp->r_rwlock);
5055		return (EINVAL);
5056	}
5057
5058	/*
5059	 * If there is a cached xattr directory entry,
5060	 * use it as long as the attributes are valid. If the
5061	 * attributes are not valid, take the simple approach and
5062	 * free the cached value and re-fetch a new value.
5063	 *
5064	 * We don't negative entry cache for now, if we did we
5065	 * would need to check if the file has changed on every
5066	 * lookup. But xattrs don't exist very often and failing
5067	 * an openattr is not much more expensive than and NVERIFY or GETATTR
5068	 * so do an openattr over the wire for now.
5069	 */
5070	if (drp->r_xattr_dir != NULL) {
5071		if (ATTRCACHE4_VALID(dvp)) {
5072			VN_HOLD(drp->r_xattr_dir);
5073			*vpp = drp->r_xattr_dir;
5074			mutex_exit(&drp->r_statelock);
5075			nfs_rw_exit(&drp->r_rwlock);
5076			return (0);
5077		}
5078		VN_RELE(drp->r_xattr_dir);
5079		drp->r_xattr_dir = NULL;
5080	}
5081	mutex_exit(&drp->r_statelock);
5082
5083	error = nfs4openattr(dvp, vpp, cflag, cr);
5084
5085	nfs_rw_exit(&drp->r_rwlock);
5086
5087	return (error);
5088}
5089
5090static int
5091nfs4lookup(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr, int skipdnlc)
5092{
5093	int error;
5094	rnode4_t *drp;
5095
5096	ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone);
5097
5098	/*
5099	 * If lookup is for "", just return dvp.  Don't need
5100	 * to send it over the wire, look it up in the dnlc,
5101	 * or perform any access checks.
5102	 */
5103	if (*nm == '\0') {
5104		VN_HOLD(dvp);
5105		*vpp = dvp;
5106		return (0);
5107	}
5108
5109	/*
5110	 * Can't do lookups in non-directories.
5111	 */
5112	if (dvp->v_type != VDIR)
5113		return (ENOTDIR);
5114
5115	/*
5116	 * If lookup is for ".", just return dvp.  Don't need
5117	 * to send it over the wire or look it up in the dnlc,
5118	 * just need to check access.
5119	 */
5120	if (nm[0] == '.' && nm[1] == '\0') {
5121		error = nfs4_access(dvp, VEXEC, 0, cr, NULL);
5122		if (error)
5123			return (error);
5124		VN_HOLD(dvp);
5125		*vpp = dvp;
5126		return (0);
5127	}
5128
5129	drp = VTOR4(dvp);
5130	if (!(drp->r_flags & R4LOOKUP)) {
5131		mutex_enter(&drp->r_statelock);
5132		drp->r_flags |= R4LOOKUP;
5133		mutex_exit(&drp->r_statelock);
5134	}
5135
5136	*vpp = NULL;
5137	/*
5138	 * Lookup this name in the DNLC.  If there is no entry
5139	 * lookup over the wire.
5140	 */
5141	if (!skipdnlc)
5142		*vpp = dnlc_lookup(dvp, nm);
5143	if (*vpp == NULL) {
5144		/*
5145		 * We need to go over the wire to lookup the name.
5146		 */
5147		return (nfs4lookupnew_otw(dvp, nm, vpp, cr));
5148	}
5149
5150	/*
5151	 * We hit on the dnlc
5152	 */
5153	if (*vpp != DNLC_NO_VNODE ||
5154	    (dvp->v_vfsp->vfs_flag & VFS_RDONLY)) {
5155		/*
5156		 * But our attrs may not be valid.
5157		 */
5158		if (ATTRCACHE4_VALID(dvp)) {
5159			error = nfs4_waitfor_purge_complete(dvp);
5160			if (error) {
5161				VN_RELE(*vpp);
5162				*vpp = NULL;
5163				return (error);
5164			}
5165
5166			/*
5167			 * If after the purge completes, check to make sure
5168			 * our attrs are still valid.
5169			 */
5170			if (ATTRCACHE4_VALID(dvp)) {
5171				/*
5172				 * If we waited for a purge we may have
5173				 * lost our vnode so look it up again.
5174				 */
5175				VN_RELE(*vpp);
5176				*vpp = dnlc_lookup(dvp, nm);
5177				if (*vpp == NULL)
5178					return (nfs4lookupnew_otw(dvp,
5179					    nm, vpp, cr));
5180
5181				/*
5182				 * The access cache should almost always hit
5183				 */
5184				error = nfs4_access(dvp, VEXEC, 0, cr, NULL);
5185
5186				if (error) {
5187					VN_RELE(*vpp);
5188					*vpp = NULL;
5189					return (error);
5190				}
5191				if (*vpp == DNLC_NO_VNODE) {
5192					VN_RELE(*vpp);
5193					*vpp = NULL;
5194					return (ENOENT);
5195				}
5196				return (0);
5197			}
5198		}
5199	}
5200
5201	ASSERT(*vpp != NULL);
5202
5203	/*
5204	 * We may have gotten here we have one of the following cases:
5205	 *	1) vpp != DNLC_NO_VNODE, our attrs have timed out so we
5206	 *		need to validate them.
5207	 *	2) vpp == DNLC_NO_VNODE, a negative entry that we always
5208	 *		must validate.
5209	 *
5210	 * Go to the server and check if the directory has changed, if
5211	 * it hasn't we are done and can use the dnlc entry.
5212	 */
5213	return (nfs4lookupvalidate_otw(dvp, nm, vpp, cr));
5214}
5215
5216/*
5217 * Go to the server and check if the directory has changed, if
5218 * it hasn't we are done and can use the dnlc entry.  If it
5219 * has changed we get a new copy of its attributes and check
5220 * the access for VEXEC, then relookup the filename and
5221 * get its filehandle and attributes.
5222 *
5223 * PUTFH dfh NVERIFY GETATTR ACCESS LOOKUP GETFH GETATTR
5224 *	if the NVERIFY failed we must
5225 *		purge the caches
5226 *		cache new attributes (will set r_time_attr_inval)
5227 *		cache new access
5228 *		recheck VEXEC access
5229 *		add name to dnlc, possibly negative
5230 *		if LOOKUP succeeded
5231 *			cache new attributes
5232 *	else
5233 *		set a new r_time_attr_inval for dvp
5234 *		check to make sure we have access
5235 *
5236 * The vpp returned is the vnode passed in if the directory is valid,
5237 * a new vnode if successful lookup, or NULL on error.
5238 */
5239static int
5240nfs4lookupvalidate_otw(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr)
5241{
5242	COMPOUND4args_clnt args;
5243	COMPOUND4res_clnt res;
5244	fattr4 *ver_fattr;
5245	fattr4_change dchange;
5246	int32_t *ptr;
5247	int argoplist_size  = 7 * sizeof (nfs_argop4);
5248	nfs_argop4 *argop;
5249	int doqueue;
5250	mntinfo4_t *mi;
5251	nfs4_recov_state_t recov_state;
5252	hrtime_t t;
5253	int isdotdot;
5254	vnode_t *nvp;
5255	nfs_fh4 *fhp;
5256	nfs4_sharedfh_t *sfhp;
5257	nfs4_access_type_t cacc;
5258	rnode4_t *nrp;
5259	rnode4_t *drp = VTOR4(dvp);
5260	nfs4_ga_res_t *garp = NULL;
5261	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
5262
5263	ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone);
5264	ASSERT(nm != NULL);
5265	ASSERT(nm[0] != '\0');
5266	ASSERT(dvp->v_type == VDIR);
5267	ASSERT(nm[0] != '.' || nm[1] != '\0');
5268	ASSERT(*vpp != NULL);
5269
5270	if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0') {
5271		isdotdot = 1;
5272		args.ctag = TAG_LOOKUP_VPARENT;
5273	} else {
5274		/*
5275		 * If dvp were a stub, it should have triggered and caused
5276		 * a mount for us to get this far.
5277		 */
5278		ASSERT(!RP_ISSTUB(VTOR4(dvp)));
5279
5280		isdotdot = 0;
5281		args.ctag = TAG_LOOKUP_VALID;
5282	}
5283
5284	mi = VTOMI4(dvp);
5285	recov_state.rs_flags = 0;
5286	recov_state.rs_num_retry_despite_err = 0;
5287
5288	nvp = NULL;
5289
5290	/* Save the original mount point security information */
5291	(void) save_mnt_secinfo(mi->mi_curr_serv);
5292
5293recov_retry:
5294	e.error = nfs4_start_fop(mi, dvp, NULL, OH_LOOKUP,
5295	    &recov_state, NULL);
5296	if (e.error) {
5297		(void) check_mnt_secinfo(mi->mi_curr_serv, nvp);
5298		VN_RELE(*vpp);
5299		*vpp = NULL;
5300		return (e.error);
5301	}
5302
5303	argop = kmem_alloc(argoplist_size, KM_SLEEP);
5304
5305	/* PUTFH dfh NVERIFY GETATTR ACCESS LOOKUP GETFH GETATTR */
5306	args.array_len = 7;
5307	args.array = argop;
5308
5309	/* 0. putfh file */
5310	argop[0].argop = OP_CPUTFH;
5311	argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(dvp)->r_fh;
5312
5313	/* 1. nverify the change info */
5314	argop[1].argop = OP_NVERIFY;
5315	ver_fattr = &argop[1].nfs_argop4_u.opnverify.obj_attributes;
5316	ver_fattr->attrmask = FATTR4_CHANGE_MASK;
5317	ver_fattr->attrlist4 = (char *)&dchange;
5318	ptr = (int32_t *)&dchange;
5319	IXDR_PUT_HYPER(ptr, VTOR4(dvp)->r_change);
5320	ver_fattr->attrlist4_len = sizeof (fattr4_change);
5321
5322	/* 2. getattr directory */
5323	argop[2].argop = OP_GETATTR;
5324	argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
5325	argop[2].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp);
5326
5327	/* 3. access directory */
5328	argop[3].argop = OP_ACCESS;
5329	argop[3].nfs_argop4_u.opaccess.access = ACCESS4_READ | ACCESS4_DELETE |
5330	    ACCESS4_MODIFY | ACCESS4_EXTEND | ACCESS4_LOOKUP;
5331
5332	/* 4. lookup name */
5333	if (isdotdot) {
5334		argop[4].argop = OP_LOOKUPP;
5335	} else {
5336		argop[4].argop = OP_CLOOKUP;
5337		argop[4].nfs_argop4_u.opclookup.cname = nm;
5338	}
5339
5340	/* 5. resulting file handle */
5341	argop[5].argop = OP_GETFH;
5342
5343	/* 6. resulting file attributes */
5344	argop[6].argop = OP_GETATTR;
5345	argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
5346	argop[6].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp);
5347
5348	doqueue = 1;
5349	t = gethrtime();
5350
5351	rfs4call(VTOMI4(dvp), &args, &res, cr, &doqueue, 0, &e);
5352
5353	if (!isdotdot && res.status == NFS4ERR_MOVED) {
5354		e.error = nfs4_setup_referral(dvp, nm, vpp, cr);
5355		if (e.error != 0 && *vpp != NULL)
5356			VN_RELE(*vpp);
5357		nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP,
5358		    &recov_state, FALSE);
5359		xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
5360		kmem_free(argop, argoplist_size);
5361		return (e.error);
5362	}
5363
5364	if (nfs4_needs_recovery(&e, FALSE, dvp->v_vfsp)) {
5365		/*
5366		 * For WRONGSEC of a non-dotdot case, send secinfo directly
5367		 * from this thread, do not go thru the recovery thread since
5368		 * we need the nm information.
5369		 *
5370		 * Not doing dotdot case because there is no specification
5371		 * for (PUTFH, SECINFO "..") yet.
5372		 */
5373		if (!isdotdot && res.status == NFS4ERR_WRONGSEC) {
5374			if ((e.error = nfs4_secinfo_vnode_otw(dvp, nm, cr)))
5375				nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP,
5376				    &recov_state, FALSE);
5377			else
5378				nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP,
5379				    &recov_state, TRUE);
5380			xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
5381			kmem_free(argop, argoplist_size);
5382			if (!e.error)
5383				goto recov_retry;
5384			(void) check_mnt_secinfo(mi->mi_curr_serv, nvp);
5385			VN_RELE(*vpp);
5386			*vpp = NULL;
5387			return (e.error);
5388		}
5389
5390		if (nfs4_start_recovery(&e, mi, dvp, NULL, NULL, NULL,
5391		    OP_LOOKUP, NULL, NULL, NULL) == FALSE) {
5392			nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP,
5393			    &recov_state, TRUE);
5394
5395			xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
5396			kmem_free(argop, argoplist_size);
5397			goto recov_retry;
5398		}
5399	}
5400
5401	nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, &recov_state, FALSE);
5402
5403	if (e.error || res.array_len == 0) {
5404		/*
5405		 * If e.error isn't set, then reply has no ops (or we couldn't
5406		 * be here).  The only legal way to reply without an op array
5407		 * is via NFS4ERR_MINOR_VERS_MISMATCH.  An ops array should
5408		 * be in the reply for all other status values.
5409		 *
5410		 * For valid replies without an ops array, return ENOTSUP
5411		 * (geterrno4 xlation of VERS_MISMATCH).  For illegal replies,
5412		 * return EIO -- don't trust status.
5413		 */
5414		if (e.error == 0)
5415			e.error = (res.status == NFS4ERR_MINOR_VERS_MISMATCH) ?
5416			    ENOTSUP : EIO;
5417		VN_RELE(*vpp);
5418		*vpp = NULL;
5419		kmem_free(argop, argoplist_size);
5420		(void) check_mnt_secinfo(mi->mi_curr_serv, nvp);
5421		return (e.error);
5422	}
5423
5424	if (res.status != NFS4ERR_SAME) {
5425		e.error = geterrno4(res.status);
5426
5427		/*
5428		 * The NVERIFY "failed" so the directory has changed
5429		 * First make sure PUTFH succeeded and NVERIFY "failed"
5430		 * cleanly.
5431		 */
5432		if ((res.array[0].nfs_resop4_u.opputfh.status != NFS4_OK) ||
5433		    (res.array[1].nfs_resop4_u.opnverify.status != NFS4_OK)) {
5434			nfs4_purge_stale_fh(e.error, dvp, cr);
5435			VN_RELE(*vpp);
5436			*vpp = NULL;
5437			goto exit;
5438		}
5439
5440		/*
5441		 * We know the NVERIFY "failed" so we must:
5442		 *	purge the caches (access and indirectly dnlc if needed)
5443		 */
5444		nfs4_purge_caches(dvp, NFS4_NOPURGE_DNLC, cr, TRUE);
5445
5446		if (res.array[2].nfs_resop4_u.opgetattr.status != NFS4_OK) {
5447			nfs4_purge_stale_fh(e.error, dvp, cr);
5448			VN_RELE(*vpp);
5449			*vpp = NULL;
5450			goto exit;
5451		}
5452
5453		/*
5454		 * Install new cached attributes for the directory
5455		 */
5456		nfs4_attr_cache(dvp,
5457		    &res.array[2].nfs_resop4_u.opgetattr.ga_res,
5458		    t, cr, FALSE, NULL);
5459
5460		if (res.array[3].nfs_resop4_u.opaccess.status != NFS4_OK) {
5461			nfs4_purge_stale_fh(e.error, dvp, cr);
5462			VN_RELE(*vpp);
5463			*vpp = NULL;
5464			e.error = geterrno4(res.status);
5465			goto exit;
5466		}
5467
5468		/*
5469		 * Now we know the directory is valid,
5470		 * cache new directory access
5471		 */
5472		nfs4_access_cache(drp,
5473		    args.array[3].nfs_argop4_u.opaccess.access,
5474		    res.array[3].nfs_resop4_u.opaccess.access, cr);
5475
5476		/*
5477		 * recheck VEXEC access
5478		 */
5479		cacc = nfs4_access_check(drp, ACCESS4_LOOKUP, cr);
5480		if (cacc != NFS4_ACCESS_ALLOWED) {
5481			/*
5482			 * Directory permissions might have been revoked
5483			 */
5484			if (cacc == NFS4_ACCESS_DENIED) {
5485				e.error = EACCES;
5486				VN_RELE(*vpp);
5487				*vpp = NULL;
5488				goto exit;
5489			}
5490
5491			/*
5492			 * Somehow we must not have asked for enough
5493			 * so try a singleton ACCESS, should never happen.
5494			 */
5495			e.error = nfs4_access(dvp, VEXEC, 0, cr, NULL);
5496			if (e.error) {
5497				VN_RELE(*vpp);
5498				*vpp = NULL;
5499				goto exit;
5500			}
5501		}
5502
5503		e.error = geterrno4(res.status);
5504		if (res.array[4].nfs_resop4_u.oplookup.status != NFS4_OK) {
5505			/*
5506			 * The lookup failed, probably no entry
5507			 */
5508			if (e.error == ENOENT && nfs4_lookup_neg_cache) {
5509				dnlc_update(dvp, nm, DNLC_NO_VNODE);
5510			} else {
5511				/*
5512				 * Might be some other error, so remove
5513				 * the dnlc entry to make sure we start all
5514				 * over again, next time.
5515				 */
5516				dnlc_remove(dvp, nm);
5517			}
5518			VN_RELE(*vpp);
5519			*vpp = NULL;
5520			goto exit;
5521		}
5522
5523		if (res.array[5].nfs_resop4_u.opgetfh.status != NFS4_OK) {
5524			/*
5525			 * The file exists but we can't get its fh for
5526			 * some unknown reason.  Remove it from the dnlc
5527			 * and error out to be safe.
5528			 */
5529			dnlc_remove(dvp, nm);
5530			VN_RELE(*vpp);
5531			*vpp = NULL;
5532			goto exit;
5533		}
5534		fhp = &res.array[5].nfs_resop4_u.opgetfh.object;
5535		if (fhp->nfs_fh4_len == 0) {
5536			/*
5537			 * The file exists but a bogus fh
5538			 * some unknown reason.  Remove it from the dnlc
5539			 * and error out to be safe.
5540			 */
5541			e.error = ENOENT;
5542			dnlc_remove(dvp, nm);
5543			VN_RELE(*vpp);
5544			*vpp = NULL;
5545			goto exit;
5546		}
5547		sfhp = sfh4_get(fhp, mi);
5548
5549		if (res.array[6].nfs_resop4_u.opgetattr.status == NFS4_OK)
5550			garp = &res.array[6].nfs_resop4_u.opgetattr.ga_res;
5551
5552		/*
5553		 * Make the new rnode
5554		 */
5555		if (isdotdot) {
5556			e.error = nfs4_make_dotdot(sfhp, t, dvp, cr, &nvp, 1);
5557			if (e.error) {
5558				sfh4_rele(&sfhp);
5559				VN_RELE(*vpp);
5560				*vpp = NULL;
5561				goto exit;
5562			}
5563			/*
5564			 * XXX if nfs4_make_dotdot uses an existing rnode
5565			 * XXX it doesn't update the attributes.
5566			 * XXX for now just save them again to save an OTW
5567			 */
5568			nfs4_attr_cache(nvp, garp, t, cr, FALSE, NULL);
5569		} else {
5570			nvp = makenfs4node(sfhp, garp, dvp->v_vfsp, t, cr,
5571			    dvp, fn_get(VTOSV(dvp)->sv_name, nm, sfhp));
5572			/*
5573			 * If v_type == VNON, then garp was NULL because
5574			 * the last op in the compound failed and makenfs4node
5575			 * could not find the vnode for sfhp. It created
5576			 * a new vnode, so we have nothing to purge here.
5577			 */
5578			if (nvp->v_type == VNON) {
5579				vattr_t vattr;
5580
5581				vattr.va_mask = AT_TYPE;
5582				/*
5583				 * N.B. We've already called nfs4_end_fop above.
5584				 */
5585				e.error = nfs4getattr(nvp, &vattr, cr);
5586				if (e.error) {
5587					sfh4_rele(&sfhp);
5588					VN_RELE(*vpp);
5589					*vpp = NULL;
5590					VN_RELE(nvp);
5591					goto exit;
5592				}
5593				nvp->v_type = vattr.va_type;
5594			}
5595		}
5596		sfh4_rele(&sfhp);
5597
5598		nrp = VTOR4(nvp);
5599		mutex_enter(&nrp->r_statev4_lock);
5600		if (!nrp->created_v4) {
5601			mutex_exit(&nrp->r_statev4_lock);
5602			dnlc_update(dvp, nm, nvp);
5603		} else
5604			mutex_exit(&nrp->r_statev4_lock);
5605
5606		VN_RELE(*vpp);
5607		*vpp = nvp;
5608	} else {
5609		hrtime_t now;
5610		hrtime_t delta = 0;
5611
5612		e.error = 0;
5613
5614		/*
5615		 * Because the NVERIFY "succeeded" we know that the
5616		 * directory attributes are still valid
5617		 * so update r_time_attr_inval
5618		 */
5619		now = gethrtime();
5620		mutex_enter(&drp->r_statelock);
5621		if (!(mi->mi_flags & MI4_NOAC) && !(dvp->v_flag & VNOCACHE)) {
5622			delta = now - drp->r_time_attr_saved;
5623			if (delta < mi->mi_acdirmin)
5624				delta = mi->mi_acdirmin;
5625			else if (delta > mi->mi_acdirmax)
5626				delta = mi->mi_acdirmax;
5627		}
5628		drp->r_time_attr_inval = now + delta;
5629		mutex_exit(&drp->r_statelock);
5630		dnlc_update(dvp, nm, *vpp);
5631
5632		/*
5633		 * Even though we have a valid directory attr cache
5634		 * and dnlc entry, we may not have access.
5635		 * This should almost always hit the cache.
5636		 */
5637		e.error = nfs4_access(dvp, VEXEC, 0, cr, NULL);
5638		if (e.error) {
5639			VN_RELE(*vpp);
5640			*vpp = NULL;
5641		}
5642
5643		if (*vpp == DNLC_NO_VNODE) {
5644			VN_RELE(*vpp);
5645			*vpp = NULL;
5646			e.error = ENOENT;
5647		}
5648	}
5649
5650exit:
5651	xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
5652	kmem_free(argop, argoplist_size);
5653	(void) check_mnt_secinfo(mi->mi_curr_serv, nvp);
5654	return (e.error);
5655}
5656
5657/*
5658 * We need to go over the wire to lookup the name, but
5659 * while we are there verify the directory has not
5660 * changed but if it has, get new attributes and check access
5661 *
5662 * PUTFH dfh SAVEFH LOOKUP nm GETFH GETATTR RESTOREFH
5663 *					NVERIFY GETATTR ACCESS
5664 *
5665 * With the results:
5666 *	if the NVERIFY failed we must purge the caches, add new attributes,
5667 *		and cache new access.
5668 *	set a new r_time_attr_inval
5669 *	add name to dnlc, possibly negative
5670 *	if LOOKUP succeeded
5671 *		cache new attributes
5672 */
5673static int
5674nfs4lookupnew_otw(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr)
5675{
5676	COMPOUND4args_clnt args;
5677	COMPOUND4res_clnt res;
5678	fattr4 *ver_fattr;
5679	fattr4_change dchange;
5680	int32_t *ptr;
5681	nfs4_ga_res_t *garp = NULL;
5682	int argoplist_size  = 9 * sizeof (nfs_argop4);
5683	nfs_argop4 *argop;
5684	int doqueue;
5685	mntinfo4_t *mi;
5686	nfs4_recov_state_t recov_state;
5687	hrtime_t t;
5688	int isdotdot;
5689	vnode_t *nvp;
5690	nfs_fh4 *fhp;
5691	nfs4_sharedfh_t *sfhp;
5692	nfs4_access_type_t cacc;
5693	rnode4_t *nrp;
5694	rnode4_t *drp = VTOR4(dvp);
5695	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
5696
5697	ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone);
5698	ASSERT(nm != NULL);
5699	ASSERT(nm[0] != '\0');
5700	ASSERT(dvp->v_type == VDIR);
5701	ASSERT(nm[0] != '.' || nm[1] != '\0');
5702	ASSERT(*vpp == NULL);
5703
5704	if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0') {
5705		isdotdot = 1;
5706		args.ctag = TAG_LOOKUP_PARENT;
5707	} else {
5708		/*
5709		 * If dvp were a stub, it should have triggered and caused
5710		 * a mount for us to get this far.
5711		 */
5712		ASSERT(!RP_ISSTUB(VTOR4(dvp)));
5713
5714		isdotdot = 0;
5715		args.ctag = TAG_LOOKUP;
5716	}
5717
5718	mi = VTOMI4(dvp);
5719	recov_state.rs_flags = 0;
5720	recov_state.rs_num_retry_despite_err = 0;
5721
5722	nvp = NULL;
5723
5724	/* Save the original mount point security information */
5725	(void) save_mnt_secinfo(mi->mi_curr_serv);
5726
5727recov_retry:
5728	e.error = nfs4_start_fop(mi, dvp, NULL, OH_LOOKUP,
5729	    &recov_state, NULL);
5730	if (e.error) {
5731		(void) check_mnt_secinfo(mi->mi_curr_serv, nvp);
5732		return (e.error);
5733	}
5734
5735	argop = kmem_alloc(argoplist_size, KM_SLEEP);
5736
5737	/* PUTFH SAVEFH LOOKUP GETFH GETATTR RESTOREFH NVERIFY GETATTR ACCESS */
5738	args.array_len = 9;
5739	args.array = argop;
5740
5741	/* 0. putfh file */
5742	argop[0].argop = OP_CPUTFH;
5743	argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(dvp)->r_fh;
5744
5745	/* 1. savefh for the nverify */
5746	argop[1].argop = OP_SAVEFH;
5747
5748	/* 2. lookup name */
5749	if (isdotdot) {
5750		argop[2].argop = OP_LOOKUPP;
5751	} else {
5752		argop[2].argop = OP_CLOOKUP;
5753		argop[2].nfs_argop4_u.opclookup.cname = nm;
5754	}
5755
5756	/* 3. resulting file handle */
5757	argop[3].argop = OP_GETFH;
5758
5759	/* 4. resulting file attributes */
5760	argop[4].argop = OP_GETATTR;
5761	argop[4].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
5762	argop[4].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp);
5763
5764	/* 5. restorefh back the directory for the nverify */
5765	argop[5].argop = OP_RESTOREFH;
5766
5767	/* 6. nverify the change info */
5768	argop[6].argop = OP_NVERIFY;
5769	ver_fattr = &argop[6].nfs_argop4_u.opnverify.obj_attributes;
5770	ver_fattr->attrmask = FATTR4_CHANGE_MASK;
5771	ver_fattr->attrlist4 = (char *)&dchange;
5772	ptr = (int32_t *)&dchange;
5773	IXDR_PUT_HYPER(ptr, VTOR4(dvp)->r_change);
5774	ver_fattr->attrlist4_len = sizeof (fattr4_change);
5775
5776	/* 7. getattr directory */
5777	argop[7].argop = OP_GETATTR;
5778	argop[7].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
5779	argop[7].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp);
5780
5781	/* 8. access directory */
5782	argop[8].argop = OP_ACCESS;
5783	argop[8].nfs_argop4_u.opaccess.access = ACCESS4_READ | ACCESS4_DELETE |
5784	    ACCESS4_MODIFY | ACCESS4_EXTEND | ACCESS4_LOOKUP;
5785
5786	doqueue = 1;
5787	t = gethrtime();
5788
5789	rfs4call(VTOMI4(dvp), &args, &res, cr, &doqueue, 0, &e);
5790
5791	if (!isdotdot && res.status == NFS4ERR_MOVED) {
5792		e.error = nfs4_setup_referral(dvp, nm, vpp, cr);
5793		if (e.error != 0 && *vpp != NULL)
5794			VN_RELE(*vpp);
5795		nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP,
5796		    &recov_state, FALSE);
5797		xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
5798		kmem_free(argop, argoplist_size);
5799		return (e.error);
5800	}
5801
5802	if (nfs4_needs_recovery(&e, FALSE, dvp->v_vfsp)) {
5803		/*
5804		 * For WRONGSEC of a non-dotdot case, send secinfo directly
5805		 * from this thread, do not go thru the recovery thread since
5806		 * we need the nm information.
5807		 *
5808		 * Not doing dotdot case because there is no specification
5809		 * for (PUTFH, SECINFO "..") yet.
5810		 */
5811		if (!isdotdot && res.status == NFS4ERR_WRONGSEC) {
5812			if ((e.error = nfs4_secinfo_vnode_otw(dvp, nm, cr)))
5813				nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP,
5814				    &recov_state, FALSE);
5815			else
5816				nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP,
5817				    &recov_state, TRUE);
5818			xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
5819			kmem_free(argop, argoplist_size);
5820			if (!e.error)
5821				goto recov_retry;
5822			(void) check_mnt_secinfo(mi->mi_curr_serv, nvp);
5823			return (e.error);
5824		}
5825
5826		if (nfs4_start_recovery(&e, mi, dvp, NULL, NULL, NULL,
5827		    OP_LOOKUP, NULL, NULL, NULL) == FALSE) {
5828			nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP,
5829			    &recov_state, TRUE);
5830
5831			xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
5832			kmem_free(argop, argoplist_size);
5833			goto recov_retry;
5834		}
5835	}
5836
5837	nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, &recov_state, FALSE);
5838
5839	if (e.error || res.array_len == 0) {
5840		/*
5841		 * If e.error isn't set, then reply has no ops (or we couldn't
5842		 * be here).  The only legal way to reply without an op array
5843		 * is via NFS4ERR_MINOR_VERS_MISMATCH.  An ops array should
5844		 * be in the reply for all other status values.
5845		 *
5846		 * For valid replies without an ops array, return ENOTSUP
5847		 * (geterrno4 xlation of VERS_MISMATCH).  For illegal replies,
5848		 * return EIO -- don't trust status.
5849		 */