1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
24 */
25
26/*
27 *	Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
28 *	All rights reserved.
29 */
30
31/*
32 * Copyright 2018 Nexenta Systems, Inc.
33 */
34
35#include <sys/param.h>
36#include <sys/types.h>
37#include <sys/systm.h>
38#include <sys/thread.h>
39#include <sys/t_lock.h>
40#include <sys/time.h>
41#include <sys/vnode.h>
42#include <sys/vfs.h>
43#include <sys/errno.h>
44#include <sys/buf.h>
45#include <sys/stat.h>
46#include <sys/cred.h>
47#include <sys/kmem.h>
48#include <sys/debug.h>
49#include <sys/dnlc.h>
50#include <sys/vmsystm.h>
51#include <sys/flock.h>
52#include <sys/share.h>
53#include <sys/cmn_err.h>
54#include <sys/tiuser.h>
55#include <sys/sysmacros.h>
56#include <sys/callb.h>
57#include <sys/acl.h>
58#include <sys/kstat.h>
59#include <sys/signal.h>
60#include <sys/list.h>
61#include <sys/zone.h>
62
63#include <rpc/types.h>
64#include <rpc/xdr.h>
65#include <rpc/auth.h>
66#include <rpc/clnt.h>
67
68#include <nfs/nfs.h>
69#include <nfs/nfs_clnt.h>
70#include <nfs/nfs_cmd.h>
71
72#include <nfs/rnode.h>
73#include <nfs/nfs_acl.h>
74#include <nfs/lm.h>
75
76#include <vm/hat.h>
77#include <vm/as.h>
78#include <vm/page.h>
79#include <vm/pvn.h>
80#include <vm/seg.h>
81#include <vm/seg_map.h>
82#include <vm/seg_vn.h>
83
84static void	nfs3_attr_cache(vnode_t *, vattr_t *, vattr_t *, hrtime_t,
85			cred_t *);
86static int	nfs_getattr_cache(vnode_t *, struct vattr *);
87static int	nfs_remove_locking_id(vnode_t *, int, char *, char *, int *);
88
89struct mi_globals {
90	kmutex_t	mig_lock;  /* lock protecting mig_list */
91	list_t		mig_list;  /* list of NFS v2 or v3 mounts in zone */
92	boolean_t	mig_destructor_called;
93};
94
95static zone_key_t mi_list_key;
96
97/* Debugging flag for PC file shares. */
98extern int	share_debug;
99
100/*
101 * Attributes caching:
102 *
103 * Attributes are cached in the rnode in struct vattr form.
104 * There is a time associated with the cached attributes (r_attrtime)
105 * which tells whether the attributes are valid. The time is initialized
106 * to the difference between current time and the modify time of the vnode
107 * when new attributes are cached. This allows the attributes for
108 * files that have changed recently to be timed out sooner than for files
109 * that have not changed for a long time. There are minimum and maximum
110 * timeout values that can be set per mount point.
111 */
112
113int
114nfs_waitfor_purge_complete(vnode_t *vp)
115{
116	rnode_t *rp;
117	k_sigset_t smask;
118
119	rp = VTOR(vp);
120	if (rp->r_serial != NULL && rp->r_serial != curthread) {
121		mutex_enter(&rp->r_statelock);
122		sigintr(&smask, VTOMI(vp)->mi_flags & MI_INT);
123		while (rp->r_serial != NULL) {
124			if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
125				sigunintr(&smask);
126				mutex_exit(&rp->r_statelock);
127				return (EINTR);
128			}
129		}
130		sigunintr(&smask);
131		mutex_exit(&rp->r_statelock);
132	}
133	return (0);
134}
135
136/*
137 * Validate caches by checking cached attributes. If the cached
138 * attributes have timed out, then get new attributes from the server.
139 * As a side affect, this will do cache invalidation if the attributes
140 * have changed.
141 *
142 * If the attributes have not timed out and if there is a cache
143 * invalidation being done by some other thread, then wait until that
144 * thread has completed the cache invalidation.
145 */
146int
147nfs_validate_caches(vnode_t *vp, cred_t *cr)
148{
149	int error;
150	struct vattr va;
151
152	if (ATTRCACHE_VALID(vp)) {
153		error = nfs_waitfor_purge_complete(vp);
154		if (error)
155			return (error);
156		return (0);
157	}
158
159	va.va_mask = AT_ALL;
160	return (nfs_getattr_otw(vp, &va, cr));
161}
162
163/*
164 * Validate caches by checking cached attributes. If the cached
165 * attributes have timed out, then get new attributes from the server.
166 * As a side affect, this will do cache invalidation if the attributes
167 * have changed.
168 *
169 * If the attributes have not timed out and if there is a cache
170 * invalidation being done by some other thread, then wait until that
171 * thread has completed the cache invalidation.
172 */
173int
174nfs3_validate_caches(vnode_t *vp, cred_t *cr)
175{
176	int error;
177	struct vattr va;
178
179	if (ATTRCACHE_VALID(vp)) {
180		error = nfs_waitfor_purge_complete(vp);
181		if (error)
182			return (error);
183		return (0);
184	}
185
186	va.va_mask = AT_ALL;
187	return (nfs3_getattr_otw(vp, &va, cr));
188}
189
190/*
191 * Purge all of the various NFS `data' caches.
192 */
193void
194nfs_purge_caches(vnode_t *vp, int purge_dnlc, cred_t *cr)
195{
196	rnode_t *rp;
197	char *contents;
198	int size;
199	int error;
200
201	/*
202	 * Purge the DNLC for any entries which refer to this file.
203	 * Avoid recursive entry into dnlc_purge_vp() in case of a directory.
204	 */
205	rp = VTOR(vp);
206	mutex_enter(&rp->r_statelock);
207	if (vp->v_count > 1 &&
208	    (vp->v_type == VDIR || purge_dnlc == NFS_PURGE_DNLC) &&
209	    !(rp->r_flags & RINDNLCPURGE)) {
210		/*
211		 * Set the RINDNLCPURGE flag to prevent recursive entry
212		 * into dnlc_purge_vp()
213		 */
214		if (vp->v_type == VDIR)
215			rp->r_flags |= RINDNLCPURGE;
216		mutex_exit(&rp->r_statelock);
217		dnlc_purge_vp(vp);
218		mutex_enter(&rp->r_statelock);
219		if (rp->r_flags & RINDNLCPURGE)
220			rp->r_flags &= ~RINDNLCPURGE;
221	}
222
223	/*
224	 * Clear any readdir state bits and purge the readlink response cache.
225	 */
226	contents = rp->r_symlink.contents;
227	size = rp->r_symlink.size;
228	rp->r_symlink.contents = NULL;
229	mutex_exit(&rp->r_statelock);
230
231	if (contents != NULL) {
232
233		kmem_free((void *)contents, size);
234	}
235
236	/*
237	 * Flush the page cache.
238	 */
239	if (vn_has_cached_data(vp)) {
240		error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_INVAL, cr, NULL);
241		if (error && (error == ENOSPC || error == EDQUOT)) {
242			mutex_enter(&rp->r_statelock);
243			if (!rp->r_error)
244				rp->r_error = error;
245			mutex_exit(&rp->r_statelock);
246		}
247	}
248
249	/*
250	 * Flush the readdir response cache.
251	 */
252	if (HAVE_RDDIR_CACHE(rp))
253		nfs_purge_rddir_cache(vp);
254}
255
256/*
257 * Purge the readdir cache of all entries
258 */
259void
260nfs_purge_rddir_cache(vnode_t *vp)
261{
262	rnode_t *rp;
263	rddir_cache *rdc;
264	rddir_cache *nrdc;
265
266	rp = VTOR(vp);
267top:
268	mutex_enter(&rp->r_statelock);
269	rp->r_direof = NULL;
270	rp->r_flags &= ~RLOOKUP;
271	rp->r_flags |= RREADDIRPLUS;
272	rdc = avl_first(&rp->r_dir);
273	while (rdc != NULL) {
274		nrdc = AVL_NEXT(&rp->r_dir, rdc);
275		avl_remove(&rp->r_dir, rdc);
276		rddir_cache_rele(rdc);
277		rdc = nrdc;
278	}
279	mutex_exit(&rp->r_statelock);
280}
281
282/*
283 * Do a cache check based on the post-operation attributes.
284 * Then make them the new cached attributes.  If no attributes
285 * were returned, then mark the attributes as timed out.
286 */
287void
288nfs3_cache_post_op_attr(vnode_t *vp, post_op_attr *poap, hrtime_t t, cred_t *cr)
289{
290	vattr_t attr;
291
292	if (!poap->attributes) {
293		PURGE_ATTRCACHE(vp);
294		return;
295	}
296	(void) nfs3_cache_fattr3(vp, &poap->attr, &attr, t, cr);
297}
298
299/*
300 * Same as above, but using a vattr
301 */
302void
303nfs3_cache_post_op_vattr(vnode_t *vp, post_op_vattr *poap, hrtime_t t,
304    cred_t *cr)
305{
306	if (!poap->attributes) {
307		PURGE_ATTRCACHE(vp);
308		return;
309	}
310	nfs_attr_cache(vp, poap->fres.vap, t, cr);
311}
312
313/*
314 * Do a cache check based on the weak cache consistency attributes.
315 * These consist of a small set of pre-operation attributes and the
316 * full set of post-operation attributes.
317 *
318 * If we are given the pre-operation attributes, then use them to
319 * check the validity of the various caches.  Then, if we got the
320 * post-operation attributes, make them the new cached attributes.
321 * If we didn't get the post-operation attributes, then mark the
322 * attribute cache as timed out so that the next reference will
323 * cause a GETATTR to the server to refresh with the current
324 * attributes.
325 *
326 * Otherwise, if we didn't get the pre-operation attributes, but
327 * we did get the post-operation attributes, then use these
328 * attributes to check the validity of the various caches.  This
329 * will probably cause a flush of the caches because if the
330 * operation succeeded, the attributes of the object were changed
331 * in some way from the old post-operation attributes.  This
332 * should be okay because it is the safe thing to do.  After
333 * checking the data caches, then we make these the new cached
334 * attributes.
335 *
336 * Otherwise, we didn't get either the pre- or post-operation
337 * attributes.  Simply mark the attribute cache as timed out so
338 * the next reference will cause a GETATTR to the server to
339 * refresh with the current attributes.
340 *
341 * If an error occurred trying to convert the over the wire
342 * attributes to a vattr, then simply mark the attribute cache as
343 * timed out.
344 */
345void
346nfs3_cache_wcc_data(vnode_t *vp, wcc_data *wccp, hrtime_t t, cred_t *cr)
347{
348	vattr_t bva;
349	vattr_t ava;
350
351	if (wccp->after.attributes) {
352		if (fattr3_to_vattr(vp, &wccp->after.attr, &ava)) {
353			PURGE_ATTRCACHE(vp);
354			return;
355		}
356		if (wccp->before.attributes) {
357			bva.va_ctime.tv_sec = wccp->before.attr.ctime.seconds;
358			bva.va_ctime.tv_nsec = wccp->before.attr.ctime.nseconds;
359			bva.va_mtime.tv_sec = wccp->before.attr.mtime.seconds;
360			bva.va_mtime.tv_nsec = wccp->before.attr.mtime.nseconds;
361			bva.va_size = wccp->before.attr.size;
362			nfs3_attr_cache(vp, &bva, &ava, t, cr);
363		} else
364			nfs_attr_cache(vp, &ava, t, cr);
365	} else {
366		PURGE_ATTRCACHE(vp);
367	}
368}
369
370/*
371 * Set attributes cache for given vnode using nfsattr.
372 *
373 * This routine does not do cache validation with the attributes.
374 *
375 * If an error occurred trying to convert the over the wire
376 * attributes to a vattr, then simply mark the attribute cache as
377 * timed out.
378 */
379void
380nfs_attrcache(vnode_t *vp, struct nfsfattr *na, hrtime_t t)
381{
382	rnode_t *rp;
383	struct vattr va;
384
385	if (!nattr_to_vattr(vp, na, &va)) {
386		rp = VTOR(vp);
387		mutex_enter(&rp->r_statelock);
388		if (rp->r_mtime <= t)
389			nfs_attrcache_va(vp, &va);
390		mutex_exit(&rp->r_statelock);
391	} else {
392		PURGE_ATTRCACHE(vp);
393	}
394}
395
396/*
397 * Set attributes cache for given vnode using fattr3.
398 *
399 * This routine does not do cache validation with the attributes.
400 *
401 * If an error occurred trying to convert the over the wire
402 * attributes to a vattr, then simply mark the attribute cache as
403 * timed out.
404 */
405void
406nfs3_attrcache(vnode_t *vp, fattr3 *na, hrtime_t t)
407{
408	rnode_t *rp;
409	struct vattr va;
410
411	if (!fattr3_to_vattr(vp, na, &va)) {
412		rp = VTOR(vp);
413		mutex_enter(&rp->r_statelock);
414		if (rp->r_mtime <= t)
415			nfs_attrcache_va(vp, &va);
416		mutex_exit(&rp->r_statelock);
417	} else {
418		PURGE_ATTRCACHE(vp);
419	}
420}
421
422/*
423 * Do a cache check based on attributes returned over the wire.  The
424 * new attributes are cached.
425 *
426 * If an error occurred trying to convert the over the wire attributes
427 * to a vattr, then just return that error.
428 *
429 * As a side affect, the vattr argument is filled in with the converted
430 * attributes.
431 */
432int
433nfs_cache_fattr(vnode_t *vp, struct nfsfattr *na, vattr_t *vap, hrtime_t t,
434    cred_t *cr)
435{
436	int error;
437
438	error = nattr_to_vattr(vp, na, vap);
439	if (error)
440		return (error);
441	nfs_attr_cache(vp, vap, t, cr);
442	return (0);
443}
444
445/*
446 * Do a cache check based on attributes returned over the wire.  The
447 * new attributes are cached.
448 *
449 * If an error occurred trying to convert the over the wire attributes
450 * to a vattr, then just return that error.
451 *
452 * As a side affect, the vattr argument is filled in with the converted
453 * attributes.
454 */
455int
456nfs3_cache_fattr3(vnode_t *vp, fattr3 *na, vattr_t *vap, hrtime_t t, cred_t *cr)
457{
458	int error;
459
460	error = fattr3_to_vattr(vp, na, vap);
461	if (error)
462		return (error);
463	nfs_attr_cache(vp, vap, t, cr);
464	return (0);
465}
466
467/*
468 * Use the passed in virtual attributes to check to see whether the
469 * data and metadata caches are valid, cache the new attributes, and
470 * then do the cache invalidation if required.
471 *
472 * The cache validation and caching of the new attributes is done
473 * atomically via the use of the mutex, r_statelock.  If required,
474 * the cache invalidation is done atomically w.r.t. the cache
475 * validation and caching of the attributes via the pseudo lock,
476 * r_serial.
477 *
478 * This routine is used to do cache validation and attributes caching
479 * for operations with a single set of post operation attributes.
480 */
481void
482nfs_attr_cache(vnode_t *vp, vattr_t *vap, hrtime_t t, cred_t *cr)
483{
484	rnode_t *rp;
485	int mtime_changed = 0;
486	int ctime_changed = 0;
487	vsecattr_t *vsp;
488	int was_serial;
489	len_t preattr_rsize;
490	boolean_t writeattr_set = B_FALSE;
491	boolean_t cachepurge_set = B_FALSE;
492
493	rp = VTOR(vp);
494
495	mutex_enter(&rp->r_statelock);
496
497	if (rp->r_serial != curthread) {
498		klwp_t *lwp = ttolwp(curthread);
499
500		was_serial = 0;
501		if (lwp != NULL)
502			lwp->lwp_nostop++;
503		while (rp->r_serial != NULL) {
504			if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
505				mutex_exit(&rp->r_statelock);
506				if (lwp != NULL)
507					lwp->lwp_nostop--;
508				return;
509			}
510		}
511		if (lwp != NULL)
512			lwp->lwp_nostop--;
513	} else
514		was_serial = 1;
515
516	if (rp->r_mtime > t) {
517		if (!CACHE_VALID(rp, vap->va_mtime, vap->va_size))
518			PURGE_ATTRCACHE_LOCKED(rp);
519		mutex_exit(&rp->r_statelock);
520		return;
521	}
522
523	/*
524	 * Write thread after writing data to file on remote server,
525	 * will always set RWRITEATTR to indicate that file on remote
526	 * server was modified with a WRITE operation and would have
527	 * marked attribute cache as timed out. If RWRITEATTR
528	 * is set, then do not check for mtime and ctime change.
529	 */
530	if (!(rp->r_flags & RWRITEATTR)) {
531		if (!CACHE_VALID(rp, vap->va_mtime, vap->va_size))
532			mtime_changed = 1;
533
534		if (rp->r_attr.va_ctime.tv_sec != vap->va_ctime.tv_sec ||
535		    rp->r_attr.va_ctime.tv_nsec != vap->va_ctime.tv_nsec)
536			ctime_changed = 1;
537	} else {
538		writeattr_set = B_TRUE;
539	}
540
541	preattr_rsize = rp->r_size;
542
543	nfs_attrcache_va(vp, vap);
544
545	/*
546	 * If we have updated filesize in nfs_attrcache_va, as soon as we
547	 * drop statelock we will be in transition of purging all
548	 * our caches and updating them. It is possible for another
549	 * thread to pick this new file size and read in zeroed data.
550	 * stall other threads till cache purge is complete.
551	 */
552	if ((vp->v_type == VREG) && (rp->r_size != preattr_rsize)) {
553		/*
554		 * If RWRITEATTR was set and we have updated the file
555		 * size, Server's returned file size need not necessarily
556		 * be because of this Client's WRITE. We need to purge
557		 * all caches.
558		 */
559		if (writeattr_set)
560			mtime_changed = 1;
561
562		if (mtime_changed && !(rp->r_flags & RINCACHEPURGE)) {
563			rp->r_flags |= RINCACHEPURGE;
564			cachepurge_set = B_TRUE;
565		}
566	}
567
568	if (!mtime_changed && !ctime_changed) {
569		mutex_exit(&rp->r_statelock);
570		return;
571	}
572
573	rp->r_serial = curthread;
574
575	mutex_exit(&rp->r_statelock);
576
577	if (mtime_changed)
578		nfs_purge_caches(vp, NFS_NOPURGE_DNLC, cr);
579
580	if ((rp->r_flags & RINCACHEPURGE) && cachepurge_set) {
581		mutex_enter(&rp->r_statelock);
582		rp->r_flags &= ~RINCACHEPURGE;
583		cv_broadcast(&rp->r_cv);
584		mutex_exit(&rp->r_statelock);
585		cachepurge_set = B_FALSE;
586	}
587
588	if (ctime_changed) {
589		(void) nfs_access_purge_rp(rp);
590		if (rp->r_secattr != NULL) {
591			mutex_enter(&rp->r_statelock);
592			vsp = rp->r_secattr;
593			rp->r_secattr = NULL;
594			mutex_exit(&rp->r_statelock);
595			if (vsp != NULL)
596				nfs_acl_free(vsp);
597		}
598	}
599
600	if (!was_serial) {
601		mutex_enter(&rp->r_statelock);
602		rp->r_serial = NULL;
603		cv_broadcast(&rp->r_cv);
604		mutex_exit(&rp->r_statelock);
605	}
606}
607
608/*
609 * Use the passed in "before" virtual attributes to check to see
610 * whether the data and metadata caches are valid, cache the "after"
611 * new attributes, and then do the cache invalidation if required.
612 *
613 * The cache validation and caching of the new attributes is done
614 * atomically via the use of the mutex, r_statelock.  If required,
615 * the cache invalidation is done atomically w.r.t. the cache
616 * validation and caching of the attributes via the pseudo lock,
617 * r_serial.
618 *
619 * This routine is used to do cache validation and attributes caching
620 * for operations with both pre operation attributes and post operation
621 * attributes.
622 */
623static void
624nfs3_attr_cache(vnode_t *vp, vattr_t *bvap, vattr_t *avap, hrtime_t t,
625    cred_t *cr)
626{
627	rnode_t *rp;
628	int mtime_changed = 0;
629	int ctime_changed = 0;
630	vsecattr_t *vsp;
631	int was_serial;
632	len_t preattr_rsize;
633	boolean_t writeattr_set = B_FALSE;
634	boolean_t cachepurge_set = B_FALSE;
635
636	rp = VTOR(vp);
637
638	mutex_enter(&rp->r_statelock);
639
640	if (rp->r_serial != curthread) {
641		klwp_t *lwp = ttolwp(curthread);
642
643		was_serial = 0;
644		if (lwp != NULL)
645			lwp->lwp_nostop++;
646		while (rp->r_serial != NULL) {
647			if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
648				mutex_exit(&rp->r_statelock);
649				if (lwp != NULL)
650					lwp->lwp_nostop--;
651				return;
652			}
653		}
654		if (lwp != NULL)
655			lwp->lwp_nostop--;
656	} else
657		was_serial = 1;
658
659	if (rp->r_mtime > t) {
660		if (!CACHE_VALID(rp, avap->va_mtime, avap->va_size))
661			PURGE_ATTRCACHE_LOCKED(rp);
662		mutex_exit(&rp->r_statelock);
663		return;
664	}
665
666	/*
667	 * Write thread after writing data to file on remote server,
668	 * will always set RWRITEATTR to indicate that file on remote
669	 * server was modified with a WRITE operation and would have
670	 * marked attribute cache as timed out. If RWRITEATTR
671	 * is set, then do not check for mtime and ctime change.
672	 */
673	if (!(rp->r_flags & RWRITEATTR)) {
674		if (!CACHE_VALID(rp, bvap->va_mtime, bvap->va_size))
675			mtime_changed = 1;
676
677		if (rp->r_attr.va_ctime.tv_sec != bvap->va_ctime.tv_sec ||
678		    rp->r_attr.va_ctime.tv_nsec != bvap->va_ctime.tv_nsec)
679			ctime_changed = 1;
680	} else {
681		writeattr_set = B_TRUE;
682	}
683
684	preattr_rsize = rp->r_size;
685
686	nfs_attrcache_va(vp, avap);
687
688	/*
689	 * If we have updated filesize in nfs_attrcache_va, as soon as we
690	 * drop statelock we will be in transition of purging all
691	 * our caches and updating them. It is possible for another
692	 * thread to pick this new file size and read in zeroed data.
693	 * stall other threads till cache purge is complete.
694	 */
695	if ((vp->v_type == VREG) && (rp->r_size != preattr_rsize)) {
696		/*
697		 * If RWRITEATTR was set and we have updated the file
698		 * size, Server's returned file size need not necessarily
699		 * be because of this Client's WRITE. We need to purge
700		 * all caches.
701		 */
702		if (writeattr_set)
703			mtime_changed = 1;
704
705		if (mtime_changed && !(rp->r_flags & RINCACHEPURGE)) {
706			rp->r_flags |= RINCACHEPURGE;
707			cachepurge_set = B_TRUE;
708		}
709	}
710
711	if (!mtime_changed && !ctime_changed) {
712		mutex_exit(&rp->r_statelock);
713		return;
714	}
715
716	rp->r_serial = curthread;
717
718	mutex_exit(&rp->r_statelock);
719
720	if (mtime_changed)
721		nfs_purge_caches(vp, NFS_NOPURGE_DNLC, cr);
722
723	if ((rp->r_flags & RINCACHEPURGE) && cachepurge_set) {
724		mutex_enter(&rp->r_statelock);
725		rp->r_flags &= ~RINCACHEPURGE;
726		cv_broadcast(&rp->r_cv);
727		mutex_exit(&rp->r_statelock);
728		cachepurge_set = B_FALSE;
729	}
730
731	if (ctime_changed) {
732		(void) nfs_access_purge_rp(rp);
733		if (rp->r_secattr != NULL) {
734			mutex_enter(&rp->r_statelock);
735			vsp = rp->r_secattr;
736			rp->r_secattr = NULL;
737			mutex_exit(&rp->r_statelock);
738			if (vsp != NULL)
739				nfs_acl_free(vsp);
740		}
741	}
742
743	if (!was_serial) {
744		mutex_enter(&rp->r_statelock);
745		rp->r_serial = NULL;
746		cv_broadcast(&rp->r_cv);
747		mutex_exit(&rp->r_statelock);
748	}
749}
750
751/*
752 * Set attributes cache for given vnode using virtual attributes.
753 *
754 * Set the timeout value on the attribute cache and fill it
755 * with the passed in attributes.
756 *
757 * The caller must be holding r_statelock.
758 */
759void
760nfs_attrcache_va(vnode_t *vp, struct vattr *va)
761{
762	rnode_t *rp;
763	mntinfo_t *mi;
764	hrtime_t delta;
765	hrtime_t now;
766
767	rp = VTOR(vp);
768
769	ASSERT(MUTEX_HELD(&rp->r_statelock));
770
771	now = gethrtime();
772
773	mi = VTOMI(vp);
774
775	/*
776	 * Delta is the number of nanoseconds that we will
777	 * cache the attributes of the file.  It is based on
778	 * the number of nanoseconds since the last time that
779	 * we detected a change.  The assumption is that files
780	 * that changed recently are likely to change again.
781	 * There is a minimum and a maximum for regular files
782	 * and for directories which is enforced though.
783	 *
784	 * Using the time since last change was detected
785	 * eliminates direct comparison or calculation
786	 * using mixed client and server times.  NFS does
787	 * not make any assumptions regarding the client
788	 * and server clocks being synchronized.
789	 */
790	if (va->va_mtime.tv_sec != rp->r_attr.va_mtime.tv_sec ||
791	    va->va_mtime.tv_nsec != rp->r_attr.va_mtime.tv_nsec ||
792	    va->va_size != rp->r_attr.va_size)
793		rp->r_mtime = now;
794
795	if ((mi->mi_flags & MI_NOAC) || (vp->v_flag & VNOCACHE))
796		delta = 0;
797	else {
798		delta = now - rp->r_mtime;
799		if (vp->v_type == VDIR) {
800			if (delta < mi->mi_acdirmin)
801				delta = mi->mi_acdirmin;
802			else if (delta > mi->mi_acdirmax)
803				delta = mi->mi_acdirmax;
804		} else {
805			if (delta < mi->mi_acregmin)
806				delta = mi->mi_acregmin;
807			else if (delta > mi->mi_acregmax)
808				delta = mi->mi_acregmax;
809		}
810	}
811	rp->r_attrtime = now + delta;
812	rp->r_attr = *va;
813	/*
814	 * Update the size of the file if there is no cached data or if
815	 * the cached data is clean and there is no data being written
816	 * out.
817	 */
818	if (rp->r_size != va->va_size &&
819	    (!vn_has_cached_data(vp) ||
820	    (!(rp->r_flags & RDIRTY) && rp->r_count == 0)))
821		rp->r_size = va->va_size;
822	nfs_setswaplike(vp, va);
823	rp->r_flags &= ~RWRITEATTR;
824}
825
826/*
827 * Fill in attribute from the cache.
828 * If valid, then return 0 to indicate that no error occurred,
829 * otherwise return 1 to indicate that an error occurred.
830 */
831static int
832nfs_getattr_cache(vnode_t *vp, struct vattr *vap)
833{
834	rnode_t *rp;
835	uint_t mask = vap->va_mask;
836
837	rp = VTOR(vp);
838	mutex_enter(&rp->r_statelock);
839	if (ATTRCACHE_VALID(vp)) {
840		/*
841		 * Cached attributes are valid
842		 */
843		*vap = rp->r_attr;
844		/*
845		 * Set the caller's va_mask to the set of attributes
846		 * that were requested ANDed with the attributes that
847		 * are available.  If attributes were requested that
848		 * are not available, those bits must be turned off
849		 * in the callers va_mask.
850		 */
851		vap->va_mask &= mask;
852		mutex_exit(&rp->r_statelock);
853		return (0);
854	}
855	mutex_exit(&rp->r_statelock);
856	return (1);
857}
858
859/*
860 * Get attributes over-the-wire and update attributes cache
861 * if no error occurred in the over-the-wire operation.
862 * Return 0 if successful, otherwise error.
863 */
864int
865nfs_getattr_otw(vnode_t *vp, struct vattr *vap, cred_t *cr)
866{
867	int error;
868	struct nfsattrstat ns;
869	int douprintf;
870	mntinfo_t *mi;
871	failinfo_t fi;
872	hrtime_t t;
873
874	mi = VTOMI(vp);
875	fi.vp = vp;
876	fi.fhp = NULL;		/* no need to update, filehandle not copied */
877	fi.copyproc = nfscopyfh;
878	fi.lookupproc = nfslookup;
879	fi.xattrdirproc = acl_getxattrdir2;
880
881	if (mi->mi_flags & MI_ACL) {
882		error = acl_getattr2_otw(vp, vap, cr);
883		if (mi->mi_flags & MI_ACL)
884			return (error);
885	}
886
887	douprintf = 1;
888
889	t = gethrtime();
890
891	error = rfs2call(mi, RFS_GETATTR,
892	    xdr_fhandle, (caddr_t)VTOFH(vp),
893	    xdr_attrstat, (caddr_t)&ns, cr,
894	    &douprintf, &ns.ns_status, 0, &fi);
895
896	if (!error) {
897		error = geterrno(ns.ns_status);
898		if (!error)
899			error = nfs_cache_fattr(vp, &ns.ns_attr, vap, t, cr);
900		else {
901			PURGE_STALE_FH(error, vp, cr);
902		}
903	}
904
905	return (error);
906}
907
908/*
909 * Return either cached ot remote attributes. If get remote attr
910 * use them to check and invalidate caches, then cache the new attributes.
911 */
912int
913nfsgetattr(vnode_t *vp, struct vattr *vap, cred_t *cr)
914{
915	int error;
916	rnode_t *rp;
917
918	/*
919	 * If we've got cached attributes, we're done, otherwise go
920	 * to the server to get attributes, which will update the cache
921	 * in the process.
922	 */
923	error = nfs_getattr_cache(vp, vap);
924	if (error)
925		error = nfs_getattr_otw(vp, vap, cr);
926
927	/* Return the client's view of file size */
928	rp = VTOR(vp);
929	mutex_enter(&rp->r_statelock);
930	vap->va_size = rp->r_size;
931	mutex_exit(&rp->r_statelock);
932
933	return (error);
934}
935
936/*
937 * Get attributes over-the-wire and update attributes cache
938 * if no error occurred in the over-the-wire operation.
939 * Return 0 if successful, otherwise error.
940 */
941int
942nfs3_getattr_otw(vnode_t *vp, struct vattr *vap, cred_t *cr)
943{
944	int error;
945	GETATTR3args args;
946	GETATTR3vres res;
947	int douprintf;
948	failinfo_t fi;
949	hrtime_t t;
950
951	args.object = *VTOFH3(vp);
952	fi.vp = vp;
953	fi.fhp = (caddr_t)&args.object;
954	fi.copyproc = nfs3copyfh;
955	fi.lookupproc = nfs3lookup;
956	fi.xattrdirproc = acl_getxattrdir3;
957	res.fres.vp = vp;
958	res.fres.vap = vap;
959
960	douprintf = 1;
961
962	t = gethrtime();
963
964	error = rfs3call(VTOMI(vp), NFSPROC3_GETATTR,
965	    xdr_nfs_fh3, (caddr_t)&args,
966	    xdr_GETATTR3vres, (caddr_t)&res, cr,
967	    &douprintf, &res.status, 0, &fi);
968
969	if (error)
970		return (error);
971
972	error = geterrno3(res.status);
973	if (error) {
974		PURGE_STALE_FH(error, vp, cr);
975		return (error);
976	}
977
978	/*
979	 * Catch status codes that indicate fattr3 to vattr translation failure
980	 */
981	if (res.fres.status)
982		return (res.fres.status);
983
984	nfs_attr_cache(vp, vap, t, cr);
985	return (0);
986}
987
988/*
989 * Return either cached or remote attributes. If get remote attr
990 * use them to check and invalidate caches, then cache the new attributes.
991 */
992int
993nfs3getattr(vnode_t *vp, struct vattr *vap, cred_t *cr)
994{
995	int error;
996	rnode_t *rp;
997
998	/*
999	 * If we've got cached attributes, we're done, otherwise go
1000	 * to the server to get attributes, which will update the cache
1001	 * in the process.
1002	 */
1003	error = nfs_getattr_cache(vp, vap);
1004	if (error)
1005		error = nfs3_getattr_otw(vp, vap, cr);
1006
1007	/* Return the client's view of file size */
1008	rp = VTOR(vp);
1009	mutex_enter(&rp->r_statelock);
1010	vap->va_size = rp->r_size;
1011	mutex_exit(&rp->r_statelock);
1012
1013	return (error);
1014}
1015
1016vtype_t nf_to_vt[] = {
1017	VNON, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK
1018};
1019/*
1020 * Convert NFS Version 2 over the network attributes to the local
1021 * virtual attributes.  The mapping between the UID_NOBODY/GID_NOBODY
1022 * network representation and the local representation is done here.
1023 * Returns 0 for success, error if failed due to overflow.
1024 */
1025int
1026nattr_to_vattr(vnode_t *vp, struct nfsfattr *na, struct vattr *vap)
1027{
1028	/* overflow in time attributes? */
1029#ifndef _LP64
1030	if (!NFS2_FATTR_TIME_OK(na))
1031		return (EOVERFLOW);
1032#endif
1033
1034	vap->va_mask = AT_ALL;
1035
1036	if (na->na_type < NFNON || na->na_type > NFSOC)
1037		vap->va_type = VBAD;
1038	else
1039		vap->va_type = nf_to_vt[na->na_type];
1040	vap->va_mode = na->na_mode;
1041	vap->va_uid = (na->na_uid == NFS_UID_NOBODY) ? UID_NOBODY : na->na_uid;
1042	vap->va_gid = (na->na_gid == NFS_GID_NOBODY) ? GID_NOBODY : na->na_gid;
1043	vap->va_fsid = vp->v_vfsp->vfs_dev;
1044	vap->va_nodeid = na->na_nodeid;
1045	vap->va_nlink = na->na_nlink;
1046	vap->va_size = na->na_size;	/* keep for cache validation */
1047	/*
1048	 * nfs protocol defines times as unsigned so don't extend sign,
1049	 * unless sysadmin set nfs_allow_preepoch_time.
1050	 */
1051	NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, na->na_atime.tv_sec);
1052	vap->va_atime.tv_nsec = (uint32_t)(na->na_atime.tv_usec * 1000);
1053	NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, na->na_mtime.tv_sec);
1054	vap->va_mtime.tv_nsec = (uint32_t)(na->na_mtime.tv_usec * 1000);
1055	NFS_TIME_T_CONVERT(vap->va_ctime.tv_sec, na->na_ctime.tv_sec);
1056	vap->va_ctime.tv_nsec = (uint32_t)(na->na_ctime.tv_usec * 1000);
1057	/*
1058	 * Shannon's law - uncompress the received dev_t
1059	 * if the top half of is zero indicating a response
1060	 * from an `older style' OS. Except for when it is a
1061	 * `new style' OS sending the maj device of zero,
1062	 * in which case the algorithm still works because the
1063	 * fact that it is a new style server
1064	 * is hidden by the minor device not being greater
1065	 * than 255 (a requirement in this case).
1066	 */
1067	if ((na->na_rdev & 0xffff0000) == 0)
1068		vap->va_rdev = nfsv2_expdev(na->na_rdev);
1069	else
1070		vap->va_rdev = expldev(na->na_rdev);
1071
1072	vap->va_nblocks = na->na_blocks;
1073	switch (na->na_type) {
1074	case NFBLK:
1075		vap->va_blksize = DEV_BSIZE;
1076		break;
1077
1078	case NFCHR:
1079		vap->va_blksize = MAXBSIZE;
1080		break;
1081
1082	case NFSOC:
1083	default:
1084		vap->va_blksize = na->na_blocksize;
1085		break;
1086	}
1087	/*
1088	 * This bit of ugliness is a hack to preserve the
1089	 * over-the-wire protocols for named-pipe vnodes.
1090	 * It remaps the special over-the-wire type to the
1091	 * VFIFO type. (see note in nfs.h)
1092	 */
1093	if (NA_ISFIFO(na)) {
1094		vap->va_type = VFIFO;
1095		vap->va_mode = (vap->va_mode & ~S_IFMT) | S_IFIFO;
1096		vap->va_rdev = 0;
1097		vap->va_blksize = na->na_blocksize;
1098	}
1099	vap->va_seq = 0;
1100	return (0);
1101}
1102
1103/*
1104 * Convert NFS Version 3 over the network attributes to the local
1105 * virtual attributes.  The mapping between the UID_NOBODY/GID_NOBODY
1106 * network representation and the local representation is done here.
1107 */
1108vtype_t nf3_to_vt[] = {
1109	VBAD, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK, VFIFO
1110};
1111
1112int
1113fattr3_to_vattr(vnode_t *vp, fattr3 *na, struct vattr *vap)
1114{
1115
1116#ifndef _LP64
1117	/* overflow in time attributes? */
1118	if (!NFS3_FATTR_TIME_OK(na))
1119		return (EOVERFLOW);
1120#endif
1121	if (!NFS3_SIZE_OK(na->size))
1122		/* file too big */
1123		return (EFBIG);
1124
1125	vap->va_mask = AT_ALL;
1126
1127	if (na->type < NF3REG || na->type > NF3FIFO)
1128		vap->va_type = VBAD;
1129	else
1130		vap->va_type = nf3_to_vt[na->type];
1131	vap->va_mode = na->mode;
1132	vap->va_uid = (na->uid == NFS_UID_NOBODY) ? UID_NOBODY : (uid_t)na->uid;
1133	vap->va_gid = (na->gid == NFS_GID_NOBODY) ? GID_NOBODY : (gid_t)na->gid;
1134	vap->va_fsid = vp->v_vfsp->vfs_dev;
1135	vap->va_nodeid = na->fileid;
1136	vap->va_nlink = na->nlink;
1137	vap->va_size = na->size;
1138
1139	/*
1140	 * nfs protocol defines times as unsigned so don't extend sign,
1141	 * unless sysadmin set nfs_allow_preepoch_time.
1142	 */
1143	NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, na->atime.seconds);
1144	vap->va_atime.tv_nsec = (uint32_t)na->atime.nseconds;
1145	NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, na->mtime.seconds);
1146	vap->va_mtime.tv_nsec = (uint32_t)na->mtime.nseconds;
1147	NFS_TIME_T_CONVERT(vap->va_ctime.tv_sec, na->ctime.seconds);
1148	vap->va_ctime.tv_nsec = (uint32_t)na->ctime.nseconds;
1149
1150	switch (na->type) {
1151	case NF3BLK:
1152		vap->va_rdev = makedevice(na->rdev.specdata1,
1153		    na->rdev.specdata2);
1154		vap->va_blksize = DEV_BSIZE;
1155		vap->va_nblocks = 0;
1156		break;
1157	case NF3CHR:
1158		vap->va_rdev = makedevice(na->rdev.specdata1,
1159		    na->rdev.specdata2);
1160		vap->va_blksize = MAXBSIZE;
1161		vap->va_nblocks = 0;
1162		break;
1163	case NF3REG:
1164	case NF3DIR:
1165	case NF3LNK:
1166		vap->va_rdev = 0;
1167		vap->va_blksize = MAXBSIZE;
1168		vap->va_nblocks = (u_longlong_t)
1169		    ((na->used + (size3)DEV_BSIZE - (size3)1) /
1170		    (size3)DEV_BSIZE);
1171		break;
1172	case NF3SOCK:
1173	case NF3FIFO:
1174	default:
1175		vap->va_rdev = 0;
1176		vap->va_blksize = MAXBSIZE;
1177		vap->va_nblocks = 0;
1178		break;
1179	}
1180	vap->va_seq = 0;
1181	return (0);
1182}
1183
1184/*
1185 * Asynchronous I/O parameters.  nfs_async_threads is the high-water mark
1186 * for the demand-based allocation of async threads per-mount.  The
1187 * nfs_async_timeout is the amount of time a thread will live after it
1188 * becomes idle, unless new I/O requests are received before the thread
1189 * dies.  See nfs_async_putpage and nfs_async_start.
1190 */
1191
1192int nfs_async_timeout = -1;	/* uninitialized */
1193
1194static void	nfs_async_start(struct vfs *);
1195static void	nfs_async_pgops_start(struct vfs *);
1196static void	nfs_async_common_start(struct vfs *, int);
1197
1198static void
1199free_async_args(struct nfs_async_reqs *args)
1200{
1201	rnode_t *rp;
1202
1203	if (args->a_io != NFS_INACTIVE) {
1204		rp = VTOR(args->a_vp);
1205		mutex_enter(&rp->r_statelock);
1206		rp->r_count--;
1207		if (args->a_io == NFS_PUTAPAGE ||
1208		    args->a_io == NFS_PAGEIO)
1209			rp->r_awcount--;
1210		cv_broadcast(&rp->r_cv);
1211		mutex_exit(&rp->r_statelock);
1212		VN_RELE(args->a_vp);
1213	}
1214	crfree(args->a_cred);
1215	kmem_free(args, sizeof (*args));
1216}
1217
1218/*
1219 * Cross-zone thread creation and NFS access is disallowed, yet fsflush() and
1220 * pageout(), running in the global zone, have legitimate reasons to do
1221 * VOP_PUTPAGE(B_ASYNC) on other zones' NFS mounts.  We avoid the problem by
1222 * use of a a per-mount "asynchronous requests manager thread" which is
1223 * signaled by the various asynchronous work routines when there is
1224 * asynchronous work to be done.  It is responsible for creating new
1225 * worker threads if necessary, and notifying existing worker threads
1226 * that there is work to be done.
1227 *
1228 * In other words, it will "take the specifications from the customers and
1229 * give them to the engineers."
1230 *
1231 * Worker threads die off of their own accord if they are no longer
1232 * needed.
1233 *
1234 * This thread is killed when the zone is going away or the filesystem
1235 * is being unmounted.
1236 */
1237void
1238nfs_async_manager(vfs_t *vfsp)
1239{
1240	callb_cpr_t cprinfo;
1241	mntinfo_t *mi;
1242	uint_t max_threads;
1243
1244	mi = VFTOMI(vfsp);
1245
1246	CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr,
1247	    "nfs_async_manager");
1248
1249	mutex_enter(&mi->mi_async_lock);
1250	/*
1251	 * We want to stash the max number of threads that this mount was
1252	 * allowed so we can use it later when the variable is set to zero as
1253	 * part of the zone/mount going away.
1254	 *
1255	 * We want to be able to create at least one thread to handle
1256	 * asynchronous inactive calls.
1257	 */
1258	max_threads = MAX(mi->mi_max_threads, 1);
1259	/*
1260	 * We don't want to wait for mi_max_threads to go to zero, since that
1261	 * happens as part of a failed unmount, but this thread should only
1262	 * exit when the mount/zone is really going away.
1263	 *
1264	 * Once MI_ASYNC_MGR_STOP is set, no more async operations will be
1265	 * attempted: the various _async_*() functions know to do things
1266	 * inline if mi_max_threads == 0.  Henceforth we just drain out the
1267	 * outstanding requests.
1268	 *
1269	 * Note that we still create zthreads even if we notice the zone is
1270	 * shutting down (MI_ASYNC_MGR_STOP is set); this may cause the zone
1271	 * shutdown sequence to take slightly longer in some cases, but
1272	 * doesn't violate the protocol, as all threads will exit as soon as
1273	 * they're done processing the remaining requests.
1274	 */
1275	for (;;) {
1276		while (mi->mi_async_req_count > 0) {
1277			/*
1278			 * Paranoia: If the mount started out having
1279			 * (mi->mi_max_threads == 0), and the value was
1280			 * later changed (via a debugger or somesuch),
1281			 * we could be confused since we will think we
1282			 * can't create any threads, and the calling
1283			 * code (which looks at the current value of
1284			 * mi->mi_max_threads, now non-zero) thinks we
1285			 * can.
1286			 *
1287			 * So, because we're paranoid, we create threads
1288			 * up to the maximum of the original and the
1289			 * current value. This means that future
1290			 * (debugger-induced) lowerings of
1291			 * mi->mi_max_threads are ignored for our
1292			 * purposes, but who told them they could change
1293			 * random values on a live kernel anyhow?
1294			 */
1295			if (mi->mi_threads[NFS_ASYNC_QUEUE] <
1296			    MAX(mi->mi_max_threads, max_threads)) {
1297				mi->mi_threads[NFS_ASYNC_QUEUE]++;
1298				mutex_exit(&mi->mi_async_lock);
1299				VFS_HOLD(vfsp);	/* hold for new thread */
1300				(void) zthread_create(NULL, 0, nfs_async_start,
1301				    vfsp, 0, minclsyspri);
1302				mutex_enter(&mi->mi_async_lock);
1303			} else if (mi->mi_threads[NFS_ASYNC_PGOPS_QUEUE] <
1304			    NUM_ASYNC_PGOPS_THREADS) {
1305				mi->mi_threads[NFS_ASYNC_PGOPS_QUEUE]++;
1306				mutex_exit(&mi->mi_async_lock);
1307				VFS_HOLD(vfsp); /* hold for new thread */
1308				(void) zthread_create(NULL, 0,
1309				    nfs_async_pgops_start, vfsp, 0,
1310				    minclsyspri);
1311				mutex_enter(&mi->mi_async_lock);
1312			}
1313			NFS_WAKE_ASYNC_WORKER(mi->mi_async_work_cv);
1314			ASSERT(mi->mi_async_req_count != 0);
1315			mi->mi_async_req_count--;
1316		}
1317
1318		mutex_enter(&mi->mi_lock);
1319		if (mi->mi_flags & MI_ASYNC_MGR_STOP) {
1320			mutex_exit(&mi->mi_lock);
1321			break;
1322		}
1323		mutex_exit(&mi->mi_lock);
1324
1325		CALLB_CPR_SAFE_BEGIN(&cprinfo);
1326		cv_wait(&mi->mi_async_reqs_cv, &mi->mi_async_lock);
1327		CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock);
1328	}
1329	/*
1330	 * Let everyone know we're done.
1331	 */
1332	mi->mi_manager_thread = NULL;
1333	cv_broadcast(&mi->mi_async_cv);
1334
1335	/*
1336	 * There is no explicit call to mutex_exit(&mi->mi_async_lock)
1337	 * since CALLB_CPR_EXIT is actually responsible for releasing
1338	 * 'mi_async_lock'.
1339	 */
1340	CALLB_CPR_EXIT(&cprinfo);
1341	VFS_RELE(vfsp);	/* release thread's hold */
1342	zthread_exit();
1343}
1344
1345/*
1346 * Signal (and wait for) the async manager thread to clean up and go away.
1347 */
1348void
1349nfs_async_manager_stop(vfs_t *vfsp)
1350{
1351	mntinfo_t *mi = VFTOMI(vfsp);
1352
1353	mutex_enter(&mi->mi_async_lock);
1354	mutex_enter(&mi->mi_lock);
1355	mi->mi_flags |= MI_ASYNC_MGR_STOP;
1356	mutex_exit(&mi->mi_lock);
1357	cv_broadcast(&mi->mi_async_reqs_cv);
1358	while (mi->mi_manager_thread != NULL)
1359		cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
1360	mutex_exit(&mi->mi_async_lock);
1361}
1362
1363int
1364nfs_async_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr,
1365    struct seg *seg, cred_t *cr, void (*readahead)(vnode_t *,
1366    u_offset_t, caddr_t, struct seg *, cred_t *))
1367{
1368	rnode_t *rp;
1369	mntinfo_t *mi;
1370	struct nfs_async_reqs *args;
1371
1372	rp = VTOR(vp);
1373	ASSERT(rp->r_freef == NULL);
1374
1375	mi = VTOMI(vp);
1376
1377	/*
1378	 * If addr falls in a different segment, don't bother doing readahead.
1379	 */
1380	if (addr >= seg->s_base + seg->s_size)
1381		return (-1);
1382
1383	/*
1384	 * If we can't allocate a request structure, punt on the readahead.
1385	 */
1386	if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1387		return (-1);
1388
1389	/*
1390	 * If a lock operation is pending, don't initiate any new
1391	 * readaheads.  Otherwise, bump r_count to indicate the new
1392	 * asynchronous I/O.
1393	 */
1394	if (!nfs_rw_tryenter(&rp->r_lkserlock, RW_READER)) {
1395		kmem_free(args, sizeof (*args));
1396		return (-1);
1397	}
1398	mutex_enter(&rp->r_statelock);
1399	rp->r_count++;
1400	mutex_exit(&rp->r_statelock);
1401	nfs_rw_exit(&rp->r_lkserlock);
1402
1403	args->a_next = NULL;
1404#ifdef DEBUG
1405	args->a_queuer = curthread;
1406#endif
1407	VN_HOLD(vp);
1408	args->a_vp = vp;
1409	ASSERT(cr != NULL);
1410	crhold(cr);
1411	args->a_cred = cr;
1412	args->a_io = NFS_READ_AHEAD;
1413	args->a_nfs_readahead = readahead;
1414	args->a_nfs_blkoff = blkoff;
1415	args->a_nfs_seg = seg;
1416	args->a_nfs_addr = addr;
1417
1418	mutex_enter(&mi->mi_async_lock);
1419
1420	/*
1421	 * If asyncio has been disabled, don't bother readahead.
1422	 */
1423	if (mi->mi_max_threads == 0) {
1424		mutex_exit(&mi->mi_async_lock);
1425		goto noasync;
1426	}
1427
1428	/*
1429	 * Link request structure into the async list and
1430	 * wakeup async thread to do the i/o.
1431	 */
1432	if (mi->mi_async_reqs[NFS_READ_AHEAD] == NULL) {
1433		mi->mi_async_reqs[NFS_READ_AHEAD] = args;
1434		mi->mi_async_tail[NFS_READ_AHEAD] = args;
1435	} else {
1436		mi->mi_async_tail[NFS_READ_AHEAD]->a_next = args;
1437		mi->mi_async_tail[NFS_READ_AHEAD] = args;
1438	}
1439
1440	if (mi->mi_io_kstats) {
1441		mutex_enter(&mi->mi_lock);
1442		kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1443		mutex_exit(&mi->mi_lock);
1444	}
1445
1446	mi->mi_async_req_count++;
1447	ASSERT(mi->mi_async_req_count != 0);
1448	cv_signal(&mi->mi_async_reqs_cv);
1449	mutex_exit(&mi->mi_async_lock);
1450	return (0);
1451
1452noasync:
1453	mutex_enter(&rp->r_statelock);
1454	rp->r_count--;
1455	cv_broadcast(&rp->r_cv);
1456	mutex_exit(&rp->r_statelock);
1457	VN_RELE(vp);
1458	crfree(cr);
1459	kmem_free(args, sizeof (*args));
1460	return (-1);
1461}
1462
1463int
1464nfs_async_putapage(vnode_t *vp, page_t *pp, u_offset_t off, size_t len,
1465    int flags, cred_t *cr, int (*putapage)(vnode_t *, page_t *,
1466    u_offset_t, size_t, int, cred_t *))
1467{
1468	rnode_t *rp;
1469	mntinfo_t *mi;
1470	struct nfs_async_reqs *args;
1471
1472	ASSERT(flags & B_ASYNC);
1473	ASSERT(vp->v_vfsp != NULL);
1474
1475	rp = VTOR(vp);
1476	ASSERT(rp->r_count > 0);
1477
1478	mi = VTOMI(vp);
1479
1480	/*
1481	 * If we can't allocate a request structure, do the putpage
1482	 * operation synchronously in this thread's context.
1483	 */
1484	if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1485		goto noasync;
1486
1487	args->a_next = NULL;
1488#ifdef DEBUG
1489	args->a_queuer = curthread;
1490#endif
1491	VN_HOLD(vp);
1492	args->a_vp = vp;
1493	ASSERT(cr != NULL);
1494	crhold(cr);
1495	args->a_cred = cr;
1496	args->a_io = NFS_PUTAPAGE;
1497	args->a_nfs_putapage = putapage;
1498	args->a_nfs_pp = pp;
1499	args->a_nfs_off = off;
1500	args->a_nfs_len = (uint_t)len;
1501	args->a_nfs_flags = flags;
1502
1503	mutex_enter(&mi->mi_async_lock);
1504
1505	/*
1506	 * If asyncio has been disabled, then make a synchronous request.
1507	 * This check is done a second time in case async io was diabled
1508	 * while this thread was blocked waiting for memory pressure to
1509	 * reduce or for the queue to drain.
1510	 */
1511	if (mi->mi_max_threads == 0) {
1512		mutex_exit(&mi->mi_async_lock);
1513		goto noasync;
1514	}
1515
1516	/*
1517	 * Link request structure into the async list and
1518	 * wakeup async thread to do the i/o.
1519	 */
1520	if (mi->mi_async_reqs[NFS_PUTAPAGE] == NULL) {
1521		mi->mi_async_reqs[NFS_PUTAPAGE] = args;
1522		mi->mi_async_tail[NFS_PUTAPAGE] = args;
1523	} else {
1524		mi->mi_async_tail[NFS_PUTAPAGE]->a_next = args;
1525		mi->mi_async_tail[NFS_PUTAPAGE] = args;
1526	}
1527
1528	mutex_enter(&rp->r_statelock);
1529	rp->r_count++;
1530	rp->r_awcount++;
1531	mutex_exit(&rp->r_statelock);
1532
1533	if (mi->mi_io_kstats) {
1534		mutex_enter(&mi->mi_lock);
1535		kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1536		mutex_exit(&mi->mi_lock);
1537	}
1538
1539	mi->mi_async_req_count++;
1540	ASSERT(mi->mi_async_req_count != 0);
1541	cv_signal(&mi->mi_async_reqs_cv);
1542	mutex_exit(&mi->mi_async_lock);
1543	return (0);
1544
1545noasync:
1546	if (args != NULL) {
1547		VN_RELE(vp);
1548		crfree(cr);
1549		kmem_free(args, sizeof (*args));
1550	}
1551
1552	if (curproc == proc_pageout || curproc == proc_fsflush) {
1553		/*
1554		 * If we get here in the context of the pageout/fsflush,
1555		 * we refuse to do a sync write, because this may hang
1556		 * pageout (and the machine). In this case, we just
1557		 * re-mark the page as dirty and punt on the page.
1558		 *
1559		 * Make sure B_FORCE isn't set.  We can re-mark the
1560		 * pages as dirty and unlock the pages in one swoop by
1561		 * passing in B_ERROR to pvn_write_done().  However,
1562		 * we should make sure B_FORCE isn't set - we don't
1563		 * want the page tossed before it gets written out.
1564		 */
1565		if (flags & B_FORCE)
1566			flags &= ~(B_INVAL | B_FORCE);
1567		pvn_write_done(pp, flags | B_ERROR);
1568		return (0);
1569	}
1570	if (nfs_zone() != mi->mi_zone) {
1571		/*
1572		 * So this was a cross-zone sync putpage.  We pass in B_ERROR
1573		 * to pvn_write_done() to re-mark the pages as dirty and unlock
1574		 * them.
1575		 *
1576		 * We don't want to clear B_FORCE here as the caller presumably
1577		 * knows what they're doing if they set it.
1578		 */
1579		pvn_write_done(pp, flags | B_ERROR);
1580		return (EPERM);
1581	}
1582	return ((*putapage)(vp, pp, off, len, flags, cr));
1583}
1584
1585int
1586nfs_async_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
1587    int flags, cred_t *cr, int (*pageio)(vnode_t *, page_t *, u_offset_t,
1588    size_t, int, cred_t *))
1589{
1590	rnode_t *rp;
1591	mntinfo_t *mi;
1592	struct nfs_async_reqs *args;
1593
1594	ASSERT(flags & B_ASYNC);
1595	ASSERT(vp->v_vfsp != NULL);
1596
1597	rp = VTOR(vp);
1598	ASSERT(rp->r_count > 0);
1599
1600	mi = VTOMI(vp);
1601
1602	/*
1603	 * If we can't allocate a request structure, do the pageio
1604	 * request synchronously in this thread's context.
1605	 */
1606	if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1607		goto noasync;
1608
1609	args->a_next = NULL;
1610#ifdef DEBUG
1611	args->a_queuer = curthread;
1612#endif
1613	VN_HOLD(vp);
1614	args->a_vp = vp;
1615	ASSERT(cr != NULL);
1616	crhold(cr);
1617	args->a_cred = cr;
1618	args->a_io = NFS_PAGEIO;
1619	args->a_nfs_pageio = pageio;
1620	args->a_nfs_pp = pp;
1621	args->a_nfs_off = io_off;
1622	args->a_nfs_len = (uint_t)io_len;
1623	args->a_nfs_flags = flags;
1624
1625	mutex_enter(&mi->mi_async_lock);
1626
1627	/*
1628	 * If asyncio has been disabled, then make a synchronous request.
1629	 * This check is done a second time in case async io was diabled
1630	 * while this thread was blocked waiting for memory pressure to
1631	 * reduce or for the queue to drain.
1632	 */
1633	if (mi->mi_max_threads == 0) {
1634		mutex_exit(&mi->mi_async_lock);
1635		goto noasync;
1636	}
1637
1638	/*
1639	 * Link request structure into the async list and
1640	 * wakeup async thread to do the i/o.
1641	 */
1642	if (mi->mi_async_reqs[NFS_PAGEIO] == NULL) {
1643		mi->mi_async_reqs[NFS_PAGEIO] = args;
1644		mi->mi_async_tail[NFS_PAGEIO] = args;
1645	} else {
1646		mi->mi_async_tail[NFS_PAGEIO]->a_next = args;
1647		mi->mi_async_tail[NFS_PAGEIO] = args;
1648	}
1649
1650	mutex_enter(&rp->r_statelock);
1651	rp->r_count++;
1652	rp->r_awcount++;
1653	mutex_exit(&rp->r_statelock);
1654
1655	if (mi->mi_io_kstats) {
1656		mutex_enter(&mi->mi_lock);
1657		kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1658		mutex_exit(&mi->mi_lock);
1659	}
1660
1661	mi->mi_async_req_count++;
1662	ASSERT(mi->mi_async_req_count != 0);
1663	cv_signal(&mi->mi_async_reqs_cv);
1664	mutex_exit(&mi->mi_async_lock);
1665	return (0);
1666
1667noasync:
1668	if (args != NULL) {
1669		VN_RELE(vp);
1670		crfree(cr);
1671		kmem_free(args, sizeof (*args));
1672	}
1673
1674	/*
1675	 * If we can't do it ASYNC, for reads we do nothing (but cleanup
1676	 * the page list), for writes we do it synchronously, except for
1677	 * proc_pageout/proc_fsflush as described below.
1678	 */
1679	if (flags & B_READ) {
1680		pvn_read_done(pp, flags | B_ERROR);
1681		return (0);
1682	}
1683
1684	if (curproc == proc_pageout || curproc == proc_fsflush) {
1685		/*
1686		 * If we get here in the context of the pageout/fsflush,
1687		 * we refuse to do a sync write, because this may hang
1688		 * pageout/fsflush (and the machine). In this case, we just
1689		 * re-mark the page as dirty and punt on the page.
1690		 *
1691		 * Make sure B_FORCE isn't set.  We can re-mark the
1692		 * pages as dirty and unlock the pages in one swoop by
1693		 * passing in B_ERROR to pvn_write_done().  However,
1694		 * we should make sure B_FORCE isn't set - we don't
1695		 * want the page tossed before it gets written out.
1696		 */
1697		if (flags & B_FORCE)
1698			flags &= ~(B_INVAL | B_FORCE);
1699		pvn_write_done(pp, flags | B_ERROR);
1700		return (0);
1701	}
1702
1703	if (nfs_zone() != mi->mi_zone) {
1704		/*
1705		 * So this was a cross-zone sync pageio.  We pass in B_ERROR
1706		 * to pvn_write_done() to re-mark the pages as dirty and unlock
1707		 * them.
1708		 *
1709		 * We don't want to clear B_FORCE here as the caller presumably
1710		 * knows what they're doing if they set it.
1711		 */
1712		pvn_write_done(pp, flags | B_ERROR);
1713		return (EPERM);
1714	}
1715	return ((*pageio)(vp, pp, io_off, io_len, flags, cr));
1716}
1717
1718void
1719nfs_async_readdir(vnode_t *vp, rddir_cache *rdc, cred_t *cr,
1720    int (*readdir)(vnode_t *, rddir_cache *, cred_t *))
1721{
1722	rnode_t *rp;
1723	mntinfo_t *mi;
1724	struct nfs_async_reqs *args;
1725
1726	rp = VTOR(vp);
1727	ASSERT(rp->r_freef == NULL);
1728
1729	mi = VTOMI(vp);
1730
1731	/*
1732	 * If we can't allocate a request structure, do the readdir
1733	 * operation synchronously in this thread's context.
1734	 */
1735	if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1736		goto noasync;
1737
1738	args->a_next = NULL;
1739#ifdef DEBUG
1740	args->a_queuer = curthread;
1741#endif
1742	VN_HOLD(vp);
1743	args->a_vp = vp;
1744	ASSERT(cr != NULL);
1745	crhold(cr);
1746	args->a_cred = cr;
1747	args->a_io = NFS_READDIR;
1748	args->a_nfs_readdir = readdir;
1749	args->a_nfs_rdc = rdc;
1750
1751	mutex_enter(&mi->mi_async_lock);
1752
1753	/*
1754	 * If asyncio has been disabled, then make a synchronous request.
1755	 */
1756	if (mi->mi_max_threads == 0) {
1757		mutex_exit(&mi->mi_async_lock);
1758		goto noasync;
1759	}
1760
1761	/*
1762	 * Link request structure into the async list and
1763	 * wakeup async thread to do the i/o.
1764	 */
1765	if (mi->mi_async_reqs[NFS_READDIR] == NULL) {
1766		mi->mi_async_reqs[NFS_READDIR] = args;
1767		mi->mi_async_tail[NFS_READDIR] = args;
1768	} else {
1769		mi->mi_async_tail[NFS_READDIR]->a_next = args;
1770		mi->mi_async_tail[NFS_READDIR] = args;
1771	}
1772
1773	mutex_enter(&rp->r_statelock);
1774	rp->r_count++;
1775	mutex_exit(&rp->r_statelock);
1776
1777	if (mi->mi_io_kstats) {
1778		mutex_enter(&mi->mi_lock);
1779		kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1780		mutex_exit(&mi->mi_lock);
1781	}
1782
1783	mi->mi_async_req_count++;
1784	ASSERT(mi->mi_async_req_count != 0);
1785	cv_signal(&mi->mi_async_reqs_cv);
1786	mutex_exit(&mi->mi_async_lock);
1787	return;
1788
1789noasync:
1790	if (args != NULL) {
1791		VN_RELE(vp);
1792		crfree(cr);
1793		kmem_free(args, sizeof (*args));
1794	}
1795
1796	rdc->entries = NULL;
1797	mutex_enter(&rp->r_statelock);
1798	ASSERT(rdc->flags & RDDIR);
1799	rdc->flags &= ~RDDIR;
1800	rdc->flags |= RDDIRREQ;
1801	/*
1802	 * Check the flag to see if RDDIRWAIT is set. If RDDIRWAIT
1803	 * is set, wakeup the thread sleeping in cv_wait_sig().
1804	 * The woken up thread will reset the flag to RDDIR and will
1805	 * continue with the readdir opeartion.
1806	 */
1807	if (rdc->flags & RDDIRWAIT) {
1808		rdc->flags &= ~RDDIRWAIT;
1809		cv_broadcast(&rdc->cv);
1810	}
1811	mutex_exit(&rp->r_statelock);
1812	rddir_cache_rele(rdc);
1813}
1814
1815void
1816nfs_async_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count,
1817    cred_t *cr, void (*commit)(vnode_t *, page_t *, offset3, count3, cred_t *))
1818{
1819	rnode_t *rp;
1820	mntinfo_t *mi;
1821	struct nfs_async_reqs *args;
1822	page_t *pp;
1823
1824	rp = VTOR(vp);
1825	mi = VTOMI(vp);
1826
1827	/*
1828	 * If we can't allocate a request structure, do the commit
1829	 * operation synchronously in this thread's context.
1830	 */
1831	if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1832		goto noasync;
1833
1834	args->a_next = NULL;
1835#ifdef DEBUG
1836	args->a_queuer = curthread;
1837#endif
1838	VN_HOLD(vp);
1839	args->a_vp = vp;
1840	ASSERT(cr != NULL);
1841	crhold(cr);
1842	args->a_cred = cr;
1843	args->a_io = NFS_COMMIT;
1844	args->a_nfs_commit = commit;
1845	args->a_nfs_plist = plist;
1846	args->a_nfs_offset = offset;
1847	args->a_nfs_count = count;
1848
1849	mutex_enter(&mi->mi_async_lock);
1850
1851	/*
1852	 * If asyncio has been disabled, then make a synchronous request.
1853	 * This check is done a second time in case async io was diabled
1854	 * while this thread was blocked waiting for memory pressure to
1855	 * reduce or for the queue to drain.
1856	 */
1857	if (mi->mi_max_threads == 0) {
1858		mutex_exit(&mi->mi_async_lock);
1859		goto noasync;
1860	}
1861
1862	/*
1863	 * Link request structure into the async list and
1864	 * wakeup async thread to do the i/o.
1865	 */
1866	if (mi->mi_async_reqs[NFS_COMMIT] == NULL) {
1867		mi->mi_async_reqs[NFS_COMMIT] = args;
1868		mi->mi_async_tail[NFS_COMMIT] = args;
1869	} else {
1870		mi->mi_async_tail[NFS_COMMIT]->a_next = args;
1871		mi->mi_async_tail[NFS_COMMIT] = args;
1872	}
1873
1874	mutex_enter(&rp->r_statelock);
1875	rp->r_count++;
1876	mutex_exit(&rp->r_statelock);
1877
1878	if (mi->mi_io_kstats) {
1879		mutex_enter(&mi->mi_lock);
1880		kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1881		mutex_exit(&mi->mi_lock);
1882	}
1883
1884	mi->mi_async_req_count++;
1885	ASSERT(mi->mi_async_req_count != 0);
1886	cv_signal(&mi->mi_async_reqs_cv);
1887	mutex_exit(&mi->mi_async_lock);
1888	return;
1889
1890noasync:
1891	if (args != NULL) {
1892		VN_RELE(vp);
1893		crfree(cr);
1894		kmem_free(args, sizeof (*args));
1895	}
1896
1897	if (curproc == proc_pageout || curproc == proc_fsflush ||
1898	    nfs_zone() != mi->mi_zone) {
1899		while (plist != NULL) {
1900			pp = plist;
1901			page_sub(&plist, pp);
1902			pp->p_fsdata = C_COMMIT;
1903			page_unlock(pp);
1904		}
1905		return;
1906	}
1907	(*commit)(vp, plist, offset, count, cr);
1908}
1909
1910void
1911nfs_async_inactive(vnode_t *vp, cred_t *cr,
1912    void (*inactive)(vnode_t *, cred_t *, caller_context_t *))
1913{
1914	mntinfo_t *mi;
1915	struct nfs_async_reqs *args;
1916
1917	mi = VTOMI(vp);
1918
1919	args = kmem_alloc(sizeof (*args), KM_SLEEP);
1920	args->a_next = NULL;
1921#ifdef DEBUG
1922	args->a_queuer = curthread;
1923#endif
1924	args->a_vp = vp;
1925	ASSERT(cr != NULL);
1926	crhold(cr);
1927	args->a_cred = cr;
1928	args->a_io = NFS_INACTIVE;
1929	args->a_nfs_inactive = inactive;
1930
1931	/*
1932	 * Note that we don't check mi->mi_max_threads here, since we
1933	 * *need* to get rid of this vnode regardless of whether someone
1934	 * set nfs3_max_threads/nfs_max_threads to zero in /etc/system.
1935	 *
1936	 * The manager thread knows about this and is willing to create
1937	 * at least one thread to accommodate us.
1938	 */
1939	mutex_enter(&mi->mi_async_lock);
1940	if (mi->mi_manager_thread == NULL) {
1941		rnode_t *rp = VTOR(vp);
1942
1943		mutex_exit(&mi->mi_async_lock);
1944		crfree(cr);	/* drop our reference */
1945		kmem_free(args, sizeof (*args));
1946		/*
1947		 * We can't do an over-the-wire call since we're in the wrong
1948		 * zone, so we need to clean up state as best we can and then
1949		 * throw away the vnode.
1950		 */
1951		mutex_enter(&rp->r_statelock);
1952		if (rp->r_unldvp != NULL) {
1953			vnode_t *unldvp;
1954			char *unlname;
1955			cred_t *unlcred;
1956
1957			unldvp = rp->r_unldvp;
1958			rp->r_unldvp = NULL;
1959			unlname = rp->r_unlname;
1960			rp->r_unlname = NULL;
1961			unlcred = rp->r_unlcred;
1962			rp->r_unlcred = NULL;
1963			mutex_exit(&rp->r_statelock);
1964
1965			VN_RELE(unldvp);
1966			kmem_free(unlname, MAXNAMELEN);
1967			crfree(unlcred);
1968		} else {
1969			mutex_exit(&rp->r_statelock);
1970		}
1971		/*
1972		 * No need to explicitly throw away any cached pages.  The
1973		 * eventual rinactive() will attempt a synchronous
1974		 * VOP_PUTPAGE() which will immediately fail since the request
1975		 * is coming from the wrong zone, and then will proceed to call
1976		 * nfs_invalidate_pages() which will clean things up for us.
1977		 */
1978		rp_addfree(VTOR(vp), cr);
1979		return;
1980	}
1981
1982	if (mi->mi_async_reqs[NFS_INACTIVE] == NULL) {
1983		mi->mi_async_reqs[NFS_INACTIVE] = args;
1984	} else {
1985		mi->mi_async_tail[NFS_INACTIVE]->a_next = args;
1986	}
1987	mi->mi_async_tail[NFS_INACTIVE] = args;
1988	/*
1989	 * Don't increment r_count, since we're trying to get rid of the vnode.
1990	 */
1991
1992	mi->mi_async_req_count++;
1993	ASSERT(mi->mi_async_req_count != 0);
1994	cv_signal(&mi->mi_async_reqs_cv);
1995	mutex_exit(&mi->mi_async_lock);
1996}
1997
1998static void
1999nfs_async_start(struct vfs *vfsp)
2000{
2001	nfs_async_common_start(vfsp, NFS_ASYNC_QUEUE);
2002}
2003
2004static void
2005nfs_async_pgops_start(struct vfs *vfsp)
2006{
2007	nfs_async_common_start(vfsp, NFS_ASYNC_PGOPS_QUEUE);
2008}
2009
2010/*
2011 * The async queues for each mounted file system are arranged as a
2012 * set of queues, one for each async i/o type.  Requests are taken
2013 * from the queues in a round-robin fashion.  A number of consecutive
2014 * requests are taken from each queue before moving on to the next
2015 * queue.  This functionality may allow the NFS Version 2 server to do
2016 * write clustering, even if the client is mixing writes and reads
2017 * because it will take multiple write requests from the queue
2018 * before processing any of the other async i/o types.
2019 *
2020 * XXX The nfs_async_common_start thread is unsafe in the light of the present
2021 * model defined by cpr to suspend the system. Specifically over the
2022 * wire calls are cpr-unsafe. The thread should be reevaluated in
2023 * case of future updates to the cpr model.
2024 */
2025static void
2026nfs_async_common_start(struct vfs *vfsp, int async_queue)
2027{
2028	struct nfs_async_reqs *args;
2029	mntinfo_t *mi = VFTOMI(vfsp);
2030	clock_t time_left = 1;
2031	callb_cpr_t cprinfo;
2032	int i;
2033	int async_types;
2034	kcondvar_t *async_work_cv;
2035
2036	if (async_queue == NFS_ASYNC_QUEUE) {
2037		async_types = NFS_ASYNC_TYPES;
2038		async_work_cv = &mi->mi_async_work_cv[NFS_ASYNC_QUEUE];
2039	} else {
2040		async_types = NFS_ASYNC_PGOPS_TYPES;
2041		async_work_cv = &mi->mi_async_work_cv[NFS_ASYNC_PGOPS_QUEUE];
2042	}
2043
2044	/*
2045	 * Dynamic initialization of nfs_async_timeout to allow nfs to be
2046	 * built in an implementation independent manner.
2047	 */
2048	if (nfs_async_timeout == -1)
2049		nfs_async_timeout = NFS_ASYNC_TIMEOUT;
2050
2051	CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr, "nas");
2052
2053	mutex_enter(&mi->mi_async_lock);
2054	for (;;) {
2055		/*
2056		 * Find the next queue containing an entry.  We start
2057		 * at the current queue pointer and then round robin
2058		 * through all of them until we either find a non-empty
2059		 * queue or have looked through all of them.
2060		 */
2061		for (i = 0; i < async_types; i++) {
2062			args = *mi->mi_async_curr[async_queue];
2063			if (args != NULL)
2064				break;
2065			mi->mi_async_curr[async_queue]++;
2066			if (mi->mi_async_curr[async_queue] ==
2067			    &mi->mi_async_reqs[async_types]) {
2068				mi->mi_async_curr[async_queue] =
2069				    &mi->mi_async_reqs[0];
2070			}
2071		}
2072		/*
2073		 * If we didn't find a entry, then block until woken up
2074		 * again and then look through the queues again.
2075		 */
2076		if (args == NULL) {
2077			/*
2078			 * Exiting is considered to be safe for CPR as well
2079			 */
2080			CALLB_CPR_SAFE_BEGIN(&cprinfo);
2081
2082			/*
2083			 * Wakeup thread waiting to unmount the file
2084			 * system only if all async threads are inactive.
2085			 *
2086			 * If we've timed-out and there's nothing to do,
2087			 * then get rid of this thread.
2088			 */
2089			if (mi->mi_max_threads == 0 || time_left <= 0) {
2090				--mi->mi_threads[async_queue];
2091
2092				if (mi->mi_threads[NFS_ASYNC_QUEUE] == 0 &&
2093				    mi->mi_threads[NFS_ASYNC_PGOPS_QUEUE] == 0)
2094					cv_signal(&mi->mi_async_cv);
2095				CALLB_CPR_EXIT(&cprinfo);
2096				VFS_RELE(vfsp);	/* release thread's hold */
2097				zthread_exit();
2098				/* NOTREACHED */
2099			}
2100			time_left = cv_reltimedwait(async_work_cv,
2101			    &mi->mi_async_lock, nfs_async_timeout,
2102			    TR_CLOCK_TICK);
2103
2104			CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock);
2105
2106			continue;
2107		}
2108		time_left = 1;
2109
2110		/*
2111		 * Remove the request from the async queue and then
2112		 * update the current async request queue pointer.  If
2113		 * the current queue is empty or we have removed enough
2114		 * consecutive entries from it, then reset the counter
2115		 * for this queue and then move the current pointer to
2116		 * the next queue.
2117		 */
2118		*mi->mi_async_curr[async_queue] = args->a_next;
2119		if (*mi->mi_async_curr[async_queue] == NULL ||
2120		    --mi->mi_async_clusters[args->a_io] == 0) {
2121			mi->mi_async_clusters[args->a_io] =
2122			    mi->mi_async_init_clusters;
2123			mi->mi_async_curr[async_queue]++;
2124			if (mi->mi_async_curr[async_queue] ==
2125			    &mi->mi_async_reqs[async_types]) {
2126				mi->mi_async_curr[async_queue] =
2127				    &mi->mi_async_reqs[0];
2128			}
2129		}
2130
2131		if (args->a_io != NFS_INACTIVE && mi->mi_io_kstats) {
2132			mutex_enter(&mi->mi_lock);
2133			kstat_waitq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
2134			mutex_exit(&mi->mi_lock);
2135		}
2136
2137		mutex_exit(&mi->mi_async_lock);
2138
2139		/*
2140		 * Obtain arguments from the async request structure.
2141		 */
2142		if (args->a_io == NFS_READ_AHEAD && mi->mi_max_threads > 0) {
2143			(*args->a_nfs_readahead)(args->a_vp, args->a_nfs_blkoff,
2144			    args->a_nfs_addr, args->a_nfs_seg,
2145			    args->a_cred);
2146		} else if (args->a_io == NFS_PUTAPAGE) {
2147			(void) (*args->a_nfs_putapage)(args->a_vp,
2148			    args->a_nfs_pp, args->a_nfs_off,
2149			    args->a_nfs_len, args->a_nfs_flags,
2150			    args->a_cred);
2151		} else if (args->a_io == NFS_PAGEIO) {
2152			(void) (*args->a_nfs_pageio)(args->a_vp,
2153			    args->a_nfs_pp, args->a_nfs_off,
2154			    args->a_nfs_len, args->a_nfs_flags,
2155			    args->a_cred);
2156		} else if (args->a_io == NFS_READDIR) {
2157			(void) ((*args->a_nfs_readdir)(args->a_vp,
2158			    args->a_nfs_rdc, args->a_cred));
2159		} else if (args->a_io == NFS_COMMIT) {
2160			(*args->a_nfs_commit)(args->a_vp, args->a_nfs_plist,
2161			    args->a_nfs_offset, args->a_nfs_count,
2162			    args->a_cred);
2163		} else if (args->a_io == NFS_INACTIVE) {
2164			(*args->a_nfs_inactive)(args->a_vp, args->a_cred, NULL);
2165		}
2166
2167		/*
2168		 * Now, release the vnode and free the credentials
2169		 * structure.
2170		 */
2171		free_async_args(args);
2172		/*
2173		 * Reacquire the mutex because it will be needed above.
2174		 */
2175		mutex_enter(&mi->mi_async_lock);
2176	}
2177}
2178
2179void
2180nfs_async_stop(struct vfs *vfsp)
2181{
2182	mntinfo_t *mi = VFTOMI(vfsp);
2183
2184	/*
2185	 * Wait for all outstanding async operations to complete and for the
2186	 * worker threads to exit.
2187	 */
2188	mutex_enter(&mi->mi_async_lock);
2189	mi->mi_max_threads = 0;
2190	NFS_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv);
2191	while (mi->mi_threads[NFS_ASYNC_QUEUE] != 0 ||
2192	    mi->mi_threads[NFS_ASYNC_PGOPS_QUEUE] != 0)
2193		cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
2194	mutex_exit(&mi->mi_async_lock);
2195}
2196
2197/*
2198 * nfs_async_stop_sig:
2199 * Wait for all outstanding putpage operation to complete. If a signal
2200 * is deliver we will abort and return non-zero. If we can put all the
2201 * pages we will return 0. This routine is called from nfs_unmount and
2202 * nfs3_unmount to make these operations interruptible.
2203 */
2204int
2205nfs_async_stop_sig(struct vfs *vfsp)
2206{
2207	mntinfo_t *mi = VFTOMI(vfsp);
2208	ushort_t omax;
2209	int rval;
2210
2211	/*
2212	 * Wait for all outstanding async operations to complete and for the
2213	 * worker threads to exit.
2214	 */
2215	mutex_enter(&mi->mi_async_lock);
2216	omax = mi->mi_max_threads;
2217	mi->mi_max_threads = 0;
2218	/*
2219	 * Tell all the worker threads to exit.
2220	 */
2221	NFS_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv);
2222	while (mi->mi_threads[NFS_ASYNC_QUEUE] != 0 ||
2223	    mi->mi_threads[NFS_ASYNC_PGOPS_QUEUE] != 0) {
2224		if (!cv_wait_sig(&mi->mi_async_cv, &mi->mi_async_lock))
2225			break;
2226	}
2227	rval = (mi->mi_threads[NFS_ASYNC_QUEUE] != 0 ||
2228	    mi->mi_threads[NFS_ASYNC_PGOPS_QUEUE]  != 0); /* Interrupted */
2229	if (rval)
2230		mi->mi_max_threads = omax;
2231	mutex_exit(&mi->mi_async_lock);
2232
2233	return (rval);
2234}
2235
2236int
2237writerp(rnode_t *rp, caddr_t base, int tcount, struct uio *uio, int pgcreated)
2238{
2239	int pagecreate;
2240	int n;
2241	int saved_n;
2242	caddr_t saved_base;
2243	u_offset_t offset;
2244	int error;
2245	int sm_error;
2246	vnode_t *vp = RTOV(rp);
2247
2248	ASSERT(tcount <= MAXBSIZE && tcount <= uio->uio_resid);
2249	ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_WRITER));
2250	if (!vpm_enable) {
2251		ASSERT(((uintptr_t)base & MAXBOFFSET) + tcount <= MAXBSIZE);
2252	}
2253
2254	/*
2255	 * Move bytes in at most PAGESIZE chunks. We must avoid
2256	 * spanning pages in uiomove() because page faults may cause
2257	 * the cache to be invalidated out from under us. The r_size is not
2258	 * updated until after the uiomove. If we push the last page of a
2259	 * file before r_size is correct, we will lose the data written past
2260	 * the current (and invalid) r_size.
2261	 */
2262	do {
2263		offset = uio->uio_loffset;
2264		pagecreate = 0;
2265
2266		/*
2267		 * n is the number of bytes required to satisfy the request
2268		 *   or the number of bytes to fill out the page.
2269		 */
2270		n = (int)MIN((PAGESIZE - (offset & PAGEOFFSET)), tcount);
2271
2272		/*
2273		 * Check to see if we can skip reading in the page
2274		 * and just allocate the memory.  We can do this
2275		 * if we are going to rewrite the entire mapping
2276		 * or if we are going to write to or beyond the current
2277		 * end of file from the beginning of the mapping.
2278		 *
2279		 * The read of r_size is now protected by r_statelock.
2280		 */
2281		mutex_enter(&rp->r_statelock);
2282		/*
2283		 * When pgcreated is nonzero the caller has already done
2284		 * a segmap_getmapflt with forcefault 0 and S_WRITE. With
2285		 * segkpm this means we already have at least one page
2286		 * created and mapped at base.
2287		 */
2288		pagecreate = pgcreated ||
2289		    ((offset & PAGEOFFSET) == 0 &&
2290		    (n == PAGESIZE || ((offset + n) >= rp->r_size)));
2291
2292		mutex_exit(&rp->r_statelock);
2293		if (!vpm_enable && pagecreate) {
2294			/*
2295			 * The last argument tells segmap_pagecreate() to
2296			 * always lock the page, as opposed to sometimes
2297			 * returning with the page locked. This way we avoid a
2298			 * fault on the ensuing uiomove(), but also
2299			 * more importantly (to fix bug 1094402) we can
2300			 * call segmap_fault() to unlock the page in all
2301			 * cases. An alternative would be to modify
2302			 * segmap_pagecreate() to tell us when it is
2303			 * locking a page, but that's a fairly major
2304			 * interface change.
2305			 */
2306			if (pgcreated == 0)
2307				(void) segmap_pagecreate(segkmap, base,
2308				    (uint_t)n, 1);
2309			saved_base = base;
2310			saved_n = n;
2311		}
2312
2313		/*
2314		 * The number of bytes of data in the last page can not
2315		 * be accurately be determined while page is being
2316		 * uiomove'd to and the size of the file being updated.
2317		 * Thus, inform threads which need to know accurately
2318		 * how much data is in the last page of the file.  They
2319		 * will not do the i/o immediately, but will arrange for
2320		 * the i/o to happen later when this modify operation
2321		 * will have finished.
2322		 */
2323		ASSERT(!(rp->r_flags & RMODINPROGRESS));
2324		mutex_enter(&rp->r_statelock);
2325		rp->r_flags |= RMODINPROGRESS;
2326		rp->r_modaddr = (offset & MAXBMASK);
2327		mutex_exit(&rp->r_statelock);
2328
2329		if (vpm_enable) {
2330			/*
2331			 * Copy data. If new pages are created, part of
2332			 * the page that is not written will be initizliazed
2333			 * with zeros.
2334			 */
2335			error = vpm_data_copy(vp, offset, n, uio,
2336			    !pagecreate, NULL, 0, S_WRITE);
2337		} else {
2338			error = uiomove(base, n, UIO_WRITE, uio);
2339		}
2340
2341		/*
2342		 * r_size is the maximum number of
2343		 * bytes known to be in the file.
2344		 * Make sure it is at least as high as the
2345		 * first unwritten byte pointed to by uio_loffset.
2346		 */
2347		mutex_enter(&rp->r_statelock);
2348		if (rp->r_size < uio->uio_loffset)
2349			rp->r_size = uio->uio_loffset;
2350		rp->r_flags &= ~RMODINPROGRESS;
2351		rp->r_flags |= RDIRTY;
2352		mutex_exit(&rp->r_statelock);
2353
2354		/* n = # of bytes written */
2355		n = (int)(uio->uio_loffset - offset);
2356
2357		if (!vpm_enable) {
2358			base += n;
2359		}
2360		tcount -= n;
2361		/*
2362		 * If we created pages w/o initializing them completely,
2363		 * we need to zero the part that wasn't set up.
2364		 * This happens on a most EOF write cases and if
2365		 * we had some sort of error during the uiomove.
2366		 */
2367		if (!vpm_enable && pagecreate) {
2368			if ((uio->uio_loffset & PAGEOFFSET) || n == 0)
2369				(void) kzero(base, PAGESIZE - n);
2370
2371			if (pgcreated) {
2372				/*
2373				 * Caller is responsible for this page,
2374				 * it was not created in this loop.
2375				 */
2376				pgcreated = 0;
2377			} else {
2378				/*
2379				 * For bug 1094402: segmap_pagecreate locks
2380				 * page. Unlock it. This also unlocks the
2381				 * pages allocated by page_create_va() in
2382				 * segmap_pagecreate().
2383				 */
2384				sm_error = segmap_fault(kas.a_hat, segkmap,
2385				    saved_base, saved_n,
2386				    F_SOFTUNLOCK, S_WRITE);
2387				if (error == 0)
2388					error = sm_error;
2389			}
2390		}
2391	} while (tcount > 0 && error == 0);
2392
2393	return (error);
2394}
2395
2396int
2397nfs_putpages(vnode_t *vp, u_offset_t off, size_t len, int flags, cred_t *cr)
2398{
2399	rnode_t *rp;
2400	page_t *pp;
2401	u_offset_t eoff;
2402	u_offset_t io_off;
2403	size_t io_len;
2404	int error;
2405	int rdirty;
2406	int err;
2407
2408	rp = VTOR(vp);
2409	ASSERT(rp->r_count > 0);
2410
2411	if (!vn_has_cached_data(vp))
2412		return (0);
2413
2414	ASSERT(vp->v_type != VCHR);
2415
2416	/*
2417	 * If ROUTOFSPACE is set, then all writes turn into B_INVAL
2418	 * writes.  B_FORCE is set to force the VM system to actually
2419	 * invalidate the pages, even if the i/o failed.  The pages
2420	 * need to get invalidated because they can't be written out
2421	 * because there isn't any space left on either the server's
2422	 * file system or in the user's disk quota.  The B_FREE bit
2423	 * is cleared to avoid confusion as to whether this is a
2424	 * request to place the page on the freelist or to destroy
2425	 * it.
2426	 */
2427	if ((rp->r_flags & ROUTOFSPACE) ||
2428	    (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED))
2429		flags = (flags & ~B_FREE) | B_INVAL | B_FORCE;
2430
2431	if (len == 0) {
2432		/*
2433		 * If doing a full file synchronous operation, then clear
2434		 * the RDIRTY bit.  If a page gets dirtied while the flush
2435		 * is happening, then RDIRTY will get set again.  The
2436		 * RDIRTY bit must get cleared before the flush so that
2437		 * we don't lose this information.
2438		 *
2439		 * If there are no full file async write operations
2440		 * pending and RDIRTY bit is set, clear it.
2441		 */
2442		if (off == (u_offset_t)0 &&
2443		    !(flags & B_ASYNC) &&
2444		    (rp->r_flags & RDIRTY)) {
2445			mutex_enter(&rp->r_statelock);
2446			rdirty = (rp->r_flags & RDIRTY);
2447			rp->r_flags &= ~RDIRTY;
2448			mutex_exit(&rp->r_statelock);
2449		} else if (flags & B_ASYNC && off == (u_offset_t)0) {
2450			mutex_enter(&rp->r_statelock);
2451			if (rp->r_flags & RDIRTY && rp->r_awcount == 0) {
2452				rdirty = (rp->r_flags & RDIRTY);
2453				rp->r_flags &= ~RDIRTY;
2454			}
2455			mutex_exit(&rp->r_statelock);
2456		} else
2457			rdirty = 0;
2458
2459		/*
2460		 * Search the entire vp list for pages >= off, and flush
2461		 * the dirty pages.
2462		 */
2463		error = pvn_vplist_dirty(vp, off, rp->r_putapage,
2464		    flags, cr);
2465
2466		/*
2467		 * If an error occurred and the file was marked as dirty
2468		 * before and we aren't forcibly invalidating pages, then
2469		 * reset the RDIRTY flag.
2470		 */
2471		if (error && rdirty &&
2472		    (flags & (B_INVAL | B_FORCE)) != (B_INVAL | B_FORCE)) {
2473			mutex_enter(&rp->r_statelock);
2474			rp->r_flags |= RDIRTY;
2475			mutex_exit(&rp->r_statelock);
2476		}
2477	} else {
2478		/*
2479		 * Do a range from [off...off + len) looking for pages
2480		 * to deal with.
2481		 */
2482		error = 0;
2483#ifdef lint
2484		io_len = 0;
2485#endif
2486		eoff = off + len;
2487		mutex_enter(&rp->r_statelock);
2488		for (io_off = off; io_off < eoff && io_off < rp->r_size;
2489		    io_off += io_len) {
2490			mutex_exit(&rp->r_statelock);
2491			/*
2492			 * If we are not invalidating, synchronously
2493			 * freeing or writing pages use the routine
2494			 * page_lookup_nowait() to prevent reclaiming
2495			 * them from the free list.
2496			 */
2497			if ((flags & B_INVAL) || !(flags & B_ASYNC)) {
2498				pp = page_lookup(vp, io_off,
2499				    (flags & (B_INVAL | B_FREE)) ?
2500				    SE_EXCL : SE_SHARED);
2501			} else {
2502				pp = page_lookup_nowait(vp, io_off,
2503				    (flags & B_FREE) ? SE_EXCL : SE_SHARED);
2504			}
2505
2506			if (pp == NULL || !pvn_getdirty(pp, flags))
2507				io_len = PAGESIZE;
2508			else {
2509				err = (*rp->r_putapage)(vp, pp, &io_off,
2510				    &io_len, flags, cr);
2511				if (!error)
2512					error = err;
2513				/*
2514				 * "io_off" and "io_len" are returned as
2515				 * the range of pages we actually wrote.
2516				 * This allows us to skip ahead more quickly
2517				 * since several pages may've been dealt
2518				 * with by this iteration of the loop.
2519				 */
2520			}
2521			mutex_enter(&rp->r_statelock);
2522		}
2523		mutex_exit(&rp->r_statelock);
2524	}
2525
2526	return (error);
2527}
2528
2529void
2530nfs_invalidate_pages(vnode_t *vp, u_offset_t off, cred_t *cr)
2531{
2532	rnode_t *rp;
2533
2534	rp = VTOR(vp);
2535	mutex_enter(&rp->r_statelock);
2536	while (rp->r_flags & RTRUNCATE)
2537		cv_wait(&rp->r_cv, &rp->r_statelock);
2538	rp->r_flags |= RTRUNCATE;
2539	if (off == (u_offset_t)0) {
2540		rp->r_flags &= ~RDIRTY;
2541		if (!(rp->r_flags & RSTALE))
2542			rp->r_error = 0;
2543	}
2544	rp->r_truncaddr = off;
2545	mutex_exit(&rp->r_statelock);
2546	(void) pvn_vplist_dirty(vp, off, rp->r_putapage,
2547	    B_INVAL | B_TRUNC, cr);
2548	mutex_enter(&rp->r_statelock);
2549	rp->r_flags &= ~RTRUNCATE;
2550	cv_broadcast(&rp->r_cv);
2551	mutex_exit(&rp->r_statelock);
2552}
2553
2554static int nfs_write_error_to_cons_only = 0;
2555#define	MSG(x)	(nfs_write_error_to_cons_only ? (x) : (x) + 1)
2556
2557/*
2558 * Print a file handle
2559 */
2560void
2561nfs_printfhandle(nfs_fhandle *fhp)
2562{
2563	int *ip;
2564	char *buf;
2565	size_t bufsize;
2566	char *cp;
2567
2568	/*
2569	 * 13 == "(file handle:"
2570	 * maximum of NFS_FHANDLE / sizeof (*ip) elements in fh_buf times
2571	 *	1 == ' '
2572	 *	8 == maximum strlen of "%x"
2573	 * 3 == ")\n\0"
2574	 */
2575	bufsize = 13 + ((NFS_FHANDLE_LEN / sizeof (*ip)) * (1 + 8)) + 3;
2576	buf = kmem_alloc(bufsize, KM_NOSLEEP);
2577	if (buf == NULL)
2578		return;
2579
2580	cp = buf;
2581	(void) strcpy(cp, "(file handle:");
2582	while (*cp != '\0')
2583		cp++;
2584	for (ip = (int *)fhp->fh_buf;
2585	    ip < (int *)&fhp->fh_buf[fhp->fh_len];
2586	    ip++) {
2587		(void) sprintf(cp, " %x", *ip);
2588		while (*cp != '\0')
2589			cp++;
2590	}
2591	(void) strcpy(cp, ")\n");
2592
2593	zcmn_err(getzoneid(), CE_CONT, MSG("^%s"), buf);
2594
2595	kmem_free(buf, bufsize);
2596}
2597
2598/*
2599 * Notify the system administrator that an NFS write error has
2600 * occurred.
2601 */
2602
2603/* seconds between ENOSPC/EDQUOT messages */
2604clock_t nfs_write_error_interval = 5;
2605
2606void
2607nfs_write_error(vnode_t *vp, int error, cred_t *cr)
2608{
2609	mntinfo_t *mi;
2610	clock_t now;
2611
2612	mi = VTOMI(vp);
2613	/*
2614	 * In case of forced unmount or zone shutdown, do not print any
2615	 * messages since it can flood the console with error messages.
2616	 */
2617	if (FS_OR_ZONE_GONE(mi->mi_vfsp))
2618		return;
2619
2620	/*
2621	 * No use in flooding the console with ENOSPC
2622	 * messages from the same file system.
2623	 */
2624	now = ddi_get_lbolt();
2625	if ((error != ENOSPC && error != EDQUOT) ||
2626	    now - mi->mi_printftime > 0) {
2627		zoneid_t zoneid = mi->mi_zone->zone_id;
2628
2629#ifdef DEBUG
2630		nfs_perror(error, "NFS%ld write error on host %s: %m.\n",
2631		    mi->mi_vers, VTOR(vp)->r_server->sv_hostname, NULL);
2632#else
2633		nfs_perror(error, "NFS write error on host %s: %m.\n",
2634		    VTOR(vp)->r_server->sv_hostname, NULL);
2635#endif
2636		if (error == ENOSPC || error == EDQUOT) {
2637			zcmn_err(zoneid, CE_CONT,
2638			    MSG("^File: userid=%d, groupid=%d\n"),
2639			    crgetuid(cr), crgetgid(cr));
2640			if (crgetuid(CRED()) != crgetuid(cr) ||
2641			    crgetgid(CRED()) != crgetgid(cr)) {
2642				zcmn_err(zoneid, CE_CONT,
2643				    MSG("^User: userid=%d, groupid=%d\n"),
2644				    crgetuid(CRED()), crgetgid(CRED()));
2645			}
2646			mi->mi_printftime = now +
2647			    nfs_write_error_interval * hz;
2648		}
2649		nfs_printfhandle(&VTOR(vp)->r_fh);
2650#ifdef DEBUG
2651		if (error == EACCES) {
2652			zcmn_err(zoneid, CE_CONT,
2653			    MSG("^nfs_bio: cred is%s kcred\n"),
2654			    cr == kcred ? "" : " not");
2655		}
2656#endif
2657	}
2658}
2659
2660/* ARGSUSED */
2661static void *
2662nfs_mi_init(zoneid_t zoneid)
2663{
2664	struct mi_globals *mig;
2665
2666	mig = kmem_alloc(sizeof (*mig), KM_SLEEP);
2667	mutex_init(&mig->mig_lock, NULL, MUTEX_DEFAULT, NULL);
2668	list_create(&mig->mig_list, sizeof (mntinfo_t),
2669	    offsetof(mntinfo_t, mi_zone_node));
2670	mig->mig_destructor_called = B_FALSE;
2671	return (mig);
2672}
2673
2674/*
2675 * Callback routine to tell all NFS mounts in the zone to stop creating new
2676 * threads.  Existing threads should exit.
2677 */
2678/* ARGSUSED */
2679static void
2680nfs_mi_shutdown(zoneid_t zoneid, void *data)
2681{
2682	struct mi_globals *mig = data;
2683	mntinfo_t *mi;
2684
2685	ASSERT(mig != NULL);
2686again:
2687	mutex_enter(&mig->mig_lock);
2688	for (mi = list_head(&mig->mig_list); mi != NULL;
2689	    mi = list_next(&mig->mig_list, mi)) {
2690
2691		/*
2692		 * If we've done the shutdown work for this FS, skip.
2693		 * Once we go off the end of the list, we're done.
2694		 */
2695		if (mi->mi_flags & MI_DEAD)
2696			continue;
2697
2698		/*
2699		 * We will do work, so not done.  Get a hold on the FS.
2700		 */
2701		VFS_HOLD(mi->mi_vfsp);
2702
2703		/*
2704		 * purge the DNLC for this filesystem
2705		 */
2706		(void) dnlc_purge_vfsp(mi->mi_vfsp, 0);
2707
2708		mutex_enter(&mi->mi_async_lock);
2709		/*
2710		 * Tell existing async worker threads to exit.
2711		 */
2712		mi->mi_max_threads = 0;
2713		NFS_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv);
2714		/*
2715		 * Set MI_ASYNC_MGR_STOP so the async manager thread starts
2716		 * getting ready to exit when it's done with its current work.
2717		 * Also set MI_DEAD to note we've acted on this FS.
2718		 */
2719		mutex_enter(&mi->mi_lock);
2720		mi->mi_flags |= (MI_ASYNC_MGR_STOP|MI_DEAD);
2721		mutex_exit(&mi->mi_lock);
2722		/*
2723		 * Wake up the async manager thread.
2724		 */
2725		cv_broadcast(&mi->mi_async_reqs_cv);
2726		mutex_exit(&mi->mi_async_lock);
2727
2728		/*
2729		 * Drop lock and release FS, which may change list, then repeat.
2730		 * We're done when every mi has been done or the list is empty.
2731		 */
2732		mutex_exit(&mig->mig_lock);
2733		VFS_RELE(mi->mi_vfsp);
2734		goto again;
2735	}
2736	mutex_exit(&mig->mig_lock);
2737}
2738
2739static void
2740nfs_mi_free_globals(struct mi_globals *mig)
2741{
2742	list_destroy(&mig->mig_list);	/* makes sure the list is empty */
2743	mutex_destroy(&mig->mig_lock);
2744	kmem_free(mig, sizeof (*mig));
2745
2746}
2747
2748/* ARGSUSED */
2749static void
2750nfs_mi_destroy(zoneid_t zoneid, void *data)
2751{
2752	struct mi_globals *mig = data;
2753
2754	ASSERT(mig != NULL);
2755	mutex_enter(&mig->mig_lock);
2756	if (list_head(&mig->mig_list) != NULL) {
2757		/* Still waiting for VFS_FREEVFS() */
2758		mig->mig_destructor_called = B_TRUE;
2759		mutex_exit(&mig->mig_lock);
2760		return;
2761	}
2762	nfs_mi_free_globals(mig);
2763}
2764
2765/*
2766 * Add an NFS mount to the per-zone list of NFS mounts.
2767 */
2768void
2769nfs_mi_zonelist_add(mntinfo_t *mi)
2770{
2771	struct mi_globals *mig;
2772
2773	mig = zone_getspecific(mi_list_key, mi->mi_zone);
2774	mutex_enter(&mig->mig_lock);
2775	list_insert_head(&mig->mig_list, mi);
2776	mutex_exit(&mig->mig_lock);
2777}
2778
2779/*
2780 * Remove an NFS mount from the per-zone list of NFS mounts.
2781 */
2782static void
2783nfs_mi_zonelist_remove(mntinfo_t *mi)
2784{
2785	struct mi_globals *mig;
2786
2787	mig = zone_getspecific(mi_list_key, mi->mi_zone);
2788	mutex_enter(&mig->mig_lock);
2789	list_remove(&mig->mig_list, mi);
2790	/*
2791	 * We can be called asynchronously by VFS_FREEVFS() after the zone
2792	 * shutdown/destroy callbacks have executed; if so, clean up the zone's
2793	 * mi globals.
2794	 */
2795	if (list_head(&mig->mig_list) == NULL &&
2796	    mig->mig_destructor_called == B_TRUE) {
2797		nfs_mi_free_globals(mig);
2798		return;
2799	}
2800	mutex_exit(&mig->mig_lock);
2801}
2802
2803/*
2804 * NFS Client initialization routine.  This routine should only be called
2805 * once.  It performs the following tasks:
2806 *	- Initalize all global locks
2807 *	- Call sub-initialization routines (localize access to variables)
2808 */
2809int
2810nfs_clntinit(void)
2811{
2812#ifdef DEBUG
2813	static boolean_t nfs_clntup = B_FALSE;
2814#endif
2815	int error;
2816
2817#ifdef DEBUG
2818	ASSERT(nfs_clntup == B_FALSE);
2819#endif
2820
2821	error = nfs_subrinit();
2822	if (error)
2823		return (error);
2824
2825	error = nfs_vfsinit();
2826	if (error) {
2827		/*
2828		 * Cleanup nfs_subrinit() work
2829		 */
2830		nfs_subrfini();
2831		return (error);
2832	}
2833	zone_key_create(&mi_list_key, nfs_mi_init, nfs_mi_shutdown,
2834	    nfs_mi_destroy);
2835
2836	nfs4_clnt_init();
2837
2838	nfscmd_init();
2839
2840#ifdef DEBUG
2841	nfs_clntup = B_TRUE;
2842#endif
2843
2844	return (0);
2845}
2846
2847/*
2848 * This routine is only called if the NFS Client has been initialized but
2849 * the module failed to be installed. This routine will cleanup the previously
2850 * allocated/initialized work.
2851 */
2852void
2853nfs_clntfini(void)
2854{
2855	(void) zone_key_delete(mi_list_key);
2856	nfs_subrfini();
2857	nfs_vfsfini();
2858	nfs4_clnt_fini();
2859	nfscmd_fini();
2860}
2861
2862/*
2863 * nfs_lockrelease:
2864 *
2865 * Release any locks on the given vnode that are held by the current
2866 * process.
2867 */
2868void
2869nfs_lockrelease(vnode_t *vp, int flag, offset_t offset, cred_t *cr)
2870{
2871	flock64_t ld;
2872	struct shrlock shr;
2873	char *buf;
2874	int remote_lock_possible;
2875	int ret;
2876
2877	ASSERT((uintptr_t)vp > KERNELBASE);
2878
2879	/*
2880	 * Generate an explicit unlock operation for the entire file.  As a
2881	 * partial optimization, only generate the unlock if there is a
2882	 * lock registered for the file.  We could check whether this
2883	 * particular process has any locks on the file, but that would
2884	 * require the local locking code to provide yet another query
2885	 * routine.  Note that no explicit synchronization is needed here.
2886	 * At worst, flk_has_remote_locks() will return a false positive,
2887	 * in which case the unlock call wastes time but doesn't harm
2888	 * correctness.
2889	 *
2890	 * In addition, an unlock request is generated if the process
2891	 * is listed as possibly having a lock on the file because the
2892	 * server and client lock managers may have gotten out of sync.
2893	 * N.B. It is important to make sure nfs_remove_locking_id() is
2894	 * called here even if flk_has_remote_locks(vp) reports true.
2895	 * If it is not called and there is an entry on the process id
2896	 * list, that entry will never get removed.
2897	 */
2898	remote_lock_possible = nfs_remove_locking_id(vp, RLMPL_PID,
2899	    (char *)&(ttoproc(curthread)->p_pid), NULL, NULL);
2900	if (remote_lock_possible || flk_has_remote_locks(vp)) {
2901		ld.l_type = F_UNLCK;	/* set to unlock entire file */
2902		ld.l_whence = 0;	/* unlock from start of file */
2903		ld.l_start = 0;
2904		ld.l_len = 0;		/* do entire file */
2905		ret = VOP_FRLOCK(vp, F_SETLK, &ld, flag, offset, NULL, cr,
2906		    NULL);
2907
2908		if (ret != 0) {
2909			/*
2910			 * If VOP_FRLOCK fails, make sure we unregister
2911			 * local locks before we continue.
2912			 */
2913			ld.l_pid = ttoproc(curthread)->p_pid;
2914			lm_register_lock_locally(vp, NULL, &ld, flag, offset);
2915#ifdef DEBUG
2916			nfs_perror(ret,
2917			    "NFS lock release error on vp %p: %m.\n",
2918			    (void *)vp, NULL);
2919#endif
2920		}
2921
2922		/*
2923		 * The call to VOP_FRLOCK may put the pid back on the
2924		 * list.  We need to remove it.
2925		 */
2926		(void) nfs_remove_locking_id(vp, RLMPL_PID,
2927		    (char *)&(ttoproc(curthread)->p_pid), NULL, NULL);
2928	}
2929
2930	/*
2931	 * As long as the vp has a share matching our pid,
2932	 * pluck it off and unshare it.  There are circumstances in
2933	 * which the call to nfs_remove_locking_id() may put the
2934	 * owner back on the list, in which case we simply do a
2935	 * redundant and harmless unshare.
2936	 */
2937	buf = kmem_alloc(MAX_SHR_OWNER_LEN, KM_SLEEP);
2938	while (nfs_remove_locking_id(vp, RLMPL_OWNER,
2939	    (char *)NULL, buf, &shr.s_own_len)) {
2940		shr.s_owner = buf;
2941		shr.s_access = 0;
2942		shr.s_deny = 0;
2943		shr.s_sysid = 0;
2944		shr.s_pid = curproc->p_pid;
2945
2946		ret = VOP_SHRLOCK(vp, F_UNSHARE, &shr, flag, cr, NULL);
2947#ifdef DEBUG
2948		if (ret != 0) {
2949			nfs_perror(ret,
2950			    "NFS share release error on vp %p: %m.\n",
2951			    (void *)vp, NULL);
2952		}
2953#endif
2954	}
2955	kmem_free(buf, MAX_SHR_OWNER_LEN);
2956}
2957
2958/*
2959 * nfs_lockcompletion:
2960 *
2961 * If the vnode has a lock that makes it unsafe to cache the file, mark it
2962 * as non cachable (set VNOCACHE bit).
2963 */
2964
2965void
2966nfs_lockcompletion(vnode_t *vp, int cmd)
2967{
2968#ifdef DEBUG
2969	rnode_t *rp = VTOR(vp);
2970
2971	ASSERT(nfs_rw_lock_held(&rp->r_lkserlock, RW_WRITER));
2972#endif
2973
2974	if (cmd == F_SETLK || cmd == F_SETLKW) {
2975		if (!lm_safemap(vp)) {
2976			mutex_enter(&vp->v_lock);
2977			vp->v_flag |= VNOCACHE;
2978			mutex_exit(&vp->v_lock);
2979		} else {
2980			mutex_enter(&vp->v_lock);
2981			vp->v_flag &= ~VNOCACHE;
2982			mutex_exit(&vp->v_lock);
2983		}
2984	}
2985	/*
2986	 * The cached attributes of the file are stale after acquiring
2987	 * the lock on the file. They were updated when the file was
2988	 * opened, but not updated when the lock was acquired. Therefore the
2989	 * cached attributes are invalidated after the lock is obtained.
2990	 */
2991	PURGE_ATTRCACHE(vp);
2992}
2993
2994/*
2995 * The lock manager holds state making it possible for the client
2996 * and server to be out of sync.  For example, if the response from
2997 * the server granting a lock request is lost, the server will think
2998 * the lock is granted and the client will think the lock is lost.
2999 * The client can tell when it is not positive if it is in sync with
3000 * the server.
3001 *
3002 * To deal with this, a list of processes for which the client is
3003 * not sure if the server holds a lock is attached to the rnode.
3004 * When such a process closes the rnode, an unlock request is sent
3005 * to the server to unlock the entire file.
3006 *
3007 * The list is kept as a singularly linked NULL terminated list.
3008 * Because it is only added to under extreme error conditions, the
3009 * list shouldn't get very big.  DEBUG kernels print a message if
3010 * the list gets bigger than nfs_lmpl_high_water.  This is arbitrarily
3011 * choosen to be 8, but can be tuned at runtime.
3012 */
3013#ifdef DEBUG
3014/* int nfs_lmpl_high_water = 8; */
3015int nfs_lmpl_high_water = 128;
3016int nfs_cnt_add_locking_id = 0;
3017int nfs_len_add_locking_id = 0;
3018#endif /* DEBUG */
3019
3020/*
3021 * Record that the nfs lock manager server may be holding a lock on
3022 * a vnode for a process.
3023 *
3024 * Because the nfs lock manager server holds state, it is possible
3025 * for the server to get out of sync with the client.  This routine is called
3026 * from the client when it is no longer sure if the server is in sync
3027 * with the client.  nfs_lockrelease() will then notice this and send
3028 * an unlock request when the file is closed
3029 */
3030void
3031nfs_add_locking_id(vnode_t *vp, pid_t pid, int type, char *id, int len)
3032{
3033	rnode_t *rp;
3034	lmpl_t *new;
3035	lmpl_t *cur;
3036	lmpl_t **lmplp;
3037#ifdef DEBUG
3038	int list_len = 1;
3039#endif /* DEBUG */
3040
3041#ifdef DEBUG
3042	++nfs_cnt_add_locking_id;
3043#endif /* DEBUG */
3044	/*
3045	 * allocate new lmpl_t now so we don't sleep
3046	 * later after grabbing mutexes
3047	 */
3048	ASSERT(len < MAX_SHR_OWNER_LEN);
3049	new = kmem_alloc(sizeof (*new), KM_SLEEP);
3050	new->lmpl_type = type;
3051	new->lmpl_pid = pid;
3052	new->lmpl_owner = kmem_alloc(len, KM_SLEEP);
3053	bcopy(id, new->lmpl_owner, len);
3054	new->lmpl_own_len = len;
3055	new->lmpl_next = (lmpl_t *)NULL;
3056#ifdef DEBUG
3057	if (type == RLMPL_PID) {
3058		ASSERT(len == sizeof (pid_t));
3059		ASSERT(pid == *(pid_t *)new->lmpl_owner);
3060	} else {
3061		ASSERT(type == RLMPL_OWNER);
3062	}
3063#endif
3064
3065	rp = VTOR(vp);
3066	mutex_enter(&rp->r_statelock);
3067
3068	/*
3069	 * Add this id to the list for this rnode only if the
3070	 * rnode is active and the id is not already there.
3071	 */
3072	ASSERT(rp->r_flags & RHASHED);
3073	lmplp = &(rp->r_lmpl);
3074	for (cur = rp->r_lmpl; cur != (lmpl_t *)NULL; cur = cur->lmpl_next) {
3075		if (cur->lmpl_pid == pid &&
3076		    cur->lmpl_type == type &&
3077		    cur->lmpl_own_len == len &&
3078		    bcmp(cur->lmpl_owner, new->lmpl_owner, len) == 0) {
3079			kmem_free(new->lmpl_owner, len);
3080			kmem_free(new, sizeof (*new));
3081			break;
3082		}
3083		lmplp = &cur->lmpl_next;
3084#ifdef DEBUG
3085		++list_len;
3086#endif /* DEBUG */
3087	}
3088	if (cur == (lmpl_t *)NULL) {
3089		*lmplp = new;
3090#ifdef DEBUG
3091		if (list_len > nfs_len_add_locking_id) {
3092			nfs_len_add_locking_id = list_len;
3093		}
3094		if (list_len > nfs_lmpl_high_water) {
3095			cmn_err(CE_WARN, "nfs_add_locking_id: long list "
3096			    "vp=%p is %d", (void *)vp, list_len);
3097		}
3098#endif /* DEBUG */
3099	}
3100
3101#ifdef DEBUG
3102	if (share_debug) {
3103		int nitems = 0;
3104		int npids = 0;
3105		int nowners = 0;
3106
3107		/*
3108		 * Count the number of things left on r_lmpl after the remove.
3109		 */
3110		for (cur = rp->r_lmpl; cur != (lmpl_t *)NULL;
3111		    cur = cur->lmpl_next) {
3112			nitems++;
3113			if (cur->lmpl_type == RLMPL_PID) {
3114				npids++;
3115			} else if (cur->lmpl_type == RLMPL_OWNER) {
3116				nowners++;
3117			} else {
3118				cmn_err(CE_PANIC, "nfs_add_locking_id: "
3119				    "unrecognized lmpl_type %d",
3120				    cur->lmpl_type);
3121			}
3122		}
3123
3124		cmn_err(CE_CONT, "nfs_add_locking_id(%s): %d PIDs + %d "
3125		    "OWNs = %d items left on r_lmpl\n",
3126		    (type == RLMPL_PID) ? "P" : "O", npids, nowners, nitems);
3127	}
3128#endif
3129
3130	mutex_exit(&rp->r_statelock);
3131}
3132
3133/*
3134 * Remove an id from the lock manager id list.
3135 *
3136 * If the id is not in the list return 0.  If it was found and
3137 * removed, return 1.
3138 */
3139static int
3140nfs_remove_locking_id(vnode_t *vp, int type, char *id, char *rid, int *rlen)
3141{
3142	lmpl_t *cur;
3143	lmpl_t **lmplp;
3144	rnode_t *rp;
3145	int rv = 0;
3146
3147	ASSERT(type == RLMPL_PID || type == RLMPL_OWNER);
3148
3149	rp = VTOR(vp);
3150
3151	mutex_enter(&rp->r_statelock);
3152	ASSERT(rp->r_flags & RHASHED);
3153	lmplp = &(rp->r_lmpl);
3154
3155	/*
3156	 * Search through the list and remove the entry for this id
3157	 * if it is there.  The special case id == NULL allows removal
3158	 * of the first share on the r_lmpl list belonging to the
3159	 * current process (if any), without regard to further details
3160	 * of its identity.
3161	 */
3162	for (cur = rp->r_lmpl; cur != (lmpl_t *)NULL; cur = cur->lmpl_next) {
3163		if (cur->lmpl_type == type &&
3164		    cur->lmpl_pid == curproc->p_pid &&
3165		    (id == (char *)NULL ||
3166		    bcmp(cur->lmpl_owner, id, cur->lmpl_own_len) == 0)) {
3167			*lmplp = cur->lmpl_next;
3168			ASSERT(cur->lmpl_own_len < MAX_SHR_OWNER_LEN);
3169			if (rid != NULL) {
3170				bcopy(cur->lmpl_owner, rid, cur->lmpl_own_len);
3171				*rlen = cur->lmpl_own_len;
3172			}
3173			kmem_free(cur->lmpl_owner, cur->lmpl_own_len);
3174			kmem_free(cur, sizeof (*cur));
3175			rv = 1;
3176			break;
3177		}
3178		lmplp = &cur->lmpl_next;
3179	}
3180
3181#ifdef DEBUG
3182	if (share_debug) {
3183		int nitems = 0;
3184		int npids = 0;
3185		int nowners = 0;
3186
3187		/*
3188		 * Count the number of things left on r_lmpl after the remove.
3189		 */
3190		for (cur = rp->r_lmpl; cur != (lmpl_t *)NULL;
3191		    cur = cur->lmpl_next) {
3192			nitems++;
3193			if (cur->lmpl_type == RLMPL_PID) {
3194				npids++;
3195			} else if (cur->lmpl_type == RLMPL_OWNER) {
3196				nowners++;
3197			} else {
3198				cmn_err(CE_PANIC,
3199				    "nrli: unrecognized lmpl_type %d",
3200				    cur->lmpl_type);
3201			}
3202		}
3203
3204		cmn_err(CE_CONT,
3205		"nrli(%s): %d PIDs + %d OWNs = %d items left on r_lmpl\n",
3206		    (type == RLMPL_PID) ? "P" : "O",
3207		    npids,
3208		    nowners,
3209		    nitems);
3210	}
3211#endif
3212
3213	mutex_exit(&rp->r_statelock);
3214	return (rv);
3215}
3216
3217void
3218nfs_free_mi(mntinfo_t *mi)
3219{
3220	ASSERT(mi->mi_flags & MI_ASYNC_MGR_STOP);
3221	ASSERT(mi->mi_manager_thread == NULL);
3222	ASSERT(mi->mi_threads[NFS_ASYNC_QUEUE] == 0 &&
3223	    mi->mi_threads[NFS_ASYNC_PGOPS_QUEUE] == 0);
3224
3225	/*
3226	 * Remove the node from the global list before we start tearing it down.
3227	 */
3228	nfs_mi_zonelist_remove(mi);
3229	if (mi->mi_klmconfig) {
3230		lm_free_config(mi->mi_klmconfig);
3231		kmem_free(mi->mi_klmconfig, sizeof (struct knetconfig));
3232	}
3233	mutex_destroy(&mi->mi_lock);
3234	mutex_destroy(&mi->mi_remap_lock);
3235	mutex_destroy(&mi->mi_async_lock);
3236	mutex_destroy(&mi->mi_rnodes_lock);
3237	cv_destroy(&mi->mi_failover_cv);
3238	cv_destroy(&mi->mi_async_work_cv[NFS_ASYNC_QUEUE]);
3239	cv_destroy(&mi->mi_async_work_cv[NFS_ASYNC_PGOPS_QUEUE]);
3240	cv_destroy(&mi->mi_async_reqs_cv);
3241	cv_destroy(&mi->mi_async_cv);
3242	list_destroy(&mi->mi_rnodes);
3243	zone_rele_ref(&mi->mi_zone_ref, ZONE_REF_NFS);
3244	kmem_free(mi, sizeof (*mi));
3245}
3246
3247static int
3248mnt_kstat_update(kstat_t *ksp, int rw)
3249{
3250	mntinfo_t *mi;
3251	struct mntinfo_kstat *mik;
3252	vfs_t *vfsp;
3253	int i;
3254
3255	/* this is a read-only kstat. Bail out on a write */
3256	if (rw == KSTAT_WRITE)
3257		return (EACCES);
3258
3259	/*
3260	 * We don't want to wait here as kstat_chain_lock could be held by
3261	 * dounmount(). dounmount() takes vfs_reflock before the chain lock
3262	 * and thus could lead to a deadlock.
3263	 */
3264	vfsp = (struct vfs *)ksp->ks_private;
3265
3266
3267	mi = VFTOMI(vfsp);
3268
3269	mik = (struct mntinfo_kstat *)ksp->ks_data;
3270
3271	(void) strcpy(mik->mik_proto, mi->mi_curr_serv->sv_knconf->knc_proto);
3272	mik->mik_vers = (uint32_t)mi->mi_vers;
3273	mik->mik_flags = mi->mi_flags;
3274	mik->mik_secmod = mi->mi_curr_serv->sv_secdata->secmod;
3275	mik->mik_curread = (uint32_t)mi->mi_curread;
3276	mik->mik_curwrite = (uint32_t)mi->mi_curwrite;
3277	mik->mik_retrans = mi->mi_retrans;
3278	mik->mik_timeo = mi->mi_timeo;
3279	mik->mik_acregmin = HR2SEC(mi->mi_acregmin);
3280	mik->mik_acregmax = HR2SEC(mi->mi_acregmax);
3281	mik->mik_acdirmin = HR2SEC(mi->mi_acdirmin);
3282	mik->mik_acdirmax = HR2SEC(mi->mi_acdirmax);
3283	for (i = 0; i < NFS_CALLTYPES + 1; i++) {
3284		mik->mik_timers[i].srtt = (uint32_t)mi->mi_timers[i].rt_srtt;
3285		mik->mik_timers[i].deviate =
3286		    (uint32_t)mi->mi_timers[i].rt_deviate;
3287		mik->mik_timers[i].rtxcur =
3288		    (uint32_t)mi->mi_timers[i].rt_rtxcur;
3289	}
3290	mik->mik_noresponse = (uint32_t)mi->mi_noresponse;
3291	mik->mik_failover = (uint32_t)mi->mi_failover;
3292	mik->mik_remap = (uint32_t)mi->mi_remap;
3293	(void) strcpy(mik->mik_curserver, mi->mi_curr_serv->sv_hostname);
3294
3295	return (0);
3296}
3297
3298void
3299nfs_mnt_kstat_init(struct vfs *vfsp)
3300{
3301	mntinfo_t *mi = VFTOMI(vfsp);
3302
3303	/*
3304	 * Create the version specific kstats.
3305	 *
3306	 * PSARC 2001/697 Contract Private Interface
3307	 * All nfs kstats are under SunMC contract
3308	 * Please refer to the PSARC listed above and contact
3309	 * SunMC before making any changes!
3310	 *
3311	 * Changes must be reviewed by Solaris File Sharing
3312	 * Changes must be communicated to contract-2001-697@sun.com
3313	 *
3314	 */
3315
3316	mi->mi_io_kstats = kstat_create_zone("nfs", getminor(vfsp->vfs_dev),
3317	    NULL, "nfs", KSTAT_TYPE_IO, 1, 0, mi->mi_zone->zone_id);
3318	if (mi->mi_io_kstats) {
3319		if (mi->mi_zone->zone_id != GLOBAL_ZONEID)
3320			kstat_zone_add(mi->mi_io_kstats, GLOBAL_ZONEID);
3321		mi->mi_io_kstats->ks_lock = &mi->mi_lock;
3322		kstat_install(mi->mi_io_kstats);
3323	}
3324
3325	if ((mi->mi_ro_kstats = kstat_create_zone("nfs",
3326	    getminor(vfsp->vfs_dev), "mntinfo", "misc", KSTAT_TYPE_RAW,
3327	    sizeof (struct mntinfo_kstat), 0, mi->mi_zone->zone_id)) != NULL) {
3328		if (mi->mi_zone->zone_id != GLOBAL_ZONEID)
3329			kstat_zone_add(mi->mi_ro_kstats, GLOBAL_ZONEID);
3330		mi->mi_ro_kstats->ks_update = mnt_kstat_update;
3331		mi->mi_ro_kstats->ks_private = (void *)vfsp;
3332		kstat_install(mi->mi_ro_kstats);
3333	}
3334}
3335
3336nfs_delmapcall_t *
3337nfs_init_delmapcall()
3338{
3339	nfs_delmapcall_t	*delmap_call;
3340
3341	delmap_call = kmem_alloc(sizeof (nfs_delmapcall_t), KM_SLEEP);
3342	delmap_call->call_id = curthread;
3343	delmap_call->error = 0;
3344
3345	return (delmap_call);
3346}
3347
3348void
3349nfs_free_delmapcall(nfs_delmapcall_t *delmap_call)
3350{
3351	kmem_free(delmap_call, sizeof (nfs_delmapcall_t));
3352}
3353
3354/*
3355 * Searches for the current delmap caller (based on curthread) in the list of
3356 * callers.  If it is found, we remove it and free the delmap caller.
3357 * Returns:
3358 *	0 if the caller wasn't found
3359 *	1 if the caller was found, removed and freed.  *errp is set to what
3360 *	the result of the delmap was.
3361 */
3362int
3363nfs_find_and_delete_delmapcall(rnode_t *rp, int *errp)
3364{
3365	nfs_delmapcall_t	*delmap_call;
3366
3367	/*
3368	 * If the list doesn't exist yet, we create it and return
3369	 * that the caller wasn't found.  No list = no callers.
3370	 */
3371	mutex_enter(&rp->r_statelock);
3372	if (!(rp->r_flags & RDELMAPLIST)) {
3373		/* The list does not exist */
3374		list_create(&rp->r_indelmap, sizeof (nfs_delmapcall_t),
3375		    offsetof(nfs_delmapcall_t, call_node));
3376		rp->r_flags |= RDELMAPLIST;
3377		mutex_exit(&rp->r_statelock);
3378		return (0);
3379	} else {
3380		/* The list exists so search it */
3381		for (delmap_call = list_head(&rp->r_indelmap);
3382		    delmap_call != NULL;
3383		    delmap_call = list_next(&rp->r_indelmap, delmap_call)) {
3384			if (delmap_call->call_id == curthread) {
3385				/* current caller is in the list */
3386				*errp = delmap_call->error;
3387				list_remove(&rp->r_indelmap, delmap_call);
3388				mutex_exit(&rp->r_statelock);
3389				nfs_free_delmapcall(delmap_call);
3390				return (1);
3391			}
3392		}
3393	}
3394	mutex_exit(&rp->r_statelock);
3395	return (0);
3396}
3397